124 lines
5.0 KiB
Bash
124 lines
5.0 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
set -euo pipefail
|
|||
|
|
|
|||
|
|
# ==================== 配置参数(可根据实际调整)====================
|
|||
|
|
TEST_BEGIN="1MB" # 测试消息起始大小
|
|||
|
|
TEST_END="1GB" # 测试消息结束大小
|
|||
|
|
TEST_FACTOR="2" # 消息大小倍增因子
|
|||
|
|
GPU_PER_PROC="1" # 每个进程绑定的GPU数
|
|||
|
|
MPI_ARGS="--allow-run-as-root" # MPI额外参数
|
|||
|
|
ALL_GPUS=(0 1 2 3 4 5 6 7) # 待测试的所有GPU序号
|
|||
|
|
LOG_FILE="gpu_all_reduce_test.log" # 测试日志文件
|
|||
|
|
|
|||
|
|
# ==================== 初始化变量 ====================
|
|||
|
|
declare -a AVAILABLE_GPUS=() # 当前可用的GPU列表(排除故障卡)
|
|||
|
|
declare -a FAULTY_GPUS=() # 故障卡信息列表(格式:序号:BUS-ID:SN号)
|
|||
|
|
CURRENT_TEST_INDEX=0 # 当前测试的GPU索引(从ALL_GPUS[0]开始)
|
|||
|
|
|
|||
|
|
# ==================== 工具函数 ====================
|
|||
|
|
# 1. 检查依赖命令是否存在
|
|||
|
|
check_dependencies() {
|
|||
|
|
local deps=("nvidia-smi" "mpirun" "all_reduce_perf")
|
|||
|
|
for dep in "${deps[@]}"; do
|
|||
|
|
if ! command -v "$dep" &> /dev/null; then
|
|||
|
|
echo "❌ 错误:未找到依赖命令 $dep,请先安装或配置环境变量"
|
|||
|
|
exit 1
|
|||
|
|
fi
|
|||
|
|
done
|
|||
|
|
echo "✅ 所有依赖命令已就绪"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 2. 获取指定GPU的BUS-ID
|
|||
|
|
get_gpu_busid() {
|
|||
|
|
local gpu_idx=$1
|
|||
|
|
nvidia-smi -L | awk -v idx="$gpu_idx" '{if ($1 == idx":") print $3}' | sed 's/\[//;s/\]//'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 3. 获取指定GPU的SN号
|
|||
|
|
get_gpu_sn() {
|
|||
|
|
local gpu_idx=$1
|
|||
|
|
nvidia-smi -q -i "$gpu_idx" | grep -A1 "Serial Number" | grep -v "Serial Number" | awk '{print $1}'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 4. 执行单轮all_reduce测试
|
|||
|
|
run_all_reduce_test() {
|
|||
|
|
local test_gpus=("$@")
|
|||
|
|
local gpu_count=${#test_gpus[@]}
|
|||
|
|
local gpu_str=$(IFS=,; echo "${test_gpus[*]}")
|
|||
|
|
|
|||
|
|
echo -e "\n=================================================="
|
|||
|
|
echo "📌 开始测试:当前测试GPU组合 = [${gpu_str}](共${gpu_count}张卡)"
|
|||
|
|
echo "=================================================="
|
|||
|
|
|
|||
|
|
# 执行测试(重定向日志到文件,同时输出到终端)
|
|||
|
|
NCCL_DEBUG=INFO CUDA_VISIBLE_DEVICES="${gpu_str}" mpirun ${MPI_ARGS} -n "${gpu_count}" all_reduce_perf \
|
|||
|
|
-b "${TEST_BEGIN}" -e "${TEST_END}" -f "${TEST_FACTOR}" -g "${GPU_PER_PROC}" \
|
|||
|
|
2>&1 | tee -a "${LOG_FILE}"
|
|||
|
|
|
|||
|
|
# 返回测试结果(0=成功,非0=失败)
|
|||
|
|
return $?
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ==================== 主流程 ====================
|
|||
|
|
main() {
|
|||
|
|
# 1. 初始化日志
|
|||
|
|
echo "📅 测试开始时间:$(date '+%Y-%m-%d %H:%M:%S')" > "${LOG_FILE}"
|
|||
|
|
echo "🔧 测试配置:BEGIN=${TEST_BEGIN}, END=${TEST_END}, FACTOR=${TEST_FACTOR}" >> "${LOG_FILE}"
|
|||
|
|
|
|||
|
|
# 2. 检查依赖
|
|||
|
|
check_dependencies
|
|||
|
|
|
|||
|
|
# 3. 逐卡增量测试(循环添加ALL_GPUS中的卡)
|
|||
|
|
while [ ${CURRENT_TEST_INDEX} -lt ${#ALL_GPUS[@]} ]; do
|
|||
|
|
local current_gpu=${ALL_GPUS[${CURRENT_TEST_INDEX}]}
|
|||
|
|
|
|||
|
|
# 临时添加当前卡到测试列表
|
|||
|
|
local temp_test_gpus=("${AVAILABLE_GPUS[@]}" "${current_gpu}")
|
|||
|
|
|
|||
|
|
# 执行测试
|
|||
|
|
if run_all_reduce_test "${temp_test_gpus[@]}"; then
|
|||
|
|
# 测试成功:保留当前卡到可用列表
|
|||
|
|
AVAILABLE_GPUS=("${temp_test_gpus[@]}")
|
|||
|
|
echo -e "✅ 测试成功:GPU ${current_gpu} 加入可用列表"
|
|||
|
|
echo "✅ GPU ${current_gpu} 测试成功(BUS-ID: $(get_gpu_busid ${current_gpu}), SN: $(get_gpu_sn ${current_gpu}))" >> "${LOG_FILE}"
|
|||
|
|
else
|
|||
|
|
# 测试失败:记录故障卡信息,不加入可用列表
|
|||
|
|
local busid=$(get_gpu_busid ${current_gpu})
|
|||
|
|
local sn=$(get_gpu_sn ${current_gpu})
|
|||
|
|
FAULTY_GPUS+=("${current_gpu}:${busid}:${sn}")
|
|||
|
|
echo -e "❌ 测试失败:GPU ${current_gpu} 为故障卡,已跳过"
|
|||
|
|
echo "❌ GPU ${current_gpu} 测试失败(BUS-ID: ${busid}, SN: ${sn})" >> "${LOG_FILE}"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 推进到下一张卡
|
|||
|
|
CURRENT_TEST_INDEX=$((CURRENT_TEST_INDEX + 1))
|
|||
|
|
done
|
|||
|
|
|
|||
|
|
# 4. 输出最终汇总结果
|
|||
|
|
echo -e "\n=================================================="
|
|||
|
|
echo "📊 测试完成!最终汇总结果"
|
|||
|
|
echo "=================================================="
|
|||
|
|
echo "✅ 可用GPU数量:${#AVAILABLE_GPUS[@]} 张"
|
|||
|
|
echo "✅ 可用GPU列表:[${AVAILABLE_GPUS[*]}]"
|
|||
|
|
echo -e "\n❌ 故障GPU数量:${#FAULTY_GPUS[@]} 张"
|
|||
|
|
|
|||
|
|
if [ ${#FAULTY_GPUS[@]} -gt 0 ]; then
|
|||
|
|
echo "❌ 故障GPU详情:"
|
|||
|
|
echo "--------------------------------------------------"
|
|||
|
|
echo -e "序号\tBUS-ID\t\tSN号"
|
|||
|
|
echo "--------------------------------------------------"
|
|||
|
|
for faulty in "${FAULTY_GPUS[@]}"; do
|
|||
|
|
IFS=':' read -r idx busid sn <<< "$faulty"
|
|||
|
|
echo -e "${idx}\t${busid}\t${sn}"
|
|||
|
|
done
|
|||
|
|
else
|
|||
|
|
echo "❌ 无故障GPU,所有卡测试通过!"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
echo -e "\n📄 详细测试日志已保存到:${LOG_FILE}"
|
|||
|
|
echo "📅 测试结束时间:$(date '+%Y-%m-%d %H:%M:%S')"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 启动主流程
|
|||
|
|
main
|
|||
|
|
#NCCL_DEBUG=INFO;CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun -n 8 --allow-run-as-root all_reduce_perf -b 1MB -e 1GB -f 2 -g 1
|