diff --git a/scripts/local_allreduce_gpu.sh b/scripts/local_allreduce_gpu.sh new file mode 100644 index 0000000..13ec7c6 --- /dev/null +++ b/scripts/local_allreduce_gpu.sh @@ -0,0 +1,124 @@ +#!/bin/bash +set -euo pipefail + +# ==================== 配置参数(可根据实际调整)==================== +TEST_BEGIN="1MB" # 测试消息起始大小 +TEST_END="1GB" # 测试消息结束大小 +TEST_FACTOR="2" # 消息大小倍增因子 +GPU_PER_PROC="1" # 每个进程绑定的GPU数 +MPI_ARGS="--allow-run-as-root" # MPI额外参数 +ALL_GPUS=(0 1 2 3 4 5 6 7) # 待测试的所有GPU序号 +LOG_FILE="gpu_all_reduce_test.log" # 测试日志文件 + +# ==================== 初始化变量 ==================== +declare -a AVAILABLE_GPUS=() # 当前可用的GPU列表(排除故障卡) +declare -a FAULTY_GPUS=() # 故障卡信息列表(格式:序号:BUS-ID:SN号) +CURRENT_TEST_INDEX=0 # 当前测试的GPU索引(从ALL_GPUS[0]开始) + +# ==================== 工具函数 ==================== +# 1. 检查依赖命令是否存在 +check_dependencies() { + local deps=("nvidia-smi" "mpirun" "all_reduce_perf") + for dep in "${deps[@]}"; do + if ! command -v "$dep" &> /dev/null; then + echo "❌ 错误:未找到依赖命令 $dep,请先安装或配置环境变量" + exit 1 + fi + done + echo "✅ 所有依赖命令已就绪" +} + +# 2. 获取指定GPU的BUS-ID +get_gpu_busid() { + local gpu_idx=$1 + nvidia-smi -L | awk -v idx="$gpu_idx" '{if ($1 == idx":") print $3}' | sed 's/\[//;s/\]//' +} + +# 3. 获取指定GPU的SN号 +get_gpu_sn() { + local gpu_idx=$1 + nvidia-smi -q -i "$gpu_idx" | grep -A1 "Serial Number" | grep -v "Serial Number" | awk '{print $1}' +} + +# 4. 执行单轮all_reduce测试 +run_all_reduce_test() { + local test_gpus=("$@") + local gpu_count=${#test_gpus[@]} + local gpu_str=$(IFS=,; echo "${test_gpus[*]}") + + echo -e "\n==================================================" + echo "📌 开始测试:当前测试GPU组合 = [${gpu_str}](共${gpu_count}张卡)" + echo "==================================================" + + # 执行测试(重定向日志到文件,同时输出到终端) + NCCL_DEBUG=INFO CUDA_VISIBLE_DEVICES="${gpu_str}" mpirun ${MPI_ARGS} -n "${gpu_count}" all_reduce_perf \ + -b "${TEST_BEGIN}" -e "${TEST_END}" -f "${TEST_FACTOR}" -g "${GPU_PER_PROC}" \ + 2>&1 | tee -a "${LOG_FILE}" + + # 返回测试结果(0=成功,非0=失败) + return $? +} + +# ==================== 主流程 ==================== +main() { + # 1. 初始化日志 + echo "📅 测试开始时间:$(date '+%Y-%m-%d %H:%M:%S')" > "${LOG_FILE}" + echo "🔧 测试配置:BEGIN=${TEST_BEGIN}, END=${TEST_END}, FACTOR=${TEST_FACTOR}" >> "${LOG_FILE}" + + # 2. 检查依赖 + check_dependencies + + # 3. 逐卡增量测试(循环添加ALL_GPUS中的卡) + while [ ${CURRENT_TEST_INDEX} -lt ${#ALL_GPUS[@]} ]; do + local current_gpu=${ALL_GPUS[${CURRENT_TEST_INDEX}]} + + # 临时添加当前卡到测试列表 + local temp_test_gpus=("${AVAILABLE_GPUS[@]}" "${current_gpu}") + + # 执行测试 + if run_all_reduce_test "${temp_test_gpus[@]}"; then + # 测试成功:保留当前卡到可用列表 + AVAILABLE_GPUS=("${temp_test_gpus[@]}") + echo -e "✅ 测试成功:GPU ${current_gpu} 加入可用列表" + echo "✅ GPU ${current_gpu} 测试成功(BUS-ID: $(get_gpu_busid ${current_gpu}), SN: $(get_gpu_sn ${current_gpu}))" >> "${LOG_FILE}" + else + # 测试失败:记录故障卡信息,不加入可用列表 + local busid=$(get_gpu_busid ${current_gpu}) + local sn=$(get_gpu_sn ${current_gpu}) + FAULTY_GPUS+=("${current_gpu}:${busid}:${sn}") + echo -e "❌ 测试失败:GPU ${current_gpu} 为故障卡,已跳过" + echo "❌ GPU ${current_gpu} 测试失败(BUS-ID: ${busid}, SN: ${sn})" >> "${LOG_FILE}" + fi + + # 推进到下一张卡 + CURRENT_TEST_INDEX=$((CURRENT_TEST_INDEX + 1)) + done + + # 4. 输出最终汇总结果 + echo -e "\n==================================================" + echo "📊 测试完成!最终汇总结果" + echo "==================================================" + echo "✅ 可用GPU数量:${#AVAILABLE_GPUS[@]} 张" + echo "✅ 可用GPU列表:[${AVAILABLE_GPUS[*]}]" + echo -e "\n❌ 故障GPU数量:${#FAULTY_GPUS[@]} 张" + + if [ ${#FAULTY_GPUS[@]} -gt 0 ]; then + echo "❌ 故障GPU详情:" + echo "--------------------------------------------------" + echo -e "序号\tBUS-ID\t\tSN号" + echo "--------------------------------------------------" + for faulty in "${FAULTY_GPUS[@]}"; do + IFS=':' read -r idx busid sn <<< "$faulty" + echo -e "${idx}\t${busid}\t${sn}" + done + else + echo "❌ 无故障GPU,所有卡测试通过!" + fi + + echo -e "\n📄 详细测试日志已保存到:${LOG_FILE}" + echo "📅 测试结束时间:$(date '+%Y-%m-%d %H:%M:%S')" +} + +# 启动主流程 +main +#NCCL_DEBUG=INFO;CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun -n 8 --allow-run-as-root all_reduce_perf -b 1MB -e 1GB -f 2 -g 1 \ No newline at end of file