ansible-devops/scripts/local_allreduce_gpu.sh

124 lines
5.0 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -euo pipefail
# ==================== 配置参数(可根据实际调整)====================
TEST_BEGIN="1MB" # 测试消息起始大小
TEST_END="1GB" # 测试消息结束大小
TEST_FACTOR="2" # 消息大小倍增因子
GPU_PER_PROC="1" # 每个进程绑定的GPU数
MPI_ARGS="--allow-run-as-root" # MPI额外参数
ALL_GPUS=(0 1 2 3 4 5 6 7) # 待测试的所有GPU序号
LOG_FILE="gpu_all_reduce_test.log" # 测试日志文件
# ==================== 初始化变量 ====================
declare -a AVAILABLE_GPUS=() # 当前可用的GPU列表排除故障卡
declare -a FAULTY_GPUS=() # 故障卡信息列表(格式:序号:BUS-ID:SN号
CURRENT_TEST_INDEX=0 # 当前测试的GPU索引从ALL_GPUS[0]开始)
# ==================== 工具函数 ====================
# 1. 检查依赖命令是否存在
check_dependencies() {
local deps=("nvidia-smi" "mpirun" "all_reduce_perf")
for dep in "${deps[@]}"; do
if ! command -v "$dep" &> /dev/null; then
echo "❌ 错误:未找到依赖命令 $dep,请先安装或配置环境变量"
exit 1
fi
done
echo "✅ 所有依赖命令已就绪"
}
# 2. 获取指定GPU的BUS-ID
get_gpu_busid() {
local gpu_idx=$1
nvidia-smi -L | awk -v idx="$gpu_idx" '{if ($1 == idx":") print $3}' | sed 's/\[//;s/\]//'
}
# 3. 获取指定GPU的SN号
get_gpu_sn() {
local gpu_idx=$1
nvidia-smi -q -i "$gpu_idx" | grep -A1 "Serial Number" | grep -v "Serial Number" | awk '{print $1}'
}
# 4. 执行单轮all_reduce测试
run_all_reduce_test() {
local test_gpus=("$@")
local gpu_count=${#test_gpus[@]}
local gpu_str=$(IFS=,; echo "${test_gpus[*]}")
echo -e "\n=================================================="
echo "📌 开始测试当前测试GPU组合 = [${gpu_str}](共${gpu_count}张卡)"
echo "=================================================="
# 执行测试(重定向日志到文件,同时输出到终端)
NCCL_DEBUG=INFO CUDA_VISIBLE_DEVICES="${gpu_str}" mpirun ${MPI_ARGS} -n "${gpu_count}" all_reduce_perf \
-b "${TEST_BEGIN}" -e "${TEST_END}" -f "${TEST_FACTOR}" -g "${GPU_PER_PROC}" \
2>&1 | tee -a "${LOG_FILE}"
# 返回测试结果0=成功非0=失败)
return $?
}
# ==================== 主流程 ====================
main() {
# 1. 初始化日志
echo "📅 测试开始时间:$(date '+%Y-%m-%d %H:%M:%S')" > "${LOG_FILE}"
echo "🔧 测试配置BEGIN=${TEST_BEGIN}, END=${TEST_END}, FACTOR=${TEST_FACTOR}" >> "${LOG_FILE}"
# 2. 检查依赖
check_dependencies
# 3. 逐卡增量测试循环添加ALL_GPUS中的卡
while [ ${CURRENT_TEST_INDEX} -lt ${#ALL_GPUS[@]} ]; do
local current_gpu=${ALL_GPUS[${CURRENT_TEST_INDEX}]}
# 临时添加当前卡到测试列表
local temp_test_gpus=("${AVAILABLE_GPUS[@]}" "${current_gpu}")
# 执行测试
if run_all_reduce_test "${temp_test_gpus[@]}"; then
# 测试成功:保留当前卡到可用列表
AVAILABLE_GPUS=("${temp_test_gpus[@]}")
echo -e "✅ 测试成功GPU ${current_gpu} 加入可用列表"
echo "✅ GPU ${current_gpu} 测试成功BUS-ID: $(get_gpu_busid ${current_gpu}), SN: $(get_gpu_sn ${current_gpu})" >> "${LOG_FILE}"
else
# 测试失败:记录故障卡信息,不加入可用列表
local busid=$(get_gpu_busid ${current_gpu})
local sn=$(get_gpu_sn ${current_gpu})
FAULTY_GPUS+=("${current_gpu}:${busid}:${sn}")
echo -e "❌ 测试失败GPU ${current_gpu} 为故障卡,已跳过"
echo "❌ GPU ${current_gpu} 测试失败BUS-ID: ${busid}, SN: ${sn}" >> "${LOG_FILE}"
fi
# 推进到下一张卡
CURRENT_TEST_INDEX=$((CURRENT_TEST_INDEX + 1))
done
# 4. 输出最终汇总结果
echo -e "\n=================================================="
echo "📊 测试完成!最终汇总结果"
echo "=================================================="
echo "✅ 可用GPU数量${#AVAILABLE_GPUS[@]}"
echo "✅ 可用GPU列表[${AVAILABLE_GPUS[*]}]"
echo -e "\n❌ 故障GPU数量${#FAULTY_GPUS[@]}"
if [ ${#FAULTY_GPUS[@]} -gt 0 ]; then
echo "❌ 故障GPU详情"
echo "--------------------------------------------------"
echo -e "序号\tBUS-ID\t\tSN号"
echo "--------------------------------------------------"
for faulty in "${FAULTY_GPUS[@]}"; do
IFS=':' read -r idx busid sn <<< "$faulty"
echo -e "${idx}\t${busid}\t${sn}"
done
else
echo "❌ 无故障GPU所有卡测试通过"
fi
echo -e "\n📄 详细测试日志已保存到:${LOG_FILE}"
echo "📅 测试结束时间:$(date '+%Y-%m-%d %H:%M:%S')"
}
# 启动主流程
main
#NCCL_DEBUG=INFO;CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun -n 8 --allow-run-as-root all_reduce_perf -b 1MB -e 1GB -f 2 -g 1