109 lines
2.8 KiB
Bash
109 lines
2.8 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
set -eo pipefail # 脚本遇到错误时立即退出,确保执行安全性
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 定义用法说明函数
|
|||
|
|
usage() {
|
|||
|
|
cat << EOF
|
|||
|
|
Usage: $(basename "$0") --host=<host_list>
|
|||
|
|
|
|||
|
|
Description:
|
|||
|
|
自动执行MPI all_reduce性能测试,支持指定主机列表,自动计算进程数(np = 服务器数量 × 8)
|
|||
|
|
|
|||
|
|
Required Options:
|
|||
|
|
--host=<host_list> 指定主机列表,格式为"ip1[:count],ip2[:count],..."(例如:10.0.21.131:8,10.0.21.132:8)
|
|||
|
|
注:每个主机默认对应8个进程,count字段不影响np计算,仅用于MPI主机分配
|
|||
|
|
|
|||
|
|
Example:
|
|||
|
|
bash $(basename "$0") --host=10.0.21.131:8,10.0.21.132:8
|
|||
|
|
EOF
|
|||
|
|
exit 1
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 解析命令行参数
|
|||
|
|
HOST_LIST=""
|
|||
|
|
for arg in "$@"; do
|
|||
|
|
case "$arg" in
|
|||
|
|
--host=*)
|
|||
|
|
HOST_LIST="${arg#*=}" # 提取--host=后面的值
|
|||
|
|
shift
|
|||
|
|
;;
|
|||
|
|
*)
|
|||
|
|
echo "错误:未知参数 '$arg'"
|
|||
|
|
usage
|
|||
|
|
;;
|
|||
|
|
esac
|
|||
|
|
done
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 校验参数合法性
|
|||
|
|
if [ -z "$HOST_LIST" ]; then
|
|||
|
|
echo "错误:必须通过--host指定主机列表"
|
|||
|
|
usage
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 分割主机列表,统计服务器数量(按逗号分割)
|
|||
|
|
IFS=',' read -ra HOST_ARRAY <<< "$HOST_LIST"
|
|||
|
|
SERVER_COUNT=${#HOST_ARRAY[@]} # 服务器数量 = 主机列表条目数
|
|||
|
|
|
|||
|
|
if [ "$SERVER_COUNT" -eq 0 ]; then
|
|||
|
|
echo "错误:主机列表为空,请检查--host参数格式"
|
|||
|
|
usage
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 计算np值(每个服务器8个进程)
|
|||
|
|
NP=$((SERVER_COUNT * 8))
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 打印配置信息(便于核对)
|
|||
|
|
echo "======================================"
|
|||
|
|
echo "MPI运行配置信息"
|
|||
|
|
echo "--------------------------------------"
|
|||
|
|
echo "主机列表: $HOST_LIST"
|
|||
|
|
echo "服务器数量: $SERVER_COUNT 台"
|
|||
|
|
echo "总进程数(np): $NP (每台服务器8个进程)"
|
|||
|
|
echo "======================================"
|
|||
|
|
echo
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 执行mpirun命令
|
|||
|
|
echo "开始执行MPI测试..."
|
|||
|
|
echo "命令: mpirun ... (详细参数见下方)"
|
|||
|
|
echo
|
|||
|
|
|
|||
|
|
mpirun \
|
|||
|
|
-np "$NP" \
|
|||
|
|
--bind-to numa \
|
|||
|
|
--map-by ppr:8:node \
|
|||
|
|
--host "$HOST_LIST" \
|
|||
|
|
--allow-run-as-root \
|
|||
|
|
-x LD_LIBRARY_PATH \
|
|||
|
|
--mca pml ob1 \
|
|||
|
|
--mca coll ^hcoll \
|
|||
|
|
--mca btl tcp,self \
|
|||
|
|
--mca btl_tcp_if_include bond0 \
|
|||
|
|
--mca oob_tcp_if_include bond0 \
|
|||
|
|
--bind-to none \
|
|||
|
|
-x NCCL_IB_HCA==mlx5_bond_2,mlx5_bond_3,mlx5_bond_4,mlx5_bond_5,mlx5_bond_6,mlx5_bond_7,mlx5_bond_8,mlx5_bond_9 \
|
|||
|
|
-x NCCL_IB_QPS_PER_CONNECTION=8 \
|
|||
|
|
-x NCCL_MIN_NCHANNELS=32 \
|
|||
|
|
-x NCCL_SOCKET_IFNAME=bond0 \
|
|||
|
|
all_reduce_perf \
|
|||
|
|
--minbytes=16G \
|
|||
|
|
--maxbytes=32G \
|
|||
|
|
--stepfactor=2 \
|
|||
|
|
--datatype=float \
|
|||
|
|
--iters=100
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 命令执行结果反馈
|
|||
|
|
if [ $? -eq 0 ]; then
|
|||
|
|
echo "======================================"
|
|||
|
|
echo "MPI测试执行完成,结果已输出"
|
|||
|
|
else
|
|||
|
|
echo "======================================"
|
|||
|
|
echo "错误:MPI测试执行失败,请检查日志" >&2
|
|||
|
|
exit 1
|
|||
|
|
fi
|