ansible-devops/scripts/mpi_allreduce_perf_test.sh

109 lines
2.8 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -eo pipefail # 脚本遇到错误时立即退出,确保执行安全性
# 定义用法说明函数
usage() {
cat << EOF
Usage: $(basename "$0") --host=<host_list>
Description:
自动执行MPI all_reduce性能测试支持指定主机列表自动计算进程数(np = 服务器数量 × 8)
Required Options:
--host=<host_list> 指定主机列表,格式为"ip1[:count],ip2[:count],..."例如10.0.21.131:8,10.0.21.132:8
每个主机默认对应8个进程count字段不影响np计算仅用于MPI主机分配
Example:
bash $(basename "$0") --host=10.0.21.131:8,10.0.21.132:8
EOF
exit 1
}
# 解析命令行参数
HOST_LIST=""
for arg in "$@"; do
case "$arg" in
--host=*)
HOST_LIST="${arg#*=}" # 提取--host=后面的值
shift
;;
*)
echo "错误:未知参数 '$arg'"
usage
;;
esac
done
# 校验参数合法性
if [ -z "$HOST_LIST" ]; then
echo "错误:必须通过--host指定主机列表"
usage
fi
# 分割主机列表,统计服务器数量(按逗号分割)
IFS=',' read -ra HOST_ARRAY <<< "$HOST_LIST"
SERVER_COUNT=${#HOST_ARRAY[@]} # 服务器数量 = 主机列表条目数
if [ "$SERVER_COUNT" -eq 0 ]; then
echo "错误:主机列表为空,请检查--host参数格式"
usage
fi
# 计算np值每个服务器8个进程
NP=$((SERVER_COUNT * 8))
# 打印配置信息(便于核对)
echo "======================================"
echo "MPI运行配置信息"
echo "--------------------------------------"
echo "主机列表: $HOST_LIST"
echo "服务器数量: $SERVER_COUNT"
echo "总进程数(np): $NP (每台服务器8个进程)"
echo "======================================"
echo
# 执行mpirun命令
echo "开始执行MPI测试..."
echo "命令: mpirun ... (详细参数见下方)"
echo
mpirun \
-np "$NP" \
--bind-to numa \
--map-by ppr:8:node \
--host "$HOST_LIST" \
--allow-run-as-root \
-x LD_LIBRARY_PATH \
--mca pml ob1 \
--mca coll ^hcoll \
--mca btl tcp,self \
--mca btl_tcp_if_include bond0 \
--mca oob_tcp_if_include bond0 \
--bind-to none \
-x NCCL_IB_HCA==mlx5_bond_2,mlx5_bond_3,mlx5_bond_4,mlx5_bond_5,mlx5_bond_6,mlx5_bond_7,mlx5_bond_8,mlx5_bond_9 \
-x NCCL_IB_QPS_PER_CONNECTION=8 \
-x NCCL_MIN_NCHANNELS=32 \
-x NCCL_SOCKET_IFNAME=bond0 \
all_reduce_perf \
--minbytes=16G \
--maxbytes=32G \
--stepfactor=2 \
--datatype=float \
--iters=100
# 命令执行结果反馈
if [ $? -eq 0 ]; then
echo "======================================"
echo "MPI测试执行完成结果已输出"
else
echo "======================================"
echo "错误MPI测试执行失败请检查日志" >&2
exit 1
fi