ansible-devops/scripts/gpu_bench_auto.sh

455 lines
18 KiB
Bash
Raw Normal View History

2025-11-05 13:57:54 +08:00
#!/bin/bash
set -euo pipefail
# ======================================================
# 核心配置(集中管理,修改方便)
# ======================================================
# 压测包优先级1. /opt/gpu-test.tgz用户手动放置 2. /tmp/gpu-test.tgz下载缓存
LOCAL_PACKAGE="/opt/gpu-test.tgz" # 用户手动放置的压测包路径
DOWNLOAD_CACHE="/tmp/gpu-test.tgz" # 下载缓存路径
REMOTE_PACKAGE_URL="http://116.205.97.109/scripts/gpu-test.tgz" # 远程下载地址
TOOL_DIR="/test/" # 安全独立目录(禁止修改为系统目录)
DEFAULT_LOG_DIR="/opt/gpu-test-logs" # 默认日志目录
TGZ_SUB_DIR="gpu-test" # 适配你的tgz包结构解压后生成的子目录名
# 禁止使用的危险目录(避免误删系统文件)
DANGER_DIRS=("/" "/opt" "/usr" "/bin" "/sbin" "/lib" "/lib64" "/etc" "/home" "/root")
# 支持的压测项(命令+成功条件+失败描述,一一对应)
declare -rA TEST_CONFIGS=(
["bandwidthTest"]="\
./bandwidthTest \
; grep -q 'Result = PASS' {LOG_FILE} \
; 显存带宽测试未通过(日志无 PASS 标记)"
["deviceQuery"]="\
./deviceQuery \
; [ \$? -eq 0 ] \
; GPU设备查询失败返回码非0"
["gpu_burn"]="\
./gpu_burn 600 \
; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|No clients are alive|Couldn'\''t init' {LOG_FILE} \
; GPU稳定性测试失败崩溃/无存活客户端/初始化失败)"
["p2pBandwidthLatencyTest"]="\
./p2pBandwidthLatencyTest \
; [ \$? -eq 0 ] \
; GPU P2P带宽测试失败返回码非0"
# 新增 batchCUBLAS 压测项(命令:./batchCUBLAS --device=all
2025-11-07 14:58:16 +08:00
#["batchCUBLAS"]="\
# ./batchCUBLAS --device=all \
# ; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|Failed|Assertion failed' {LOG_FILE} \
# ; CUBLAS批量计算测试失败返回码非0或日志含错误信息"
2025-11-05 13:57:54 +08:00
)
# 必须的系统工具(缺失则提示安装)
REQUIRED_TOOLS=("wget" "tar" "grep" "sed" "mkdir" "chmod" "mv")
# 所有必须的工具文件(含新增的 batchCUBLAS
REQUIRED_TOOL_FILES=("${!TEST_CONFIGS[@]}" "compare.ptx")
# ======================================================
# 工具函数:日志输出(颜色区分,直观易读)
# ======================================================
print_success() { echo -e "\n\033[32m✅ $1\033[0m"; }
print_error() { echo -e "\n\033[31m❌ $1\033[0m"; }
print_info() { echo -e "\033[34m $1\033[0m"; }
print_warn() { echo -e "\033[33m⚠ $1\033[0m"; }
# ======================================================
# 安全校验:禁止工具目录是危险目录(核心安全逻辑)
# ======================================================
check_danger_dir() {
print_info "开始校验工具目录安全性..."
local tool_dir_real=$(realpath -m "$TOOL_DIR") # 解析绝对路径(处理相对路径)
# 检查是否在危险目录列表中
for danger in "${DANGER_DIRS[@]}"; do
if [[ "$tool_dir_real" == "$danger" || "$tool_dir_real" == "$danger/"* ]]; then
print_error "工具目录 $TOOL_DIR 是系统关键目录!禁止使用,避免误删文件"
print_info "建议工具目录:/opt/gpu-test-tools、/tmp/gpu-test-tools 等独立目录"
exit 1
fi
done
print_success "工具目录安全性校验通过(非系统关键目录)"
}
# ======================================================
# 步骤1检查系统依赖工具缺失则退出
# ======================================================
check_system_deps() {
print_info "开始检查系统依赖工具..."
local missing_tools=()
for tool in "${REQUIRED_TOOLS[@]}"; do
if ! command -v "$tool" &> /dev/null; then
missing_tools+=("$tool")
fi
done
if [[ ${#missing_tools[@]} -ne 0 ]]; then
print_error "缺失必要工具:${missing_tools[*]}"
print_info "CentOS/RHEL 安装yum install -y ${missing_tools[*]}"
print_info "Debian/Ubuntu 安装apt install -y ${missing_tools[*]}"
exit 1
fi
print_success "所有系统依赖工具均已安装"
}
# ======================================================
# 步骤2解析命令行参数处理 --tests 和 --logs
# ======================================================
parse_arguments() {
print_info "开始解析命令行参数..."
local target_tests=""
local log_dir="${DEFAULT_LOG_DIR}"
# 解析参数
for arg in "$@"; do
case "$arg" in
--tests=*) target_tests="${arg#--tests=}"; ;;
--logs=*) log_dir="${arg#--logs=}"; ;;
--help|-h) print_help; ;;
*) print_error "未知参数:$arg"; print_help; ;;
esac
done
# 校验必选参数 --tests
if [[ -z "$target_tests" ]]; then
print_error "缺少必选参数 --tests请指定压测项"
print_help
fi
# 解析并校验压测项
IFS=',' read -r -a selected_tests <<< "$target_tests"
local valid_tests=()
for test in "${selected_tests[@]}"; do
local trimmed_test=$(echo "$test" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
if [[ -z "$trimmed_test" ]]; then
print_warn "忽略空压测项(参数中多余逗号)"
continue
fi
if [[ ! -v "TEST_CONFIGS[$trimmed_test]" ]]; then
print_error "不支持的压测项:$trimmed_test"
print_info "支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')"
exit 1
fi
valid_tests+=("$trimmed_test")
done
# 校验有效压测项非空
if [[ ${#valid_tests[@]} -eq 0 ]]; then
print_error "未指定有效压测项(仅输入了逗号?)"
exit 1
fi
# 处理日志目录(不存在则创建)
if [[ ! -d "$log_dir" ]]; then
print_info "日志目录 $log_dir 不存在,开始创建..."
if ! mkdir -p "$log_dir" 2>/dev/null; then
print_error "创建日志目录失败:权限不足或路径非法"
exit 1
fi
print_success "日志目录创建成功:$log_dir"
fi
if [[ ! -w "$log_dir" ]]; then
print_error "日志目录 $log_dir 无写入权限"
exit 1
fi
# 全局变量赋值(只读,避免后续修改)
readonly SELECTED_TESTS=("${valid_tests[@]}")
readonly LOG_DIR="$log_dir"
print_success "参数解析完成"
print_info "目标压测项:${SELECTED_TESTS[*]}"
print_info "日志存储目录:${LOG_DIR}"
}
# ======================================================
# 步骤3处理压测包优先用本地无则下载适配tgz子目录
# ======================================================
process_test_package() {
print_info "开始处理压测包..."
local package_path=""
# 优先级1使用 /opt/gpu-test.tgz用户手动放置
if [[ -f "$LOCAL_PACKAGE" && -s "$LOCAL_PACKAGE" ]]; then
package_path="$LOCAL_PACKAGE"
print_info "找到本地压测包:$package_path(优先使用)"
# 优先级2使用 /tmp/ 缓存(之前下载过)
elif [[ -f "$DOWNLOAD_CACHE" && -s "$DOWNLOAD_CACHE" ]]; then
package_path="$DOWNLOAD_CACHE"
print_info "找到下载缓存:$package_path(复用,不重复下载)"
# 优先级3远程下载
else
print_info "本地无压测包,开始从远程下载..."
print_info "下载地址:$REMOTE_PACKAGE_URL"
print_info "下载缓存:$DOWNLOAD_CACHE"
if ! wget -c --no-check-certificate -O "$DOWNLOAD_CACHE" "$REMOTE_PACKAGE_URL" 2>&1; then
print_error "压测包下载失败(网络问题或地址无效)"
print_info "手动验证地址curl -I $REMOTE_PACKAGE_URL"
rm -f "$DOWNLOAD_CACHE" # 清理不完整文件
exit 1
fi
if [[ ! -s "$DOWNLOAD_CACHE" ]]; then
print_error "下载的压测包为空文件(地址错误?)"
rm -f "$DOWNLOAD_CACHE"
exit 1
fi
package_path="$DOWNLOAD_CACHE"
print_success "压测包下载成功:$package_path(大小:$(du -sh "$package_path" | awk '{print $1}')"
fi
# 解压压测包(安全清理:只删目录内文件,不删目录本身)
print_info "开始解压压测包到:$TOOL_DIR"
mkdir -p "$TOOL_DIR" 2>/dev/null || { print_error "创建工具目录失败:权限不足"; exit 1; }
# 安全清理:删除目录内所有文件(保留目录),避免误删系统文件
if [[ $(ls -A "$TOOL_DIR" 2>/dev/null) ]]; then
print_info "清理工具目录内旧文件:$TOOL_DIR"
rm -rf "$TOOL_DIR"/* 2>/dev/null || { print_error "清理旧文件失败:权限不足"; exit 1; }
fi
# 解压
if ! tar -zxf "$package_path" -C "$TOOL_DIR" 2>&1; then
print_error "压测包解压失败(文件损坏或格式错误)"
rm -rf "$TOOL_DIR"/* # 清理不完整解压文件
exit 1
fi
# 适配tgz包结构将 gpu-test/ 子目录下的工具移动到 TOOL_DIR 根目录
local sub_dir_path="$TOOL_DIR/$TGZ_SUB_DIR"
if [[ -d "$sub_dir_path" ]]; then
print_info "识别到tgz子目录$sub_dir_path,开始移动工具到根目录"
mv -f "$sub_dir_path"/* "$TOOL_DIR/" 2>/dev/null || {
print_error "移动工具失败(子目录内无工具?)"
rm -rf "$TOOL_DIR"/*
exit 1
}
rm -rf "$sub_dir_path" # 删除空的子目录
else
print_warn "未找到tgz子目录 $sub_dir_path,默认工具已在根目录"
fi
# 验证所有必须工具文件是否存在(含新增的 batchCUBLAS
for tool_file in "${REQUIRED_TOOL_FILES[@]}"; do
if [[ ! -f "$TOOL_DIR/$tool_file" ]]; then
print_error "解压后未找到必要文件:$tool_file(压测包不完整)"
rm -rf "$TOOL_DIR"/*
exit 1
fi
done
# 添加执行权限(含 batchCUBLAS
chmod +x "$TOOL_DIR/"* 2>/dev/null || print_warn "部分工具已具备执行权限"
print_success "压测包处理完成!工具列表:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ') + compare.ptx"
}
# ======================================================
# 步骤4验证压测工具及依赖确保可执行含 batchCUBLAS
# ======================================================
verify_tools() {
print_info "开始验证压测工具及依赖..."
for test in "${SELECTED_TESTS[@]}"; do
local tool_path="$TOOL_DIR/$test"
# 验证工具存在
if [[ ! -f "$tool_path" ]]; then
print_error "压测工具不存在:$tool_path"
rm -rf "$TOOL_DIR"/*
exit 1
fi
# 验证工具可执行
if [[ ! -x "$tool_path" ]]; then
print_info "给工具添加执行权限:$test"
chmod +x "$tool_path" 2>/dev/null || { print_error "添加权限失败"; exit 1; }
fi
# 验证 gpu_burn 依赖compare.ptx
if [[ "$test" == "gpu_burn" ]]; then
local ptx_path="$TOOL_DIR/compare.ptx"
if [[ ! -f "$ptx_path" || ! -s "$ptx_path" ]]; then
print_error "gpu_burn 缺失依赖文件:$ptx_path请确保tgz包中包含"
rm -rf "$TOOL_DIR"/*
exit 1
fi
print_info "gpu_burn 依赖文件已验证:$ptx_path"
fi
# batchCUBLAS 无额外依赖,无需特殊校验
done
print_success "所有压测工具及依赖均有效"
}
# ======================================================
# 步骤5执行单个压测项独立函数逻辑清晰含 batchCUBLAS
# ======================================================
run_single_test() {
local test_name="$1"
local config="${TEST_CONFIGS[$test_name]}"
local log_file="${LOG_DIR}/${test_name}.log"
# 解析配置:执行命令、成功条件、失败描述
local exec_cmd=$(echo "$config" | cut -d';' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local success_cond=$(echo "$config" | cut -d';' -f2 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e "s/{LOG_FILE}/${log_file//\//\\/}/g")
local fail_desc=$(echo "$config" | cut -d';' -f3 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
print_info "\n=================================================="
print_info "开始执行压测项:$test_name"
print_info "执行命令cd $TOOL_DIR && $exec_cmd"
print_info "日志路径:$log_file"
print_info "=================================================="
# 切换目录执行(避免相对路径问题)
cd "$TOOL_DIR" 2>/dev/null || { print_error "无法进入工具目录:$TOOL_DIR"; return 1; }
# 执行压测(输出重定向到日志)
if ! $exec_cmd > "$log_file" 2>&1; then
print_error "压测项 $test_name 执行异常返回码非0"
print_error "失败描述:$fail_desc"
print_info "详细日志:$log_file"
return 1
fi
# 校验压测结果
print_info "正在校验 $test_name 压测结果..."
if eval "$success_cond"; then
print_success "$test_name 压测执行成功"
return 0
else
print_error "$test_name 压测结果不达标"
print_error "失败描述:$fail_desc"
print_info "详细日志:$log_file"
return 1
fi
}
# ======================================================
# 步骤6打印压测结果汇总修复语法错误直观清晰
# ======================================================
print_summary() {
print_info "\n\n======================================================"
echo -e " 压测结果汇总 "
print_info "======================================================"
local total=${#SELECTED_TESTS[@]}
local success=0
local failed=0
for test in "${SELECTED_TESTS[@]}"; do
local log_file="${LOG_DIR}/${test_name}.log" # 修复:将 test 改为 test_name
# 修复语法:用括号明确条件分组,避免解析错误
if ( [[ -f "$log_file" && $(grep -cE 'SUCCESS|Result = PASS|completed successfully' "$log_file" 2>/dev/null) -ge 1 ]] ) || \
( [[ "$test" == "batchCUBLAS" ]] && ! grep -qE 'ERROR|FATAL|Failed' "$log_file" 2>/dev/null ); then
echo -e "✅ \033[32m$test:成功\033[0m → 日志:$log_file"
success=$((success + 1))
else
echo -e "❌ \033[31m$test:失败\033[0m → 日志:$log_file"
failed=$((failed + 1))
fi
done
print_info "\n📊 统计信息"
echo -e "总压测项:$total"
echo -e "成功项数:\033[32m$success 项\033[0m"
echo -e "失败项数:\033[31m$failed 项\033[0m"
print_info "======================================================\n"
if [[ $failed -eq 0 ]]; then
print_success "所有压测项均执行成功!"
else
print_error "共有 $failed 项压测失败,请查看日志排查问题"
fi
}
# ======================================================
# 步骤7帮助信息更新支持的压测项含 batchCUBLAS
# ======================================================
print_help() {
cat << EOF
用法:$(basename "$0") --tests=压测项1,压测项2,... [--logs=日志目录] [--help]
核心功能:自动处理压测包(优先本地→无则下载)→ 解压→验证→执行压测,失败即终止
适配tgz包结构解压后生成 $TGZ_SUB_DIR/ 子目录,自动移动工具到根目录
必选参数:
--tests=TEST1,TEST2,... 指定压测项(多个用逗号分隔,无空格)
支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')"
可选参数:
--logs=LOG_DIR 日志存储目录(默认:$DEFAULT_LOG_DIR,不存在则自动创建)
--help/-h 查看帮助信息
压测包优先级:
1. 本地包:$LOCAL_PACKAGE(用户手动放置,优先使用)
2. 缓存包:$DOWNLOAD_CACHE(之前下载过,复用)
3. 远程包:$REMOTE_PACKAGE_URL(无本地包时自动下载)
各压测项说明:
- bandwidthTest显存带宽测试校验日志 PASS 标记)
- deviceQueryGPU设备信息查询校验返回码0
- gpu_burnGPU稳定性测试600秒校验无崩溃日志
- p2pBandwidthLatencyTestGPU P2P带宽延迟测试校验返回码0
- batchCUBLASCUBLAS批量计算测试--device=all校验无错误日志
安全提示:
❗ 工具目录默认:$TOOL_DIR(独立目录,避免误删系统文件)
❗ 禁止修改 TOOL_DIR 为 /opt、/usr 等系统关键目录!
示例:
1. 执行 batchCUBLAS + gpu_burn 测试(自定义日志目录)
$(basename "$0") --tests=batchCUBLAS,gpu_burn --logs=/tmp/gpu-logs
2. 执行所有压测项(使用默认日志目录)
$(basename "$0") --tests=bandwidthTest,deviceQuery,gpu_burn,p2pBandwidthLatencyTest,batchCUBLAS
3. 查看帮助
$(basename "$0") -h
EOF
exit 1
}
# ======================================================
# 主流程(按步骤执行,逻辑线性,易追踪)
# ======================================================
main() {
echo -e "======================================================"
echo -e " GPU 压测脚本最终适配版v1.4 "
echo -e "======================================================\n"
# 安全校验:禁止危险目录(核心安全逻辑)
check_danger_dir
# 1. 检查系统依赖
check_system_deps
# 2. 解析命令行参数
parse_arguments "$@"
# 3. 处理压测包适配tgz子目录
process_test_package
# 4. 验证工具及依赖
verify_tools
# 5. 执行压测(失败即终止)
print_info "\n开始执行压测流程失败即终止..."
for test in "${SELECTED_TESTS[@]}"; do
if ! run_single_test "$test"; then
print_summary
exit 1
fi
done
# 6. 输出结果汇总
print_summary
exit 0
}
# 启动主流程(接收所有命令行参数)
main "$@"