ansible-devops/scripts/gpu_bench_auto.sh

455 lines
18 KiB
Bash
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -euo pipefail
# ======================================================
# 核心配置(集中管理,修改方便)
# ======================================================
# 压测包优先级1. /opt/gpu-test.tgz用户手动放置 2. /tmp/gpu-test.tgz下载缓存
LOCAL_PACKAGE="/opt/gpu-test.tgz" # 用户手动放置的压测包路径
DOWNLOAD_CACHE="/tmp/gpu-test.tgz" # 下载缓存路径
REMOTE_PACKAGE_URL="http://116.205.97.109/scripts/gpu-test.tgz" # 远程下载地址
TOOL_DIR="/test/" # 安全独立目录(禁止修改为系统目录)
DEFAULT_LOG_DIR="/opt/gpu-test-logs" # 默认日志目录
TGZ_SUB_DIR="gpu-test" # 适配你的tgz包结构解压后生成的子目录名
# 禁止使用的危险目录(避免误删系统文件)
DANGER_DIRS=("/" "/opt" "/usr" "/bin" "/sbin" "/lib" "/lib64" "/etc" "/home" "/root")
# 支持的压测项(命令+成功条件+失败描述,一一对应)
declare -rA TEST_CONFIGS=(
["bandwidthTest"]="\
./bandwidthTest \
; grep -q 'Result = PASS' {LOG_FILE} \
; 显存带宽测试未通过(日志无 PASS 标记)"
["deviceQuery"]="\
./deviceQuery \
; [ \$? -eq 0 ] \
; GPU设备查询失败返回码非0"
["gpu_burn"]="\
./gpu_burn 600 \
; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|No clients are alive|Couldn'\''t init' {LOG_FILE} \
; GPU稳定性测试失败崩溃/无存活客户端/初始化失败)"
["p2pBandwidthLatencyTest"]="\
./p2pBandwidthLatencyTest \
; [ \$? -eq 0 ] \
; GPU P2P带宽测试失败返回码非0"
# 新增 batchCUBLAS 压测项(命令:./batchCUBLAS --device=all
#["batchCUBLAS"]="\
# ./batchCUBLAS --device=all \
# ; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|Failed|Assertion failed' {LOG_FILE} \
# ; CUBLAS批量计算测试失败返回码非0或日志含错误信息"
)
# 必须的系统工具(缺失则提示安装)
REQUIRED_TOOLS=("wget" "tar" "grep" "sed" "mkdir" "chmod" "mv")
# 所有必须的工具文件(含新增的 batchCUBLAS
REQUIRED_TOOL_FILES=("${!TEST_CONFIGS[@]}" "compare.ptx")
# ======================================================
# 工具函数:日志输出(颜色区分,直观易读)
# ======================================================
print_success() { echo -e "\n\033[32m✅ $1\033[0m"; }
print_error() { echo -e "\n\033[31m❌ $1\033[0m"; }
print_info() { echo -e "\033[34m $1\033[0m"; }
print_warn() { echo -e "\033[33m⚠ $1\033[0m"; }
# ======================================================
# 安全校验:禁止工具目录是危险目录(核心安全逻辑)
# ======================================================
check_danger_dir() {
print_info "开始校验工具目录安全性..."
local tool_dir_real=$(realpath -m "$TOOL_DIR") # 解析绝对路径(处理相对路径)
# 检查是否在危险目录列表中
for danger in "${DANGER_DIRS[@]}"; do
if [[ "$tool_dir_real" == "$danger" || "$tool_dir_real" == "$danger/"* ]]; then
print_error "工具目录 $TOOL_DIR 是系统关键目录!禁止使用,避免误删文件"
print_info "建议工具目录:/opt/gpu-test-tools、/tmp/gpu-test-tools 等独立目录"
exit 1
fi
done
print_success "工具目录安全性校验通过(非系统关键目录)"
}
# ======================================================
# 步骤1检查系统依赖工具缺失则退出
# ======================================================
check_system_deps() {
print_info "开始检查系统依赖工具..."
local missing_tools=()
for tool in "${REQUIRED_TOOLS[@]}"; do
if ! command -v "$tool" &> /dev/null; then
missing_tools+=("$tool")
fi
done
if [[ ${#missing_tools[@]} -ne 0 ]]; then
print_error "缺失必要工具:${missing_tools[*]}"
print_info "CentOS/RHEL 安装yum install -y ${missing_tools[*]}"
print_info "Debian/Ubuntu 安装apt install -y ${missing_tools[*]}"
exit 1
fi
print_success "所有系统依赖工具均已安装"
}
# ======================================================
# 步骤2解析命令行参数处理 --tests 和 --logs
# ======================================================
parse_arguments() {
print_info "开始解析命令行参数..."
local target_tests=""
local log_dir="${DEFAULT_LOG_DIR}"
# 解析参数
for arg in "$@"; do
case "$arg" in
--tests=*) target_tests="${arg#--tests=}"; ;;
--logs=*) log_dir="${arg#--logs=}"; ;;
--help|-h) print_help; ;;
*) print_error "未知参数:$arg"; print_help; ;;
esac
done
# 校验必选参数 --tests
if [[ -z "$target_tests" ]]; then
print_error "缺少必选参数 --tests请指定压测项"
print_help
fi
# 解析并校验压测项
IFS=',' read -r -a selected_tests <<< "$target_tests"
local valid_tests=()
for test in "${selected_tests[@]}"; do
local trimmed_test=$(echo "$test" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
if [[ -z "$trimmed_test" ]]; then
print_warn "忽略空压测项(参数中多余逗号)"
continue
fi
if [[ ! -v "TEST_CONFIGS[$trimmed_test]" ]]; then
print_error "不支持的压测项:$trimmed_test"
print_info "支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')"
exit 1
fi
valid_tests+=("$trimmed_test")
done
# 校验有效压测项非空
if [[ ${#valid_tests[@]} -eq 0 ]]; then
print_error "未指定有效压测项(仅输入了逗号?)"
exit 1
fi
# 处理日志目录(不存在则创建)
if [[ ! -d "$log_dir" ]]; then
print_info "日志目录 $log_dir 不存在,开始创建..."
if ! mkdir -p "$log_dir" 2>/dev/null; then
print_error "创建日志目录失败:权限不足或路径非法"
exit 1
fi
print_success "日志目录创建成功:$log_dir"
fi
if [[ ! -w "$log_dir" ]]; then
print_error "日志目录 $log_dir 无写入权限"
exit 1
fi
# 全局变量赋值(只读,避免后续修改)
readonly SELECTED_TESTS=("${valid_tests[@]}")
readonly LOG_DIR="$log_dir"
print_success "参数解析完成"
print_info "目标压测项:${SELECTED_TESTS[*]}"
print_info "日志存储目录:${LOG_DIR}"
}
# ======================================================
# 步骤3处理压测包优先用本地无则下载适配tgz子目录
# ======================================================
process_test_package() {
print_info "开始处理压测包..."
local package_path=""
# 优先级1使用 /opt/gpu-test.tgz用户手动放置
if [[ -f "$LOCAL_PACKAGE" && -s "$LOCAL_PACKAGE" ]]; then
package_path="$LOCAL_PACKAGE"
print_info "找到本地压测包:$package_path(优先使用)"
# 优先级2使用 /tmp/ 缓存(之前下载过)
elif [[ -f "$DOWNLOAD_CACHE" && -s "$DOWNLOAD_CACHE" ]]; then
package_path="$DOWNLOAD_CACHE"
print_info "找到下载缓存:$package_path(复用,不重复下载)"
# 优先级3远程下载
else
print_info "本地无压测包,开始从远程下载..."
print_info "下载地址:$REMOTE_PACKAGE_URL"
print_info "下载缓存:$DOWNLOAD_CACHE"
if ! wget -c --no-check-certificate -O "$DOWNLOAD_CACHE" "$REMOTE_PACKAGE_URL" 2>&1; then
print_error "压测包下载失败(网络问题或地址无效)"
print_info "手动验证地址curl -I $REMOTE_PACKAGE_URL"
rm -f "$DOWNLOAD_CACHE" # 清理不完整文件
exit 1
fi
if [[ ! -s "$DOWNLOAD_CACHE" ]]; then
print_error "下载的压测包为空文件(地址错误?)"
rm -f "$DOWNLOAD_CACHE"
exit 1
fi
package_path="$DOWNLOAD_CACHE"
print_success "压测包下载成功:$package_path(大小:$(du -sh "$package_path" | awk '{print $1}')"
fi
# 解压压测包(安全清理:只删目录内文件,不删目录本身)
print_info "开始解压压测包到:$TOOL_DIR"
mkdir -p "$TOOL_DIR" 2>/dev/null || { print_error "创建工具目录失败:权限不足"; exit 1; }
# 安全清理:删除目录内所有文件(保留目录),避免误删系统文件
if [[ $(ls -A "$TOOL_DIR" 2>/dev/null) ]]; then
print_info "清理工具目录内旧文件:$TOOL_DIR"
rm -rf "$TOOL_DIR"/* 2>/dev/null || { print_error "清理旧文件失败:权限不足"; exit 1; }
fi
# 解压
if ! tar -zxf "$package_path" -C "$TOOL_DIR" 2>&1; then
print_error "压测包解压失败(文件损坏或格式错误)"
rm -rf "$TOOL_DIR"/* # 清理不完整解压文件
exit 1
fi
# 适配tgz包结构将 gpu-test/ 子目录下的工具移动到 TOOL_DIR 根目录
local sub_dir_path="$TOOL_DIR/$TGZ_SUB_DIR"
if [[ -d "$sub_dir_path" ]]; then
print_info "识别到tgz子目录$sub_dir_path,开始移动工具到根目录"
mv -f "$sub_dir_path"/* "$TOOL_DIR/" 2>/dev/null || {
print_error "移动工具失败(子目录内无工具?)"
rm -rf "$TOOL_DIR"/*
exit 1
}
rm -rf "$sub_dir_path" # 删除空的子目录
else
print_warn "未找到tgz子目录 $sub_dir_path,默认工具已在根目录"
fi
# 验证所有必须工具文件是否存在(含新增的 batchCUBLAS
for tool_file in "${REQUIRED_TOOL_FILES[@]}"; do
if [[ ! -f "$TOOL_DIR/$tool_file" ]]; then
print_error "解压后未找到必要文件:$tool_file(压测包不完整)"
rm -rf "$TOOL_DIR"/*
exit 1
fi
done
# 添加执行权限(含 batchCUBLAS
chmod +x "$TOOL_DIR/"* 2>/dev/null || print_warn "部分工具已具备执行权限"
print_success "压测包处理完成!工具列表:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ') + compare.ptx"
}
# ======================================================
# 步骤4验证压测工具及依赖确保可执行含 batchCUBLAS
# ======================================================
verify_tools() {
print_info "开始验证压测工具及依赖..."
for test in "${SELECTED_TESTS[@]}"; do
local tool_path="$TOOL_DIR/$test"
# 验证工具存在
if [[ ! -f "$tool_path" ]]; then
print_error "压测工具不存在:$tool_path"
rm -rf "$TOOL_DIR"/*
exit 1
fi
# 验证工具可执行
if [[ ! -x "$tool_path" ]]; then
print_info "给工具添加执行权限:$test"
chmod +x "$tool_path" 2>/dev/null || { print_error "添加权限失败"; exit 1; }
fi
# 验证 gpu_burn 依赖compare.ptx
if [[ "$test" == "gpu_burn" ]]; then
local ptx_path="$TOOL_DIR/compare.ptx"
if [[ ! -f "$ptx_path" || ! -s "$ptx_path" ]]; then
print_error "gpu_burn 缺失依赖文件:$ptx_path请确保tgz包中包含"
rm -rf "$TOOL_DIR"/*
exit 1
fi
print_info "gpu_burn 依赖文件已验证:$ptx_path"
fi
# batchCUBLAS 无额外依赖,无需特殊校验
done
print_success "所有压测工具及依赖均有效"
}
# ======================================================
# 步骤5执行单个压测项独立函数逻辑清晰含 batchCUBLAS
# ======================================================
run_single_test() {
local test_name="$1"
local config="${TEST_CONFIGS[$test_name]}"
local log_file="${LOG_DIR}/${test_name}.log"
# 解析配置:执行命令、成功条件、失败描述
local exec_cmd=$(echo "$config" | cut -d';' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local success_cond=$(echo "$config" | cut -d';' -f2 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e "s/{LOG_FILE}/${log_file//\//\\/}/g")
local fail_desc=$(echo "$config" | cut -d';' -f3 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
print_info "\n=================================================="
print_info "开始执行压测项:$test_name"
print_info "执行命令cd $TOOL_DIR && $exec_cmd"
print_info "日志路径:$log_file"
print_info "=================================================="
# 切换目录执行(避免相对路径问题)
cd "$TOOL_DIR" 2>/dev/null || { print_error "无法进入工具目录:$TOOL_DIR"; return 1; }
# 执行压测(输出重定向到日志)
if ! $exec_cmd > "$log_file" 2>&1; then
print_error "压测项 $test_name 执行异常返回码非0"
print_error "失败描述:$fail_desc"
print_info "详细日志:$log_file"
return 1
fi
# 校验压测结果
print_info "正在校验 $test_name 压测结果..."
if eval "$success_cond"; then
print_success "$test_name 压测执行成功"
return 0
else
print_error "$test_name 压测结果不达标"
print_error "失败描述:$fail_desc"
print_info "详细日志:$log_file"
return 1
fi
}
# ======================================================
# 步骤6打印压测结果汇总修复语法错误直观清晰
# ======================================================
print_summary() {
print_info "\n\n======================================================"
echo -e " 压测结果汇总 "
print_info "======================================================"
local total=${#SELECTED_TESTS[@]}
local success=0
local failed=0
for test in "${SELECTED_TESTS[@]}"; do
local log_file="${LOG_DIR}/${test_name}.log" # 修复:将 test 改为 test_name
# 修复语法:用括号明确条件分组,避免解析错误
if ( [[ -f "$log_file" && $(grep -cE 'SUCCESS|Result = PASS|completed successfully' "$log_file" 2>/dev/null) -ge 1 ]] ) || \
( [[ "$test" == "batchCUBLAS" ]] && ! grep -qE 'ERROR|FATAL|Failed' "$log_file" 2>/dev/null ); then
echo -e "✅ \033[32m$test:成功\033[0m → 日志:$log_file"
success=$((success + 1))
else
echo -e "❌ \033[31m$test:失败\033[0m → 日志:$log_file"
failed=$((failed + 1))
fi
done
print_info "\n📊 统计信息"
echo -e "总压测项:$total"
echo -e "成功项数:\033[32m$success 项\033[0m"
echo -e "失败项数:\033[31m$failed 项\033[0m"
print_info "======================================================\n"
if [[ $failed -eq 0 ]]; then
print_success "所有压测项均执行成功!"
else
print_error "共有 $failed 项压测失败,请查看日志排查问题"
fi
}
# ======================================================
# 步骤7帮助信息更新支持的压测项含 batchCUBLAS
# ======================================================
print_help() {
cat << EOF
用法:$(basename "$0") --tests=压测项1,压测项2,... [--logs=日志目录] [--help]
核心功能:自动处理压测包(优先本地→无则下载)→ 解压→验证→执行压测,失败即终止
适配tgz包结构解压后生成 $TGZ_SUB_DIR/ 子目录,自动移动工具到根目录
必选参数:
--tests=TEST1,TEST2,... 指定压测项(多个用逗号分隔,无空格)
支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')"
可选参数:
--logs=LOG_DIR 日志存储目录(默认:$DEFAULT_LOG_DIR不存在则自动创建
--help/-h 查看帮助信息
压测包优先级:
1. 本地包:$LOCAL_PACKAGE用户手动放置优先使用
2. 缓存包:$DOWNLOAD_CACHE之前下载过复用
3. 远程包:$REMOTE_PACKAGE_URL无本地包时自动下载
各压测项说明:
- bandwidthTest显存带宽测试校验日志 PASS 标记)
- deviceQueryGPU设备信息查询校验返回码0
- gpu_burnGPU稳定性测试600秒校验无崩溃日志
- p2pBandwidthLatencyTestGPU P2P带宽延迟测试校验返回码0
- batchCUBLASCUBLAS批量计算测试--device=all校验无错误日志
安全提示:
❗ 工具目录默认:$TOOL_DIR独立目录避免误删系统文件
❗ 禁止修改 TOOL_DIR 为 /opt、/usr 等系统关键目录!
示例:
1. 执行 batchCUBLAS + gpu_burn 测试(自定义日志目录)
$(basename "$0") --tests=batchCUBLAS,gpu_burn --logs=/tmp/gpu-logs
2. 执行所有压测项(使用默认日志目录)
$(basename "$0") --tests=bandwidthTest,deviceQuery,gpu_burn,p2pBandwidthLatencyTest,batchCUBLAS
3. 查看帮助
$(basename "$0") -h
EOF
exit 1
}
# ======================================================
# 主流程(按步骤执行,逻辑线性,易追踪)
# ======================================================
main() {
echo -e "======================================================"
echo -e " GPU 压测脚本最终适配版v1.4 "
echo -e "======================================================\n"
# 安全校验:禁止危险目录(核心安全逻辑)
check_danger_dir
# 1. 检查系统依赖
check_system_deps
# 2. 解析命令行参数
parse_arguments "$@"
# 3. 处理压测包适配tgz子目录
process_test_package
# 4. 验证工具及依赖
verify_tools
# 5. 执行压测(失败即终止)
print_info "\n开始执行压测流程失败即终止..."
for test in "${SELECTED_TESTS[@]}"; do
if ! run_single_test "$test"; then
print_summary
exit 1
fi
done
# 6. 输出结果汇总
print_summary
exit 0
}
# 启动主流程(接收所有命令行参数)
main "$@"