From 3e65f478d1291038d33a586cef982b8bcee8a885 Mon Sep 17 00:00:00 2001 From: joy Date: Wed, 5 Nov 2025 13:57:54 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20scripts/gpu=5Fbench=5Fauto?= =?UTF-8?q?.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit asdf --- scripts/gpu_bench_auto.sh | 454 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 454 insertions(+) create mode 100644 scripts/gpu_bench_auto.sh diff --git a/scripts/gpu_bench_auto.sh b/scripts/gpu_bench_auto.sh new file mode 100644 index 0000000..632a536 --- /dev/null +++ b/scripts/gpu_bench_auto.sh @@ -0,0 +1,454 @@ +#!/bin/bash +set -euo pipefail + +# ====================================================== +# 核心配置(集中管理,修改方便) +# ====================================================== +# 压测包优先级:1. /opt/gpu-test.tgz(用户手动放置) 2. /tmp/gpu-test.tgz(下载缓存) +LOCAL_PACKAGE="/opt/gpu-test.tgz" # 用户手动放置的压测包路径 +DOWNLOAD_CACHE="/tmp/gpu-test.tgz" # 下载缓存路径 +REMOTE_PACKAGE_URL="http://116.205.97.109/scripts/gpu-test.tgz" # 远程下载地址 +TOOL_DIR="/test/" # 安全独立目录(禁止修改为系统目录) +DEFAULT_LOG_DIR="/opt/gpu-test-logs" # 默认日志目录 +TGZ_SUB_DIR="gpu-test" # 适配你的tgz包结构:解压后生成的子目录名 + +# 禁止使用的危险目录(避免误删系统文件) +DANGER_DIRS=("/" "/opt" "/usr" "/bin" "/sbin" "/lib" "/lib64" "/etc" "/home" "/root") + +# 支持的压测项(命令+成功条件+失败描述,一一对应) +declare -rA TEST_CONFIGS=( + ["bandwidthTest"]="\ + ./bandwidthTest \ + ; grep -q 'Result = PASS' {LOG_FILE} \ + ; 显存带宽测试未通过(日志无 PASS 标记)" + ["deviceQuery"]="\ + ./deviceQuery \ + ; [ \$? -eq 0 ] \ + ; GPU设备查询失败(返回码非0)" + ["gpu_burn"]="\ + ./gpu_burn 600 \ + ; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|No clients are alive|Couldn'\''t init' {LOG_FILE} \ + ; GPU稳定性测试失败(崩溃/无存活客户端/初始化失败)" + ["p2pBandwidthLatencyTest"]="\ + ./p2pBandwidthLatencyTest \ + ; [ \$? -eq 0 ] \ + ; GPU P2P带宽测试失败(返回码非0)" + # 新增 batchCUBLAS 压测项(命令:./batchCUBLAS --device=all) + ["batchCUBLAS"]="\ + ./batchCUBLAS --device=all \ + ; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|Failed|Assertion failed' {LOG_FILE} \ + ; CUBLAS批量计算测试失败(返回码非0或日志含错误信息)" +) + +# 必须的系统工具(缺失则提示安装) +REQUIRED_TOOLS=("wget" "tar" "grep" "sed" "mkdir" "chmod" "mv") + +# 所有必须的工具文件(含新增的 batchCUBLAS) +REQUIRED_TOOL_FILES=("${!TEST_CONFIGS[@]}" "compare.ptx") + +# ====================================================== +# 工具函数:日志输出(颜色区分,直观易读) +# ====================================================== +print_success() { echo -e "\n\033[32m✅ $1\033[0m"; } +print_error() { echo -e "\n\033[31m❌ $1\033[0m"; } +print_info() { echo -e "\033[34mℹ️ $1\033[0m"; } +print_warn() { echo -e "\033[33m⚠️ $1\033[0m"; } + +# ====================================================== +# 安全校验:禁止工具目录是危险目录(核心安全逻辑) +# ====================================================== +check_danger_dir() { + print_info "开始校验工具目录安全性..." + local tool_dir_real=$(realpath -m "$TOOL_DIR") # 解析绝对路径(处理相对路径) + + # 检查是否在危险目录列表中 + for danger in "${DANGER_DIRS[@]}"; do + if [[ "$tool_dir_real" == "$danger" || "$tool_dir_real" == "$danger/"* ]]; then + print_error "工具目录 $TOOL_DIR 是系统关键目录!禁止使用,避免误删文件" + print_info "建议工具目录:/opt/gpu-test-tools、/tmp/gpu-test-tools 等独立目录" + exit 1 + fi + done + + print_success "工具目录安全性校验通过(非系统关键目录)" +} + +# ====================================================== +# 步骤1:检查系统依赖工具(缺失则退出) +# ====================================================== +check_system_deps() { + print_info "开始检查系统依赖工具..." + local missing_tools=() + + for tool in "${REQUIRED_TOOLS[@]}"; do + if ! command -v "$tool" &> /dev/null; then + missing_tools+=("$tool") + fi + done + + if [[ ${#missing_tools[@]} -ne 0 ]]; then + print_error "缺失必要工具:${missing_tools[*]}" + print_info "CentOS/RHEL 安装:yum install -y ${missing_tools[*]}" + print_info "Debian/Ubuntu 安装:apt install -y ${missing_tools[*]}" + exit 1 + fi + + print_success "所有系统依赖工具均已安装" +} + +# ====================================================== +# 步骤2:解析命令行参数(处理 --tests 和 --logs) +# ====================================================== +parse_arguments() { + print_info "开始解析命令行参数..." + local target_tests="" + local log_dir="${DEFAULT_LOG_DIR}" + + # 解析参数 + for arg in "$@"; do + case "$arg" in + --tests=*) target_tests="${arg#--tests=}"; ;; + --logs=*) log_dir="${arg#--logs=}"; ;; + --help|-h) print_help; ;; + *) print_error "未知参数:$arg"; print_help; ;; + esac + done + + # 校验必选参数 --tests + if [[ -z "$target_tests" ]]; then + print_error "缺少必选参数 --tests!请指定压测项" + print_help + fi + + # 解析并校验压测项 + IFS=',' read -r -a selected_tests <<< "$target_tests" + local valid_tests=() + for test in "${selected_tests[@]}"; do + local trimmed_test=$(echo "$test" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') + if [[ -z "$trimmed_test" ]]; then + print_warn "忽略空压测项(参数中多余逗号)" + continue + fi + if [[ ! -v "TEST_CONFIGS[$trimmed_test]" ]]; then + print_error "不支持的压测项:$trimmed_test" + print_info "支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')" + exit 1 + fi + valid_tests+=("$trimmed_test") + done + + # 校验有效压测项非空 + if [[ ${#valid_tests[@]} -eq 0 ]]; then + print_error "未指定有效压测项(仅输入了逗号?)" + exit 1 + fi + + # 处理日志目录(不存在则创建) + if [[ ! -d "$log_dir" ]]; then + print_info "日志目录 $log_dir 不存在,开始创建..." + if ! mkdir -p "$log_dir" 2>/dev/null; then + print_error "创建日志目录失败:权限不足或路径非法" + exit 1 + fi + print_success "日志目录创建成功:$log_dir" + fi + if [[ ! -w "$log_dir" ]]; then + print_error "日志目录 $log_dir 无写入权限" + exit 1 + fi + + # 全局变量赋值(只读,避免后续修改) + readonly SELECTED_TESTS=("${valid_tests[@]}") + readonly LOG_DIR="$log_dir" + + print_success "参数解析完成" + print_info "目标压测项:${SELECTED_TESTS[*]}" + print_info "日志存储目录:${LOG_DIR}" +} + +# ====================================================== +# 步骤3:处理压测包(优先用本地,无则下载,适配tgz子目录) +# ====================================================== +process_test_package() { + print_info "开始处理压测包..." + local package_path="" + + # 优先级1:使用 /opt/gpu-test.tgz(用户手动放置) + if [[ -f "$LOCAL_PACKAGE" && -s "$LOCAL_PACKAGE" ]]; then + package_path="$LOCAL_PACKAGE" + print_info "找到本地压测包:$package_path(优先使用)" + # 优先级2:使用 /tmp/ 缓存(之前下载过) + elif [[ -f "$DOWNLOAD_CACHE" && -s "$DOWNLOAD_CACHE" ]]; then + package_path="$DOWNLOAD_CACHE" + print_info "找到下载缓存:$package_path(复用,不重复下载)" + # 优先级3:远程下载 + else + print_info "本地无压测包,开始从远程下载..." + print_info "下载地址:$REMOTE_PACKAGE_URL" + print_info "下载缓存:$DOWNLOAD_CACHE" + + if ! wget -c --no-check-certificate -O "$DOWNLOAD_CACHE" "$REMOTE_PACKAGE_URL" 2>&1; then + print_error "压测包下载失败(网络问题或地址无效)" + print_info "手动验证地址:curl -I $REMOTE_PACKAGE_URL" + rm -f "$DOWNLOAD_CACHE" # 清理不完整文件 + exit 1 + fi + + if [[ ! -s "$DOWNLOAD_CACHE" ]]; then + print_error "下载的压测包为空文件(地址错误?)" + rm -f "$DOWNLOAD_CACHE" + exit 1 + fi + + package_path="$DOWNLOAD_CACHE" + print_success "压测包下载成功:$package_path(大小:$(du -sh "$package_path" | awk '{print $1}'))" + fi + + # 解压压测包(安全清理:只删目录内文件,不删目录本身) + print_info "开始解压压测包到:$TOOL_DIR" + mkdir -p "$TOOL_DIR" 2>/dev/null || { print_error "创建工具目录失败:权限不足"; exit 1; } + + # 安全清理:删除目录内所有文件(保留目录),避免误删系统文件 + if [[ $(ls -A "$TOOL_DIR" 2>/dev/null) ]]; then + print_info "清理工具目录内旧文件:$TOOL_DIR" + rm -rf "$TOOL_DIR"/* 2>/dev/null || { print_error "清理旧文件失败:权限不足"; exit 1; } + fi + + # 解压 + if ! tar -zxf "$package_path" -C "$TOOL_DIR" 2>&1; then + print_error "压测包解压失败(文件损坏或格式错误)" + rm -rf "$TOOL_DIR"/* # 清理不完整解压文件 + exit 1 + fi + + # 适配tgz包结构:将 gpu-test/ 子目录下的工具移动到 TOOL_DIR 根目录 + local sub_dir_path="$TOOL_DIR/$TGZ_SUB_DIR" + if [[ -d "$sub_dir_path" ]]; then + print_info "识别到tgz子目录:$sub_dir_path,开始移动工具到根目录" + mv -f "$sub_dir_path"/* "$TOOL_DIR/" 2>/dev/null || { + print_error "移动工具失败(子目录内无工具?)" + rm -rf "$TOOL_DIR"/* + exit 1 + } + rm -rf "$sub_dir_path" # 删除空的子目录 + else + print_warn "未找到tgz子目录 $sub_dir_path,默认工具已在根目录" + fi + + # 验证所有必须工具文件是否存在(含新增的 batchCUBLAS) + for tool_file in "${REQUIRED_TOOL_FILES[@]}"; do + if [[ ! -f "$TOOL_DIR/$tool_file" ]]; then + print_error "解压后未找到必要文件:$tool_file(压测包不完整)" + rm -rf "$TOOL_DIR"/* + exit 1 + fi + done + + # 添加执行权限(含 batchCUBLAS) + chmod +x "$TOOL_DIR/"* 2>/dev/null || print_warn "部分工具已具备执行权限" + + print_success "压测包处理完成!工具列表:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ') + compare.ptx" +} + +# ====================================================== +# 步骤4:验证压测工具及依赖(确保可执行,含 batchCUBLAS) +# ====================================================== +verify_tools() { + print_info "开始验证压测工具及依赖..." + + for test in "${SELECTED_TESTS[@]}"; do + local tool_path="$TOOL_DIR/$test" + # 验证工具存在 + if [[ ! -f "$tool_path" ]]; then + print_error "压测工具不存在:$tool_path" + rm -rf "$TOOL_DIR"/* + exit 1 + fi + # 验证工具可执行 + if [[ ! -x "$tool_path" ]]; then + print_info "给工具添加执行权限:$test" + chmod +x "$tool_path" 2>/dev/null || { print_error "添加权限失败"; exit 1; } + fi + # 验证 gpu_burn 依赖(compare.ptx) + if [[ "$test" == "gpu_burn" ]]; then + local ptx_path="$TOOL_DIR/compare.ptx" + if [[ ! -f "$ptx_path" || ! -s "$ptx_path" ]]; then + print_error "gpu_burn 缺失依赖文件:$ptx_path(请确保tgz包中包含)" + rm -rf "$TOOL_DIR"/* + exit 1 + fi + print_info "gpu_burn 依赖文件已验证:$ptx_path" + fi + # batchCUBLAS 无额外依赖,无需特殊校验 + done + + print_success "所有压测工具及依赖均有效" +} + +# ====================================================== +# 步骤5:执行单个压测项(独立函数,逻辑清晰,含 batchCUBLAS) +# ====================================================== +run_single_test() { + local test_name="$1" + local config="${TEST_CONFIGS[$test_name]}" + local log_file="${LOG_DIR}/${test_name}.log" + + # 解析配置:执行命令、成功条件、失败描述 + local exec_cmd=$(echo "$config" | cut -d';' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') + local success_cond=$(echo "$config" | cut -d';' -f2 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e "s/{LOG_FILE}/${log_file//\//\\/}/g") + local fail_desc=$(echo "$config" | cut -d';' -f3 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') + + print_info "\n==================================================" + print_info "开始执行压测项:$test_name" + print_info "执行命令:cd $TOOL_DIR && $exec_cmd" + print_info "日志路径:$log_file" + print_info "==================================================" + + # 切换目录执行(避免相对路径问题) + cd "$TOOL_DIR" 2>/dev/null || { print_error "无法进入工具目录:$TOOL_DIR"; return 1; } + + # 执行压测(输出重定向到日志) + if ! $exec_cmd > "$log_file" 2>&1; then + print_error "压测项 $test_name 执行异常(返回码非0)" + print_error "失败描述:$fail_desc" + print_info "详细日志:$log_file" + return 1 + fi + + # 校验压测结果 + print_info "正在校验 $test_name 压测结果..." + if eval "$success_cond"; then + print_success "$test_name 压测执行成功" + return 0 + else + print_error "$test_name 压测结果不达标" + print_error "失败描述:$fail_desc" + print_info "详细日志:$log_file" + return 1 + fi +} + +# ====================================================== +# 步骤6:打印压测结果汇总(修复语法错误,直观清晰) +# ====================================================== +print_summary() { + print_info "\n\n======================================================" + echo -e " 压测结果汇总 " + print_info "======================================================" + + local total=${#SELECTED_TESTS[@]} + local success=0 + local failed=0 + + for test in "${SELECTED_TESTS[@]}"; do + local log_file="${LOG_DIR}/${test_name}.log" # 修复:将 test 改为 test_name + # 修复语法:用括号明确条件分组,避免解析错误 + if ( [[ -f "$log_file" && $(grep -cE 'SUCCESS|Result = PASS|completed successfully' "$log_file" 2>/dev/null) -ge 1 ]] ) || \ + ( [[ "$test" == "batchCUBLAS" ]] && ! grep -qE 'ERROR|FATAL|Failed' "$log_file" 2>/dev/null ); then + echo -e "✅ \033[32m$test:成功\033[0m → 日志:$log_file" + success=$((success + 1)) + else + echo -e "❌ \033[31m$test:失败\033[0m → 日志:$log_file" + failed=$((failed + 1)) + fi + done + + print_info "\n📊 统计信息" + echo -e "总压测项:$total 项" + echo -e "成功项数:\033[32m$success 项\033[0m" + echo -e "失败项数:\033[31m$failed 项\033[0m" + print_info "======================================================\n" + + if [[ $failed -eq 0 ]]; then + print_success "所有压测项均执行成功!" + else + print_error "共有 $failed 项压测失败,请查看日志排查问题" + fi +} + +# ====================================================== +# 步骤7:帮助信息(更新支持的压测项,含 batchCUBLAS) +# ====================================================== +print_help() { + cat << EOF +用法:$(basename "$0") --tests=压测项1,压测项2,... [--logs=日志目录] [--help] + +核心功能:自动处理压测包(优先本地→无则下载)→ 解压→验证→执行压测,失败即终止 +适配tgz包结构:解压后生成 $TGZ_SUB_DIR/ 子目录,自动移动工具到根目录 + +必选参数: + --tests=TEST1,TEST2,... 指定压测项(多个用逗号分隔,无空格) + 支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')" + +可选参数: + --logs=LOG_DIR 日志存储目录(默认:$DEFAULT_LOG_DIR,不存在则自动创建) + --help/-h 查看帮助信息 + +压测包优先级: + 1. 本地包:$LOCAL_PACKAGE(用户手动放置,优先使用) + 2. 缓存包:$DOWNLOAD_CACHE(之前下载过,复用) + 3. 远程包:$REMOTE_PACKAGE_URL(无本地包时自动下载) + +各压测项说明: + - bandwidthTest:显存带宽测试(校验日志 PASS 标记) + - deviceQuery:GPU设备信息查询(校验返回码0) + - gpu_burn:GPU稳定性测试(600秒,校验无崩溃日志) + - p2pBandwidthLatencyTest:GPU P2P带宽延迟测试(校验返回码0) + - batchCUBLAS:CUBLAS批量计算测试(--device=all,校验无错误日志) + +安全提示: + ❗ 工具目录默认:$TOOL_DIR(独立目录,避免误删系统文件) + ❗ 禁止修改 TOOL_DIR 为 /opt、/usr 等系统关键目录! + +示例: + 1. 执行 batchCUBLAS + gpu_burn 测试(自定义日志目录) + $(basename "$0") --tests=batchCUBLAS,gpu_burn --logs=/tmp/gpu-logs + + 2. 执行所有压测项(使用默认日志目录) + $(basename "$0") --tests=bandwidthTest,deviceQuery,gpu_burn,p2pBandwidthLatencyTest,batchCUBLAS + + 3. 查看帮助 + $(basename "$0") -h +EOF + exit 1 +} + +# ====================================================== +# 主流程(按步骤执行,逻辑线性,易追踪) +# ====================================================== +main() { + echo -e "======================================================" + echo -e " GPU 压测脚本(最终适配版)v1.4 " + echo -e "======================================================\n" + + # 安全校验:禁止危险目录(核心安全逻辑) + check_danger_dir + + # 1. 检查系统依赖 + check_system_deps + + # 2. 解析命令行参数 + parse_arguments "$@" + + # 3. 处理压测包(适配tgz子目录) + process_test_package + + # 4. 验证工具及依赖 + verify_tools + + # 5. 执行压测(失败即终止) + print_info "\n开始执行压测流程(失败即终止)..." + for test in "${SELECTED_TESTS[@]}"; do + if ! run_single_test "$test"; then + print_summary + exit 1 + fi + done + + # 6. 输出结果汇总 + print_summary + exit 0 +} + +# 启动主流程(接收所有命令行参数) +main "$@"