#!/bin/bash set -euo pipefail # ====================================================== # 核心配置(集中管理,修改方便) # ====================================================== # 压测包优先级:1. /opt/gpu-test.tgz(用户手动放置) 2. /tmp/gpu-test.tgz(下载缓存) LOCAL_PACKAGE="/opt/gpu-test.tgz" # 用户手动放置的压测包路径 DOWNLOAD_CACHE="/tmp/gpu-test.tgz" # 下载缓存路径 REMOTE_PACKAGE_URL="http://116.205.97.109/scripts/gpu-test.tgz" # 远程下载地址 TOOL_DIR="/test/" # 安全独立目录(禁止修改为系统目录) DEFAULT_LOG_DIR="/opt/gpu-test-logs" # 默认日志目录 TGZ_SUB_DIR="gpu-test" # 适配你的tgz包结构:解压后生成的子目录名 # 禁止使用的危险目录(避免误删系统文件) DANGER_DIRS=("/" "/opt" "/usr" "/bin" "/sbin" "/lib" "/lib64" "/etc" "/home" "/root") # 支持的压测项(命令+成功条件+失败描述,一一对应) declare -rA TEST_CONFIGS=( ["bandwidthTest"]="\ ./bandwidthTest \ ; grep -q 'Result = PASS' {LOG_FILE} \ ; 显存带宽测试未通过(日志无 PASS 标记)" ["deviceQuery"]="\ ./deviceQuery \ ; [ \$? -eq 0 ] \ ; GPU设备查询失败(返回码非0)" ["gpu_burn"]="\ ./gpu_burn 600 \ ; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|No clients are alive|Couldn'\''t init' {LOG_FILE} \ ; GPU稳定性测试失败(崩溃/无存活客户端/初始化失败)" ["p2pBandwidthLatencyTest"]="\ ./p2pBandwidthLatencyTest \ ; [ \$? -eq 0 ] \ ; GPU P2P带宽测试失败(返回码非0)" # 新增 batchCUBLAS 压测项(命令:./batchCUBLAS --device=all) #["batchCUBLAS"]="\ # ./batchCUBLAS --device=all \ # ; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|Failed|Assertion failed' {LOG_FILE} \ # ; CUBLAS批量计算测试失败(返回码非0或日志含错误信息)" ) # 必须的系统工具(缺失则提示安装) REQUIRED_TOOLS=("wget" "tar" "grep" "sed" "mkdir" "chmod" "mv") # 所有必须的工具文件(含新增的 batchCUBLAS) REQUIRED_TOOL_FILES=("${!TEST_CONFIGS[@]}" "compare.ptx") # ====================================================== # 工具函数:日志输出(颜色区分,直观易读) # ====================================================== print_success() { echo -e "\n\033[32m✅ $1\033[0m"; } print_error() { echo -e "\n\033[31m❌ $1\033[0m"; } print_info() { echo -e "\033[34mℹ️ $1\033[0m"; } print_warn() { echo -e "\033[33m⚠️ $1\033[0m"; } # ====================================================== # 安全校验:禁止工具目录是危险目录(核心安全逻辑) # ====================================================== check_danger_dir() { print_info "开始校验工具目录安全性..." local tool_dir_real=$(realpath -m "$TOOL_DIR") # 解析绝对路径(处理相对路径) # 检查是否在危险目录列表中 for danger in "${DANGER_DIRS[@]}"; do if [[ "$tool_dir_real" == "$danger" || "$tool_dir_real" == "$danger/"* ]]; then print_error "工具目录 $TOOL_DIR 是系统关键目录!禁止使用,避免误删文件" print_info "建议工具目录:/opt/gpu-test-tools、/tmp/gpu-test-tools 等独立目录" exit 1 fi done print_success "工具目录安全性校验通过(非系统关键目录)" } # ====================================================== # 步骤1:检查系统依赖工具(缺失则退出) # ====================================================== check_system_deps() { print_info "开始检查系统依赖工具..." local missing_tools=() for tool in "${REQUIRED_TOOLS[@]}"; do if ! command -v "$tool" &> /dev/null; then missing_tools+=("$tool") fi done if [[ ${#missing_tools[@]} -ne 0 ]]; then print_error "缺失必要工具:${missing_tools[*]}" print_info "CentOS/RHEL 安装:yum install -y ${missing_tools[*]}" print_info "Debian/Ubuntu 安装:apt install -y ${missing_tools[*]}" exit 1 fi print_success "所有系统依赖工具均已安装" } # ====================================================== # 步骤2:解析命令行参数(处理 --tests 和 --logs) # ====================================================== parse_arguments() { print_info "开始解析命令行参数..." local target_tests="" local log_dir="${DEFAULT_LOG_DIR}" # 解析参数 for arg in "$@"; do case "$arg" in --tests=*) target_tests="${arg#--tests=}"; ;; --logs=*) log_dir="${arg#--logs=}"; ;; --help|-h) print_help; ;; *) print_error "未知参数:$arg"; print_help; ;; esac done # 校验必选参数 --tests if [[ -z "$target_tests" ]]; then print_error "缺少必选参数 --tests!请指定压测项" print_help fi # 解析并校验压测项 IFS=',' read -r -a selected_tests <<< "$target_tests" local valid_tests=() for test in "${selected_tests[@]}"; do local trimmed_test=$(echo "$test" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') if [[ -z "$trimmed_test" ]]; then print_warn "忽略空压测项(参数中多余逗号)" continue fi if [[ ! -v "TEST_CONFIGS[$trimmed_test]" ]]; then print_error "不支持的压测项:$trimmed_test" print_info "支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')" exit 1 fi valid_tests+=("$trimmed_test") done # 校验有效压测项非空 if [[ ${#valid_tests[@]} -eq 0 ]]; then print_error "未指定有效压测项(仅输入了逗号?)" exit 1 fi # 处理日志目录(不存在则创建) if [[ ! -d "$log_dir" ]]; then print_info "日志目录 $log_dir 不存在,开始创建..." if ! mkdir -p "$log_dir" 2>/dev/null; then print_error "创建日志目录失败:权限不足或路径非法" exit 1 fi print_success "日志目录创建成功:$log_dir" fi if [[ ! -w "$log_dir" ]]; then print_error "日志目录 $log_dir 无写入权限" exit 1 fi # 全局变量赋值(只读,避免后续修改) readonly SELECTED_TESTS=("${valid_tests[@]}") readonly LOG_DIR="$log_dir" print_success "参数解析完成" print_info "目标压测项:${SELECTED_TESTS[*]}" print_info "日志存储目录:${LOG_DIR}" } # ====================================================== # 步骤3:处理压测包(优先用本地,无则下载,适配tgz子目录) # ====================================================== process_test_package() { print_info "开始处理压测包..." local package_path="" # 优先级1:使用 /opt/gpu-test.tgz(用户手动放置) if [[ -f "$LOCAL_PACKAGE" && -s "$LOCAL_PACKAGE" ]]; then package_path="$LOCAL_PACKAGE" print_info "找到本地压测包:$package_path(优先使用)" # 优先级2:使用 /tmp/ 缓存(之前下载过) elif [[ -f "$DOWNLOAD_CACHE" && -s "$DOWNLOAD_CACHE" ]]; then package_path="$DOWNLOAD_CACHE" print_info "找到下载缓存:$package_path(复用,不重复下载)" # 优先级3:远程下载 else print_info "本地无压测包,开始从远程下载..." print_info "下载地址:$REMOTE_PACKAGE_URL" print_info "下载缓存:$DOWNLOAD_CACHE" if ! wget -c --no-check-certificate -O "$DOWNLOAD_CACHE" "$REMOTE_PACKAGE_URL" 2>&1; then print_error "压测包下载失败(网络问题或地址无效)" print_info "手动验证地址:curl -I $REMOTE_PACKAGE_URL" rm -f "$DOWNLOAD_CACHE" # 清理不完整文件 exit 1 fi if [[ ! -s "$DOWNLOAD_CACHE" ]]; then print_error "下载的压测包为空文件(地址错误?)" rm -f "$DOWNLOAD_CACHE" exit 1 fi package_path="$DOWNLOAD_CACHE" print_success "压测包下载成功:$package_path(大小:$(du -sh "$package_path" | awk '{print $1}'))" fi # 解压压测包(安全清理:只删目录内文件,不删目录本身) print_info "开始解压压测包到:$TOOL_DIR" mkdir -p "$TOOL_DIR" 2>/dev/null || { print_error "创建工具目录失败:权限不足"; exit 1; } # 安全清理:删除目录内所有文件(保留目录),避免误删系统文件 if [[ $(ls -A "$TOOL_DIR" 2>/dev/null) ]]; then print_info "清理工具目录内旧文件:$TOOL_DIR" rm -rf "$TOOL_DIR"/* 2>/dev/null || { print_error "清理旧文件失败:权限不足"; exit 1; } fi # 解压 if ! tar -zxf "$package_path" -C "$TOOL_DIR" 2>&1; then print_error "压测包解压失败(文件损坏或格式错误)" rm -rf "$TOOL_DIR"/* # 清理不完整解压文件 exit 1 fi # 适配tgz包结构:将 gpu-test/ 子目录下的工具移动到 TOOL_DIR 根目录 local sub_dir_path="$TOOL_DIR/$TGZ_SUB_DIR" if [[ -d "$sub_dir_path" ]]; then print_info "识别到tgz子目录:$sub_dir_path,开始移动工具到根目录" mv -f "$sub_dir_path"/* "$TOOL_DIR/" 2>/dev/null || { print_error "移动工具失败(子目录内无工具?)" rm -rf "$TOOL_DIR"/* exit 1 } rm -rf "$sub_dir_path" # 删除空的子目录 else print_warn "未找到tgz子目录 $sub_dir_path,默认工具已在根目录" fi # 验证所有必须工具文件是否存在(含新增的 batchCUBLAS) for tool_file in "${REQUIRED_TOOL_FILES[@]}"; do if [[ ! -f "$TOOL_DIR/$tool_file" ]]; then print_error "解压后未找到必要文件:$tool_file(压测包不完整)" rm -rf "$TOOL_DIR"/* exit 1 fi done # 添加执行权限(含 batchCUBLAS) chmod +x "$TOOL_DIR/"* 2>/dev/null || print_warn "部分工具已具备执行权限" print_success "压测包处理完成!工具列表:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ') + compare.ptx" } # ====================================================== # 步骤4:验证压测工具及依赖(确保可执行,含 batchCUBLAS) # ====================================================== verify_tools() { print_info "开始验证压测工具及依赖..." for test in "${SELECTED_TESTS[@]}"; do local tool_path="$TOOL_DIR/$test" # 验证工具存在 if [[ ! -f "$tool_path" ]]; then print_error "压测工具不存在:$tool_path" rm -rf "$TOOL_DIR"/* exit 1 fi # 验证工具可执行 if [[ ! -x "$tool_path" ]]; then print_info "给工具添加执行权限:$test" chmod +x "$tool_path" 2>/dev/null || { print_error "添加权限失败"; exit 1; } fi # 验证 gpu_burn 依赖(compare.ptx) if [[ "$test" == "gpu_burn" ]]; then local ptx_path="$TOOL_DIR/compare.ptx" if [[ ! -f "$ptx_path" || ! -s "$ptx_path" ]]; then print_error "gpu_burn 缺失依赖文件:$ptx_path(请确保tgz包中包含)" rm -rf "$TOOL_DIR"/* exit 1 fi print_info "gpu_burn 依赖文件已验证:$ptx_path" fi # batchCUBLAS 无额外依赖,无需特殊校验 done print_success "所有压测工具及依赖均有效" } # ====================================================== # 步骤5:执行单个压测项(独立函数,逻辑清晰,含 batchCUBLAS) # ====================================================== run_single_test() { local test_name="$1" local config="${TEST_CONFIGS[$test_name]}" local log_file="${LOG_DIR}/${test_name}.log" # 解析配置:执行命令、成功条件、失败描述 local exec_cmd=$(echo "$config" | cut -d';' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local success_cond=$(echo "$config" | cut -d';' -f2 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e "s/{LOG_FILE}/${log_file//\//\\/}/g") local fail_desc=$(echo "$config" | cut -d';' -f3 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') print_info "\n==================================================" print_info "开始执行压测项:$test_name" print_info "执行命令:cd $TOOL_DIR && $exec_cmd" print_info "日志路径:$log_file" print_info "==================================================" # 切换目录执行(避免相对路径问题) cd "$TOOL_DIR" 2>/dev/null || { print_error "无法进入工具目录:$TOOL_DIR"; return 1; } # 执行压测(输出重定向到日志) if ! $exec_cmd > "$log_file" 2>&1; then print_error "压测项 $test_name 执行异常(返回码非0)" print_error "失败描述:$fail_desc" print_info "详细日志:$log_file" return 1 fi # 校验压测结果 print_info "正在校验 $test_name 压测结果..." if eval "$success_cond"; then print_success "$test_name 压测执行成功" return 0 else print_error "$test_name 压测结果不达标" print_error "失败描述:$fail_desc" print_info "详细日志:$log_file" return 1 fi } # ====================================================== # 步骤6:打印压测结果汇总(修复语法错误,直观清晰) # ====================================================== print_summary() { print_info "\n\n======================================================" echo -e " 压测结果汇总 " print_info "======================================================" local total=${#SELECTED_TESTS[@]} local success=0 local failed=0 for test in "${SELECTED_TESTS[@]}"; do local log_file="${LOG_DIR}/${test_name}.log" # 修复:将 test 改为 test_name # 修复语法:用括号明确条件分组,避免解析错误 if ( [[ -f "$log_file" && $(grep -cE 'SUCCESS|Result = PASS|completed successfully' "$log_file" 2>/dev/null) -ge 1 ]] ) || \ ( [[ "$test" == "batchCUBLAS" ]] && ! grep -qE 'ERROR|FATAL|Failed' "$log_file" 2>/dev/null ); then echo -e "✅ \033[32m$test:成功\033[0m → 日志:$log_file" success=$((success + 1)) else echo -e "❌ \033[31m$test:失败\033[0m → 日志:$log_file" failed=$((failed + 1)) fi done print_info "\n📊 统计信息" echo -e "总压测项:$total 项" echo -e "成功项数:\033[32m$success 项\033[0m" echo -e "失败项数:\033[31m$failed 项\033[0m" print_info "======================================================\n" if [[ $failed -eq 0 ]]; then print_success "所有压测项均执行成功!" else print_error "共有 $failed 项压测失败,请查看日志排查问题" fi } # ====================================================== # 步骤7:帮助信息(更新支持的压测项,含 batchCUBLAS) # ====================================================== print_help() { cat << EOF 用法:$(basename "$0") --tests=压测项1,压测项2,... [--logs=日志目录] [--help] 核心功能:自动处理压测包(优先本地→无则下载)→ 解压→验证→执行压测,失败即终止 适配tgz包结构:解压后生成 $TGZ_SUB_DIR/ 子目录,自动移动工具到根目录 必选参数: --tests=TEST1,TEST2,... 指定压测项(多个用逗号分隔,无空格) 支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')" 可选参数: --logs=LOG_DIR 日志存储目录(默认:$DEFAULT_LOG_DIR,不存在则自动创建) --help/-h 查看帮助信息 压测包优先级: 1. 本地包:$LOCAL_PACKAGE(用户手动放置,优先使用) 2. 缓存包:$DOWNLOAD_CACHE(之前下载过,复用) 3. 远程包:$REMOTE_PACKAGE_URL(无本地包时自动下载) 各压测项说明: - bandwidthTest:显存带宽测试(校验日志 PASS 标记) - deviceQuery:GPU设备信息查询(校验返回码0) - gpu_burn:GPU稳定性测试(600秒,校验无崩溃日志) - p2pBandwidthLatencyTest:GPU P2P带宽延迟测试(校验返回码0) - batchCUBLAS:CUBLAS批量计算测试(--device=all,校验无错误日志) 安全提示: ❗ 工具目录默认:$TOOL_DIR(独立目录,避免误删系统文件) ❗ 禁止修改 TOOL_DIR 为 /opt、/usr 等系统关键目录! 示例: 1. 执行 batchCUBLAS + gpu_burn 测试(自定义日志目录) $(basename "$0") --tests=batchCUBLAS,gpu_burn --logs=/tmp/gpu-logs 2. 执行所有压测项(使用默认日志目录) $(basename "$0") --tests=bandwidthTest,deviceQuery,gpu_burn,p2pBandwidthLatencyTest,batchCUBLAS 3. 查看帮助 $(basename "$0") -h EOF exit 1 } # ====================================================== # 主流程(按步骤执行,逻辑线性,易追踪) # ====================================================== main() { echo -e "======================================================" echo -e " GPU 压测脚本(最终适配版)v1.4 " echo -e "======================================================\n" # 安全校验:禁止危险目录(核心安全逻辑) check_danger_dir # 1. 检查系统依赖 check_system_deps # 2. 解析命令行参数 parse_arguments "$@" # 3. 处理压测包(适配tgz子目录) process_test_package # 4. 验证工具及依赖 verify_tools # 5. 执行压测(失败即终止) print_info "\n开始执行压测流程(失败即终止)..." for test in "${SELECTED_TESTS[@]}"; do if ! run_single_test "$test"; then print_summary exit 1 fi done # 6. 输出结果汇总 print_summary exit 0 } # 启动主流程(接收所有命令行参数) main "$@"