2025-11-05 13:57:54 +08:00
|
|
|
|
#!/bin/bash
|
|
|
|
|
|
set -euo pipefail
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 核心配置(集中管理,修改方便)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 压测包优先级:1. /opt/gpu-test.tgz(用户手动放置) 2. /tmp/gpu-test.tgz(下载缓存)
|
|
|
|
|
|
LOCAL_PACKAGE="/opt/gpu-test.tgz" # 用户手动放置的压测包路径
|
|
|
|
|
|
DOWNLOAD_CACHE="/tmp/gpu-test.tgz" # 下载缓存路径
|
|
|
|
|
|
REMOTE_PACKAGE_URL="http://116.205.97.109/scripts/gpu-test.tgz" # 远程下载地址
|
|
|
|
|
|
TOOL_DIR="/test/" # 安全独立目录(禁止修改为系统目录)
|
|
|
|
|
|
DEFAULT_LOG_DIR="/opt/gpu-test-logs" # 默认日志目录
|
|
|
|
|
|
TGZ_SUB_DIR="gpu-test" # 适配你的tgz包结构:解压后生成的子目录名
|
|
|
|
|
|
|
|
|
|
|
|
# 禁止使用的危险目录(避免误删系统文件)
|
|
|
|
|
|
DANGER_DIRS=("/" "/opt" "/usr" "/bin" "/sbin" "/lib" "/lib64" "/etc" "/home" "/root")
|
|
|
|
|
|
|
|
|
|
|
|
# 支持的压测项(命令+成功条件+失败描述,一一对应)
|
|
|
|
|
|
declare -rA TEST_CONFIGS=(
|
|
|
|
|
|
["bandwidthTest"]="\
|
|
|
|
|
|
./bandwidthTest \
|
|
|
|
|
|
; grep -q 'Result = PASS' {LOG_FILE} \
|
|
|
|
|
|
; 显存带宽测试未通过(日志无 PASS 标记)"
|
|
|
|
|
|
["deviceQuery"]="\
|
|
|
|
|
|
./deviceQuery \
|
|
|
|
|
|
; [ \$? -eq 0 ] \
|
|
|
|
|
|
; GPU设备查询失败(返回码非0)"
|
|
|
|
|
|
["gpu_burn"]="\
|
|
|
|
|
|
./gpu_burn 600 \
|
|
|
|
|
|
; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|No clients are alive|Couldn'\''t init' {LOG_FILE} \
|
|
|
|
|
|
; GPU稳定性测试失败(崩溃/无存活客户端/初始化失败)"
|
|
|
|
|
|
["p2pBandwidthLatencyTest"]="\
|
|
|
|
|
|
./p2pBandwidthLatencyTest \
|
|
|
|
|
|
; [ \$? -eq 0 ] \
|
|
|
|
|
|
; GPU P2P带宽测试失败(返回码非0)"
|
|
|
|
|
|
# 新增 batchCUBLAS 压测项(命令:./batchCUBLAS --device=all)
|
2025-11-07 14:58:16 +08:00
|
|
|
|
#["batchCUBLAS"]="\
|
|
|
|
|
|
# ./batchCUBLAS --device=all \
|
|
|
|
|
|
# ; [ \$? -eq 0 ] && ! grep -qE 'ERROR|FATAL|Failed|Assertion failed' {LOG_FILE} \
|
|
|
|
|
|
# ; CUBLAS批量计算测试失败(返回码非0或日志含错误信息)"
|
2025-11-05 13:57:54 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 必须的系统工具(缺失则提示安装)
|
|
|
|
|
|
REQUIRED_TOOLS=("wget" "tar" "grep" "sed" "mkdir" "chmod" "mv")
|
|
|
|
|
|
|
|
|
|
|
|
# 所有必须的工具文件(含新增的 batchCUBLAS)
|
|
|
|
|
|
REQUIRED_TOOL_FILES=("${!TEST_CONFIGS[@]}" "compare.ptx")
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 工具函数:日志输出(颜色区分,直观易读)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
print_success() { echo -e "\n\033[32m✅ $1\033[0m"; }
|
|
|
|
|
|
print_error() { echo -e "\n\033[31m❌ $1\033[0m"; }
|
|
|
|
|
|
print_info() { echo -e "\033[34mℹ️ $1\033[0m"; }
|
|
|
|
|
|
print_warn() { echo -e "\033[33m⚠️ $1\033[0m"; }
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 安全校验:禁止工具目录是危险目录(核心安全逻辑)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
check_danger_dir() {
|
|
|
|
|
|
print_info "开始校验工具目录安全性..."
|
|
|
|
|
|
local tool_dir_real=$(realpath -m "$TOOL_DIR") # 解析绝对路径(处理相对路径)
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否在危险目录列表中
|
|
|
|
|
|
for danger in "${DANGER_DIRS[@]}"; do
|
|
|
|
|
|
if [[ "$tool_dir_real" == "$danger" || "$tool_dir_real" == "$danger/"* ]]; then
|
|
|
|
|
|
print_error "工具目录 $TOOL_DIR 是系统关键目录!禁止使用,避免误删文件"
|
|
|
|
|
|
print_info "建议工具目录:/opt/gpu-test-tools、/tmp/gpu-test-tools 等独立目录"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
print_success "工具目录安全性校验通过(非系统关键目录)"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 步骤1:检查系统依赖工具(缺失则退出)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
check_system_deps() {
|
|
|
|
|
|
print_info "开始检查系统依赖工具..."
|
|
|
|
|
|
local missing_tools=()
|
|
|
|
|
|
|
|
|
|
|
|
for tool in "${REQUIRED_TOOLS[@]}"; do
|
|
|
|
|
|
if ! command -v "$tool" &> /dev/null; then
|
|
|
|
|
|
missing_tools+=("$tool")
|
|
|
|
|
|
fi
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
if [[ ${#missing_tools[@]} -ne 0 ]]; then
|
|
|
|
|
|
print_error "缺失必要工具:${missing_tools[*]}"
|
|
|
|
|
|
print_info "CentOS/RHEL 安装:yum install -y ${missing_tools[*]}"
|
|
|
|
|
|
print_info "Debian/Ubuntu 安装:apt install -y ${missing_tools[*]}"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
print_success "所有系统依赖工具均已安装"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 步骤2:解析命令行参数(处理 --tests 和 --logs)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
parse_arguments() {
|
|
|
|
|
|
print_info "开始解析命令行参数..."
|
|
|
|
|
|
local target_tests=""
|
|
|
|
|
|
local log_dir="${DEFAULT_LOG_DIR}"
|
|
|
|
|
|
|
|
|
|
|
|
# 解析参数
|
|
|
|
|
|
for arg in "$@"; do
|
|
|
|
|
|
case "$arg" in
|
|
|
|
|
|
--tests=*) target_tests="${arg#--tests=}"; ;;
|
|
|
|
|
|
--logs=*) log_dir="${arg#--logs=}"; ;;
|
|
|
|
|
|
--help|-h) print_help; ;;
|
|
|
|
|
|
*) print_error "未知参数:$arg"; print_help; ;;
|
|
|
|
|
|
esac
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
# 校验必选参数 --tests
|
|
|
|
|
|
if [[ -z "$target_tests" ]]; then
|
|
|
|
|
|
print_error "缺少必选参数 --tests!请指定压测项"
|
|
|
|
|
|
print_help
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 解析并校验压测项
|
|
|
|
|
|
IFS=',' read -r -a selected_tests <<< "$target_tests"
|
|
|
|
|
|
local valid_tests=()
|
|
|
|
|
|
for test in "${selected_tests[@]}"; do
|
|
|
|
|
|
local trimmed_test=$(echo "$test" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
|
|
|
|
|
|
if [[ -z "$trimmed_test" ]]; then
|
|
|
|
|
|
print_warn "忽略空压测项(参数中多余逗号)"
|
|
|
|
|
|
continue
|
|
|
|
|
|
fi
|
|
|
|
|
|
if [[ ! -v "TEST_CONFIGS[$trimmed_test]" ]]; then
|
|
|
|
|
|
print_error "不支持的压测项:$trimmed_test"
|
|
|
|
|
|
print_info "支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
valid_tests+=("$trimmed_test")
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
# 校验有效压测项非空
|
|
|
|
|
|
if [[ ${#valid_tests[@]} -eq 0 ]]; then
|
|
|
|
|
|
print_error "未指定有效压测项(仅输入了逗号?)"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 处理日志目录(不存在则创建)
|
|
|
|
|
|
if [[ ! -d "$log_dir" ]]; then
|
|
|
|
|
|
print_info "日志目录 $log_dir 不存在,开始创建..."
|
|
|
|
|
|
if ! mkdir -p "$log_dir" 2>/dev/null; then
|
|
|
|
|
|
print_error "创建日志目录失败:权限不足或路径非法"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
print_success "日志目录创建成功:$log_dir"
|
|
|
|
|
|
fi
|
|
|
|
|
|
if [[ ! -w "$log_dir" ]]; then
|
|
|
|
|
|
print_error "日志目录 $log_dir 无写入权限"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 全局变量赋值(只读,避免后续修改)
|
|
|
|
|
|
readonly SELECTED_TESTS=("${valid_tests[@]}")
|
|
|
|
|
|
readonly LOG_DIR="$log_dir"
|
|
|
|
|
|
|
|
|
|
|
|
print_success "参数解析完成"
|
|
|
|
|
|
print_info "目标压测项:${SELECTED_TESTS[*]}"
|
|
|
|
|
|
print_info "日志存储目录:${LOG_DIR}"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 步骤3:处理压测包(优先用本地,无则下载,适配tgz子目录)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
process_test_package() {
|
|
|
|
|
|
print_info "开始处理压测包..."
|
|
|
|
|
|
local package_path=""
|
|
|
|
|
|
|
|
|
|
|
|
# 优先级1:使用 /opt/gpu-test.tgz(用户手动放置)
|
|
|
|
|
|
if [[ -f "$LOCAL_PACKAGE" && -s "$LOCAL_PACKAGE" ]]; then
|
|
|
|
|
|
package_path="$LOCAL_PACKAGE"
|
|
|
|
|
|
print_info "找到本地压测包:$package_path(优先使用)"
|
|
|
|
|
|
# 优先级2:使用 /tmp/ 缓存(之前下载过)
|
|
|
|
|
|
elif [[ -f "$DOWNLOAD_CACHE" && -s "$DOWNLOAD_CACHE" ]]; then
|
|
|
|
|
|
package_path="$DOWNLOAD_CACHE"
|
|
|
|
|
|
print_info "找到下载缓存:$package_path(复用,不重复下载)"
|
|
|
|
|
|
# 优先级3:远程下载
|
|
|
|
|
|
else
|
|
|
|
|
|
print_info "本地无压测包,开始从远程下载..."
|
|
|
|
|
|
print_info "下载地址:$REMOTE_PACKAGE_URL"
|
|
|
|
|
|
print_info "下载缓存:$DOWNLOAD_CACHE"
|
|
|
|
|
|
|
|
|
|
|
|
if ! wget -c --no-check-certificate -O "$DOWNLOAD_CACHE" "$REMOTE_PACKAGE_URL" 2>&1; then
|
|
|
|
|
|
print_error "压测包下载失败(网络问题或地址无效)"
|
|
|
|
|
|
print_info "手动验证地址:curl -I $REMOTE_PACKAGE_URL"
|
|
|
|
|
|
rm -f "$DOWNLOAD_CACHE" # 清理不完整文件
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
if [[ ! -s "$DOWNLOAD_CACHE" ]]; then
|
|
|
|
|
|
print_error "下载的压测包为空文件(地址错误?)"
|
|
|
|
|
|
rm -f "$DOWNLOAD_CACHE"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
package_path="$DOWNLOAD_CACHE"
|
|
|
|
|
|
print_success "压测包下载成功:$package_path(大小:$(du -sh "$package_path" | awk '{print $1}'))"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 解压压测包(安全清理:只删目录内文件,不删目录本身)
|
|
|
|
|
|
print_info "开始解压压测包到:$TOOL_DIR"
|
|
|
|
|
|
mkdir -p "$TOOL_DIR" 2>/dev/null || { print_error "创建工具目录失败:权限不足"; exit 1; }
|
|
|
|
|
|
|
|
|
|
|
|
# 安全清理:删除目录内所有文件(保留目录),避免误删系统文件
|
|
|
|
|
|
if [[ $(ls -A "$TOOL_DIR" 2>/dev/null) ]]; then
|
|
|
|
|
|
print_info "清理工具目录内旧文件:$TOOL_DIR"
|
|
|
|
|
|
rm -rf "$TOOL_DIR"/* 2>/dev/null || { print_error "清理旧文件失败:权限不足"; exit 1; }
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 解压
|
|
|
|
|
|
if ! tar -zxf "$package_path" -C "$TOOL_DIR" 2>&1; then
|
|
|
|
|
|
print_error "压测包解压失败(文件损坏或格式错误)"
|
|
|
|
|
|
rm -rf "$TOOL_DIR"/* # 清理不完整解压文件
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 适配tgz包结构:将 gpu-test/ 子目录下的工具移动到 TOOL_DIR 根目录
|
|
|
|
|
|
local sub_dir_path="$TOOL_DIR/$TGZ_SUB_DIR"
|
|
|
|
|
|
if [[ -d "$sub_dir_path" ]]; then
|
|
|
|
|
|
print_info "识别到tgz子目录:$sub_dir_path,开始移动工具到根目录"
|
|
|
|
|
|
mv -f "$sub_dir_path"/* "$TOOL_DIR/" 2>/dev/null || {
|
|
|
|
|
|
print_error "移动工具失败(子目录内无工具?)"
|
|
|
|
|
|
rm -rf "$TOOL_DIR"/*
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
}
|
|
|
|
|
|
rm -rf "$sub_dir_path" # 删除空的子目录
|
|
|
|
|
|
else
|
|
|
|
|
|
print_warn "未找到tgz子目录 $sub_dir_path,默认工具已在根目录"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 验证所有必须工具文件是否存在(含新增的 batchCUBLAS)
|
|
|
|
|
|
for tool_file in "${REQUIRED_TOOL_FILES[@]}"; do
|
|
|
|
|
|
if [[ ! -f "$TOOL_DIR/$tool_file" ]]; then
|
|
|
|
|
|
print_error "解压后未找到必要文件:$tool_file(压测包不完整)"
|
|
|
|
|
|
rm -rf "$TOOL_DIR"/*
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
# 添加执行权限(含 batchCUBLAS)
|
|
|
|
|
|
chmod +x "$TOOL_DIR/"* 2>/dev/null || print_warn "部分工具已具备执行权限"
|
|
|
|
|
|
|
|
|
|
|
|
print_success "压测包处理完成!工具列表:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ') + compare.ptx"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 步骤4:验证压测工具及依赖(确保可执行,含 batchCUBLAS)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
verify_tools() {
|
|
|
|
|
|
print_info "开始验证压测工具及依赖..."
|
|
|
|
|
|
|
|
|
|
|
|
for test in "${SELECTED_TESTS[@]}"; do
|
|
|
|
|
|
local tool_path="$TOOL_DIR/$test"
|
|
|
|
|
|
# 验证工具存在
|
|
|
|
|
|
if [[ ! -f "$tool_path" ]]; then
|
|
|
|
|
|
print_error "压测工具不存在:$tool_path"
|
|
|
|
|
|
rm -rf "$TOOL_DIR"/*
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
# 验证工具可执行
|
|
|
|
|
|
if [[ ! -x "$tool_path" ]]; then
|
|
|
|
|
|
print_info "给工具添加执行权限:$test"
|
|
|
|
|
|
chmod +x "$tool_path" 2>/dev/null || { print_error "添加权限失败"; exit 1; }
|
|
|
|
|
|
fi
|
|
|
|
|
|
# 验证 gpu_burn 依赖(compare.ptx)
|
|
|
|
|
|
if [[ "$test" == "gpu_burn" ]]; then
|
|
|
|
|
|
local ptx_path="$TOOL_DIR/compare.ptx"
|
|
|
|
|
|
if [[ ! -f "$ptx_path" || ! -s "$ptx_path" ]]; then
|
|
|
|
|
|
print_error "gpu_burn 缺失依赖文件:$ptx_path(请确保tgz包中包含)"
|
|
|
|
|
|
rm -rf "$TOOL_DIR"/*
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
print_info "gpu_burn 依赖文件已验证:$ptx_path"
|
|
|
|
|
|
fi
|
|
|
|
|
|
# batchCUBLAS 无额外依赖,无需特殊校验
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
print_success "所有压测工具及依赖均有效"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 步骤5:执行单个压测项(独立函数,逻辑清晰,含 batchCUBLAS)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
run_single_test() {
|
|
|
|
|
|
local test_name="$1"
|
|
|
|
|
|
local config="${TEST_CONFIGS[$test_name]}"
|
|
|
|
|
|
local log_file="${LOG_DIR}/${test_name}.log"
|
|
|
|
|
|
|
|
|
|
|
|
# 解析配置:执行命令、成功条件、失败描述
|
|
|
|
|
|
local exec_cmd=$(echo "$config" | cut -d';' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
|
|
|
|
|
|
local success_cond=$(echo "$config" | cut -d';' -f2 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e "s/{LOG_FILE}/${log_file//\//\\/}/g")
|
|
|
|
|
|
local fail_desc=$(echo "$config" | cut -d';' -f3 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
|
|
|
|
|
|
|
|
|
|
|
|
print_info "\n=================================================="
|
|
|
|
|
|
print_info "开始执行压测项:$test_name"
|
|
|
|
|
|
print_info "执行命令:cd $TOOL_DIR && $exec_cmd"
|
|
|
|
|
|
print_info "日志路径:$log_file"
|
|
|
|
|
|
print_info "=================================================="
|
|
|
|
|
|
|
|
|
|
|
|
# 切换目录执行(避免相对路径问题)
|
|
|
|
|
|
cd "$TOOL_DIR" 2>/dev/null || { print_error "无法进入工具目录:$TOOL_DIR"; return 1; }
|
|
|
|
|
|
|
|
|
|
|
|
# 执行压测(输出重定向到日志)
|
|
|
|
|
|
if ! $exec_cmd > "$log_file" 2>&1; then
|
|
|
|
|
|
print_error "压测项 $test_name 执行异常(返回码非0)"
|
|
|
|
|
|
print_error "失败描述:$fail_desc"
|
|
|
|
|
|
print_info "详细日志:$log_file"
|
|
|
|
|
|
return 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 校验压测结果
|
|
|
|
|
|
print_info "正在校验 $test_name 压测结果..."
|
|
|
|
|
|
if eval "$success_cond"; then
|
|
|
|
|
|
print_success "$test_name 压测执行成功"
|
|
|
|
|
|
return 0
|
|
|
|
|
|
else
|
|
|
|
|
|
print_error "$test_name 压测结果不达标"
|
|
|
|
|
|
print_error "失败描述:$fail_desc"
|
|
|
|
|
|
print_info "详细日志:$log_file"
|
|
|
|
|
|
return 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 步骤6:打印压测结果汇总(修复语法错误,直观清晰)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
print_summary() {
|
|
|
|
|
|
print_info "\n\n======================================================"
|
|
|
|
|
|
echo -e " 压测结果汇总 "
|
|
|
|
|
|
print_info "======================================================"
|
|
|
|
|
|
|
|
|
|
|
|
local total=${#SELECTED_TESTS[@]}
|
|
|
|
|
|
local success=0
|
|
|
|
|
|
local failed=0
|
|
|
|
|
|
|
|
|
|
|
|
for test in "${SELECTED_TESTS[@]}"; do
|
|
|
|
|
|
local log_file="${LOG_DIR}/${test_name}.log" # 修复:将 test 改为 test_name
|
|
|
|
|
|
# 修复语法:用括号明确条件分组,避免解析错误
|
|
|
|
|
|
if ( [[ -f "$log_file" && $(grep -cE 'SUCCESS|Result = PASS|completed successfully' "$log_file" 2>/dev/null) -ge 1 ]] ) || \
|
|
|
|
|
|
( [[ "$test" == "batchCUBLAS" ]] && ! grep -qE 'ERROR|FATAL|Failed' "$log_file" 2>/dev/null ); then
|
|
|
|
|
|
echo -e "✅ \033[32m$test:成功\033[0m → 日志:$log_file"
|
|
|
|
|
|
success=$((success + 1))
|
|
|
|
|
|
else
|
|
|
|
|
|
echo -e "❌ \033[31m$test:失败\033[0m → 日志:$log_file"
|
|
|
|
|
|
failed=$((failed + 1))
|
|
|
|
|
|
fi
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
print_info "\n📊 统计信息"
|
|
|
|
|
|
echo -e "总压测项:$total 项"
|
|
|
|
|
|
echo -e "成功项数:\033[32m$success 项\033[0m"
|
|
|
|
|
|
echo -e "失败项数:\033[31m$failed 项\033[0m"
|
|
|
|
|
|
print_info "======================================================\n"
|
|
|
|
|
|
|
|
|
|
|
|
if [[ $failed -eq 0 ]]; then
|
|
|
|
|
|
print_success "所有压测项均执行成功!"
|
|
|
|
|
|
else
|
|
|
|
|
|
print_error "共有 $failed 项压测失败,请查看日志排查问题"
|
|
|
|
|
|
fi
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 步骤7:帮助信息(更新支持的压测项,含 batchCUBLAS)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
print_help() {
|
|
|
|
|
|
cat << EOF
|
|
|
|
|
|
用法:$(basename "$0") --tests=压测项1,压测项2,... [--logs=日志目录] [--help]
|
|
|
|
|
|
|
|
|
|
|
|
核心功能:自动处理压测包(优先本地→无则下载)→ 解压→验证→执行压测,失败即终止
|
|
|
|
|
|
适配tgz包结构:解压后生成 $TGZ_SUB_DIR/ 子目录,自动移动工具到根目录
|
|
|
|
|
|
|
|
|
|
|
|
必选参数:
|
|
|
|
|
|
--tests=TEST1,TEST2,... 指定压测项(多个用逗号分隔,无空格)
|
|
|
|
|
|
支持的压测项:$(echo "${!TEST_CONFIGS[@]}" | tr ' ' ', ')"
|
|
|
|
|
|
|
|
|
|
|
|
可选参数:
|
|
|
|
|
|
--logs=LOG_DIR 日志存储目录(默认:$DEFAULT_LOG_DIR,不存在则自动创建)
|
|
|
|
|
|
--help/-h 查看帮助信息
|
|
|
|
|
|
|
|
|
|
|
|
压测包优先级:
|
|
|
|
|
|
1. 本地包:$LOCAL_PACKAGE(用户手动放置,优先使用)
|
|
|
|
|
|
2. 缓存包:$DOWNLOAD_CACHE(之前下载过,复用)
|
|
|
|
|
|
3. 远程包:$REMOTE_PACKAGE_URL(无本地包时自动下载)
|
|
|
|
|
|
|
|
|
|
|
|
各压测项说明:
|
|
|
|
|
|
- bandwidthTest:显存带宽测试(校验日志 PASS 标记)
|
|
|
|
|
|
- deviceQuery:GPU设备信息查询(校验返回码0)
|
|
|
|
|
|
- gpu_burn:GPU稳定性测试(600秒,校验无崩溃日志)
|
|
|
|
|
|
- p2pBandwidthLatencyTest:GPU P2P带宽延迟测试(校验返回码0)
|
|
|
|
|
|
- batchCUBLAS:CUBLAS批量计算测试(--device=all,校验无错误日志)
|
|
|
|
|
|
|
|
|
|
|
|
安全提示:
|
|
|
|
|
|
❗ 工具目录默认:$TOOL_DIR(独立目录,避免误删系统文件)
|
|
|
|
|
|
❗ 禁止修改 TOOL_DIR 为 /opt、/usr 等系统关键目录!
|
|
|
|
|
|
|
|
|
|
|
|
示例:
|
|
|
|
|
|
1. 执行 batchCUBLAS + gpu_burn 测试(自定义日志目录)
|
|
|
|
|
|
$(basename "$0") --tests=batchCUBLAS,gpu_burn --logs=/tmp/gpu-logs
|
|
|
|
|
|
|
|
|
|
|
|
2. 执行所有压测项(使用默认日志目录)
|
|
|
|
|
|
$(basename "$0") --tests=bandwidthTest,deviceQuery,gpu_burn,p2pBandwidthLatencyTest,batchCUBLAS
|
|
|
|
|
|
|
|
|
|
|
|
3. 查看帮助
|
|
|
|
|
|
$(basename "$0") -h
|
|
|
|
|
|
EOF
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
# 主流程(按步骤执行,逻辑线性,易追踪)
|
|
|
|
|
|
# ======================================================
|
|
|
|
|
|
main() {
|
|
|
|
|
|
echo -e "======================================================"
|
|
|
|
|
|
echo -e " GPU 压测脚本(最终适配版)v1.4 "
|
|
|
|
|
|
echo -e "======================================================\n"
|
|
|
|
|
|
|
|
|
|
|
|
# 安全校验:禁止危险目录(核心安全逻辑)
|
|
|
|
|
|
check_danger_dir
|
|
|
|
|
|
|
|
|
|
|
|
# 1. 检查系统依赖
|
|
|
|
|
|
check_system_deps
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 解析命令行参数
|
|
|
|
|
|
parse_arguments "$@"
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 处理压测包(适配tgz子目录)
|
|
|
|
|
|
process_test_package
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 验证工具及依赖
|
|
|
|
|
|
verify_tools
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 执行压测(失败即终止)
|
|
|
|
|
|
print_info "\n开始执行压测流程(失败即终止)..."
|
|
|
|
|
|
for test in "${SELECTED_TESTS[@]}"; do
|
|
|
|
|
|
if ! run_single_test "$test"; then
|
|
|
|
|
|
print_summary
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
# 6. 输出结果汇总
|
|
|
|
|
|
print_summary
|
|
|
|
|
|
exit 0
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 启动主流程(接收所有命令行参数)
|
|
|
|
|
|
main "$@"
|