更新 scripts/deploy_gpu_monitor.sh
This commit is contained in:
parent
0a6d4bb107
commit
59170b15c6
|
|
@ -1,102 +1,64 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# 功能:监控 GPU 温度、功率、使用率、数量及在线状态(固定8卡:0~7)
|
|
||||||
# 路径:/usr/local/bin/gpu_monitor.sh
|
|
||||||
|
|
||||||
METRICS_FILE="/var/lib/node_exporter/textfile_collector/gpu_metrics.prom"
|
# 脚本部署工具 - 下载并配置GPU监控脚本
|
||||||
mkdir -p "$(dirname "${METRICS_FILE}")"
|
set -euo pipefail
|
||||||
|
IFS=$'\n\t'
|
||||||
|
|
||||||
# 预期的8张GPU编号
|
# 配置信息
|
||||||
EXPECTED_GPUS="0 1 2 3 4 5 6 7"
|
SCRIPT_URL="http://116.205.97.109/scripts/gpu_metrics.sh" # 脚本下载地址
|
||||||
|
TARGET_DIR="/opt/scripts" # 脚本存放目录
|
||||||
|
TARGET_SCRIPT="${TARGET_DIR}/gpu_monitor.sh" # 目标脚本路径
|
||||||
|
CRON_SCHEDULE="*/2 * * * *" # 定时任务执行频率
|
||||||
|
LOG_FILE="/var/log/gpu_monitor.log" # 日志文件路径
|
||||||
|
|
||||||
# 临时文件存储当前检测到的 GPU 编号和使用率
|
# 创建目标目录
|
||||||
TMP_GPU_IDS=$(mktemp)
|
mkdir -p "${TARGET_DIR}"
|
||||||
TMP_GPU_UTIL=$(mktemp)
|
cd "${TARGET_DIR}"
|
||||||
|
|
||||||
# 1. 收集所有在线 GPU 基础指标,并记录在线编号和使用率
|
# 下载脚本
|
||||||
{
|
echo "正在下载GPU监控脚本..."
|
||||||
echo "# HELP gpu_temperature GPU 核心温度 (℃)"
|
wget -q -O "${TARGET_SCRIPT}" "${SCRIPT_URL}" || {
|
||||||
echo "# TYPE gpu_temperature gauge"
|
echo "错误: 无法从 ${SCRIPT_URL} 下载脚本" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
echo "# HELP gpu_power_limit GPU 额定功率(最大功率限制,W)"
|
# 赋予执行权限
|
||||||
echo "# TYPE gpu_power_limit gauge"
|
chmod +x "${TARGET_SCRIPT}"
|
||||||
|
|
||||||
echo "# HELP gpu_power_current GPU 当前功率(实时功耗,W)"
|
# 修改脚本内容(示例:将GPU总数设置为8)
|
||||||
echo "# TYPE gpu_power_current gauge"
|
echo "正在修改脚本配置..."
|
||||||
|
sed -i 's/BASELINE_GPUS=".*"/BASELINE_GPUS="0 1 2 3 4 5 6 7"/' "${TARGET_SCRIPT}"
|
||||||
|
|
||||||
echo "# HELP gpu_utilization GPU 计算核心使用率 (%)"
|
# 确保metrics文件目录存在
|
||||||
echo "# TYPE gpu_utilization gauge"
|
METRICS_DIR="/var/lib/node_exporter/textfile_collector"
|
||||||
|
mkdir -p "${METRICS_DIR}"
|
||||||
|
chmod 755 "${METRICS_DIR}"
|
||||||
|
|
||||||
# GPU 在线状态(1=在线,0=离线)
|
# 添加定时任务到crontab
|
||||||
echo "# HELP gpu_online GPU 在线状态(1=在线,0=离线)"
|
echo "正在配置定时任务..."
|
||||||
echo "# TYPE gpu_online gauge"
|
(
|
||||||
|
crontab -l 2>/dev/null || true # 获取现有crontab(如果有)
|
||||||
|
echo "${CRON_SCHEDULE} ${TARGET_SCRIPT} >> ${LOG_FILE} 2>&1" # 添加新任务
|
||||||
|
) | crontab -
|
||||||
|
|
||||||
# 聚合指标:所有 GPU 的平均使用率
|
# 验证定时任务
|
||||||
echo "# HELP gpu_utilization_avg 所有 GPU 的平均使用率 (%)"
|
echo "验证定时任务配置:"
|
||||||
echo "# TYPE gpu_utilization_avg gauge"
|
crontab -l | grep "${TARGET_SCRIPT}" || {
|
||||||
|
echo "错误: 定时任务添加失败" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
# 聚合指标:所有 GPU 的总使用率 (%)
|
# 首次执行脚本
|
||||||
echo "# HELP gpu_utilization_total 所有 GPU 的总使用率 (%)"
|
echo "执行脚本进行测试..."
|
||||||
echo "# TYPE gpu_utilization_total gauge"
|
"${TARGET_SCRIPT}" || {
|
||||||
|
echo "错误: 脚本首次执行失败,请检查 ${LOG_FILE}" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
# 收集单块 GPU 指标
|
echo "====================================================="
|
||||||
total_util=0 # 初始化总使用率
|
echo "GPU监控脚本已成功部署!"
|
||||||
online_count=0 # 初始化在线 GPU 数量
|
echo "脚本位置: ${TARGET_SCRIPT}"
|
||||||
|
echo "定时任务: ${CRON_SCHEDULE}"
|
||||||
nvidia-smi --query-gpu=index,power.limit,power.draw,temperature.gpu,utilization.gpu \
|
echo "日志文件: ${LOG_FILE}"
|
||||||
--format=csv,noheader,nounits | while IFS=',' read -r idx limit current temp util; do
|
echo "指标文件: ${METRICS_DIR}/gpu_metrics.prom"
|
||||||
idx=$(echo "${idx}" | xargs)
|
echo "====================================================="
|
||||||
limit=$(echo "${limit}" | xargs)
|
|
||||||
current=$(echo "${current}" | xargs)
|
|
||||||
temp=$(echo "${temp}" | xargs)
|
|
||||||
util=$(echo "${util}" | xargs)
|
|
||||||
|
|
||||||
# 输出单块 GPU 指标
|
|
||||||
echo "gpu_temperature{gpu=\"${idx}\"} ${temp}"
|
|
||||||
echo "gpu_power_limit{gpu=\"${idx}\"} ${limit}"
|
|
||||||
echo "gpu_power_current{gpu=\"${idx}\"} ${current}"
|
|
||||||
echo "gpu_utilization{gpu=\"${idx}\"} ${util}"
|
|
||||||
echo "gpu_online{gpu=\"${idx}\"} 1" # 标记为在线
|
|
||||||
|
|
||||||
# 记录在线 GPU 编号和使用率
|
|
||||||
echo "${idx}" >> "${TMP_GPU_IDS}"
|
|
||||||
echo "${util}" >> "${TMP_GPU_UTIL}"
|
|
||||||
|
|
||||||
# 累加总使用率
|
|
||||||
((total_util += util))
|
|
||||||
((online_count++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# 2. 计算当前在线 GPU 总数
|
|
||||||
echo "# HELP gpu_total 在线 GPU 总数"
|
|
||||||
echo "# TYPE gpu_total gauge"
|
|
||||||
gpu_count=$(wc -l < "${TMP_GPU_IDS}" | xargs)
|
|
||||||
echo "gpu_total ${gpu_count}"
|
|
||||||
|
|
||||||
# 3. 计算并输出聚合使用率指标
|
|
||||||
if [ "${online_count}" -gt 0 ]; then
|
|
||||||
# 平均使用率 = 总使用率 / 在线 GPU 数量
|
|
||||||
avg_util=$(echo "scale=2; ${total_util} / ${online_count}" | bc)
|
|
||||||
echo "gpu_utilization_avg ${avg_util}"
|
|
||||||
|
|
||||||
# 总使用率(所有 GPU 使用率之和)
|
|
||||||
echo "gpu_utilization_total ${total_util}"
|
|
||||||
else
|
|
||||||
# 无在线 GPU 时设为 0
|
|
||||||
echo "gpu_utilization_avg 0"
|
|
||||||
echo "gpu_utilization_total 0"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4. 标记离线的 GPU
|
|
||||||
for expected_gpu in ${EXPECTED_GPUS}; do
|
|
||||||
if ! grep -q "^${expected_gpu}$" "${TMP_GPU_IDS}"; then
|
|
||||||
echo "gpu_online{gpu=\"${expected_gpu}\"} 0" # 标记为离线
|
|
||||||
echo "gpu_temperature{gpu=\"${expected_gpu}\"} 0"
|
|
||||||
echo "gpu_power_limit{gpu=\"${expected_gpu}\"} 0"
|
|
||||||
echo "gpu_power_current{gpu=\"${expected_gpu}\"} 0"
|
|
||||||
echo "gpu_utilization{gpu=\"${expected_gpu}\"} 0"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
} > "${METRICS_FILE}"
|
|
||||||
|
|
||||||
# 清理临时文件
|
|
||||||
rm -f "${TMP_GPU_IDS}" "${TMP_GPU_UTIL}"
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue