From 0a6d4bb107b9c752f1d9a5129bd33ecc7f9d2ea6 Mon Sep 17 00:00:00 2001 From: joy Date: Mon, 28 Jul 2025 16:51:33 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20scripts/deploy=5Fgpu=5Fmon?= =?UTF-8?q?itor.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/deploy_gpu_monitor.sh | 102 ++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 scripts/deploy_gpu_monitor.sh diff --git a/scripts/deploy_gpu_monitor.sh b/scripts/deploy_gpu_monitor.sh new file mode 100644 index 0000000..db44f45 --- /dev/null +++ b/scripts/deploy_gpu_monitor.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# 功能:监控 GPU 温度、功率、使用率、数量及在线状态(固定8卡:0~7) +# 路径:/usr/local/bin/gpu_monitor.sh + +METRICS_FILE="/var/lib/node_exporter/textfile_collector/gpu_metrics.prom" +mkdir -p "$(dirname "${METRICS_FILE}")" + +# 预期的8张GPU编号 +EXPECTED_GPUS="0 1 2 3 4 5 6 7" + +# 临时文件存储当前检测到的 GPU 编号和使用率 +TMP_GPU_IDS=$(mktemp) +TMP_GPU_UTIL=$(mktemp) + +# 1. 收集所有在线 GPU 基础指标,并记录在线编号和使用率 +{ + echo "# HELP gpu_temperature GPU 核心温度 (℃)" + echo "# TYPE gpu_temperature gauge" + + echo "# HELP gpu_power_limit GPU 额定功率(最大功率限制,W)" + echo "# TYPE gpu_power_limit gauge" + + echo "# HELP gpu_power_current GPU 当前功率(实时功耗,W)" + echo "# TYPE gpu_power_current gauge" + + echo "# HELP gpu_utilization GPU 计算核心使用率 (%)" + echo "# TYPE gpu_utilization gauge" + + # GPU 在线状态(1=在线,0=离线) + echo "# HELP gpu_online GPU 在线状态(1=在线,0=离线)" + echo "# TYPE gpu_online gauge" + + # 聚合指标:所有 GPU 的平均使用率 + echo "# HELP gpu_utilization_avg 所有 GPU 的平均使用率 (%)" + echo "# TYPE gpu_utilization_avg gauge" + + # 聚合指标:所有 GPU 的总使用率 (%) + echo "# HELP gpu_utilization_total 所有 GPU 的总使用率 (%)" + echo "# TYPE gpu_utilization_total gauge" + + # 收集单块 GPU 指标 + total_util=0 # 初始化总使用率 + online_count=0 # 初始化在线 GPU 数量 + + nvidia-smi --query-gpu=index,power.limit,power.draw,temperature.gpu,utilization.gpu \ + --format=csv,noheader,nounits | while IFS=',' read -r idx limit current temp util; do + idx=$(echo "${idx}" | xargs) + limit=$(echo "${limit}" | xargs) + current=$(echo "${current}" | xargs) + temp=$(echo "${temp}" | xargs) + util=$(echo "${util}" | xargs) + + # 输出单块 GPU 指标 + echo "gpu_temperature{gpu=\"${idx}\"} ${temp}" + echo "gpu_power_limit{gpu=\"${idx}\"} ${limit}" + echo "gpu_power_current{gpu=\"${idx}\"} ${current}" + echo "gpu_utilization{gpu=\"${idx}\"} ${util}" + echo "gpu_online{gpu=\"${idx}\"} 1" # 标记为在线 + + # 记录在线 GPU 编号和使用率 + echo "${idx}" >> "${TMP_GPU_IDS}" + echo "${util}" >> "${TMP_GPU_UTIL}" + + # 累加总使用率 + ((total_util += util)) + ((online_count++)) + done + + # 2. 计算当前在线 GPU 总数 + echo "# HELP gpu_total 在线 GPU 总数" + echo "# TYPE gpu_total gauge" + gpu_count=$(wc -l < "${TMP_GPU_IDS}" | xargs) + echo "gpu_total ${gpu_count}" + + # 3. 计算并输出聚合使用率指标 + if [ "${online_count}" -gt 0 ]; then + # 平均使用率 = 总使用率 / 在线 GPU 数量 + avg_util=$(echo "scale=2; ${total_util} / ${online_count}" | bc) + echo "gpu_utilization_avg ${avg_util}" + + # 总使用率(所有 GPU 使用率之和) + echo "gpu_utilization_total ${total_util}" + else + # 无在线 GPU 时设为 0 + echo "gpu_utilization_avg 0" + echo "gpu_utilization_total 0" + fi + + # 4. 标记离线的 GPU + for expected_gpu in ${EXPECTED_GPUS}; do + if ! grep -q "^${expected_gpu}$" "${TMP_GPU_IDS}"; then + echo "gpu_online{gpu=\"${expected_gpu}\"} 0" # 标记为离线 + echo "gpu_temperature{gpu=\"${expected_gpu}\"} 0" + echo "gpu_power_limit{gpu=\"${expected_gpu}\"} 0" + echo "gpu_power_current{gpu=\"${expected_gpu}\"} 0" + echo "gpu_utilization{gpu=\"${expected_gpu}\"} 0" + fi + done +} > "${METRICS_FILE}" + +# 清理临时文件 +rm -f "${TMP_GPU_IDS}" "${TMP_GPU_UTIL}"