ansible-devops/scripts/deploy_gpu_monitor.sh

103 lines
3.5 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 功能:监控 GPU 温度、功率、使用率、数量及在线状态固定8卡0~7
# 路径:/usr/local/bin/gpu_monitor.sh
METRICS_FILE="/var/lib/node_exporter/textfile_collector/gpu_metrics.prom"
mkdir -p "$(dirname "${METRICS_FILE}")"
# 预期的8张GPU编号
EXPECTED_GPUS="0 1 2 3 4 5 6 7"
# 临时文件存储当前检测到的 GPU 编号和使用率
TMP_GPU_IDS=$(mktemp)
TMP_GPU_UTIL=$(mktemp)
# 1. 收集所有在线 GPU 基础指标,并记录在线编号和使用率
{
echo "# HELP gpu_temperature GPU 核心温度 (℃)"
echo "# TYPE gpu_temperature gauge"
echo "# HELP gpu_power_limit GPU 额定功率最大功率限制W"
echo "# TYPE gpu_power_limit gauge"
echo "# HELP gpu_power_current GPU 当前功率实时功耗W"
echo "# TYPE gpu_power_current gauge"
echo "# HELP gpu_utilization GPU 计算核心使用率 (%)"
echo "# TYPE gpu_utilization gauge"
# GPU 在线状态1=在线0=离线)
echo "# HELP gpu_online GPU 在线状态1=在线0=离线)"
echo "# TYPE gpu_online gauge"
# 聚合指标:所有 GPU 的平均使用率
echo "# HELP gpu_utilization_avg 所有 GPU 的平均使用率 (%)"
echo "# TYPE gpu_utilization_avg gauge"
# 聚合指标:所有 GPU 的总使用率 (%)
echo "# HELP gpu_utilization_total 所有 GPU 的总使用率 (%)"
echo "# TYPE gpu_utilization_total gauge"
# 收集单块 GPU 指标
total_util=0 # 初始化总使用率
online_count=0 # 初始化在线 GPU 数量
nvidia-smi --query-gpu=index,power.limit,power.draw,temperature.gpu,utilization.gpu \
--format=csv,noheader,nounits | while IFS=',' read -r idx limit current temp util; do
idx=$(echo "${idx}" | xargs)
limit=$(echo "${limit}" | xargs)
current=$(echo "${current}" | xargs)
temp=$(echo "${temp}" | xargs)
util=$(echo "${util}" | xargs)
# 输出单块 GPU 指标
echo "gpu_temperature{gpu=\"${idx}\"} ${temp}"
echo "gpu_power_limit{gpu=\"${idx}\"} ${limit}"
echo "gpu_power_current{gpu=\"${idx}\"} ${current}"
echo "gpu_utilization{gpu=\"${idx}\"} ${util}"
echo "gpu_online{gpu=\"${idx}\"} 1" # 标记为在线
# 记录在线 GPU 编号和使用率
echo "${idx}" >> "${TMP_GPU_IDS}"
echo "${util}" >> "${TMP_GPU_UTIL}"
# 累加总使用率
((total_util += util))
((online_count++))
done
# 2. 计算当前在线 GPU 总数
echo "# HELP gpu_total 在线 GPU 总数"
echo "# TYPE gpu_total gauge"
gpu_count=$(wc -l < "${TMP_GPU_IDS}" | xargs)
echo "gpu_total ${gpu_count}"
# 3. 计算并输出聚合使用率指标
if [ "${online_count}" -gt 0 ]; then
# 平均使用率 = 总使用率 / 在线 GPU 数量
avg_util=$(echo "scale=2; ${total_util} / ${online_count}" | bc)
echo "gpu_utilization_avg ${avg_util}"
# 总使用率(所有 GPU 使用率之和)
echo "gpu_utilization_total ${total_util}"
else
# 无在线 GPU 时设为 0
echo "gpu_utilization_avg 0"
echo "gpu_utilization_total 0"
fi
# 4. 标记离线的 GPU
for expected_gpu in ${EXPECTED_GPUS}; do
if ! grep -q "^${expected_gpu}$" "${TMP_GPU_IDS}"; then
echo "gpu_online{gpu=\"${expected_gpu}\"} 0" # 标记为离线
echo "gpu_temperature{gpu=\"${expected_gpu}\"} 0"
echo "gpu_power_limit{gpu=\"${expected_gpu}\"} 0"
echo "gpu_power_current{gpu=\"${expected_gpu}\"} 0"
echo "gpu_utilization{gpu=\"${expected_gpu}\"} 0"
fi
done
} > "${METRICS_FILE}"
# 清理临时文件
rm -f "${TMP_GPU_IDS}" "${TMP_GPU_UTIL}"