更新 scripts/gpu_metrics.sh

This commit is contained in:
joy 2025-09-09 08:47:12 +08:00
parent 56ba418605
commit 0078106d82
1 changed files with 2 additions and 5 deletions

View File

@ -1,7 +1,4 @@
#!/bin/bash
# 集群健康检查脚本 - 生成node_exporter可采集的监控指标
# 指标规则0=正常1=异常
METRIC_FILE="/var/lib/node_exporter/cluster_metrics.prom"
TMP_FILE=$(mktemp)
@ -25,7 +22,7 @@ cat << EOF > $TMP_FILE
# HELP node_cluster_check_ib_bandwidth IB网卡带宽是否正常0=正常1=异常)
EOF
# 1. 检查计算卡数量应为8
# 1. 检查计算卡数量
gpu_count=$(nvidia-smi -L | grep -i nvidia | wc -l)
if [ "$gpu_count" -ne 8 ]; then
echo "node_cluster_check_gpu_count 1" >> $TMP_FILE
@ -190,5 +187,5 @@ for i in {0..7}; do
echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE
done
# 替换指标文件(原子操作避免读取不完整)
# 替换指标文件
mv $TMP_FILE $METRIC_FILE