From 0078106d82f9bb30f006946567aa5dfea3a09144 Mon Sep 17 00:00:00 2001 From: joy Date: Tue, 9 Sep 2025 08:47:12 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20scripts/gpu=5Fmetrics.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/gpu_metrics.sh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/gpu_metrics.sh b/scripts/gpu_metrics.sh index b02d729..cffff78 100644 --- a/scripts/gpu_metrics.sh +++ b/scripts/gpu_metrics.sh @@ -1,7 +1,4 @@ #!/bin/bash -# 集群健康检查脚本 - 生成node_exporter可采集的监控指标 -# 指标规则:0=正常,1=异常 - METRIC_FILE="/var/lib/node_exporter/cluster_metrics.prom" TMP_FILE=$(mktemp) @@ -25,7 +22,7 @@ cat << EOF > $TMP_FILE # HELP node_cluster_check_ib_bandwidth IB网卡带宽是否正常(0=正常,1=异常) EOF -# 1. 检查计算卡数量(应为8) +# 1. 检查计算卡数量 gpu_count=$(nvidia-smi -L | grep -i nvidia | wc -l) if [ "$gpu_count" -ne 8 ]; then echo "node_cluster_check_gpu_count 1" >> $TMP_FILE @@ -190,5 +187,5 @@ for i in {0..7}; do echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE done -# 替换指标文件(原子操作避免读取不完整) +# 替换指标文件 mv $TMP_FILE $METRIC_FILE \ No newline at end of file