更新 scripts/gpu_metrics.sh
This commit is contained in:
parent
56ba418605
commit
0078106d82
|
|
@ -1,7 +1,4 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# 集群健康检查脚本 - 生成node_exporter可采集的监控指标
|
|
||||||
# 指标规则:0=正常,1=异常
|
|
||||||
|
|
||||||
METRIC_FILE="/var/lib/node_exporter/cluster_metrics.prom"
|
METRIC_FILE="/var/lib/node_exporter/cluster_metrics.prom"
|
||||||
TMP_FILE=$(mktemp)
|
TMP_FILE=$(mktemp)
|
||||||
|
|
||||||
|
|
@ -25,7 +22,7 @@ cat << EOF > $TMP_FILE
|
||||||
# HELP node_cluster_check_ib_bandwidth IB网卡带宽是否正常(0=正常,1=异常)
|
# HELP node_cluster_check_ib_bandwidth IB网卡带宽是否正常(0=正常,1=异常)
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
# 1. 检查计算卡数量(应为8)
|
# 1. 检查计算卡数量
|
||||||
gpu_count=$(nvidia-smi -L | grep -i nvidia | wc -l)
|
gpu_count=$(nvidia-smi -L | grep -i nvidia | wc -l)
|
||||||
if [ "$gpu_count" -ne 8 ]; then
|
if [ "$gpu_count" -ne 8 ]; then
|
||||||
echo "node_cluster_check_gpu_count 1" >> $TMP_FILE
|
echo "node_cluster_check_gpu_count 1" >> $TMP_FILE
|
||||||
|
|
@ -190,5 +187,5 @@ for i in {0..7}; do
|
||||||
echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE
|
echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE
|
||||||
done
|
done
|
||||||
|
|
||||||
# 替换指标文件(原子操作避免读取不完整)
|
# 替换指标文件
|
||||||
mv $TMP_FILE $METRIC_FILE
|
mv $TMP_FILE $METRIC_FILE
|
||||||
Loading…
Reference in New Issue