ansible-devops/scripts/gpu_metrics.sh

194 lines
6.5 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 集群健康检查脚本 - 生成node_exporter可采集的监控指标
# 指标规则0=正常1=异常
METRIC_FILE="/var/lib/node_exporter/cluster_metrics.prom"
TMP_FILE=$(mktemp)
# 确保输出文件目录存在
mkdir -p $(dirname $METRIC_FILE)
# 写入指标注释
cat << EOF > $TMP_FILE
# HELP node_cluster_check_gpu_count 检查GPU数量是否为80=正常1=异常)
# HELP node_cluster_check_ib_status 检查IB网卡状态0=正常1=异常)
# HELP node_cluster_check_peermem 检查peermem模块加载状态0=正常1=异常)
# HELP node_cluster_check_fabric 检查nvidia-fabricmanager运行状态0=正常1=异常)
# HELP node_cluster_check_acs 检查ACS状态是否关闭0=正常1=异常)
# HELP node_cluster_check_ib_order 检查IB网卡排列顺序0=正常1=异常)
# HELP node_cluster_check_memory_brand 检查内存品牌是否一致0=正常1=异常)
# HELP node_cluster_check_gpu_ecc GPU ECC错误状态0=正常1=异常)
# HELP node_cluster_check_gpu_xid GPU XID错误状态0=正常1=异常)
# HELP node_cluster_check_fan_speed 检查风扇转速是否为最高0=正常1=异常)
# HELP node_cluster_check_ib_ber IB网卡Symbol BER值是否正常0=正常1=异常)
# HELP node_cluster_check_ib_power IB网卡光衰是否正常0=正常1=异常)
# HELP node_cluster_check_ib_bandwidth IB网卡带宽是否正常0=正常1=异常)
EOF
# 1. 检查计算卡数量应为8
gpu_count=$(nvidia-smi -L | grep -i nvidia | wc -l)
if [ "$gpu_count" -ne 8 ]; then
echo "node_cluster_check_gpu_count 1" >> $TMP_FILE
else
echo "node_cluster_check_gpu_count 0" >> $TMP_FILE
fi
# 2. 检查IB网卡状态
ib_down=$(ibdev2netdev | grep ib | grep 'own' | wc -l)
if [ "$ib_down" -ne 0 ]; then
echo "node_cluster_check_ib_status 1" >> $TMP_FILE
else
echo "node_cluster_check_ib_status 0" >> $TMP_FILE
fi
# 3. 检查peermem状态
peermem_status=$(lsmod | grep -cE 'nvidia_peermem|peer_mem|peer' 2>/dev/null)
if [ "$peermem_status" -ne 3 ]; then
modprobe nvidia_peermem >/dev/null 2>&1
echo "node_cluster_check_peermem 1" >> $TMP_FILE
else
echo "node_cluster_check_peermem 0" >> $TMP_FILE
fi
# 4. 检查fabric状态
fabric_status=$(systemctl is-active nvidia-fabricmanager.service 2>/dev/null)
if [ "$fabric_status" != "active" ]; then
echo "node_cluster_check_fabric 1" >> $TMP_FILE
else
echo "node_cluster_check_fabric 0" >> $TMP_FILE
fi
# 5. 检查acs状态应关闭
acs_status=$(lspci -vvv | grep 'PCI bridge' -A 80 | grep ACSCtl | grep -c SrcValid+ 2>/dev/null)
if [ "$acs_status" -ne 0 ]; then
echo "node_cluster_check_acs 1" >> $TMP_FILE
else
echo "node_cluster_check_acs 0" >> $TMP_FILE
fi
# 6. 检查IB网卡排列顺序
value3=$(cat << EOF
hca_id: mlx5_0
hca_id: mlx5_1
hca_id: mlx5_2
hca_id: mlx5_3
hca_id: mlx5_4
hca_id: mlx5_5
hca_id: mlx5_6
hca_id: mlx5_7
hca_id: mlx5_bond_0
hca_id: mlx5_bond_1
EOF
)
value4=$(ibv_devinfo | grep mlx)
if ! test "$value3" = "$value4"; then
echo "node_cluster_check_ib_order 1" >> $TMP_FILE
else
echo "node_cluster_check_ib_order 0" >> $TMP_FILE
fi
# 7. 检查硬盘数量
disk_error=0
if ! lsblk | grep nvme5n1 > /dev/null; then
disk_error=1
elif lsblk | grep nvme6n1 > /dev/null; then
disk_error=1
fi
echo "node_cluster_check_disk_count $disk_error" >> $TMP_FILE
# 8. 检查内存品牌一致性
memory2=$(sudo dmidecode -t memory | grep -i "Manufacturer:" | uniq | wc -l)
if [ $memory2 -ne 1 ]; then
echo "node_cluster_check_memory_brand 1" >> $TMP_FILE
else
echo "node_cluster_check_memory_brand 0" >> $TMP_FILE
fi
# 9. 检查GPU ECC状态
ECC=$(nvidia-smi -q | grep -i 'Remapping Failure Occurred' | grep -c ': Yes' 2>/dev/null)
if [ "$ECC" -ne 0 ]; then
echo "node_cluster_check_gpu_ecc 1" >> $TMP_FILE
else
echo "node_cluster_check_gpu_ecc 0" >> $TMP_FILE
fi
# 10. 检查GPU XID异常
XID=$(dmesg | grep -i 'NVRM: Xid' | awk -F': ' '{print $3}' | awk '{print $1}' | wc -l)
if [ "$XID" -ne 0 ]; then
echo "node_cluster_check_gpu_xid 1" >> $TMP_FILE
else
echo "node_cluster_check_gpu_xid 0" >> $TMP_FILE
fi
# 11. 检查风扇模式
fan_mode=$(racadm get system.thermalsettings.ThermalProfile 2>/dev/null | grep -c 'ThermalProfile=Maximum Performance')
if [ "$fan_mode" -ne 1 ]; then
echo "node_cluster_check_fan_mode 1" >> $TMP_FILE
else
echo "node_cluster_check_fan_mode 0" >> $TMP_FILE
fi
# 12. 检查风扇转速
fan_speed=$(racadm get system.thermalsettings.FanSpeedOffset 2>/dev/null | grep -c 'FanSpeedOffset=Max')
if [ "$fan_speed" -ne 1 ]; then
echo "node_cluster_check_fan_speed 1" >> $TMP_FILE
else
echo "node_cluster_check_fan_speed 0" >> $TMP_FILE
fi
# 13. 检查IB链路BER值
for i in {0..7}; do
dev="mlx5_$i"
ber_value=$(mlxlink -d $dev -c 2>/dev/null | grep 'Symbol BER' | awk -F- '{print $2}' | xargs)
if [[ -n "$ber_value" && "$ber_value" -lt 14 ]]; then
echo "node_cluster_check_ib_ber{device=\"$dev\"} 1" >> $TMP_FILE
else
echo "node_cluster_check_ib_ber{device=\"$dev\"} 0" >> $TMP_FILE
fi
done
# 14. 检查IB网卡光衰
for i in {0..7}; do
dev="mlx5_$i"
output=$(mlxlink -d "$dev" -m 2>/dev/null | grep -E 'Tx Power Current|Rx Power Current')
power_error=0
if [[ -n "$output" ]]; then
while IFS= read -r line; do
power_type=$(echo "$line" | awk '{print $1 " " $2}')
values=$(echo "$line" | awk -F: '{print $2}' | awk '{print $1}')
IFS=',' read -r -a value_array <<< "$values"
for val in "${value_array[@]}"; do
integer_part=$(echo "$val" | awk -F. '{print $1}')
if [[ "$integer_part" =~ ^-?[0-9]+$ && "$integer_part" -lt -2 ]]; then
power_error=1
break 2 # 跳出双层循环
fi
done
done <<< "$output"
fi
echo "node_cluster_check_ib_power{device=\"$dev\"} $power_error" >> $TMP_FILE
done
# 15. 检查IB网卡带宽
for i in {0..7}; do
dev="mlx5_$i"
bandwidth_error=0
output=$(run_perftest_loopback 0 1 ib_write_bw -d "$dev" --report_gbits 2>/dev/null | grep 65536 | awk '{print int($3)}')
if [[ -n "$output" ]]; then
while read -r bw; do
if (( bw < 360 )); then
bandwidth_error=1
break
fi
done <<< "$output"
fi
echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE
done
# 替换指标文件(原子操作避免读取不完整)
mv $TMP_FILE $METRIC_FILE