diff --git a/scripts/gpu_metrics.sh b/scripts/gpu_metrics.sh new file mode 100644 index 0000000..b02d729 --- /dev/null +++ b/scripts/gpu_metrics.sh @@ -0,0 +1,194 @@ +#!/bin/bash +# 集群健康检查脚本 - 生成node_exporter可采集的监控指标 +# 指标规则:0=正常,1=异常 + +METRIC_FILE="/var/lib/node_exporter/cluster_metrics.prom" +TMP_FILE=$(mktemp) + +# 确保输出文件目录存在 +mkdir -p $(dirname $METRIC_FILE) + +# 写入指标注释 +cat << EOF > $TMP_FILE +# HELP node_cluster_check_gpu_count 检查GPU数量是否为8(0=正常,1=异常) +# HELP node_cluster_check_ib_status 检查IB网卡状态(0=正常,1=异常) +# HELP node_cluster_check_peermem 检查peermem模块加载状态(0=正常,1=异常) +# HELP node_cluster_check_fabric 检查nvidia-fabricmanager运行状态(0=正常,1=异常) +# HELP node_cluster_check_acs 检查ACS状态是否关闭(0=正常,1=异常) +# HELP node_cluster_check_ib_order 检查IB网卡排列顺序(0=正常,1=异常) +# HELP node_cluster_check_memory_brand 检查内存品牌是否一致(0=正常,1=异常) +# HELP node_cluster_check_gpu_ecc GPU ECC错误状态(0=正常,1=异常) +# HELP node_cluster_check_gpu_xid GPU XID错误状态(0=正常,1=异常) +# HELP node_cluster_check_fan_speed 检查风扇转速是否为最高(0=正常,1=异常) +# HELP node_cluster_check_ib_ber IB网卡Symbol BER值是否正常(0=正常,1=异常) +# HELP node_cluster_check_ib_power IB网卡光衰是否正常(0=正常,1=异常) +# HELP node_cluster_check_ib_bandwidth IB网卡带宽是否正常(0=正常,1=异常) +EOF + +# 1. 检查计算卡数量(应为8) +gpu_count=$(nvidia-smi -L | grep -i nvidia | wc -l) +if [ "$gpu_count" -ne 8 ]; then + echo "node_cluster_check_gpu_count 1" >> $TMP_FILE +else + echo "node_cluster_check_gpu_count 0" >> $TMP_FILE +fi + +# 2. 检查IB网卡状态 +ib_down=$(ibdev2netdev | grep ib | grep 'own' | wc -l) +if [ "$ib_down" -ne 0 ]; then + echo "node_cluster_check_ib_status 1" >> $TMP_FILE +else + echo "node_cluster_check_ib_status 0" >> $TMP_FILE +fi + +# 3. 检查peermem状态 +peermem_status=$(lsmod | grep -cE 'nvidia_peermem|peer_mem|peer' 2>/dev/null) +if [ "$peermem_status" -ne 3 ]; then + modprobe nvidia_peermem >/dev/null 2>&1 + echo "node_cluster_check_peermem 1" >> $TMP_FILE +else + echo "node_cluster_check_peermem 0" >> $TMP_FILE +fi + +# 4. 检查fabric状态 +fabric_status=$(systemctl is-active nvidia-fabricmanager.service 2>/dev/null) +if [ "$fabric_status" != "active" ]; then + echo "node_cluster_check_fabric 1" >> $TMP_FILE +else + echo "node_cluster_check_fabric 0" >> $TMP_FILE +fi + +# 5. 检查acs状态(应关闭) +acs_status=$(lspci -vvv | grep 'PCI bridge' -A 80 | grep ACSCtl | grep -c SrcValid+ 2>/dev/null) +if [ "$acs_status" -ne 0 ]; then + echo "node_cluster_check_acs 1" >> $TMP_FILE +else + echo "node_cluster_check_acs 0" >> $TMP_FILE +fi + +# 6. 检查IB网卡排列顺序 +value3=$(cat << EOF +hca_id: mlx5_0 +hca_id: mlx5_1 +hca_id: mlx5_2 +hca_id: mlx5_3 +hca_id: mlx5_4 +hca_id: mlx5_5 +hca_id: mlx5_6 +hca_id: mlx5_7 +hca_id: mlx5_bond_0 +hca_id: mlx5_bond_1 +EOF +) +value4=$(ibv_devinfo | grep mlx) +if ! test "$value3" = "$value4"; then + echo "node_cluster_check_ib_order 1" >> $TMP_FILE +else + echo "node_cluster_check_ib_order 0" >> $TMP_FILE +fi + +# 7. 检查硬盘数量 +disk_error=0 +if ! lsblk | grep nvme5n1 > /dev/null; then + disk_error=1 +elif lsblk | grep nvme6n1 > /dev/null; then + disk_error=1 +fi +echo "node_cluster_check_disk_count $disk_error" >> $TMP_FILE + +# 8. 检查内存品牌一致性 +memory2=$(sudo dmidecode -t memory | grep -i "Manufacturer:" | uniq | wc -l) +if [ $memory2 -ne 1 ]; then + echo "node_cluster_check_memory_brand 1" >> $TMP_FILE +else + echo "node_cluster_check_memory_brand 0" >> $TMP_FILE +fi + +# 9. 检查GPU ECC状态 +ECC=$(nvidia-smi -q | grep -i 'Remapping Failure Occurred' | grep -c ': Yes' 2>/dev/null) +if [ "$ECC" -ne 0 ]; then + echo "node_cluster_check_gpu_ecc 1" >> $TMP_FILE +else + echo "node_cluster_check_gpu_ecc 0" >> $TMP_FILE +fi + +# 10. 检查GPU XID异常 +XID=$(dmesg | grep -i 'NVRM: Xid' | awk -F': ' '{print $3}' | awk '{print $1}' | wc -l) +if [ "$XID" -ne 0 ]; then + echo "node_cluster_check_gpu_xid 1" >> $TMP_FILE +else + echo "node_cluster_check_gpu_xid 0" >> $TMP_FILE +fi + +# 11. 检查风扇模式 +fan_mode=$(racadm get system.thermalsettings.ThermalProfile 2>/dev/null | grep -c 'ThermalProfile=Maximum Performance') +if [ "$fan_mode" -ne 1 ]; then + echo "node_cluster_check_fan_mode 1" >> $TMP_FILE +else + echo "node_cluster_check_fan_mode 0" >> $TMP_FILE +fi + +# 12. 检查风扇转速 +fan_speed=$(racadm get system.thermalsettings.FanSpeedOffset 2>/dev/null | grep -c 'FanSpeedOffset=Max') +if [ "$fan_speed" -ne 1 ]; then + echo "node_cluster_check_fan_speed 1" >> $TMP_FILE +else + echo "node_cluster_check_fan_speed 0" >> $TMP_FILE +fi + +# 13. 检查IB链路BER值 +for i in {0..7}; do + dev="mlx5_$i" + ber_value=$(mlxlink -d $dev -c 2>/dev/null | grep 'Symbol BER' | awk -F- '{print $2}' | xargs) + if [[ -n "$ber_value" && "$ber_value" -lt 14 ]]; then + echo "node_cluster_check_ib_ber{device=\"$dev\"} 1" >> $TMP_FILE + else + echo "node_cluster_check_ib_ber{device=\"$dev\"} 0" >> $TMP_FILE + fi +done + +# 14. 检查IB网卡光衰 +for i in {0..7}; do + dev="mlx5_$i" + output=$(mlxlink -d "$dev" -m 2>/dev/null | grep -E 'Tx Power Current|Rx Power Current') + power_error=0 + + if [[ -n "$output" ]]; then + while IFS= read -r line; do + power_type=$(echo "$line" | awk '{print $1 " " $2}') + values=$(echo "$line" | awk -F: '{print $2}' | awk '{print $1}') + IFS=',' read -r -a value_array <<< "$values" + + for val in "${value_array[@]}"; do + integer_part=$(echo "$val" | awk -F. '{print $1}') + if [[ "$integer_part" =~ ^-?[0-9]+$ && "$integer_part" -lt -2 ]]; then + power_error=1 + break 2 # 跳出双层循环 + fi + done + done <<< "$output" + fi + + echo "node_cluster_check_ib_power{device=\"$dev\"} $power_error" >> $TMP_FILE +done + +# 15. 检查IB网卡带宽 +for i in {0..7}; do + dev="mlx5_$i" + bandwidth_error=0 + output=$(run_perftest_loopback 0 1 ib_write_bw -d "$dev" --report_gbits 2>/dev/null | grep 65536 | awk '{print int($3)}') + + if [[ -n "$output" ]]; then + while read -r bw; do + if (( bw < 360 )); then + bandwidth_error=1 + break + fi + done <<< "$output" + fi + + echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE +done + +# 替换指标文件(原子操作避免读取不完整) +mv $TMP_FILE $METRIC_FILE \ No newline at end of file