2025-09-09 08:34:41 +08:00
|
|
|
|
#!/bin/bash
|
2025-09-10 14:48:02 +08:00
|
|
|
|
source /etc/profile
|
2025-09-09 09:41:31 +08:00
|
|
|
|
METRIC_FILE="/var/lib/node_exporter/textfile_collector/cluster_metrics.prom"
|
2025-09-09 08:34:41 +08:00
|
|
|
|
TMP_FILE=$(mktemp)
|
|
|
|
|
|
|
|
|
|
|
|
# 确保输出文件目录存在
|
|
|
|
|
|
mkdir -p $(dirname $METRIC_FILE)
|
|
|
|
|
|
|
|
|
|
|
|
# 写入指标注释
|
|
|
|
|
|
cat << EOF > $TMP_FILE
|
|
|
|
|
|
# HELP node_cluster_check_gpu_count 检查GPU数量是否为8(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_ib_status 检查IB网卡状态(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_peermem 检查peermem模块加载状态(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_fabric 检查nvidia-fabricmanager运行状态(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_acs 检查ACS状态是否关闭(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_ib_order 检查IB网卡排列顺序(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_memory_brand 检查内存品牌是否一致(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_gpu_ecc GPU ECC错误状态(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_gpu_xid GPU XID错误状态(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_fan_speed 检查风扇转速是否为最高(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_ib_ber IB网卡Symbol BER值是否正常(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_ib_power IB网卡光衰是否正常(0=正常,1=异常)
|
|
|
|
|
|
# HELP node_cluster_check_ib_bandwidth IB网卡带宽是否正常(0=正常,1=异常)
|
|
|
|
|
|
EOF
|
|
|
|
|
|
|
2025-09-09 08:47:12 +08:00
|
|
|
|
# 1. 检查计算卡数量
|
2025-09-09 08:34:41 +08:00
|
|
|
|
gpu_count=$(nvidia-smi -L | grep -i nvidia | wc -l)
|
|
|
|
|
|
if [ "$gpu_count" -ne 8 ]; then
|
|
|
|
|
|
echo "node_cluster_check_gpu_count 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_gpu_count 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 检查IB网卡状态
|
|
|
|
|
|
ib_down=$(ibdev2netdev | grep ib | grep 'own' | wc -l)
|
|
|
|
|
|
if [ "$ib_down" -ne 0 ]; then
|
|
|
|
|
|
echo "node_cluster_check_ib_status 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_ib_status 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 检查peermem状态
|
|
|
|
|
|
peermem_status=$(lsmod | grep -cE 'nvidia_peermem|peer_mem|peer' 2>/dev/null)
|
|
|
|
|
|
if [ "$peermem_status" -ne 3 ]; then
|
|
|
|
|
|
modprobe nvidia_peermem >/dev/null 2>&1
|
|
|
|
|
|
echo "node_cluster_check_peermem 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_peermem 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 检查fabric状态
|
|
|
|
|
|
fabric_status=$(systemctl is-active nvidia-fabricmanager.service 2>/dev/null)
|
|
|
|
|
|
if [ "$fabric_status" != "active" ]; then
|
|
|
|
|
|
echo "node_cluster_check_fabric 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_fabric 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 检查acs状态(应关闭)
|
|
|
|
|
|
acs_status=$(lspci -vvv | grep 'PCI bridge' -A 80 | grep ACSCtl | grep -c SrcValid+ 2>/dev/null)
|
|
|
|
|
|
if [ "$acs_status" -ne 0 ]; then
|
|
|
|
|
|
echo "node_cluster_check_acs 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_acs 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 6. 检查IB网卡排列顺序
|
|
|
|
|
|
value3=$(cat << EOF
|
2025-10-31 12:11:13 +08:00
|
|
|
|
mlx5_bond_0 port 1 ==> bond0 (Up)
|
|
|
|
|
|
mlx5_bond_1 port 1 ==> bond1 (Up)
|
|
|
|
|
|
mlx5_bond_2 port 1 ==> bond2 (Up)
|
|
|
|
|
|
mlx5_bond_3 port 1 ==> bond3 (Up)
|
|
|
|
|
|
mlx5_bond_4 port 1 ==> bond4 (Up)
|
|
|
|
|
|
mlx5_bond_5 port 1 ==> bond5 (Up)
|
|
|
|
|
|
mlx5_bond_6 port 1 ==> bond6 (Up)
|
|
|
|
|
|
mlx5_bond_7 port 1 ==> bond7 (Up)
|
|
|
|
|
|
mlx5_bond_8 port 1 ==> bond8 (Up)
|
|
|
|
|
|
mlx5_bond_9 port 1 ==> bond9 (Up)
|
2025-09-09 08:34:41 +08:00
|
|
|
|
EOF
|
|
|
|
|
|
)
|
2025-10-31 12:11:13 +08:00
|
|
|
|
value4=$(ibdev2netdev)
|
2025-09-09 08:34:41 +08:00
|
|
|
|
if ! test "$value3" = "$value4"; then
|
|
|
|
|
|
echo "node_cluster_check_ib_order 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_ib_order 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 7. 检查硬盘数量
|
|
|
|
|
|
disk_error=0
|
2025-10-31 12:11:13 +08:00
|
|
|
|
TARGET_DISKS=("nvme0n1" "nvme1n1" "nvme2n1" "nvme3n1")
|
|
|
|
|
|
for disk in "${TARGET_DISKS[@]}"; do
|
|
|
|
|
|
if ! lsblk | grep -w "$disk" > /dev/null; then
|
|
|
|
|
|
disk_error=1
|
|
|
|
|
|
break
|
|
|
|
|
|
fi
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
echo "node_cluster_check_disk_count $disk_error" >> "$TMP_FILE"
|
|
|
|
|
|
|
2025-09-09 08:34:41 +08:00
|
|
|
|
|
|
|
|
|
|
# 8. 检查内存品牌一致性
|
|
|
|
|
|
memory2=$(sudo dmidecode -t memory | grep -i "Manufacturer:" | uniq | wc -l)
|
|
|
|
|
|
if [ $memory2 -ne 1 ]; then
|
|
|
|
|
|
echo "node_cluster_check_memory_brand 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_memory_brand 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 9. 检查GPU ECC状态
|
|
|
|
|
|
ECC=$(nvidia-smi -q | grep -i 'Remapping Failure Occurred' | grep -c ': Yes' 2>/dev/null)
|
|
|
|
|
|
if [ "$ECC" -ne 0 ]; then
|
|
|
|
|
|
echo "node_cluster_check_gpu_ecc 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_gpu_ecc 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 10. 检查GPU XID异常
|
|
|
|
|
|
XID=$(dmesg | grep -i 'NVRM: Xid' | awk -F': ' '{print $3}' | awk '{print $1}' | wc -l)
|
|
|
|
|
|
if [ "$XID" -ne 0 ]; then
|
|
|
|
|
|
echo "node_cluster_check_gpu_xid 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_gpu_xid 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 11. 检查风扇模式
|
2025-10-31 12:11:13 +08:00
|
|
|
|
#fan_mode=$(racadm get system.thermalsettings.ThermalProfile 2>/dev/null | grep -c 'ThermalProfile=Maximum Performance')
|
|
|
|
|
|
#if [ "$fan_mode" -ne 1 ]; then
|
|
|
|
|
|
# echo "node_cluster_check_fan_mode 1" >> $TMP_FILE
|
|
|
|
|
|
#else
|
|
|
|
|
|
# echo "node_cluster_check_fan_mode 0" >> $TMP_FILE
|
|
|
|
|
|
#fi
|
2025-09-09 08:34:41 +08:00
|
|
|
|
|
|
|
|
|
|
# 12. 检查风扇转速
|
2025-10-31 12:11:13 +08:00
|
|
|
|
#fan_speed=$(racadm get system.thermalsettings.FanSpeedOffset 2>/dev/null | grep -c 'FanSpeedOffset=Max')
|
|
|
|
|
|
#if [ "$fan_speed" -ne 1 ]; then
|
|
|
|
|
|
# echo "node_cluster_check_fan_speed 1" >> $TMP_FILE
|
|
|
|
|
|
#else
|
|
|
|
|
|
# echo "node_cluster_check_fan_speed 0" >> $TMP_FILE
|
|
|
|
|
|
#fi
|
2025-09-09 08:34:41 +08:00
|
|
|
|
|
|
|
|
|
|
# 13. 检查IB链路BER值
|
2025-10-31 12:11:13 +08:00
|
|
|
|
for i in {2..9}; do
|
|
|
|
|
|
dev="mlx5_bond_$i"
|
2025-09-09 08:34:41 +08:00
|
|
|
|
ber_value=$(mlxlink -d $dev -c 2>/dev/null | grep 'Symbol BER' | awk -F- '{print $2}' | xargs)
|
|
|
|
|
|
if [[ -n "$ber_value" && "$ber_value" -lt 14 ]]; then
|
|
|
|
|
|
echo "node_cluster_check_ib_ber{device=\"$dev\"} 1" >> $TMP_FILE
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "node_cluster_check_ib_ber{device=\"$dev\"} 0" >> $TMP_FILE
|
|
|
|
|
|
fi
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
# 14. 检查IB网卡光衰
|
2025-10-31 12:11:13 +08:00
|
|
|
|
for i in {2..9}; do
|
|
|
|
|
|
dev="mlx5_bond_$i"
|
2025-09-09 08:34:41 +08:00
|
|
|
|
output=$(mlxlink -d "$dev" -m 2>/dev/null | grep -E 'Tx Power Current|Rx Power Current')
|
|
|
|
|
|
power_error=0
|
|
|
|
|
|
|
|
|
|
|
|
if [[ -n "$output" ]]; then
|
|
|
|
|
|
while IFS= read -r line; do
|
|
|
|
|
|
power_type=$(echo "$line" | awk '{print $1 " " $2}')
|
|
|
|
|
|
values=$(echo "$line" | awk -F: '{print $2}' | awk '{print $1}')
|
|
|
|
|
|
IFS=',' read -r -a value_array <<< "$values"
|
|
|
|
|
|
|
|
|
|
|
|
for val in "${value_array[@]}"; do
|
|
|
|
|
|
integer_part=$(echo "$val" | awk -F. '{print $1}')
|
|
|
|
|
|
if [[ "$integer_part" =~ ^-?[0-9]+$ && "$integer_part" -lt -2 ]]; then
|
|
|
|
|
|
power_error=1
|
|
|
|
|
|
break 2 # 跳出双层循环
|
|
|
|
|
|
fi
|
|
|
|
|
|
done
|
|
|
|
|
|
done <<< "$output"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
echo "node_cluster_check_ib_power{device=\"$dev\"} $power_error" >> $TMP_FILE
|
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
# 15. 检查IB网卡带宽
|
2025-10-31 12:11:13 +08:00
|
|
|
|
for i in {2..9}; do
|
|
|
|
|
|
dev="mlx5_bond_$i"
|
2025-09-09 08:34:41 +08:00
|
|
|
|
bandwidth_error=0
|
|
|
|
|
|
output=$(run_perftest_loopback 0 1 ib_write_bw -d "$dev" --report_gbits 2>/dev/null | grep 65536 | awk '{print int($3)}')
|
|
|
|
|
|
|
|
|
|
|
|
if [[ -n "$output" ]]; then
|
|
|
|
|
|
while read -r bw; do
|
2025-10-31 12:11:13 +08:00
|
|
|
|
if (( bw < 80 )); then
|
2025-09-09 08:34:41 +08:00
|
|
|
|
bandwidth_error=1
|
|
|
|
|
|
break
|
|
|
|
|
|
fi
|
|
|
|
|
|
done <<< "$output"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE
|
|
|
|
|
|
done
|
|
|
|
|
|
|
2025-10-31 12:11:13 +08:00
|
|
|
|
#16 PCIE 降速
|
|
|
|
|
|
# PCIe速度与Gen版本对应表
|
|
|
|
|
|
declare -A PCIE_SPEED=(
|
|
|
|
|
|
["2.5"]="1" # Gen1
|
|
|
|
|
|
["5.0"]="2" # Gen2
|
|
|
|
|
|
["8.0"]="3" # Gen3
|
|
|
|
|
|
["16.0"]="4" # Gen4
|
|
|
|
|
|
["32.0"]="5" # Gen5
|
|
|
|
|
|
["32"]="5" # 适配整数表示
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 识别NVIDIA 3D控制器
|
|
|
|
|
|
GPU_DEVICES=$(lspci | grep -i "nvidia" | grep -i "3D controller" | awk '{print $1}')
|
|
|
|
|
|
GPU_COUNT=$(echo "$GPU_DEVICES" | wc -l | tr -d ' ')
|
|
|
|
|
|
|
|
|
|
|
|
# 遍历每个GPU设备
|
|
|
|
|
|
INDEX=0
|
|
|
|
|
|
while IFS= read -r DEVICE; do
|
|
|
|
|
|
[ -z "$DEVICE" ] && continue
|
|
|
|
|
|
|
|
|
|
|
|
# 获取PCIe信息
|
|
|
|
|
|
PCI_INFO=$(lspci -vv -s "$DEVICE")
|
|
|
|
|
|
|
|
|
|
|
|
# 提取链路能力(LnkCap)
|
|
|
|
|
|
LNK_CAP=$(echo "$PCI_INFO" | grep "LnkCap:" | head -n 1)
|
|
|
|
|
|
LNK_CAP_SPEED=$(echo "$LNK_CAP" | grep -oP 'Speed \K[\d.]+(?=GT/s)')
|
|
|
|
|
|
LNK_CAP_WIDTH=$(echo "$LNK_CAP" | grep -oP 'Width \Kx\d+')
|
|
|
|
|
|
CAP_GEN=${PCIE_SPEED[$LNK_CAP_SPEED]:-0}
|
|
|
|
|
|
|
|
|
|
|
|
# 提取当前状态(仅用于判断降速,不输出详细指标)
|
|
|
|
|
|
LNK_STA=$(echo "$PCI_INFO" | grep "LnkSta:" | head -n 1)
|
|
|
|
|
|
LNK_STA_SPEED=$(echo "$LNK_STA" | grep -oP 'Speed \K[\d.]+(?=GT/s)')
|
|
|
|
|
|
LNK_STA_WIDTH=$(echo "$LNK_STA" | grep -oP 'Width \Kx\d+')
|
|
|
|
|
|
STA_GEN=${PCIE_SPEED[$LNK_STA_SPEED]:-0}
|
|
|
|
|
|
|
|
|
|
|
|
# 判断是否降速
|
|
|
|
|
|
DOWNGRADED=0
|
|
|
|
|
|
if [ "$CAP_GEN" -gt "$STA_GEN" ] || [ "$LNK_CAP_WIDTH" != "$LNK_STA_WIDTH" ]; then
|
|
|
|
|
|
DOWNGRADED=1
|
|
|
|
|
|
fi
|
|
|
|
|
|
# 只输出链路能力和降速状态指标
|
|
|
|
|
|
#echo "# HELP node_cluster_check_gpu_lnkCap GPU支持的最大PCIe链路规格" >> "$TMP_FILE"
|
|
|
|
|
|
#echo "# TYPE node_cluster_check_gpu_lnkCap gauge" >> "$TMP_FILE"
|
|
|
|
|
|
#echo "node_cluster_check_gpu_lnkCap{device=\"$DEVICE\",index=\"$INDEX\",max_speed_gtps=\"$LNK_CAP_SPEED\",max_gen=\"$CAP_GEN\",max_width=\"$LNK_CAP_WIDTH\"} 1" >> "$TMP_FILE"
|
|
|
|
|
|
#echo "# HELP node_cluster_check_gpu_downgraded GPU是否降速(1=是,0=否)" >> "$TMP_FILE"
|
|
|
|
|
|
#echo "# TYPE node_cluster_check_gpu_downgraded gauge" >> "$TMP_FILE"
|
|
|
|
|
|
echo "node_cluster_check_gpu_downgraded{device=\"$DEVICE\",index=\"$INDEX\"} $DOWNGRADED" >> "$TMP_FILE"
|
|
|
|
|
|
INDEX=$((INDEX + 1))
|
|
|
|
|
|
done <<< "$GPU_DEVICES"
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-09-09 08:47:12 +08:00
|
|
|
|
# 替换指标文件
|
2025-09-09 09:41:01 +08:00
|
|
|
|
mv $TMP_FILE $METRIC_FILE
|
2025-10-31 12:11:13 +08:00
|
|
|
|
chmod 777 $METRIC_FILE
|