ansible-devops/scripts/gpu_metrics.sh

249 lines
8.7 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
source /etc/profile
METRIC_FILE="/var/lib/node_exporter/textfile_collector/cluster_metrics.prom"
TMP_FILE=$(mktemp)
# 确保输出文件目录存在
mkdir -p $(dirname $METRIC_FILE)
# 写入指标注释
cat << EOF > $TMP_FILE
# HELP node_cluster_check_gpu_count 检查GPU数量是否为80=正常1=异常)
# HELP node_cluster_check_ib_status 检查IB网卡状态0=正常1=异常)
# HELP node_cluster_check_peermem 检查peermem模块加载状态0=正常1=异常)
# HELP node_cluster_check_fabric 检查nvidia-fabricmanager运行状态0=正常1=异常)
# HELP node_cluster_check_acs 检查ACS状态是否关闭0=正常1=异常)
# HELP node_cluster_check_ib_order 检查IB网卡排列顺序0=正常1=异常)
# HELP node_cluster_check_memory_brand 检查内存品牌是否一致0=正常1=异常)
# HELP node_cluster_check_gpu_ecc GPU ECC错误状态0=正常1=异常)
# HELP node_cluster_check_gpu_xid GPU XID错误状态0=正常1=异常)
# HELP node_cluster_check_fan_speed 检查风扇转速是否为最高0=正常1=异常)
# HELP node_cluster_check_ib_ber IB网卡Symbol BER值是否正常0=正常1=异常)
# HELP node_cluster_check_ib_power IB网卡光衰是否正常0=正常1=异常)
# HELP node_cluster_check_ib_bandwidth IB网卡带宽是否正常0=正常1=异常)
EOF
# 1. 检查计算卡数量
gpu_count=$(nvidia-smi -L | grep -i nvidia | wc -l)
if [ "$gpu_count" -ne 8 ]; then
echo "node_cluster_check_gpu_count 1" >> $TMP_FILE
else
echo "node_cluster_check_gpu_count 0" >> $TMP_FILE
fi
# 2. 检查IB网卡状态
ib_down=$(ibdev2netdev | grep ib | grep 'own' | wc -l)
if [ "$ib_down" -ne 0 ]; then
echo "node_cluster_check_ib_status 1" >> $TMP_FILE
else
echo "node_cluster_check_ib_status 0" >> $TMP_FILE
fi
# 3. 检查peermem状态
peermem_status=$(lsmod | grep -cE 'nvidia_peermem|peer_mem|peer' 2>/dev/null)
if [ "$peermem_status" -ne 3 ]; then
modprobe nvidia_peermem >/dev/null 2>&1
echo "node_cluster_check_peermem 1" >> $TMP_FILE
else
echo "node_cluster_check_peermem 0" >> $TMP_FILE
fi
# 4. 检查fabric状态
fabric_status=$(systemctl is-active nvidia-fabricmanager.service 2>/dev/null)
if [ "$fabric_status" != "active" ]; then
echo "node_cluster_check_fabric 1" >> $TMP_FILE
else
echo "node_cluster_check_fabric 0" >> $TMP_FILE
fi
# 5. 检查acs状态应关闭
acs_status=$(lspci -vvv | grep 'PCI bridge' -A 80 | grep ACSCtl | grep -c SrcValid+ 2>/dev/null)
if [ "$acs_status" -ne 0 ]; then
echo "node_cluster_check_acs 1" >> $TMP_FILE
else
echo "node_cluster_check_acs 0" >> $TMP_FILE
fi
# 6. 检查IB网卡排列顺序
value3=$(cat << EOF
mlx5_bond_0 port 1 ==> bond0 (Up)
mlx5_bond_1 port 1 ==> bond1 (Up)
mlx5_bond_2 port 1 ==> bond2 (Up)
mlx5_bond_3 port 1 ==> bond3 (Up)
mlx5_bond_4 port 1 ==> bond4 (Up)
mlx5_bond_5 port 1 ==> bond5 (Up)
mlx5_bond_6 port 1 ==> bond6 (Up)
mlx5_bond_7 port 1 ==> bond7 (Up)
mlx5_bond_8 port 1 ==> bond8 (Up)
mlx5_bond_9 port 1 ==> bond9 (Up)
EOF
)
value4=$(ibdev2netdev)
if ! test "$value3" = "$value4"; then
echo "node_cluster_check_ib_order 1" >> $TMP_FILE
else
echo "node_cluster_check_ib_order 0" >> $TMP_FILE
fi
# 7. 检查硬盘数量
disk_error=0
TARGET_DISKS=("nvme0n1" "nvme1n1" "nvme2n1" "nvme3n1")
for disk in "${TARGET_DISKS[@]}"; do
if ! lsblk | grep -w "$disk" > /dev/null; then
disk_error=1
break
fi
done
echo "node_cluster_check_disk_count $disk_error" >> "$TMP_FILE"
# 8. 检查内存品牌一致性
memory2=$(sudo dmidecode -t memory | grep -i "Manufacturer:" | uniq | wc -l)
if [ $memory2 -ne 1 ]; then
echo "node_cluster_check_memory_brand 1" >> $TMP_FILE
else
echo "node_cluster_check_memory_brand 0" >> $TMP_FILE
fi
# 9. 检查GPU ECC状态
ECC=$(nvidia-smi -q | grep -i 'Remapping Failure Occurred' | grep -c ': Yes' 2>/dev/null)
if [ "$ECC" -ne 0 ]; then
echo "node_cluster_check_gpu_ecc 1" >> $TMP_FILE
else
echo "node_cluster_check_gpu_ecc 0" >> $TMP_FILE
fi
# 10. 检查GPU XID异常
XID=$(dmesg | grep -i 'NVRM: Xid' | awk -F': ' '{print $3}' | awk '{print $1}' | wc -l)
if [ "$XID" -ne 0 ]; then
echo "node_cluster_check_gpu_xid 1" >> $TMP_FILE
else
echo "node_cluster_check_gpu_xid 0" >> $TMP_FILE
fi
# 11. 检查风扇模式
#fan_mode=$(racadm get system.thermalsettings.ThermalProfile 2>/dev/null | grep -c 'ThermalProfile=Maximum Performance')
#if [ "$fan_mode" -ne 1 ]; then
# echo "node_cluster_check_fan_mode 1" >> $TMP_FILE
#else
# echo "node_cluster_check_fan_mode 0" >> $TMP_FILE
#fi
# 12. 检查风扇转速
#fan_speed=$(racadm get system.thermalsettings.FanSpeedOffset 2>/dev/null | grep -c 'FanSpeedOffset=Max')
#if [ "$fan_speed" -ne 1 ]; then
# echo "node_cluster_check_fan_speed 1" >> $TMP_FILE
#else
# echo "node_cluster_check_fan_speed 0" >> $TMP_FILE
#fi
# 13. 检查IB链路BER值
for i in {2..9}; do
dev="mlx5_bond_$i"
ber_value=$(mlxlink -d $dev -c 2>/dev/null | grep 'Symbol BER' | awk -F- '{print $2}' | xargs)
if [[ -n "$ber_value" && "$ber_value" -lt 14 ]]; then
echo "node_cluster_check_ib_ber{device=\"$dev\"} 1" >> $TMP_FILE
else
echo "node_cluster_check_ib_ber{device=\"$dev\"} 0" >> $TMP_FILE
fi
done
# 14. 检查IB网卡光衰
for i in {2..9}; do
dev="mlx5_bond_$i"
output=$(mlxlink -d "$dev" -m 2>/dev/null | grep -E 'Tx Power Current|Rx Power Current')
power_error=0
if [[ -n "$output" ]]; then
while IFS= read -r line; do
power_type=$(echo "$line" | awk '{print $1 " " $2}')
values=$(echo "$line" | awk -F: '{print $2}' | awk '{print $1}')
IFS=',' read -r -a value_array <<< "$values"
for val in "${value_array[@]}"; do
integer_part=$(echo "$val" | awk -F. '{print $1}')
if [[ "$integer_part" =~ ^-?[0-9]+$ && "$integer_part" -lt -2 ]]; then
power_error=1
break 2 # 跳出双层循环
fi
done
done <<< "$output"
fi
echo "node_cluster_check_ib_power{device=\"$dev\"} $power_error" >> $TMP_FILE
done
# 15. 检查IB网卡带宽
for i in {2..9}; do
dev="mlx5_bond_$i"
bandwidth_error=0
output=$(run_perftest_loopback 0 1 ib_write_bw -d "$dev" --report_gbits 2>/dev/null | grep 65536 | awk '{print int($3)}')
if [[ -n "$output" ]]; then
while read -r bw; do
if (( bw < 80 )); then
bandwidth_error=1
break
fi
done <<< "$output"
fi
echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE
done
#16 PCIE 降速
# PCIe速度与Gen版本对应表
declare -A PCIE_SPEED=(
["2.5"]="1" # Gen1
["5.0"]="2" # Gen2
["8.0"]="3" # Gen3
["16.0"]="4" # Gen4
["32.0"]="5" # Gen5
["32"]="5" # 适配整数表示
)
# 识别NVIDIA 3D控制器
GPU_DEVICES=$(lspci | grep -i "nvidia" | grep -i "3D controller" | awk '{print $1}')
GPU_COUNT=$(echo "$GPU_DEVICES" | wc -l | tr -d ' ')
# 遍历每个GPU设备
INDEX=0
while IFS= read -r DEVICE; do
[ -z "$DEVICE" ] && continue
# 获取PCIe信息
PCI_INFO=$(lspci -vv -s "$DEVICE")
# 提取链路能力LnkCap
LNK_CAP=$(echo "$PCI_INFO" | grep "LnkCap:" | head -n 1)
LNK_CAP_SPEED=$(echo "$LNK_CAP" | grep -oP 'Speed \K[\d.]+(?=GT/s)')
LNK_CAP_WIDTH=$(echo "$LNK_CAP" | grep -oP 'Width \Kx\d+')
CAP_GEN=${PCIE_SPEED[$LNK_CAP_SPEED]:-0}
# 提取当前状态(仅用于判断降速,不输出详细指标)
LNK_STA=$(echo "$PCI_INFO" | grep "LnkSta:" | head -n 1)
LNK_STA_SPEED=$(echo "$LNK_STA" | grep -oP 'Speed \K[\d.]+(?=GT/s)')
LNK_STA_WIDTH=$(echo "$LNK_STA" | grep -oP 'Width \Kx\d+')
STA_GEN=${PCIE_SPEED[$LNK_STA_SPEED]:-0}
# 判断是否降速
DOWNGRADED=0
if [ "$CAP_GEN" -gt "$STA_GEN" ] || [ "$LNK_CAP_WIDTH" != "$LNK_STA_WIDTH" ]; then
DOWNGRADED=1
fi
# 只输出链路能力和降速状态指标
#echo "# HELP node_cluster_check_gpu_lnkCap GPU支持的最大PCIe链路规格" >> "$TMP_FILE"
#echo "# TYPE node_cluster_check_gpu_lnkCap gauge" >> "$TMP_FILE"
#echo "node_cluster_check_gpu_lnkCap{device=\"$DEVICE\",index=\"$INDEX\",max_speed_gtps=\"$LNK_CAP_SPEED\",max_gen=\"$CAP_GEN\",max_width=\"$LNK_CAP_WIDTH\"} 1" >> "$TMP_FILE"
#echo "# HELP node_cluster_check_gpu_downgraded GPU是否降速1=是0=否)" >> "$TMP_FILE"
#echo "# TYPE node_cluster_check_gpu_downgraded gauge" >> "$TMP_FILE"
echo "node_cluster_check_gpu_downgraded{device=\"$DEVICE\",index=\"$INDEX\"} $DOWNGRADED" >> "$TMP_FILE"
INDEX=$((INDEX + 1))
done <<< "$GPU_DEVICES"
# 替换指标文件
mv $TMP_FILE $METRIC_FILE
chmod 777 $METRIC_FILE