更新 scripts/gpu_metrics.sh

ddd
This commit is contained in:
joy 2025-10-31 12:11:13 +08:00
parent 1c719a214d
commit a33d7582fb
1 changed files with 92 additions and 37 deletions

View File

@ -66,19 +66,19 @@ fi
# 6. 检查IB网卡排列顺序 # 6. 检查IB网卡排列顺序
value3=$(cat << EOF value3=$(cat << EOF
hca_id: mlx5_0 mlx5_bond_0 port 1 ==> bond0 (Up)
hca_id: mlx5_1 mlx5_bond_1 port 1 ==> bond1 (Up)
hca_id: mlx5_2 mlx5_bond_2 port 1 ==> bond2 (Up)
hca_id: mlx5_3 mlx5_bond_3 port 1 ==> bond3 (Up)
hca_id: mlx5_4 mlx5_bond_4 port 1 ==> bond4 (Up)
hca_id: mlx5_5 mlx5_bond_5 port 1 ==> bond5 (Up)
hca_id: mlx5_6 mlx5_bond_6 port 1 ==> bond6 (Up)
hca_id: mlx5_7 mlx5_bond_7 port 1 ==> bond7 (Up)
hca_id: mlx5_bond_0 mlx5_bond_8 port 1 ==> bond8 (Up)
hca_id: mlx5_bond_1 mlx5_bond_9 port 1 ==> bond9 (Up)
EOF EOF
) )
value4=$(ibv_devinfo | grep mlx) value4=$(ibdev2netdev)
if ! test "$value3" = "$value4"; then if ! test "$value3" = "$value4"; then
echo "node_cluster_check_ib_order 1" >> $TMP_FILE echo "node_cluster_check_ib_order 1" >> $TMP_FILE
else else
@ -87,12 +87,16 @@ fi
# 7. 检查硬盘数量 # 7. 检查硬盘数量
disk_error=0 disk_error=0
if ! lsblk | grep nvme5n1 > /dev/null; then TARGET_DISKS=("nvme0n1" "nvme1n1" "nvme2n1" "nvme3n1")
disk_error=1 for disk in "${TARGET_DISKS[@]}"; do
elif lsblk | grep nvme6n1 > /dev/null; then if ! lsblk | grep -w "$disk" > /dev/null; then
disk_error=1 disk_error=1
break
fi fi
echo "node_cluster_check_disk_count $disk_error" >> $TMP_FILE done
echo "node_cluster_check_disk_count $disk_error" >> "$TMP_FILE"
# 8. 检查内存品牌一致性 # 8. 检查内存品牌一致性
memory2=$(sudo dmidecode -t memory | grep -i "Manufacturer:" | uniq | wc -l) memory2=$(sudo dmidecode -t memory | grep -i "Manufacturer:" | uniq | wc -l)
@ -119,24 +123,24 @@ else
fi fi
# 11. 检查风扇模式 # 11. 检查风扇模式
fan_mode=$(racadm get system.thermalsettings.ThermalProfile 2>/dev/null | grep -c 'ThermalProfile=Maximum Performance') #fan_mode=$(racadm get system.thermalsettings.ThermalProfile 2>/dev/null | grep -c 'ThermalProfile=Maximum Performance')
if [ "$fan_mode" -ne 1 ]; then #if [ "$fan_mode" -ne 1 ]; then
echo "node_cluster_check_fan_mode 1" >> $TMP_FILE # echo "node_cluster_check_fan_mode 1" >> $TMP_FILE
else #else
echo "node_cluster_check_fan_mode 0" >> $TMP_FILE # echo "node_cluster_check_fan_mode 0" >> $TMP_FILE
fi #fi
# 12. 检查风扇转速 # 12. 检查风扇转速
fan_speed=$(racadm get system.thermalsettings.FanSpeedOffset 2>/dev/null | grep -c 'FanSpeedOffset=Max') #fan_speed=$(racadm get system.thermalsettings.FanSpeedOffset 2>/dev/null | grep -c 'FanSpeedOffset=Max')
if [ "$fan_speed" -ne 1 ]; then #if [ "$fan_speed" -ne 1 ]; then
echo "node_cluster_check_fan_speed 1" >> $TMP_FILE # echo "node_cluster_check_fan_speed 1" >> $TMP_FILE
else #else
echo "node_cluster_check_fan_speed 0" >> $TMP_FILE # echo "node_cluster_check_fan_speed 0" >> $TMP_FILE
fi #fi
# 13. 检查IB链路BER值 # 13. 检查IB链路BER值
for i in {0..7}; do for i in {2..9}; do
dev="mlx5_$i" dev="mlx5_bond_$i"
ber_value=$(mlxlink -d $dev -c 2>/dev/null | grep 'Symbol BER' | awk -F- '{print $2}' | xargs) ber_value=$(mlxlink -d $dev -c 2>/dev/null | grep 'Symbol BER' | awk -F- '{print $2}' | xargs)
if [[ -n "$ber_value" && "$ber_value" -lt 14 ]]; then if [[ -n "$ber_value" && "$ber_value" -lt 14 ]]; then
echo "node_cluster_check_ib_ber{device=\"$dev\"} 1" >> $TMP_FILE echo "node_cluster_check_ib_ber{device=\"$dev\"} 1" >> $TMP_FILE
@ -146,8 +150,8 @@ for i in {0..7}; do
done done
# 14. 检查IB网卡光衰 # 14. 检查IB网卡光衰
for i in {0..7}; do for i in {2..9}; do
dev="mlx5_$i" dev="mlx5_bond_$i"
output=$(mlxlink -d "$dev" -m 2>/dev/null | grep -E 'Tx Power Current|Rx Power Current') output=$(mlxlink -d "$dev" -m 2>/dev/null | grep -E 'Tx Power Current|Rx Power Current')
power_error=0 power_error=0
@ -171,14 +175,14 @@ for i in {0..7}; do
done done
# 15. 检查IB网卡带宽 # 15. 检查IB网卡带宽
for i in {0..7}; do for i in {2..9}; do
dev="mlx5_$i" dev="mlx5_bond_$i"
bandwidth_error=0 bandwidth_error=0
output=$(run_perftest_loopback 0 1 ib_write_bw -d "$dev" --report_gbits 2>/dev/null | grep 65536 | awk '{print int($3)}') output=$(run_perftest_loopback 0 1 ib_write_bw -d "$dev" --report_gbits 2>/dev/null | grep 65536 | awk '{print int($3)}')
if [[ -n "$output" ]]; then if [[ -n "$output" ]]; then
while read -r bw; do while read -r bw; do
if (( bw < 360 )); then if (( bw < 80 )); then
bandwidth_error=1 bandwidth_error=1
break break
fi fi
@ -188,6 +192,57 @@ for i in {0..7}; do
echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE
done done
#16 PCIE 降速
# PCIe速度与Gen版本对应表
declare -A PCIE_SPEED=(
["2.5"]="1" # Gen1
["5.0"]="2" # Gen2
["8.0"]="3" # Gen3
["16.0"]="4" # Gen4
["32.0"]="5" # Gen5
["32"]="5" # 适配整数表示
)
# 识别NVIDIA 3D控制器
GPU_DEVICES=$(lspci | grep -i "nvidia" | grep -i "3D controller" | awk '{print $1}')
GPU_COUNT=$(echo "$GPU_DEVICES" | wc -l | tr -d ' ')
# 遍历每个GPU设备
INDEX=0
while IFS= read -r DEVICE; do
[ -z "$DEVICE" ] && continue
# 获取PCIe信息
PCI_INFO=$(lspci -vv -s "$DEVICE")
# 提取链路能力LnkCap
LNK_CAP=$(echo "$PCI_INFO" | grep "LnkCap:" | head -n 1)
LNK_CAP_SPEED=$(echo "$LNK_CAP" | grep -oP 'Speed \K[\d.]+(?=GT/s)')
LNK_CAP_WIDTH=$(echo "$LNK_CAP" | grep -oP 'Width \Kx\d+')
CAP_GEN=${PCIE_SPEED[$LNK_CAP_SPEED]:-0}
# 提取当前状态(仅用于判断降速,不输出详细指标)
LNK_STA=$(echo "$PCI_INFO" | grep "LnkSta:" | head -n 1)
LNK_STA_SPEED=$(echo "$LNK_STA" | grep -oP 'Speed \K[\d.]+(?=GT/s)')
LNK_STA_WIDTH=$(echo "$LNK_STA" | grep -oP 'Width \Kx\d+')
STA_GEN=${PCIE_SPEED[$LNK_STA_SPEED]:-0}
# 判断是否降速
DOWNGRADED=0
if [ "$CAP_GEN" -gt "$STA_GEN" ] || [ "$LNK_CAP_WIDTH" != "$LNK_STA_WIDTH" ]; then
DOWNGRADED=1
fi
# 只输出链路能力和降速状态指标
#echo "# HELP node_cluster_check_gpu_lnkCap GPU支持的最大PCIe链路规格" >> "$TMP_FILE"
#echo "# TYPE node_cluster_check_gpu_lnkCap gauge" >> "$TMP_FILE"
#echo "node_cluster_check_gpu_lnkCap{device=\"$DEVICE\",index=\"$INDEX\",max_speed_gtps=\"$LNK_CAP_SPEED\",max_gen=\"$CAP_GEN\",max_width=\"$LNK_CAP_WIDTH\"} 1" >> "$TMP_FILE"
#echo "# HELP node_cluster_check_gpu_downgraded GPU是否降速1=是0=否)" >> "$TMP_FILE"
#echo "# TYPE node_cluster_check_gpu_downgraded gauge" >> "$TMP_FILE"
echo "node_cluster_check_gpu_downgraded{device=\"$DEVICE\",index=\"$INDEX\"} $DOWNGRADED" >> "$TMP_FILE"
INDEX=$((INDEX + 1))
done <<< "$GPU_DEVICES"
# 替换指标文件 # 替换指标文件
mv $TMP_FILE $METRIC_FILE mv $TMP_FILE $METRIC_FILE
chmod 777 $METRIC_FILE chmod 777 $METRIC_FILE