From a33d7582fb3db792937dcb8802600938d7a70825 Mon Sep 17 00:00:00 2001 From: joy Date: Fri, 31 Oct 2025 12:11:13 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20scripts/gpu=5Fmetrics.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ddd --- scripts/gpu_metrics.sh | 129 +++++++++++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 37 deletions(-) diff --git a/scripts/gpu_metrics.sh b/scripts/gpu_metrics.sh index a9347a9..a323489 100644 --- a/scripts/gpu_metrics.sh +++ b/scripts/gpu_metrics.sh @@ -66,19 +66,19 @@ fi # 6. 检查IB网卡排列顺序 value3=$(cat << EOF -hca_id: mlx5_0 -hca_id: mlx5_1 -hca_id: mlx5_2 -hca_id: mlx5_3 -hca_id: mlx5_4 -hca_id: mlx5_5 -hca_id: mlx5_6 -hca_id: mlx5_7 -hca_id: mlx5_bond_0 -hca_id: mlx5_bond_1 +mlx5_bond_0 port 1 ==> bond0 (Up) +mlx5_bond_1 port 1 ==> bond1 (Up) +mlx5_bond_2 port 1 ==> bond2 (Up) +mlx5_bond_3 port 1 ==> bond3 (Up) +mlx5_bond_4 port 1 ==> bond4 (Up) +mlx5_bond_5 port 1 ==> bond5 (Up) +mlx5_bond_6 port 1 ==> bond6 (Up) +mlx5_bond_7 port 1 ==> bond7 (Up) +mlx5_bond_8 port 1 ==> bond8 (Up) +mlx5_bond_9 port 1 ==> bond9 (Up) EOF ) -value4=$(ibv_devinfo | grep mlx) +value4=$(ibdev2netdev) if ! test "$value3" = "$value4"; then echo "node_cluster_check_ib_order 1" >> $TMP_FILE else @@ -87,12 +87,16 @@ fi # 7. 检查硬盘数量 disk_error=0 -if ! lsblk | grep nvme5n1 > /dev/null; then - disk_error=1 -elif lsblk | grep nvme6n1 > /dev/null; then - disk_error=1 -fi -echo "node_cluster_check_disk_count $disk_error" >> $TMP_FILE +TARGET_DISKS=("nvme0n1" "nvme1n1" "nvme2n1" "nvme3n1") +for disk in "${TARGET_DISKS[@]}"; do + if ! lsblk | grep -w "$disk" > /dev/null; then + disk_error=1 + break + fi +done + +echo "node_cluster_check_disk_count $disk_error" >> "$TMP_FILE" + # 8. 检查内存品牌一致性 memory2=$(sudo dmidecode -t memory | grep -i "Manufacturer:" | uniq | wc -l) @@ -119,24 +123,24 @@ else fi # 11. 检查风扇模式 -fan_mode=$(racadm get system.thermalsettings.ThermalProfile 2>/dev/null | grep -c 'ThermalProfile=Maximum Performance') -if [ "$fan_mode" -ne 1 ]; then - echo "node_cluster_check_fan_mode 1" >> $TMP_FILE -else - echo "node_cluster_check_fan_mode 0" >> $TMP_FILE -fi +#fan_mode=$(racadm get system.thermalsettings.ThermalProfile 2>/dev/null | grep -c 'ThermalProfile=Maximum Performance') +#if [ "$fan_mode" -ne 1 ]; then +# echo "node_cluster_check_fan_mode 1" >> $TMP_FILE +#else +# echo "node_cluster_check_fan_mode 0" >> $TMP_FILE +#fi # 12. 检查风扇转速 -fan_speed=$(racadm get system.thermalsettings.FanSpeedOffset 2>/dev/null | grep -c 'FanSpeedOffset=Max') -if [ "$fan_speed" -ne 1 ]; then - echo "node_cluster_check_fan_speed 1" >> $TMP_FILE -else - echo "node_cluster_check_fan_speed 0" >> $TMP_FILE -fi +#fan_speed=$(racadm get system.thermalsettings.FanSpeedOffset 2>/dev/null | grep -c 'FanSpeedOffset=Max') +#if [ "$fan_speed" -ne 1 ]; then +# echo "node_cluster_check_fan_speed 1" >> $TMP_FILE +#else +# echo "node_cluster_check_fan_speed 0" >> $TMP_FILE +#fi # 13. 检查IB链路BER值 -for i in {0..7}; do - dev="mlx5_$i" +for i in {2..9}; do + dev="mlx5_bond_$i" ber_value=$(mlxlink -d $dev -c 2>/dev/null | grep 'Symbol BER' | awk -F- '{print $2}' | xargs) if [[ -n "$ber_value" && "$ber_value" -lt 14 ]]; then echo "node_cluster_check_ib_ber{device=\"$dev\"} 1" >> $TMP_FILE @@ -146,8 +150,8 @@ for i in {0..7}; do done # 14. 检查IB网卡光衰 -for i in {0..7}; do - dev="mlx5_$i" +for i in {2..9}; do + dev="mlx5_bond_$i" output=$(mlxlink -d "$dev" -m 2>/dev/null | grep -E 'Tx Power Current|Rx Power Current') power_error=0 @@ -171,14 +175,14 @@ for i in {0..7}; do done # 15. 检查IB网卡带宽 -for i in {0..7}; do - dev="mlx5_$i" +for i in {2..9}; do + dev="mlx5_bond_$i" bandwidth_error=0 output=$(run_perftest_loopback 0 1 ib_write_bw -d "$dev" --report_gbits 2>/dev/null | grep 65536 | awk '{print int($3)}') if [[ -n "$output" ]]; then while read -r bw; do - if (( bw < 360 )); then + if (( bw < 80 )); then bandwidth_error=1 break fi @@ -188,6 +192,57 @@ for i in {0..7}; do echo "node_cluster_check_ib_bandwidth{device=\"$dev\"} $bandwidth_error" >> $TMP_FILE done +#16 PCIE 降速 +# PCIe速度与Gen版本对应表 +declare -A PCIE_SPEED=( + ["2.5"]="1" # Gen1 + ["5.0"]="2" # Gen2 + ["8.0"]="3" # Gen3 + ["16.0"]="4" # Gen4 + ["32.0"]="5" # Gen5 + ["32"]="5" # 适配整数表示 +) + +# 识别NVIDIA 3D控制器 +GPU_DEVICES=$(lspci | grep -i "nvidia" | grep -i "3D controller" | awk '{print $1}') +GPU_COUNT=$(echo "$GPU_DEVICES" | wc -l | tr -d ' ') + +# 遍历每个GPU设备 +INDEX=0 +while IFS= read -r DEVICE; do + [ -z "$DEVICE" ] && continue + + # 获取PCIe信息 + PCI_INFO=$(lspci -vv -s "$DEVICE") + + # 提取链路能力(LnkCap) + LNK_CAP=$(echo "$PCI_INFO" | grep "LnkCap:" | head -n 1) + LNK_CAP_SPEED=$(echo "$LNK_CAP" | grep -oP 'Speed \K[\d.]+(?=GT/s)') + LNK_CAP_WIDTH=$(echo "$LNK_CAP" | grep -oP 'Width \Kx\d+') + CAP_GEN=${PCIE_SPEED[$LNK_CAP_SPEED]:-0} + + # 提取当前状态(仅用于判断降速,不输出详细指标) + LNK_STA=$(echo "$PCI_INFO" | grep "LnkSta:" | head -n 1) + LNK_STA_SPEED=$(echo "$LNK_STA" | grep -oP 'Speed \K[\d.]+(?=GT/s)') + LNK_STA_WIDTH=$(echo "$LNK_STA" | grep -oP 'Width \Kx\d+') + STA_GEN=${PCIE_SPEED[$LNK_STA_SPEED]:-0} + + # 判断是否降速 + DOWNGRADED=0 + if [ "$CAP_GEN" -gt "$STA_GEN" ] || [ "$LNK_CAP_WIDTH" != "$LNK_STA_WIDTH" ]; then + DOWNGRADED=1 + fi + # 只输出链路能力和降速状态指标 + #echo "# HELP node_cluster_check_gpu_lnkCap GPU支持的最大PCIe链路规格" >> "$TMP_FILE" + #echo "# TYPE node_cluster_check_gpu_lnkCap gauge" >> "$TMP_FILE" + #echo "node_cluster_check_gpu_lnkCap{device=\"$DEVICE\",index=\"$INDEX\",max_speed_gtps=\"$LNK_CAP_SPEED\",max_gen=\"$CAP_GEN\",max_width=\"$LNK_CAP_WIDTH\"} 1" >> "$TMP_FILE" + #echo "# HELP node_cluster_check_gpu_downgraded GPU是否降速(1=是,0=否)" >> "$TMP_FILE" + #echo "# TYPE node_cluster_check_gpu_downgraded gauge" >> "$TMP_FILE" + echo "node_cluster_check_gpu_downgraded{device=\"$DEVICE\",index=\"$INDEX\"} $DOWNGRADED" >> "$TMP_FILE" + INDEX=$((INDEX + 1)) +done <<< "$GPU_DEVICES" + + # 替换指标文件 mv $TMP_FILE $METRIC_FILE -chmod 777 $METRIC_FILE \ No newline at end of file +chmod 777 $METRIC_FILE