From 6a219bf0a095f3ebec32d16311d9414b4402e5d0 Mon Sep 17 00:00:00 2001 From: joy Date: Tue, 14 Oct 2025 09:45:40 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20scripts/ib.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/ib.sh | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 scripts/ib.sh diff --git a/scripts/ib.sh b/scripts/ib.sh new file mode 100644 index 0000000..e5967f9 --- /dev/null +++ b/scripts/ib.sh @@ -0,0 +1,192 @@ +#!/bin/bash + +# 配置文件路径定义 +TMP_IPOIB="/tmp/ipoib_temp.rules" +TMP_NET="/tmp/net_temp.rules" +TARGET_IPOIB="/etc/udev/rules.d/70-persistent-ipoib.rules" +TARGET_NET="/etc/udev/rules.d/70-persistent-net.rules" +NVIDIA_CONF="/etc/modprobe.d/nvidia-gsp.conf" +NETPLAN_CONF="/etc/netplan/00-installer-config.yaml" +BACKUP_SUFFIX=".ibconfig_backup" + +# 打印信息函数 +info() { + echo -e "\033[1;34m[INFO] $1\033[0m" +} + +# 成功信息函数 +success() { + echo -e "\033[1;32m[SUCCESS] $1\033[0m" +} + +# 错误信息函数 +error() { + echo -e "\033[1;31m[ERROR] $1\033[0m" >&2 +} + +# 帮助信息 +usage() { + echo "InfiniBand网络配置脚本" + echo "用法: $0 [选项]" + echo "选项:" + echo " --install 安装并配置InfiniBand网络" + echo " --uninstall 卸载配置并恢复原始状态" + echo " --help 显示此帮助信息" + exit 1 +} + +# 安装配置 +install_config() { + info "开始安装InfiniBand网络配置..." + + # 备份现有配置文件 + info "备份现有配置文件..." + [ -f "$TARGET_IPOIB" ] && cp "$TARGET_IPOIB" "$TARGET_IPOIB$BACKUP_SUFFIX" + [ -f "$TARGET_NET" ] && cp "$TARGET_NET" "$TARGET_NET$BACKUP_SUFFIX" + [ -f "$NVIDIA_CONF" ] && cp "$NVIDIA_CONF" "$NVIDIA_CONF$BACKUP_SUFFIX" + [ -f "$NETPLAN_CONF" ] && cp "$NETPLAN_CONF" "$NETPLAN_CONF$BACKUP_SUFFIX" + + # 确保目标文件存在 + info "初始化UDev规则文件..." + touch "$TARGET_IPOIB" "$TARGET_NET" + + # 清除原有非注释规则(保留注释行) + info "清理现有规则..." + sed -i '/^\s*#/!d' "$TARGET_IPOIB" + sed -i '/^\s*#/!d' "$TARGET_NET" + + # 提取400G网卡的PCI地址 + info "检测400G InfiniBand网卡..." + ibdev2netdev -v | grep 400G | awk '{print $1}' > /tmp/pci_devices.tmp + + if [ ! -s /tmp/pci_devices.tmp ]; then + error "未找到400G InfiniBand网卡,无法继续配置" + exit 1 + fi + + # 生成ipoib临时规则 + info "生成IPOIB规则..." + ID=20 + while read -r kernel; do + if [ -n "$kernel" ]; then + echo "ACTION==\"add\", KERNELS==\"$kernel\", SUBSYSTEM==\"infiniband\", PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> "$TMP_IPOIB" + ID=$((ID + 1)) + fi + done < /tmp/pci_devices.tmp + + # 生成net临时规则 + info "生成网络接口规则..." + IDS=0 + while read -r kernel; do + if [ -n "$kernel" ]; then + echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$kernel\", NAME=\"ib$IDS\"" >> "$TMP_NET" + IDS=$((IDS + 1)) + fi + done < /tmp/pci_devices.tmp + + # 对临时规则文件按PCI地址排序 + info "排序规则文件..." + sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_IPOIB" -o "$TMP_IPOIB" + sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_NET" -o "$TMP_NET" + + # 覆盖目标规则文件 + info "应用规则文件..." + cat "$TMP_IPOIB" > "$TARGET_IPOIB" + cat "$TMP_NET" > "$TARGET_NET" + + # 配置NVIDIA固件选项 + info "配置NVIDIA固件选项..." + echo "options nvidia NVreg_EnableGpuFirmware=0" > "$NVIDIA_CONF" + + # 更新netplan配置 + info "配置网络接口..." + sed -i "/version\:\ 2/d" "$NETPLAN_CONF" + + { + echo " ib0:" + echo " dhcp4: true" + echo " ib1:" + echo " dhcp4: true" + echo " ib2:" + echo " dhcp4: true" + echo " ib3:" + echo " dhcp4: true" + echo " ib4:" + echo " dhcp4: true" + echo " ib5:" + echo " dhcp4: true" + echo " ib6:" + echo " dhcp4: true" + echo " ib7:" + echo " dhcp4: true" + echo " version: 2" + } >> "$NETPLAN_CONF" + + # 应用netplan配置 + info "应用网络配置..." + + netplan apply + systemctl restart openibd + # 更新initramfs + info "更新initramfs..." + update-initramfs -u &>> /tmp/ib_config_update.log + + # 清理临时文件 + info "清理临时文件..." + rm -f /tmp/pci_devices.tmp "$TMP_IPOIB" "$TMP_NET" + + success "InfiniBand网络配置安装完成" +} + +# 卸载配置 +uninstall_config() { + info "开始卸载InfiniBand网络配置..." + + # 恢复UDev规则文件 + info "恢复原始UDev规则..." + [ -f "$TARGET_IPOIB$BACKUP_SUFFIX" ] && mv "$TARGET_IPOIB$BACKUP_SUFFIX" "$TARGET_IPOIB" + [ -f "$TARGET_NET$BACKUP_SUFFIX" ] && mv "$TARGET_NET$BACKUP_SUFFIX" "$TARGET_NET" + + # 恢复NVIDIA配置 + info "恢复NVIDIA配置..." + if [ -f "$NVIDIA_CONF$BACKUP_SUFFIX" ]; then + mv "$NVIDIA_CONF$BACKUP_SUFFIX" "$NVIDIA_CONF" + else + rm -f "$NVIDIA_CONF" + fi + + # 恢复netplan配置 + info "恢复网络配置..." + if [ -f "$NETPLAN_CONF$BACKUP_SUFFIX" ]; then + mv "$NETPLAN_CONF$BACKUP_SUFFIX" "$NETPLAN_CONF" + info "应用原始网络配置..." + netplan apply + fi + + # 更新initramfs + info "更新initramfs..." + update-initramfs -u &>> /tmp/ib_config_restore.log + + success "InfiniBand网络配置已卸载,系统恢复到原始状态" +} + +# 主逻辑 +if [ $# -ne 1 ]; then + usage +fi + +case "$1" in + --install) + install_config + ;; + --uninstall) + uninstall_config + ;; + --help) + usage + ;; + *) + error "无效选项: $1" + usage + ;; +esac