#!/bin/bash # 配置文件路径定义 TMP_IPOIB="/tmp/ipoib_temp.rules" TMP_NET="/tmp/net_temp.rules" TARGET_IPOIB="/etc/udev/rules.d/70-persistent-ipoib.rules" TARGET_NET="/etc/udev/rules.d/70-persistent-net.rules" NVIDIA_CONF="/etc/modprobe.d/nvidia-gsp.conf" NETPLAN_CONF="/etc/netplan/00-installer-config.yaml" BACKUP_SUFFIX=".ibconfig_backup" # 打印信息函数 info() { echo -e "\033[1;34m[INFO] $1\033[0m" } # 成功信息函数 success() { echo -e "\033[1;32m[SUCCESS] $1\033[0m" } # 错误信息函数 error() { echo -e "\033[1;31m[ERROR] $1\033[0m" >&2 } # 帮助信息 usage() { echo "InfiniBand网络配置脚本" echo "用法: $0 [选项]" echo "选项:" echo " --install 安装并配置InfiniBand网络" echo " --uninstall 卸载配置并恢复原始状态" echo " --help 显示此帮助信息" exit 1 } # 安装配置 install_config() { info "开始安装InfiniBand网络配置..." # 备份现有配置文件 info "备份现有配置文件..." [ -f "$TARGET_IPOIB" ] && cp "$TARGET_IPOIB" "$TARGET_IPOIB$BACKUP_SUFFIX" [ -f "$TARGET_NET" ] && cp "$TARGET_NET" "$TARGET_NET$BACKUP_SUFFIX" [ -f "$NVIDIA_CONF" ] && cp "$NVIDIA_CONF" "$NVIDIA_CONF$BACKUP_SUFFIX" [ -f "$NETPLAN_CONF" ] && cp "$NETPLAN_CONF" "$NETPLAN_CONF$BACKUP_SUFFIX" # 确保目标文件存在 info "初始化UDev规则文件..." touch "$TARGET_IPOIB" "$TARGET_NET" # 清除原有非注释规则(保留注释行) info "清理现有规则..." sed -i '/^\s*#/!d' "$TARGET_IPOIB" sed -i '/^\s*#/!d' "$TARGET_NET" # 提取400G网卡的PCI地址 info "检测400G InfiniBand网卡..." ibdev2netdev -v | grep 400G | awk '{print $1}' > /tmp/pci_devices.tmp if [ ! -s /tmp/pci_devices.tmp ]; then error "未找到400G InfiniBand网卡,无法继续配置" exit 1 fi # 生成ipoib临时规则 info "生成IPOIB规则..." ID=20 while read -r kernel; do if [ -n "$kernel" ]; then echo "ACTION==\"add\", KERNELS==\"$kernel\", SUBSYSTEM==\"infiniband\", PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> "$TMP_IPOIB" ID=$((ID + 1)) fi done < /tmp/pci_devices.tmp # 生成net临时规则 info "生成网络接口规则..." IDS=0 while read -r kernel; do if [ -n "$kernel" ]; then echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$kernel\", NAME=\"ib$IDS\"" >> "$TMP_NET" IDS=$((IDS + 1)) fi done < /tmp/pci_devices.tmp # 对临时规则文件按PCI地址排序 info "排序规则文件..." sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_IPOIB" -o "$TMP_IPOIB" sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_NET" -o "$TMP_NET" # 覆盖目标规则文件 info "应用规则文件..." cat "$TMP_IPOIB" > "$TARGET_IPOIB" cat "$TMP_NET" > "$TARGET_NET" # 配置NVIDIA固件选项 info "配置NVIDIA固件选项..." echo "options nvidia NVreg_EnableGpuFirmware=0" > "$NVIDIA_CONF" # 更新netplan配置 info "配置网络接口..." sed -i "/version\:\ 2/d" "$NETPLAN_CONF" { echo " ib0:" echo " dhcp4: true" echo " ib1:" echo " dhcp4: true" echo " ib2:" echo " dhcp4: true" echo " ib3:" echo " dhcp4: true" echo " ib4:" echo " dhcp4: true" echo " ib5:" echo " dhcp4: true" echo " ib6:" echo " dhcp4: true" echo " ib7:" echo " dhcp4: true" echo " version: 2" } >> "$NETPLAN_CONF" # 应用netplan配置 info "应用网络配置..." netplan apply systemctl restart openibd # 更新initramfs info "更新initramfs..." update-initramfs -u &>> /tmp/ib_config_update.log # 清理临时文件 info "清理临时文件..." rm -f /tmp/pci_devices.tmp "$TMP_IPOIB" "$TMP_NET" success "InfiniBand网络配置安装完成" } # 卸载配置 uninstall_config() { info "开始卸载InfiniBand网络配置..." # 恢复UDev规则文件 info "恢复原始UDev规则..." [ -f "$TARGET_IPOIB$BACKUP_SUFFIX" ] && mv "$TARGET_IPOIB$BACKUP_SUFFIX" "$TARGET_IPOIB" [ -f "$TARGET_NET$BACKUP_SUFFIX" ] && mv "$TARGET_NET$BACKUP_SUFFIX" "$TARGET_NET" # 恢复NVIDIA配置 info "恢复NVIDIA配置..." if [ -f "$NVIDIA_CONF$BACKUP_SUFFIX" ]; then mv "$NVIDIA_CONF$BACKUP_SUFFIX" "$NVIDIA_CONF" else rm -f "$NVIDIA_CONF" fi # 恢复netplan配置 info "恢复网络配置..." if [ -f "$NETPLAN_CONF$BACKUP_SUFFIX" ]; then mv "$NETPLAN_CONF$BACKUP_SUFFIX" "$NETPLAN_CONF" info "应用原始网络配置..." netplan apply fi # 更新initramfs info "更新initramfs..." update-initramfs -u &>> /tmp/ib_config_restore.log success "InfiniBand网络配置已卸载,系统恢复到原始状态" } # 主逻辑 if [ $# -ne 1 ]; then usage fi case "$1" in --install) install_config ;; --uninstall) uninstall_config ;; --help) usage ;; *) error "无效选项: $1" usage ;; esac