ansible-devops/scripts/ib.sh

193 lines
5.2 KiB
Bash
Raw Normal View History

2025-10-14 09:45:40 +08:00
#!/bin/bash
# 配置文件路径定义
TMP_IPOIB="/tmp/ipoib_temp.rules"
TMP_NET="/tmp/net_temp.rules"
TARGET_IPOIB="/etc/udev/rules.d/70-persistent-ipoib.rules"
TARGET_NET="/etc/udev/rules.d/70-persistent-net.rules"
NVIDIA_CONF="/etc/modprobe.d/nvidia-gsp.conf"
NETPLAN_CONF="/etc/netplan/00-installer-config.yaml"
BACKUP_SUFFIX=".ibconfig_backup"
# 打印信息函数
info() {
echo -e "\033[1;34m[INFO] $1\033[0m"
}
# 成功信息函数
success() {
echo -e "\033[1;32m[SUCCESS] $1\033[0m"
}
# 错误信息函数
error() {
echo -e "\033[1;31m[ERROR] $1\033[0m" >&2
}
# 帮助信息
usage() {
echo "InfiniBand网络配置脚本"
echo "用法: $0 [选项]"
echo "选项:"
echo " --install 安装并配置InfiniBand网络"
echo " --uninstall 卸载配置并恢复原始状态"
echo " --help 显示此帮助信息"
exit 1
}
# 安装配置
install_config() {
info "开始安装InfiniBand网络配置..."
# 备份现有配置文件
info "备份现有配置文件..."
[ -f "$TARGET_IPOIB" ] && cp "$TARGET_IPOIB" "$TARGET_IPOIB$BACKUP_SUFFIX"
[ -f "$TARGET_NET" ] && cp "$TARGET_NET" "$TARGET_NET$BACKUP_SUFFIX"
[ -f "$NVIDIA_CONF" ] && cp "$NVIDIA_CONF" "$NVIDIA_CONF$BACKUP_SUFFIX"
[ -f "$NETPLAN_CONF" ] && cp "$NETPLAN_CONF" "$NETPLAN_CONF$BACKUP_SUFFIX"
# 确保目标文件存在
info "初始化UDev规则文件..."
touch "$TARGET_IPOIB" "$TARGET_NET"
# 清除原有非注释规则(保留注释行)
info "清理现有规则..."
sed -i '/^\s*#/!d' "$TARGET_IPOIB"
sed -i '/^\s*#/!d' "$TARGET_NET"
# 提取400G网卡的PCI地址
info "检测400G InfiniBand网卡..."
ibdev2netdev -v | grep 400G | awk '{print $1}' > /tmp/pci_devices.tmp
if [ ! -s /tmp/pci_devices.tmp ]; then
error "未找到400G InfiniBand网卡无法继续配置"
exit 1
fi
# 生成ipoib临时规则
info "生成IPOIB规则..."
ID=20
while read -r kernel; do
if [ -n "$kernel" ]; then
echo "ACTION==\"add\", KERNELS==\"$kernel\", SUBSYSTEM==\"infiniband\", PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> "$TMP_IPOIB"
ID=$((ID + 1))
fi
done < /tmp/pci_devices.tmp
# 生成net临时规则
info "生成网络接口规则..."
IDS=0
while read -r kernel; do
if [ -n "$kernel" ]; then
echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$kernel\", NAME=\"ib$IDS\"" >> "$TMP_NET"
IDS=$((IDS + 1))
fi
done < /tmp/pci_devices.tmp
# 对临时规则文件按PCI地址排序
info "排序规则文件..."
sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_IPOIB" -o "$TMP_IPOIB"
sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_NET" -o "$TMP_NET"
# 覆盖目标规则文件
info "应用规则文件..."
cat "$TMP_IPOIB" > "$TARGET_IPOIB"
cat "$TMP_NET" > "$TARGET_NET"
# 配置NVIDIA固件选项
info "配置NVIDIA固件选项..."
echo "options nvidia NVreg_EnableGpuFirmware=0" > "$NVIDIA_CONF"
# 更新netplan配置
info "配置网络接口..."
sed -i "/version\:\ 2/d" "$NETPLAN_CONF"
{
echo " ib0:"
echo " dhcp4: true"
echo " ib1:"
echo " dhcp4: true"
echo " ib2:"
echo " dhcp4: true"
echo " ib3:"
echo " dhcp4: true"
echo " ib4:"
echo " dhcp4: true"
echo " ib5:"
echo " dhcp4: true"
echo " ib6:"
echo " dhcp4: true"
echo " ib7:"
echo " dhcp4: true"
echo " version: 2"
} >> "$NETPLAN_CONF"
# 应用netplan配置
info "应用网络配置..."
netplan apply
systemctl restart openibd
# 更新initramfs
info "更新initramfs..."
update-initramfs -u &>> /tmp/ib_config_update.log
# 清理临时文件
info "清理临时文件..."
rm -f /tmp/pci_devices.tmp "$TMP_IPOIB" "$TMP_NET"
success "InfiniBand网络配置安装完成"
}
# 卸载配置
uninstall_config() {
info "开始卸载InfiniBand网络配置..."
# 恢复UDev规则文件
info "恢复原始UDev规则..."
[ -f "$TARGET_IPOIB$BACKUP_SUFFIX" ] && mv "$TARGET_IPOIB$BACKUP_SUFFIX" "$TARGET_IPOIB"
[ -f "$TARGET_NET$BACKUP_SUFFIX" ] && mv "$TARGET_NET$BACKUP_SUFFIX" "$TARGET_NET"
# 恢复NVIDIA配置
info "恢复NVIDIA配置..."
if [ -f "$NVIDIA_CONF$BACKUP_SUFFIX" ]; then
mv "$NVIDIA_CONF$BACKUP_SUFFIX" "$NVIDIA_CONF"
else
rm -f "$NVIDIA_CONF"
fi
# 恢复netplan配置
info "恢复网络配置..."
if [ -f "$NETPLAN_CONF$BACKUP_SUFFIX" ]; then
mv "$NETPLAN_CONF$BACKUP_SUFFIX" "$NETPLAN_CONF"
info "应用原始网络配置..."
netplan apply
fi
# 更新initramfs
info "更新initramfs..."
update-initramfs -u &>> /tmp/ib_config_restore.log
success "InfiniBand网络配置已卸载系统恢复到原始状态"
}
# 主逻辑
if [ $# -ne 1 ]; then
usage
fi
case "$1" in
--install)
install_config
;;
--uninstall)
uninstall_config
;;
--help)
usage
;;
*)
error "无效选项: $1"
usage
;;
esac