ansible-devops/scripts/ib.sh

193 lines
5.2 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 配置文件路径定义
TMP_IPOIB="/tmp/ipoib_temp.rules"
TMP_NET="/tmp/net_temp.rules"
TARGET_IPOIB="/etc/udev/rules.d/70-persistent-ipoib.rules"
TARGET_NET="/etc/udev/rules.d/70-persistent-net.rules"
NVIDIA_CONF="/etc/modprobe.d/nvidia-gsp.conf"
NETPLAN_CONF="/etc/netplan/00-installer-config.yaml"
BACKUP_SUFFIX=".ibconfig_backup"
# 打印信息函数
info() {
echo -e "\033[1;34m[INFO] $1\033[0m"
}
# 成功信息函数
success() {
echo -e "\033[1;32m[SUCCESS] $1\033[0m"
}
# 错误信息函数
error() {
echo -e "\033[1;31m[ERROR] $1\033[0m" >&2
}
# 帮助信息
usage() {
echo "InfiniBand网络配置脚本"
echo "用法: $0 [选项]"
echo "选项:"
echo " --install 安装并配置InfiniBand网络"
echo " --uninstall 卸载配置并恢复原始状态"
echo " --help 显示此帮助信息"
exit 1
}
# 安装配置
install_config() {
info "开始安装InfiniBand网络配置..."
# 备份现有配置文件
info "备份现有配置文件..."
[ -f "$TARGET_IPOIB" ] && cp "$TARGET_IPOIB" "$TARGET_IPOIB$BACKUP_SUFFIX"
[ -f "$TARGET_NET" ] && cp "$TARGET_NET" "$TARGET_NET$BACKUP_SUFFIX"
[ -f "$NVIDIA_CONF" ] && cp "$NVIDIA_CONF" "$NVIDIA_CONF$BACKUP_SUFFIX"
[ -f "$NETPLAN_CONF" ] && cp "$NETPLAN_CONF" "$NETPLAN_CONF$BACKUP_SUFFIX"
# 确保目标文件存在
info "初始化UDev规则文件..."
touch "$TARGET_IPOIB" "$TARGET_NET"
# 清除原有非注释规则(保留注释行)
info "清理现有规则..."
sed -i '/^\s*#/!d' "$TARGET_IPOIB"
sed -i '/^\s*#/!d' "$TARGET_NET"
# 提取400G网卡的PCI地址
info "检测400G InfiniBand网卡..."
ibdev2netdev -v | grep 400G | awk '{print $1}' > /tmp/pci_devices.tmp
if [ ! -s /tmp/pci_devices.tmp ]; then
error "未找到400G InfiniBand网卡无法继续配置"
exit 1
fi
# 生成ipoib临时规则
info "生成IPOIB规则..."
ID=20
while read -r kernel; do
if [ -n "$kernel" ]; then
echo "ACTION==\"add\", KERNELS==\"$kernel\", SUBSYSTEM==\"infiniband\", PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> "$TMP_IPOIB"
ID=$((ID + 1))
fi
done < /tmp/pci_devices.tmp
# 生成net临时规则
info "生成网络接口规则..."
IDS=0
while read -r kernel; do
if [ -n "$kernel" ]; then
echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$kernel\", NAME=\"ib$IDS\"" >> "$TMP_NET"
IDS=$((IDS + 1))
fi
done < /tmp/pci_devices.tmp
# 对临时规则文件按PCI地址排序
info "排序规则文件..."
sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_IPOIB" -o "$TMP_IPOIB"
sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_NET" -o "$TMP_NET"
# 覆盖目标规则文件
info "应用规则文件..."
cat "$TMP_IPOIB" > "$TARGET_IPOIB"
cat "$TMP_NET" > "$TARGET_NET"
# 配置NVIDIA固件选项
info "配置NVIDIA固件选项..."
echo "options nvidia NVreg_EnableGpuFirmware=0" > "$NVIDIA_CONF"
# 更新netplan配置
info "配置网络接口..."
sed -i "/version\:\ 2/d" "$NETPLAN_CONF"
{
echo " ib0:"
echo " dhcp4: true"
echo " ib1:"
echo " dhcp4: true"
echo " ib2:"
echo " dhcp4: true"
echo " ib3:"
echo " dhcp4: true"
echo " ib4:"
echo " dhcp4: true"
echo " ib5:"
echo " dhcp4: true"
echo " ib6:"
echo " dhcp4: true"
echo " ib7:"
echo " dhcp4: true"
echo " version: 2"
} >> "$NETPLAN_CONF"
# 应用netplan配置
info "应用网络配置..."
netplan apply
systemctl restart openibd
# 更新initramfs
info "更新initramfs..."
update-initramfs -u &>> /tmp/ib_config_update.log
# 清理临时文件
info "清理临时文件..."
rm -f /tmp/pci_devices.tmp "$TMP_IPOIB" "$TMP_NET"
success "InfiniBand网络配置安装完成"
}
# 卸载配置
uninstall_config() {
info "开始卸载InfiniBand网络配置..."
# 恢复UDev规则文件
info "恢复原始UDev规则..."
[ -f "$TARGET_IPOIB$BACKUP_SUFFIX" ] && mv "$TARGET_IPOIB$BACKUP_SUFFIX" "$TARGET_IPOIB"
[ -f "$TARGET_NET$BACKUP_SUFFIX" ] && mv "$TARGET_NET$BACKUP_SUFFIX" "$TARGET_NET"
# 恢复NVIDIA配置
info "恢复NVIDIA配置..."
if [ -f "$NVIDIA_CONF$BACKUP_SUFFIX" ]; then
mv "$NVIDIA_CONF$BACKUP_SUFFIX" "$NVIDIA_CONF"
else
rm -f "$NVIDIA_CONF"
fi
# 恢复netplan配置
info "恢复网络配置..."
if [ -f "$NETPLAN_CONF$BACKUP_SUFFIX" ]; then
mv "$NETPLAN_CONF$BACKUP_SUFFIX" "$NETPLAN_CONF"
info "应用原始网络配置..."
netplan apply
fi
# 更新initramfs
info "更新initramfs..."
update-initramfs -u &>> /tmp/ib_config_restore.log
success "InfiniBand网络配置已卸载系统恢复到原始状态"
}
# 主逻辑
if [ $# -ne 1 ]; then
usage
fi
case "$1" in
--install)
install_config
;;
--uninstall)
uninstall_config
;;
--help)
usage
;;
*)
error "无效选项: $1"
usage
;;
esac