forked from yindun/ansible-devops
193 lines
5.2 KiB
Bash
193 lines
5.2 KiB
Bash
#!/bin/bash
|
||
|
||
# 配置文件路径定义
|
||
TMP_IPOIB="/tmp/ipoib_temp.rules"
|
||
TMP_NET="/tmp/net_temp.rules"
|
||
TARGET_IPOIB="/etc/udev/rules.d/70-persistent-ipoib.rules"
|
||
TARGET_NET="/etc/udev/rules.d/70-persistent-net.rules"
|
||
NVIDIA_CONF="/etc/modprobe.d/nvidia-gsp.conf"
|
||
NETPLAN_CONF="/etc/netplan/00-installer-config.yaml"
|
||
BACKUP_SUFFIX=".ibconfig_backup"
|
||
|
||
# 打印信息函数
|
||
info() {
|
||
echo -e "\033[1;34m[INFO] $1\033[0m"
|
||
}
|
||
|
||
# 成功信息函数
|
||
success() {
|
||
echo -e "\033[1;32m[SUCCESS] $1\033[0m"
|
||
}
|
||
|
||
# 错误信息函数
|
||
error() {
|
||
echo -e "\033[1;31m[ERROR] $1\033[0m" >&2
|
||
}
|
||
|
||
# 帮助信息
|
||
usage() {
|
||
echo "InfiniBand网络配置脚本"
|
||
echo "用法: $0 [选项]"
|
||
echo "选项:"
|
||
echo " --install 安装并配置InfiniBand网络"
|
||
echo " --uninstall 卸载配置并恢复原始状态"
|
||
echo " --help 显示此帮助信息"
|
||
exit 1
|
||
}
|
||
|
||
# 安装配置
|
||
install_config() {
|
||
info "开始安装InfiniBand网络配置..."
|
||
|
||
# 备份现有配置文件
|
||
info "备份现有配置文件..."
|
||
[ -f "$TARGET_IPOIB" ] && cp "$TARGET_IPOIB" "$TARGET_IPOIB$BACKUP_SUFFIX"
|
||
[ -f "$TARGET_NET" ] && cp "$TARGET_NET" "$TARGET_NET$BACKUP_SUFFIX"
|
||
[ -f "$NVIDIA_CONF" ] && cp "$NVIDIA_CONF" "$NVIDIA_CONF$BACKUP_SUFFIX"
|
||
[ -f "$NETPLAN_CONF" ] && cp "$NETPLAN_CONF" "$NETPLAN_CONF$BACKUP_SUFFIX"
|
||
|
||
# 确保目标文件存在
|
||
info "初始化UDev规则文件..."
|
||
touch "$TARGET_IPOIB" "$TARGET_NET"
|
||
|
||
# 清除原有非注释规则(保留注释行)
|
||
info "清理现有规则..."
|
||
sed -i '/^\s*#/!d' "$TARGET_IPOIB"
|
||
sed -i '/^\s*#/!d' "$TARGET_NET"
|
||
|
||
# 提取400G网卡的PCI地址
|
||
info "检测400G InfiniBand网卡..."
|
||
ibdev2netdev -v | grep 400G | awk '{print $1}' > /tmp/pci_devices.tmp
|
||
|
||
if [ ! -s /tmp/pci_devices.tmp ]; then
|
||
error "未找到400G InfiniBand网卡,无法继续配置"
|
||
exit 1
|
||
fi
|
||
|
||
# 生成ipoib临时规则
|
||
info "生成IPOIB规则..."
|
||
ID=20
|
||
while read -r kernel; do
|
||
if [ -n "$kernel" ]; then
|
||
echo "ACTION==\"add\", KERNELS==\"$kernel\", SUBSYSTEM==\"infiniband\", PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> "$TMP_IPOIB"
|
||
ID=$((ID + 1))
|
||
fi
|
||
done < /tmp/pci_devices.tmp
|
||
|
||
# 生成net临时规则
|
||
info "生成网络接口规则..."
|
||
IDS=0
|
||
while read -r kernel; do
|
||
if [ -n "$kernel" ]; then
|
||
echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$kernel\", NAME=\"ib$IDS\"" >> "$TMP_NET"
|
||
IDS=$((IDS + 1))
|
||
fi
|
||
done < /tmp/pci_devices.tmp
|
||
|
||
# 对临时规则文件按PCI地址排序
|
||
info "排序规则文件..."
|
||
sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_IPOIB" -o "$TMP_IPOIB"
|
||
sort -t ':' -k 1,1n -k 2,2n -t '.' -k 1,1n "$TMP_NET" -o "$TMP_NET"
|
||
|
||
# 覆盖目标规则文件
|
||
info "应用规则文件..."
|
||
cat "$TMP_IPOIB" > "$TARGET_IPOIB"
|
||
cat "$TMP_NET" > "$TARGET_NET"
|
||
|
||
# 配置NVIDIA固件选项
|
||
info "配置NVIDIA固件选项..."
|
||
echo "options nvidia NVreg_EnableGpuFirmware=0" > "$NVIDIA_CONF"
|
||
|
||
# 更新netplan配置
|
||
info "配置网络接口..."
|
||
sed -i "/version\:\ 2/d" "$NETPLAN_CONF"
|
||
|
||
{
|
||
echo " ib0:"
|
||
echo " dhcp4: true"
|
||
echo " ib1:"
|
||
echo " dhcp4: true"
|
||
echo " ib2:"
|
||
echo " dhcp4: true"
|
||
echo " ib3:"
|
||
echo " dhcp4: true"
|
||
echo " ib4:"
|
||
echo " dhcp4: true"
|
||
echo " ib5:"
|
||
echo " dhcp4: true"
|
||
echo " ib6:"
|
||
echo " dhcp4: true"
|
||
echo " ib7:"
|
||
echo " dhcp4: true"
|
||
echo " version: 2"
|
||
} >> "$NETPLAN_CONF"
|
||
|
||
# 应用netplan配置
|
||
info "应用网络配置..."
|
||
|
||
netplan apply
|
||
systemctl restart openibd
|
||
# 更新initramfs
|
||
info "更新initramfs..."
|
||
update-initramfs -u &>> /tmp/ib_config_update.log
|
||
|
||
# 清理临时文件
|
||
info "清理临时文件..."
|
||
rm -f /tmp/pci_devices.tmp "$TMP_IPOIB" "$TMP_NET"
|
||
|
||
success "InfiniBand网络配置安装完成"
|
||
}
|
||
|
||
# 卸载配置
|
||
uninstall_config() {
|
||
info "开始卸载InfiniBand网络配置..."
|
||
|
||
# 恢复UDev规则文件
|
||
info "恢复原始UDev规则..."
|
||
[ -f "$TARGET_IPOIB$BACKUP_SUFFIX" ] && mv "$TARGET_IPOIB$BACKUP_SUFFIX" "$TARGET_IPOIB"
|
||
[ -f "$TARGET_NET$BACKUP_SUFFIX" ] && mv "$TARGET_NET$BACKUP_SUFFIX" "$TARGET_NET"
|
||
|
||
# 恢复NVIDIA配置
|
||
info "恢复NVIDIA配置..."
|
||
if [ -f "$NVIDIA_CONF$BACKUP_SUFFIX" ]; then
|
||
mv "$NVIDIA_CONF$BACKUP_SUFFIX" "$NVIDIA_CONF"
|
||
else
|
||
rm -f "$NVIDIA_CONF"
|
||
fi
|
||
|
||
# 恢复netplan配置
|
||
info "恢复网络配置..."
|
||
if [ -f "$NETPLAN_CONF$BACKUP_SUFFIX" ]; then
|
||
mv "$NETPLAN_CONF$BACKUP_SUFFIX" "$NETPLAN_CONF"
|
||
info "应用原始网络配置..."
|
||
netplan apply
|
||
fi
|
||
|
||
# 更新initramfs
|
||
info "更新initramfs..."
|
||
update-initramfs -u &>> /tmp/ib_config_restore.log
|
||
|
||
success "InfiniBand网络配置已卸载,系统恢复到原始状态"
|
||
}
|
||
|
||
# 主逻辑
|
||
if [ $# -ne 1 ]; then
|
||
usage
|
||
fi
|
||
|
||
case "$1" in
|
||
--install)
|
||
install_config
|
||
;;
|
||
--uninstall)
|
||
uninstall_config
|
||
;;
|
||
--help)
|
||
usage
|
||
;;
|
||
*)
|
||
error "无效选项: $1"
|
||
usage
|
||
;;
|
||
esac
|