#!/bin/bash set -e # 颜色定义 GREEN='\033[1;32m' RED='\033[1;31m' YELLOW='\033[1;33m' NC='\033[0m' # 重置颜色 # 输出带颜色的信息 log_info() { echo -e "${GREEN}[INFO] $1${NC}"; } log_error() { echo -e "${RED}[ERROR] $1${NC}"; exit 1; } log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}"; } # 默认变量 ACTION="" DRIVER_VERSION="5.8-6.0.4.2" DISTRO="ubuntu22.04" ARCH="x86_64" FORCE=0 # 生成包名和路径 generate_package_info() { DRIVER_PACKAGE="MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}.tgz" PACKAGE_PATH="/opt/${DRIVER_PACKAGE}" DRIVER_DIR="/opt/MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}" INTERNAL_URL="http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" OFFICIAL_URL="https://example.com/${DRIVER_PACKAGE}" # 请替换为实际官网地址 } # 解析命令行参数 parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --install) ACTION="install" shift ;; --uninstall) ACTION="uninstall" shift ;; --version) if [[ -z "$2" ]]; then log_error "请指定版本号,如: --version 5.8-6.0.4.2" fi DRIVER_VERSION="$2" generate_package_info shift 2 ;; --force) FORCE=1 shift ;; *) log_error "未知参数: $1" ;; esac done if [[ -z "$ACTION" ]]; then log_error "请指定操作: --install 或 --uninstall" fi } # 下载驱动包 download_driver() { log_info "开始下载驱动包: $DRIVER_PACKAGE" if [ -f "$PACKAGE_PATH" ]; then log_info "使用本地驱动包: $PACKAGE_PATH" else log_info "本地包不存在,尝试从内网下载" if wget -q -O "$PACKAGE_PATH" "$INTERNAL_URL"; then log_info "内网下载成功" else log_warning "内网下载失败,尝试从官网下载" if wget -q -O "$PACKAGE_PATH" "$OFFICIAL_URL"; then log_info "官网下载成功" else log_error "驱动包下载失败,请手动放置到 /opt/" fi fi fi } # 安装驱动 install_driver() { log_info "开始安装驱动: $DRIVER_VERSION" # 检查是否已安装 #if command -v ibv_devinfo &> /dev/null && [[ $FORCE -eq 0 ]]; then # log_warning "检测到驱动已安装,使用 --force 覆盖安装" # exit 0 #fi kernel_version=$(uname -r) log_info "当前内核版本: $kernel_version" log_info "安装依赖包" apt update &>> /tmp/mlnx_install.log apt install -y net-tools bzip2 &>> /tmp/mlnx_install.log log_info "解压驱动包" tar -zxf "$PACKAGE_PATH" -C /opt/ log_info "执行驱动安装" cd "$DRIVER_DIR" ./mlnxofedinstall --without-dkms --add-kernel-support --kernel "$kernel_version" --with-fw-update --force &>> /tmp/mlnx_install.log sleep 10 } # 卸载驱动(修改后版本) uninstall_driver() { log_info "开始卸载驱动: $DRIVER_VERSION" # 检查驱动目录,不存在则重新下载解压 if [ ! -d "$DRIVER_DIR" ]; then log_warning "驱动目录不存在,尝试重新下载和解压" download_driver # 复用安装的下载逻辑 log_info "解压驱动包" tar -zxf "$PACKAGE_PATH" -C /opt/ if [ ! -d "$DRIVER_DIR" ]; then log_error "解压失败,无法找到驱动目录: $DRIVER_DIR" else log_info "成功解压驱动包到: $DRIVER_DIR" fi else log_info "找到驱动目录: $DRIVER_DIR" fi # 执行卸载 cd "$DRIVER_DIR" log_info "执行卸载脚本" ./uninstall.sh -q -y &>> /tmp/mlnx_install.log || log_warning "卸载脚本执行失败,尝试手动清理" log_info "清理残留文件" rm -rf "$DRIVER_DIR" "$PACKAGE_PATH" log_info "停止并禁用openibd服务" systemctl stop openibd.service &>> /tmp/mlnx_install.log || true systemctl disable openibd.service &>> /tmp/mlnx_install.log || true log_info "恢复网卡命名规则" sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules rm -f /etc/modprobe.d/nvidia-gsp.conf update-initramfs -u &>> /tmp/mlnx_install.log } # 配置网卡命名规则 configure_naming_rules() { log_info "配置IB网卡命名规则" log_info "备份原有规则" cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log || true cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log || true log_info "清除原有规则" sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log || true sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log || true log_info "生成IB设备命名规则" ID=20 for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do if [ -n "$i" ]; then echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules ID=$((ID+1)) fi done log_info "生成网络设备命名规则" IDS=0 for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do if [ -n "$j" ]; then echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules IDS=$((IDS+1)) fi done log_info "配置nvidia选项" echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-gsp.conf update-initramfs -u &>> /tmp/mlnx_install.log log_info "重启openibd服务" systemctl restart openibd.service sleep 15 } # 检查驱动安装结果 check_installation() { log_info "检查驱动安装结果" if command -v ibv_devinfo &> /dev/null; then log_info "驱动安装成功" else log_error "驱动安装失败" fi log_info "检查网卡命名规则" valid_count=0 for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do if [ -n "$dev" ]; then mlx_name=$(udevadm info -q name -n "$dev" 2>/dev/null | grep "mlx5_" || true) net_name=$(ip link show "$dev" | grep "ib[0-9]" || true) if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then valid_count=$((valid_count+1)) else log_warning "网卡 $dev 命名规则未生效" fi fi done if [ $valid_count -gt 0 ]; then log_info "网卡命名规则生效,成功配置 $valid_count 个网卡" else log_warning "所有网卡命名规则均未生效,建议重启系统手工配置!" fi } # 检查卸载结果 check_uninstallation() { log_info "检查卸载结果" if ! command -v ibv_devinfo &> /dev/null; then log_info "驱动已成功卸载" else log_warning "驱动命令仍存在,可能需要手动清理" fi if [ ! -d "$DRIVER_DIR" ]; then log_info "驱动目录已删除" else log_warning "驱动目录未完全删除: $DRIVER_DIR" fi } # 主函数 main() { generate_package_info parse_args "$@" log_info "开始执行 ${ACTION^^} 操作,版本: $DRIVER_VERSION" case "$ACTION" in install) download_driver install_driver configure_naming_rules check_installation ;; uninstall) uninstall_driver check_uninstallation ;; esac log_info "操作完成!" } # 执行主函数 main "$@"