#!/bin/bash set -euo pipefail IFS=$'\n\t' #================================ # 全局配置 #================================ LOG_FILE="/var/log/mlnx_driver_install_$(date +%Y%m%d%H%M%S).log" DRIVER_NAME="MLNX_OFED" APP_VERSION="" # 不再设置默认值,必须手动指定 ARCH="x86_64" DISTRO="" # 不再设置默认值,必须手动指定 APP_DIR="/opt" FORCE=0 # 支持的系统版本列表 SUPPORTED_DISTROS=("ubuntu22.04" "ubuntu24.04") # 颜色定义 GREEN='\033[1;32m' # 绿色 - 成功 RED='\033[1;31m' # 红色 - 失败/错误 BLUE='\033[1;34m' # 蓝色 - 标题/信息 YELLOW='\033[1;33m' # 黄色 - 警告/进度 NC='\033[0m' # 重置颜色 #================================ # 日志函数 #================================ log() { local level="$1" local message="$2" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') # 颜色输出 case "$level" in "SUCCESS") echo -e "${GREEN}[${level}]${NC} $message" ;; "ERROR") echo -e "${RED}[${level}]${NC} $message" ;; "WARN") echo -e "${YELLOW}[${level}]${NC} $message" ;; *) echo -e "${BLUE}[${level}]${NC} $message" ;; esac # 记录到日志文件 echo "[$timestamp] [$level] $message" >> "$LOG_FILE" } #================================ # 错误处理 #================================ error() { local message="$1" log "ERROR" "$message" log "ERROR" "详细日志请查看: $LOG_FILE" exit 1 } #================================ # 执行命令 #================================ run_cmd() { local command="$1" local description="${2:-"执行命令"}" log "INFO" "$description" #log "INFO" "$description: $command" # 执行命令并捕获输出 local output output=$(eval "$command" 2>&1) || { log "ERROR" "命令执行失败: $command" log "ERROR" "错误详情: $output" return 1 } return 0 } #================================ # 测试网络连接 #================================ test_network() { local url="$1" log "INFO" "测试网络连接: $url" # 直接测试URL连通性 if ! curl -fsSLI --connect-timeout 5 "$url" &>> "$LOG_FILE"; then log "WARN" "网络源不可用: $url" return 1 fi return 0 } #================================ # 下载文件 - 带进度条(使用curl) #================================ download_file() { local url="$1" local dest="$2" log "INFO" "开始下载: $url" # 创建一个临时文件来存储进度信息 local progress_file=$(mktemp) # 使用curl下载,显示进度条 echo -e "${YELLOW}[DOWNLOAD]${NC} 下载进度:" # 启动后台进程来执行下载 (curl -# -L -o "$dest" "$url" 2>"$progress_file") & local curl_pid=$! # 显示进度信息 while kill -0 $curl_pid 2>/dev/null; do if [ -s "$progress_file" ]; then # 获取最后一行进度信息并显示 local last_line=$(tail -n 1 "$progress_file") echo -ne "\r${YELLOW}[DOWNLOAD]${NC} $last_line" fi sleep 2 done # 等待下载完成并获取退出状态 wait $curl_pid local exit_status=$? # 确保捕获最后一行进度信息 if [ -s "$progress_file" ]; then local last_line=$(tail -n 1 "$progress_file") echo -e "\r${YELLOW}[DOWNLOAD]${NC} $last_line" fi # 清理临时文件 rm -f "$progress_file" # 检查下载是否成功 if [ $exit_status -ne 0 ]; then error "下载失败: $url (错误码: $exit_status)" fi echo -e "\r${YELLOW}[DOWNLOAD]${NC} 下载完成! " log "INFO" "下载完成: $dest" # 验证文件完整性(检查文件大小是否大于10MB) local file_size=$(stat -c%s "$dest") if [ $file_size -lt 10485760 ]; then log "ERROR" "下载的文件大小异常: $file_size 字节" log "ERROR" "请检查网络连接或下载源的可用性" error "建议手动下载后放置到/opt目录" fi } #================================ # 生成包信息 #================================ generate_package_info() { # 检查是否设置了必需的参数 if [[ -z "$APP_VERSION" ]]; then error "必须指定驱动版本,请使用 --version 参数" fi if [[ -z "$DISTRO" ]]; then error "必须指定系统版本,请使用 --distro 参数" fi # 使用改进的系统版本检查方法 local found=false for supported in "${SUPPORTED_DISTROS[@]}"; do if [[ "$supported" == "$DISTRO" ]]; then found=true break fi done if [[ "$found" == "false" ]]; then error "不支持的系统版本: $DISTRO。支持的系统: ${SUPPORTED_DISTROS[*]}" fi DRIVER_PACKAGE="${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}.tgz" PACKAGE_PATH="${APP_DIR}/${DRIVER_PACKAGE}" DRIVER_DIR="${APP_DIR}/${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}" log "INFO" "生成包信息: $DRIVER_PACKAGE" } #================================ # 生成下载URL #================================ generate_download_urls() { # 清空下载源数组 DOWNLOAD_SOURCES=() # 定义下载源模板 - 按优先级排序 local SOURCE_TEMPLATES=( "$PACKAGE_PATH" # 本地文件优先 "http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内网1 "http://10.102.32.207:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内网2 "https://content.mellanox.com/ofed/MLNX_OFED-${APP_VERSION}/${DRIVER_PACKAGE}" # 官网 ) # 生成最终下载源列表 for template in "${SOURCE_TEMPLATES[@]}"; do DOWNLOAD_SOURCES+=("$template") done } #================================ # 下载驱动包 #================================ download_driver() { log "TITLE" "${BLUE}开始获取驱动安装包${NC}" # 生成下载URL - 按优先级排序 generate_download_urls # 查找可用的下载源 DOWNLOAD_URL="" log "INFO" "开始查找可用的下载源..." for source in "${DOWNLOAD_SOURCES[@]}"; do if [[ "$source" == /* ]]; then # 本地文件检查 log "INFO" "检查本地文件: $source" if [[ -f "$source" ]]; then DOWNLOAD_URL="$source" log "INFO" "${BLUE}找到本地文件: $DOWNLOAD_URL${NC}" break else log "WARN" "本地文件不存在: $source" fi else # 网络URL检查 log "INFO" "检查网络源: $source" if test_network "$source"; then DOWNLOAD_URL="$source" log "INFO" "${BLUE}找到可用下载源: $DOWNLOAD_URL${NC}" break else log "WARN" "网络源不可用: $source" fi fi done # 检查是否找到可用源 if [[ -z "$DOWNLOAD_URL" ]]; then log "ERROR" "无法找到可用的下载源" log "ERROR" "请检查网络连接或手动下载安装包到/opt目录" error "官网下载地址: ${DOWNLOAD_SOURCES[-1]}" fi # 下载文件 TEMP_FILE="/opt/${DRIVER_PACKAGE}" log "INFO" "准备获取驱动包..." if [[ "$DOWNLOAD_URL" == /* ]]; then # 使用本地文件 log "INFO" "使用本地文件: $DOWNLOAD_URL" #run_cmd "cp $DOWNLOAD_URL $TEMP_FILE" "复制本地文件到临时目录" else # 从网络下载 log "INFO" "从网络下载: $DOWNLOAD_URL" download_file "$DOWNLOAD_URL" "$TEMP_FILE" fi } #================================ # 安装驱动 #================================ install_driver() { log "TITLE" "${BLUE}开始安装驱动: $APP_VERSION${NC}" kernel_version=$(uname -r) log "INFO" "当前内核版本: $kernel_version" # 根据系统版本安装不同的依赖 case "$DISTRO" in "ubuntu22.04") log "INFO" "安装Ubuntu 22.04依赖包" run_cmd "apt update" "更新软件包索引" run_cmd "apt install -y net-tools bzip2" "安装依赖包" ;; "ubuntu24.04") log "INFO" "安装Ubuntu 24.04依赖包" run_cmd "apt update" "更新软件包索引" run_cmd "apt install -y net-tools bzip2 dkms" "安装依赖包(包含dkms)" ;; *) error "不支持的系统版本: $DISTRO" ;; esac log "INFO" "解压驱动包" run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包" log "INFO" "执行驱动安装" case "$DISTRO" in "ubuntu22.04") run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --without-dkms --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装" ;; "ubuntu24.04") # Ubuntu 24.04可能需要不同的安装选项 run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装(Ubuntu 24.04)" ;; *) error "不支持的系统版本: $DISTRO" ;; esac sleep 10 } #================================ # 卸载驱动 #================================ uninstall_driver() { log "TITLE" "${BLUE}开始卸载驱动: $APP_VERSION${NC}" # 检查驱动目录,不存在则重新下载解压 if [ ! -d "$DRIVER_DIR" ]; then log "WARN" "驱动目录不存在,尝试重新下载和解压" download_driver # 复用安装的下载逻辑 run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包" if [ ! -d "$DRIVER_DIR" ]; then error "解压失败,无法找到驱动目录: $DRIVER_DIR" else log "INFO" "成功解压驱动包到: $DRIVER_DIR" fi else log "INFO" "找到驱动目录: $DRIVER_DIR" fi # 执行卸载 run_cmd "cd $DRIVER_DIR && DEBIAN_FRONTEND=noninteractive && ./uninstall.sh -q --force 2>/dev/null" "执行卸载脚本" || log "WARN" "卸载脚本执行失败,尝试手动清理" log "INFO" "清理残留文件" log "INFO" "停止并禁用openibd服务" run_cmd "systemctl stop openibd.service || true" "停止openibd服务" run_cmd "systemctl disable openibd.service || true" "禁用openibd服务" log "INFO" "恢复网卡命名规则" run_cmd ">/etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log||true" "清理IPOIB规则" run_cmd ">/etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log|true" "清理网络规则" run_cmd "rm -f /etc/modprobe.d/nvidia-gsp.conf" "删除nvidia配置" run_cmd "update-initramfs -u" "更新initramfs" } #================================ # 配置网卡命名规则 #================================ configure_naming_rules() { log "TITLE" "${BLUE}配置IB网卡命名规则${NC}" log "INFO" "备份原有规则" run_cmd "cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log|| true" "备份IPOIB规则" run_cmd "cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log|| true" "备份网络规则" log "INFO" "清除原有规则" run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log|| true" "清除IPOIB规则" run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log|| true" "清除网络规则" log "INFO" "生成IB设备命名规则" ID=20 for i in $(ibdev2netdev -v | grep 400G | awk '{print $1}'); do if [ -n "$i" ]; then echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules ID=$((ID+1)) fi done log "INFO" "生成网络设备命名规则" IDS=0 for j in $(ibdev2netdev -v | grep 400G | awk '{print $1}'); do if [ -n "$j" ]; then echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules IDS=$((IDS+1)) fi done log "INFO" "配置nvidia选项" run_cmd "echo \"options nvidia NVreg_EnableGpuFirmware=0\" > /etc/modprobe.d/nvidia-gsp.conf" "写入nvidia配置" run_cmd "update-initramfs -u" "更新initramfs" log "INFO" "重启openibd服务" run_cmd "systemctl restart openibd.service" "重启openibd服务" sleep 15 } #================================ # 检查驱动安装结果 #================================ check_installation() { log "TITLE" "${BLUE}检查驱动安装结果${NC}" if command -v ibv_devinfo &> /dev/null; then log "SUCCESS" "驱动安装成功" else error "驱动安装失败" fi } #================================ # 检查卸载结果 #================================ check_uninstallation() { log "TITLE" "${BLUE}检查卸载结果${NC}" if ! command -v ibv_devinfo &> /dev/null; then log "SUCCESS" "驱动已成功卸载" else log "WARN" "驱动命令仍存在,可能需要手动清理" fi } #================================ # 显示帮助信息 #================================ show_help() { echo "MLNX_OFED驱动管理脚本" echo "用法: $0 [选项] [命令]" echo "" echo "命令:" echo " --install 安装驱动" echo " --uninstall 卸载驱动" echo "" echo "选项:" echo " --version <版本号> 指定驱动版本 (必需)" echo " --distro <系统> 指定系统版本 (支持: ${SUPPORTED_DISTROS[*]}, 必需)" echo " --force 强制操作" echo " --help 显示此帮助信息" echo "" echo "示例:" echo " $0 --install --distro ubuntu24.04 --version 5.9-1.0.8.0" echo " $0 --uninstall --distro ubuntu22.04 --version 5.8-3.0.7.0" } #================================ # 主函数 #================================ main() { log "TITLE" "${BLUE}MLNX驱动管理脚本启动========================================================${NC}" # 检查root权限 if [[ $EUID -ne 0 ]]; then error "此脚本需要root权限运行" fi # 解析参数 COMMAND="" while [[ $# -gt 0 ]]; do case "$1" in "--install") COMMAND="install" ;; "--uninstall") COMMAND="uninstall" ;; "--version") if [[ -n "$2" ]]; then APP_VERSION="$2" log "INFO" "设置驱动版本: $APP_VERSION" shift 1 else error "请指定版本号,如: --version 5.8-3.0.7.0" fi ;; "--distro") if [[ -n "$2" ]]; then DISTRO="$2" log "INFO" "设置系统版本: $DISTRO" shift 1 else error "请指定系统版本,如: --distro ubuntu22.04" fi ;; "--force") FORCE=1 ;; "--help") show_help exit 0 ;; *) error "未知参数: $1" ;; esac shift 1 done # 检查是否指定了命令 if [[ -z "$COMMAND" ]]; then error "请指定操作: --install 或 --uninstall" fi # 生成包信息(会检查必需参数) generate_package_info # 执行命令 case "$COMMAND" in "install") download_driver install_driver configure_naming_rules check_installation ;; "uninstall") uninstall_driver check_uninstallation ;; *) error "未知命令: $COMMAND" ;; esac log "TITLE" "========================================================================${NC}" } # 执行主函数 main "$@"