From 402cf5312b1ca4c1e07bb379a67a9481edbcc249 Mon Sep 17 00:00:00 2001 From: joy Date: Sat, 19 Jul 2025 23:10:24 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20scripts/ib-drive.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/ib-drive.sh | 338 ++++++++++++++++++++++++++++++-------------- 1 file changed, 232 insertions(+), 106 deletions(-) diff --git a/scripts/ib-drive.sh b/scripts/ib-drive.sh index 5dc83e8..69fb0e6 100644 --- a/scripts/ib-drive.sh +++ b/scripts/ib-drive.sh @@ -7,16 +7,20 @@ IFS=$'\n\t' #================================ LOG_FILE="/var/log/mlnx_driver_install_$(date +%Y%m%d%H%M%S).log" DRIVER_NAME="MLNX_OFED" -APP_VERSION="5.8-6.0.4.2" +APP_VERSION="" # 不再设置默认值,必须手动指定 ARCH="x86_64" -DISTRO="ubuntu22.04" +DISTRO="" # 不再设置默认值,必须手动指定 APP_DIR="/opt" FORCE=0 +# 支持的系统版本列表 +SUPPORTED_DISTROS=("ubuntu22.04" "ubuntu24.04") + # 颜色定义 GREEN='\033[1;32m' # 绿色 - 成功 RED='\033[1;31m' # 红色 - 失败/错误 BLUE='\033[1;34m' # 蓝色 - 标题/信息 +YELLOW='\033[1;33m' # 黄色 - 警告/进度 NC='\033[0m' # 重置颜色 #================================ @@ -27,14 +31,15 @@ log() { local message="$2" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') - # 颜色输出 - 仅SUCCESS和ERROR使用特殊颜色 + # 颜色输出 case "$level" in "SUCCESS") echo -e "${GREEN}[${level}]${NC} $message" ;; "ERROR") echo -e "${RED}[${level}]${NC} $message" ;; + "WARN") echo -e "${YELLOW}[${level}]${NC} $message" ;; *) echo -e "${BLUE}[${level}]${NC} $message" ;; esac - # 记录到日志文件(不包含颜色) + # 记录到日志文件 echo "[$timestamp] [$level] $message" >> "$LOG_FILE" } @@ -54,7 +59,8 @@ error() { run_cmd() { local command="$1" local description="${2:-"执行命令"}" - log "INFO" "$description: $command" + log "INFO" "$description" + #log "INFO" "$description: $command" # 执行命令并捕获输出 local output @@ -74,7 +80,7 @@ test_network() { local url="$1" log "INFO" "测试网络连接: $url" - # 直接测试URL连通性,设置5秒超时 + # 直接测试URL连通性 if ! curl -fsSLI --connect-timeout 5 "$url" &>> "$LOG_FILE"; then log "WARN" "网络源不可用: $url" return 1 @@ -84,25 +90,89 @@ test_network() { } #================================ -# 下载文件 +# 下载文件 - 带进度条(使用curl) #================================ download_file() { local url="$1" local dest="$2" log "INFO" "开始下载: $url" - # 使用wget下载,显示进度条 - if ! wget -q --show-progress -O "$dest" "$url" &>> "$LOG_FILE"; then - error "下载失败: $url" + # 创建一个临时文件来存储进度信息 + local progress_file=$(mktemp) + + # 使用curl下载,显示进度条 + echo -e "${YELLOW}[DOWNLOAD]${NC} 下载进度:" + + # 启动后台进程来执行下载 + (curl -# -L -o "$dest" "$url" 2>"$progress_file") & + local curl_pid=$! + + # 显示进度信息 + while kill -0 $curl_pid 2>/dev/null; do + if [ -s "$progress_file" ]; then + # 获取最后一行进度信息并显示 + local last_line=$(tail -n 1 "$progress_file") + echo -ne "\r${YELLOW}[DOWNLOAD]${NC} $last_line" + fi + sleep 2 + done + + # 等待下载完成并获取退出状态 + wait $curl_pid + local exit_status=$? + + # 确保捕获最后一行进度信息 + if [ -s "$progress_file" ]; then + local last_line=$(tail -n 1 "$progress_file") + echo -e "\r${YELLOW}[DOWNLOAD]${NC} $last_line" fi + # 清理临时文件 + rm -f "$progress_file" + + # 检查下载是否成功 + if [ $exit_status -ne 0 ]; then + error "下载失败: $url (错误码: $exit_status)" + fi + + echo -e "\r${YELLOW}[DOWNLOAD]${NC} 下载完成! " log "INFO" "下载完成: $dest" + + # 验证文件完整性(检查文件大小是否大于10MB) + local file_size=$(stat -c%s "$dest") + if [ $file_size -lt 10485760 ]; then + log "ERROR" "下载的文件大小异常: $file_size 字节" + log "ERROR" "请检查网络连接或下载源的可用性" + error "建议手动下载后放置到/opt目录" + fi } #================================ # 生成包信息 #================================ generate_package_info() { + # 检查是否设置了必需的参数 + if [[ -z "$APP_VERSION" ]]; then + error "必须指定驱动版本,请使用 --version 参数" + fi + + if [[ -z "$DISTRO" ]]; then + error "必须指定系统版本,请使用 --distro 参数" + fi + + # 使用改进的系统版本检查方法 + local found=false + for supported in "${SUPPORTED_DISTROS[@]}"; do + if [[ "$supported" == "$DISTRO" ]]; then + found=true + break + fi + done + + if [[ "$found" == "false" ]]; then + error "不支持的系统版本: $DISTRO。支持的系统: ${SUPPORTED_DISTROS[*]}" + fi + DRIVER_PACKAGE="${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}.tgz" PACKAGE_PATH="${APP_DIR}/${DRIVER_PACKAGE}" DRIVER_DIR="${APP_DIR}/${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}" @@ -110,18 +180,34 @@ generate_package_info() { } #================================ -# 下载驱动包(优化版) +# 生成下载URL +#================================ +generate_download_urls() { + # 清空下载源数组 + DOWNLOAD_SOURCES=() + + # 定义下载源模板 - 按优先级排序 + local SOURCE_TEMPLATES=( + "$PACKAGE_PATH" # 本地文件优先 + "http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内网1 + "http://10.102.32.207:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内网2 + "https://content.mellanox.com/ofed/MLNX_OFED-${APP_VERSION}/${DRIVER_PACKAGE}" # 官网 + ) + + # 生成最终下载源列表 + for template in "${SOURCE_TEMPLATES[@]}"; do + DOWNLOAD_SOURCES+=("$template") + done +} + +#================================ +# 下载驱动包 #================================ download_driver() { log "TITLE" "${BLUE}开始获取驱动安装包${NC}" - # 定义下载源列表 (按优先级排序) - DOWNLOAD_SOURCES=( - "${PACKAGE_PATH}" # 本地文件 - "http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内部源1 - "http://10.102.32.207:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内部源2 - "https://www.mellanox.com/downloads/ofed/${DRIVER_PACKAGE}" # 公共源 - ) + # 生成下载URL - 按优先级排序 + generate_download_urls # 查找可用的下载源 DOWNLOAD_URL="" @@ -140,13 +226,14 @@ download_driver() { fi else # 网络URL检查 - if ! test_network "$source"; then - continue # 跳过不可用源 + log "INFO" "检查网络源: $source" + if test_network "$source"; then + DOWNLOAD_URL="$source" + log "INFO" "${BLUE}找到可用下载源: $DOWNLOAD_URL${NC}" + break + else + log "WARN" "网络源不可用: $source" fi - - DOWNLOAD_URL="$source" - log "INFO" "${BLUE}找到可用下载源: $DOWNLOAD_URL${NC}" - break fi done @@ -154,60 +241,72 @@ download_driver() { if [[ -z "$DOWNLOAD_URL" ]]; then log "ERROR" "无法找到可用的下载源" log "ERROR" "请检查网络连接或手动下载安装包到/opt目录" - error "下载地址: https://www.mellanox.com/downloads/ofed" + error "官网下载地址: ${DOWNLOAD_SOURCES[-1]}" fi # 下载文件 - TEMP_FILE="/tmp/${DRIVER_PACKAGE}" + TEMP_FILE="/opt/${DRIVER_PACKAGE}" log "INFO" "准备获取驱动包..." if [[ "$DOWNLOAD_URL" == /* ]]; then # 使用本地文件 log "INFO" "使用本地文件: $DOWNLOAD_URL" - run_cmd "cp $DOWNLOAD_URL $TEMP_FILE" "复制本地文件到临时目录" + #run_cmd "cp $DOWNLOAD_URL $TEMP_FILE" "复制本地文件到临时目录" else # 从网络下载 log "INFO" "从网络下载: $DOWNLOAD_URL" download_file "$DOWNLOAD_URL" "$TEMP_FILE" fi - - # 验证文件完整性 - log "INFO" "验证下载文件的完整性..." - file_size=$(stat -c%s "$TEMP_FILE") - - if [[ $file_size -lt 10485760 ]]; then # 检查文件大小是否小于10MB - log "ERROR" "下载的文件大小异常: $file_size 字节" - log "ERROR" "请检查网络连接或下载源的可用性" - error "建议手动下载后放置到/opt目录" - fi - - log "SUCCESS" "文件完整性验证通过: $file_size 字节" - return 0 } -#================================ -# 其他函数保持不变... #================================ # 安装驱动 +#================================ install_driver() { log "TITLE" "${BLUE}开始安装驱动: $APP_VERSION${NC}" kernel_version=$(uname -r) log "INFO" "当前内核版本: $kernel_version" - log "INFO" "安装依赖包" - run_cmd "apt update" "更新软件包索引" - run_cmd "apt install -y net-tools bzip2" "安装依赖包" + # 根据系统版本安装不同的依赖 + case "$DISTRO" in + "ubuntu22.04") + log "INFO" "安装Ubuntu 22.04依赖包" + run_cmd "apt update" "更新软件包索引" + run_cmd "apt install -y net-tools bzip2" "安装依赖包" + ;; + "ubuntu24.04") + log "INFO" "安装Ubuntu 24.04依赖包" + run_cmd "apt update" "更新软件包索引" + run_cmd "apt install -y net-tools bzip2 dkms" "安装依赖包(包含dkms)" + ;; + *) + error "不支持的系统版本: $DISTRO" + ;; + esac log "INFO" "解压驱动包" run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包" log "INFO" "执行驱动安装" - run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --without-dkms --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装" + case "$DISTRO" in + "ubuntu22.04") + run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --without-dkms --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装" + ;; + "ubuntu24.04") + # Ubuntu 24.04可能需要不同的安装选项 + run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装(Ubuntu 24.04)" + ;; + *) + error "不支持的系统版本: $DISTRO" + ;; + esac sleep 10 } +#================================ # 卸载驱动 +#================================ uninstall_driver() { log "TITLE" "${BLUE}开始卸载驱动: $APP_VERSION${NC}" @@ -226,23 +325,23 @@ uninstall_driver() { fi # 执行卸载 - run_cmd "cd $DRIVER_DIR && ./uninstall.sh -q -y" "执行卸载脚本" || log "WARN" "卸载脚本执行失败,尝试手动清理" + run_cmd "cd $DRIVER_DIR && DEBIAN_FRONTEND=noninteractive && ./uninstall.sh -q --force 2>/dev/null" "执行卸载脚本" || log "WARN" "卸载脚本执行失败,尝试手动清理" log "INFO" "清理残留文件" - run_cmd "rm -rf $DRIVER_DIR $TEMP_FILE" "删除驱动目录和临时文件" - log "INFO" "停止并禁用openibd服务" run_cmd "systemctl stop openibd.service || true" "停止openibd服务" run_cmd "systemctl disable openibd.service || true" "禁用openibd服务" log "INFO" "恢复网卡命名规则" - run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log||true" "清理IPOIB规则" - run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log|true" "清理网络规则" + run_cmd ">/etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log||true" "清理IPOIB规则" + run_cmd ">/etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log|true" "清理网络规则" run_cmd "rm -f /etc/modprobe.d/nvidia-gsp.conf" "删除nvidia配置" run_cmd "update-initramfs -u" "更新initramfs" } +#================================ # 配置网卡命名规则 +#================================ configure_naming_rules() { log "TITLE" "${BLUE}配置IB网卡命名规则${NC}" @@ -281,7 +380,9 @@ configure_naming_rules() { sleep 15 } +#================================ # 检查驱动安装结果 +#================================ check_installation() { log "TITLE" "${BLUE}检查驱动安装结果${NC}" if command -v ibv_devinfo &> /dev/null; then @@ -289,30 +390,11 @@ check_installation() { else error "驱动安装失败" fi - - log "INFO" "检查网卡命名规则" - valid_count=0 - for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do - if [ -n "$dev" ]; then - mlx_name=$(udevadm info -q name -n "$dev" 2>/dev/null | grep "mlx5_" || true) - net_name=$(ip link show "$dev" | grep "ib[0-9]" || true) - - if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then - valid_count=$((valid_count+1)) - else - log "WARN" "网卡 $dev 命名规则未生效" - fi - fi - done - - if [ $valid_count -gt 0 ]; then - log "SUCCESS" "网卡命名规则生效,成功配置 $valid_count 个网卡" - else - log "WARN" "所有网卡命名规则均未生效,建议重启系统手工配置!" - fi } +#================================ # 检查卸载结果 +#================================ check_uninstallation() { log "TITLE" "${BLUE}检查卸载结果${NC}" if ! command -v ibv_devinfo &> /dev/null; then @@ -321,66 +403,110 @@ check_uninstallation() { log "WARN" "驱动命令仍存在,可能需要手动清理" fi - if [ ! -d "$DRIVER_DIR" ]; then - log "SUCCESS" "驱动目录已删除" - else - log "WARN" "驱动目录未完全删除: $DRIVER_DIR" - fi } +#================================ +# 显示帮助信息 +#================================ +show_help() { + echo "MLNX_OFED驱动管理脚本" + echo "用法: $0 [选项] [命令]" + echo "" + echo "命令:" + echo " --install 安装驱动" + echo " --uninstall 卸载驱动" + echo "" + echo "选项:" + echo " --version <版本号> 指定驱动版本 (必需)" + echo " --distro <系统> 指定系统版本 (支持: ${SUPPORTED_DISTROS[*]}, 必需)" + echo " --force 强制操作" + echo " --help 显示此帮助信息" + echo "" + echo "示例:" + echo " $0 --install --distro ubuntu24.04 --version 5.9-1.0.8.0" + echo " $0 --uninstall --distro ubuntu22.04 --version 5.8-3.0.7.0" +} + +#================================ # 主函数 +#================================ main() { - log "TITLE" "${BLUE}MLNX驱动管理脚本启动${NC}" + log "TITLE" "${BLUE}MLNX驱动管理脚本启动========================================================${NC}" # 检查root权限 if [[ $EUID -ne 0 ]]; then error "此脚本需要root权限运行" fi - # 生成包信息 - generate_package_info - # 解析参数 - if [[ $# -lt 1 ]]; then - log "ERROR" "请指定操作: --install 或 --uninstall" - exit 1 + COMMAND="" + while [[ $# -gt 0 ]]; do + case "$1" in + "--install") + COMMAND="install" + ;; + "--uninstall") + COMMAND="uninstall" + ;; + "--version") + if [[ -n "$2" ]]; then + APP_VERSION="$2" + log "INFO" "设置驱动版本: $APP_VERSION" + shift 1 + else + error "请指定版本号,如: --version 5.8-3.0.7.0" + fi + ;; + "--distro") + if [[ -n "$2" ]]; then + DISTRO="$2" + log "INFO" "设置系统版本: $DISTRO" + shift 1 + else + error "请指定系统版本,如: --distro ubuntu22.04" + fi + ;; + "--force") + FORCE=1 + ;; + "--help") + show_help + exit 0 + ;; + *) + error "未知参数: $1" + ;; + esac + shift 1 + done + + # 检查是否指定了命令 + if [[ -z "$COMMAND" ]]; then + error "请指定操作: --install 或 --uninstall" fi - case "$1" in - "--install") + # 生成包信息(会检查必需参数) + generate_package_info + + # 执行命令 + case "$COMMAND" in + "install") download_driver install_driver configure_naming_rules check_installation ;; - "--uninstall") + "uninstall") uninstall_driver check_uninstallation ;; - "--version") - if [[ -n "$2" ]]; then - APP_VERSION="$2" - generate_package_info - log "INFO" "设置驱动版本: $APP_VERSION" - shift 2 - main "$@" - else - error "请指定版本号,如: --version 5.8-6.0.4.2" - fi - ;; - "--force") - FORCE=1 - shift 1 - main "$@" - ;; *) - error "未知参数: $1" + error "未知命令: $COMMAND" ;; esac - log "TITLE" "${BLUE}操作完成!${NC}" + log "TITLE" "========================================================================${NC}" } # 执行主函数 main "$@" -