diff --git a/scripts/ib-drive.sh b/scripts/ib-drive.sh index c9b95df..08f948f 100644 --- a/scripts/ib-drive.sh +++ b/scripts/ib-drive.sh @@ -1,165 +1,260 @@ #!/bin/bash -set -e +set -euo pipefail +IFS=$'\n\t' -# 颜色定义 -GREEN='\033[1;32m' -RED='\033[1;31m' -YELLOW='\033[1;33m' -NC='\033[0m' # 重置颜色 - -# 输出带颜色的信息 -log_info() { echo -e "${GREEN}[INFO] $1${NC}"; } -log_error() { echo -e "${RED}[ERROR] $1${NC}"; exit 1; } -log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}"; } - -# 默认变量 -ACTION="" -DRIVER_VERSION="5.8-6.0.4.2" -DISTRO="ubuntu22.04" +#================================ +# 全局配置 +#================================ +LOG_FILE="/var/log/mlnx_driver_install_$(date +%Y%m%d%H%M%S).log" +DRIVER_NAME="MLNX_OFED" +APP_VERSION="5.8-6.0.4.2" ARCH="x86_64" +DISTRO="ubuntu22.04" +APP_DIR="/opt" FORCE=0 -# 生成包名和路径 -generate_package_info() { - DRIVER_PACKAGE="MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}.tgz" - PACKAGE_PATH="/opt/${DRIVER_PACKAGE}" - DRIVER_DIR="/opt/MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}" - INTERNAL_URL="http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" - OFFICIAL_URL="https://example.com/${DRIVER_PACKAGE}" # 请替换为实际官网地址 +# 颜色定义 +GREEN='\033[1;32m' # 绿色 - 成功 +RED='\033[1;31m' # 红色 - 失败/错误 +BLUE='\033[1;34m' # 蓝色 - 标题/信息 +NC='\033[0m' # 重置颜色 + +#================================ +# 日志函数 +#================================ +log() { + local level="$1" + local message="$2" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + + # 颜色输出 - 仅SUCCESS和ERROR使用特殊颜色 + case "$level" in + "SUCCESS") echo -e "${GREEN}[${level}]${NC} $message" ;; + "ERROR") echo -e "${RED}[${level}]${NC} $message" ;; + *) echo -e "${BLUE}[${level}]${NC} $message" ;; + esac + + # 记录到日志文件(不包含颜色) + echo "[$timestamp] [$level] $message" >> "$LOG_FILE" } -# 解析命令行参数 -parse_args() { - while [[ $# -gt 0 ]]; do - case "$1" in - --install) - ACTION="install" - shift - ;; - --uninstall) - ACTION="uninstall" - shift - ;; - --version) - if [[ -z "$2" ]]; then - log_error "请指定版本号,如: --version 5.8-6.0.4.2" - fi - DRIVER_VERSION="$2" - generate_package_info - shift 2 - ;; - --force) - FORCE=1 - shift - ;; - *) - log_error "未知参数: $1" - ;; - esac +#================================ +# 错误处理 +#================================ +error() { + local message="$1" + log "ERROR" "$message" + log "ERROR" "详细日志请查看: $LOG_FILE" + exit 1 +} + +#================================ +# 执行命令 +#================================ +run_cmd() { + local command="$1" + local description="${2:-"执行命令"}" + log "INFO" "$description: $command" + + # 执行命令并捕获输出 + local output + output=$(eval "$command" 2>&1) || { + log "ERROR" "命令执行失败: $command" + log "ERROR" "错误详情: $output" + return 1 + } + + return 0 +} + +#================================ +# 测试网络连接 +#================================ +test_network() { + local url="$1" + log "INFO" "测试网络连接: $url" + + # 直接测试URL连通性,设置5秒超时 + if ! curl -fsSLI --connect-timeout 5 "$url" &>> "$LOG_FILE"; then + log "WARN" "网络源不可用: $url" + return 1 + fi + + return 0 +} + +#================================ +# 下载文件 +#================================ +download_file() { + local url="$1" + local dest="$2" + log "INFO" "开始下载: $url" + + # 使用wget下载,显示进度条 + if ! wget -q --show-progress -O "$dest" "$url" &>> "$LOG_FILE"; then + error "下载失败: $url" + fi + + log "INFO" "下载完成: $dest" +} + +#================================ +# 生成包信息 +#================================ +generate_package_info() { + DRIVER_PACKAGE="${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}.tgz" + PACKAGE_PATH="${APP_DIR}/${DRIVER_PACKAGE}" + DRIVER_DIR="${APP_DIR}/${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}" + log "INFO" "生成包信息: $DRIVER_PACKAGE" +} + +#================================ +# 下载驱动包(优化版) +#================================ +download_driver() { + log "TITLE" "${BLUE}开始获取驱动安装包${NC}" + + # 定义下载源列表 (按优先级排序) + DOWNLOAD_SOURCES=( + "${PACKAGE_PATH}" # 本地文件 + "http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内部源1 + "http://10.102.32.207:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内部源2 + "https://www.mellanox.com/downloads/ofed/${DRIVER_PACKAGE}" # 公共源 + ) + + # 查找可用的下载源 + DOWNLOAD_URL="" + log "INFO" "开始查找可用的下载源..." + + for source in "${DOWNLOAD_SOURCES[@]}"; do + if [[ "$source" == /* ]]; then + # 本地文件检查 + log "INFO" "检查本地文件: $source" + if [[ -f "$source" ]]; then + DOWNLOAD_URL="$source" + log "INFO" "${BLUE}找到本地文件: $DOWNLOAD_URL${NC}" + break + else + log "WARN" "本地文件不存在: $source" + fi + else + # 网络URL检查 + if ! test_network "$source"; then + continue # 跳过不可用源 + fi + + DOWNLOAD_URL="$source" + log "INFO" "${BLUE}找到可用下载源: $DOWNLOAD_URL${NC}" + break + fi done - if [[ -z "$ACTION" ]]; then - log_error "请指定操作: --install 或 --uninstall" + # 检查是否找到可用源 + if [[ -z "$DOWNLOAD_URL" ]]; then + log "ERROR" "无法找到可用的下载源" + log "ERROR" "请检查网络连接或手动下载安装包到/opt目录" + error "下载地址: https://www.mellanox.com/downloads/ofed" fi -} - -# 下载驱动包 -download_driver() { - log_info "开始下载驱动包: $DRIVER_PACKAGE" - if [ -f "$PACKAGE_PATH" ]; then - log_info "使用本地驱动包: $PACKAGE_PATH" + + # 下载文件 + TEMP_FILE="/tmp/${DRIVER_PACKAGE}" + log "INFO" "准备获取驱动包..." + + if [[ "$DOWNLOAD_URL" == /* ]]; then + # 使用本地文件 + log "INFO" "使用本地文件: $DOWNLOAD_URL" + run_cmd "cp $DOWNLOAD_URL $TEMP_FILE" "复制本地文件到临时目录" else - log_info "本地包不存在,尝试从内网下载" - if wget -q -O "$PACKAGE_PATH" "$INTERNAL_URL"; then - log_info "内网下载成功" - else - log_warning "内网下载失败,尝试从官网下载" - if wget -q -O "$PACKAGE_PATH" "$OFFICIAL_URL"; then - log_info "官网下载成功" - else - log_error "驱动包下载失败,请手动放置到 /opt/" - fi - fi + # 从网络下载 + log "INFO" "从网络下载: $DOWNLOAD_URL" + download_file "$DOWNLOAD_URL" "$TEMP_FILE" fi + + # 验证文件完整性 + log "INFO" "验证下载文件的完整性..." + file_size=$(stat -c%s "$TEMP_FILE") + + if [[ $file_size -lt 10485760 ]]; then # 检查文件大小是否小于10MB + log "ERROR" "下载的文件大小异常: $file_size 字节" + log "ERROR" "请检查网络连接或下载源的可用性" + error "建议手动下载后放置到/opt目录" + fi + + log "SUCCESS" "文件完整性验证通过: $file_size 字节" + return 0 } +#================================ +# 其他函数保持不变... +#================================ # 安装驱动 install_driver() { - log_info "开始安装驱动: $DRIVER_VERSION" - - # 检查是否已安装 - #if command -v ibv_devinfo &> /dev/null && [[ $FORCE -eq 0 ]]; then - # log_warning "检测到驱动已安装,使用 --force 覆盖安装" - # exit 0 - #fi + log "TITLE" "${BLUE}开始安装驱动: $APP_VERSION${NC}" kernel_version=$(uname -r) - log_info "当前内核版本: $kernel_version" + log "INFO" "当前内核版本: $kernel_version" - log_info "安装依赖包" - apt update &>> /tmp/mlnx_install.log - apt install -y net-tools bzip2 &>> /tmp/mlnx_install.log + log "INFO" "安装依赖包" + run_cmd "apt update" "更新软件包索引" + run_cmd "apt install -y net-tools bzip2" "安装依赖包" - log_info "解压驱动包" - tar -zxf "$PACKAGE_PATH" -C /opt/ + log "INFO" "解压驱动包" + run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包" - log_info "执行驱动安装" - cd "$DRIVER_DIR" - ./mlnxofedinstall --without-dkms --add-kernel-support --kernel "$kernel_version" --with-fw-update --force &>> /tmp/mlnx_install.log + log "INFO" "执行驱动安装" + run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --without-dkms --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装" sleep 10 } -# 卸载驱动(修改后版本) +# 卸载驱动 uninstall_driver() { - log_info "开始卸载驱动: $DRIVER_VERSION" + log "TITLE" "${BLUE}开始卸载驱动: $APP_VERSION${NC}" # 检查驱动目录,不存在则重新下载解压 if [ ! -d "$DRIVER_DIR" ]; then - log_warning "驱动目录不存在,尝试重新下载和解压" + log "WARN" "驱动目录不存在,尝试重新下载和解压" download_driver # 复用安装的下载逻辑 - log_info "解压驱动包" - tar -zxf "$PACKAGE_PATH" -C /opt/ + run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包" if [ ! -d "$DRIVER_DIR" ]; then - log_error "解压失败,无法找到驱动目录: $DRIVER_DIR" + error "解压失败,无法找到驱动目录: $DRIVER_DIR" else - log_info "成功解压驱动包到: $DRIVER_DIR" + log "INFO" "成功解压驱动包到: $DRIVER_DIR" fi else - log_info "找到驱动目录: $DRIVER_DIR" + log "INFO" "找到驱动目录: $DRIVER_DIR" fi # 执行卸载 - cd "$DRIVER_DIR" - log_info "执行卸载脚本" - ./uninstall.sh -q -y &>> /tmp/mlnx_install.log || log_warning "卸载脚本执行失败,尝试手动清理" + run_cmd "cd $DRIVER_DIR && ./uninstall.sh -q -y" "执行卸载脚本" || log "WARN" "卸载脚本执行失败,尝试手动清理" - log_info "清理残留文件" - rm -rf "$DRIVER_DIR" "$PACKAGE_PATH" + log "INFO" "清理残留文件" + run_cmd "rm -rf $DRIVER_DIR $TEMP_FILE" "删除驱动目录和临时文件" - log_info "停止并禁用openibd服务" - systemctl stop openibd.service &>> /tmp/mlnx_install.log || true - systemctl disable openibd.service &>> /tmp/mlnx_install.log || true + log "INFO" "停止并禁用openibd服务" + run_cmd "systemctl stop openibd.service || true" "停止openibd服务" + run_cmd "systemctl disable openibd.service || true" "禁用openibd服务" - log_info "恢复网卡命名规则" - sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules - sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules - rm -f /etc/modprobe.d/nvidia-gsp.conf - update-initramfs -u &>> /tmp/mlnx_install.log + log "INFO" "恢复网卡命名规则" + run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules" "清理IPOIB规则" + run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules" "清理网络规则" + run_cmd "rm -f /etc/modprobe.d/nvidia-gsp.conf" "删除nvidia配置" + run_cmd "update-initramfs -u" "更新initramfs" } # 配置网卡命名规则 configure_naming_rules() { - log_info "配置IB网卡命名规则" + log "TITLE" "${BLUE}配置IB网卡命名规则${NC}" - log_info "备份原有规则" - cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log || true - cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log || true + log "INFO" "备份原有规则" + run_cmd "cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak || true" "备份IPOIB规则" + run_cmd "cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak || true" "备份网络规则" - log_info "清除原有规则" - sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log || true - sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log || true + log "INFO" "清除原有规则" + run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules || true" "清除IPOIB规则" + run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules || true" "清除网络规则" - log_info "生成IB设备命名规则" + log "INFO" "生成IB设备命名规则" ID=20 for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do if [ -n "$i" ]; then @@ -168,7 +263,7 @@ configure_naming_rules() { fi done - log_info "生成网络设备命名规则" + log "INFO" "生成网络设备命名规则" IDS=0 for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do if [ -n "$j" ]; then @@ -177,25 +272,25 @@ configure_naming_rules() { fi done - log_info "配置nvidia选项" - echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-gsp.conf - update-initramfs -u &>> /tmp/mlnx_install.log + log "INFO" "配置nvidia选项" + run_cmd "echo \"options nvidia NVreg_EnableGpuFirmware=0\" > /etc/modprobe.d/nvidia-gsp.conf" "写入nvidia配置" + run_cmd "update-initramfs -u" "更新initramfs" - log_info "重启openibd服务" - systemctl restart openibd.service + log "INFO" "重启openibd服务" + run_cmd "systemctl restart openibd.service" "重启openibd服务" sleep 15 } # 检查驱动安装结果 check_installation() { - log_info "检查驱动安装结果" + log "TITLE" "${BLUE}检查驱动安装结果${NC}" if command -v ibv_devinfo &> /dev/null; then - log_info "驱动安装成功" + log "SUCCESS" "驱动安装成功" else - log_error "驱动安装失败" + error "驱动安装失败" fi - log_info "检查网卡命名规则" + log "INFO" "检查网卡命名规则" valid_count=0 for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do if [ -n "$dev" ]; then @@ -205,56 +300,87 @@ check_installation() { if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then valid_count=$((valid_count+1)) else - log_warning "网卡 $dev 命名规则未生效" + log "WARN" "网卡 $dev 命名规则未生效" fi fi done if [ $valid_count -gt 0 ]; then - log_info "网卡命名规则生效,成功配置 $valid_count 个网卡" + log "SUCCESS" "网卡命名规则生效,成功配置 $valid_count 个网卡" else - log_warning "所有网卡命名规则均未生效,建议重启系统手工配置!" + log "WARN" "所有网卡命名规则均未生效,建议重启系统手工配置!" fi } # 检查卸载结果 check_uninstallation() { - log_info "检查卸载结果" + log "TITLE" "${BLUE}检查卸载结果${NC}" if ! command -v ibv_devinfo &> /dev/null; then - log_info "驱动已成功卸载" + log "SUCCESS" "驱动已成功卸载" else - log_warning "驱动命令仍存在,可能需要手动清理" + log "WARN" "驱动命令仍存在,可能需要手动清理" fi if [ ! -d "$DRIVER_DIR" ]; then - log_info "驱动目录已删除" + log "SUCCESS" "驱动目录已删除" else - log_warning "驱动目录未完全删除: $DRIVER_DIR" + log "WARN" "驱动目录未完全删除: $DRIVER_DIR" fi } # 主函数 main() { + log "TITLE" "${BLUE}MLNX驱动管理脚本启动${NC}" + + # 检查root权限 + if [[ $EUID -ne 0 ]]; then + error "此脚本需要root权限运行" + fi + + # 生成包信息 generate_package_info - parse_args "$@" - log_info "开始执行 ${ACTION^^} 操作,版本: $DRIVER_VERSION" + # 解析参数 + if [[ $# -lt 1 ]]; then + log "ERROR" "请指定操作: --install 或 --uninstall" + exit 1 + fi - case "$ACTION" in - install) + case "$1" in + "--install") download_driver install_driver configure_naming_rules check_installation ;; - uninstall) + "--uninstall") uninstall_driver check_uninstallation ;; + "--version") + if [[ -n "$2" ]]; then + APP_VERSION="$2" + generate_package_info + log "INFO" "设置驱动版本: $APP_VERSION" + shift 2 + main "$@" + else + error "请指定版本号,如: --version 5.8-6.0.4.2" + fi + ;; + "--force") + FORCE=1 + shift 1 + main "$@" + ;; + *) + error "未知参数: $1" + ;; esac - log_info "操作完成!" + log "TITLE" "${BLUE}操作完成!${NC}" } # 执行主函数 main "$@" +