This commit is contained in:
joy 2025-07-07 12:04:49 +08:00
parent 4b6db85dc2
commit 626b40d2b5
1 changed files with 266 additions and 140 deletions

View File

@ -1,165 +1,260 @@
#!/bin/bash
set -e
set -euo pipefail
IFS=$'\n\t'
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 输出带颜色的信息
log_info() { echo -e "${GREEN}[INFO] $1${NC}"; }
log_error() { echo -e "${RED}[ERROR] $1${NC}"; exit 1; }
log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}"; }
# 默认变量
ACTION=""
DRIVER_VERSION="5.8-6.0.4.2"
DISTRO="ubuntu22.04"
#================================
# 全局配置
#================================
LOG_FILE="/var/log/mlnx_driver_install_$(date +%Y%m%d%H%M%S).log"
DRIVER_NAME="MLNX_OFED"
APP_VERSION="5.8-6.0.4.2"
ARCH="x86_64"
DISTRO="ubuntu22.04"
APP_DIR="/opt"
FORCE=0
# 生成包名和路径
generate_package_info() {
DRIVER_PACKAGE="MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}.tgz"
PACKAGE_PATH="/opt/${DRIVER_PACKAGE}"
DRIVER_DIR="/opt/MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}"
INTERNAL_URL="http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}"
OFFICIAL_URL="https://example.com/${DRIVER_PACKAGE}" # 请替换为实际官网地址
# 颜色定义
GREEN='\033[1;32m' # 绿色 - 成功
RED='\033[1;31m' # 红色 - 失败/错误
BLUE='\033[1;34m' # 蓝色 - 标题/信息
NC='\033[0m' # 重置颜色
#================================
# 日志函数
#================================
log() {
local level="$1"
local message="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 颜色输出 - 仅SUCCESS和ERROR使用特殊颜色
case "$level" in
"SUCCESS") echo -e "${GREEN}[${level}]${NC} $message" ;;
"ERROR") echo -e "${RED}[${level}]${NC} $message" ;;
*) echo -e "${BLUE}[${level}]${NC} $message" ;;
esac
# 记录到日志文件(不包含颜色)
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
# 解析命令行参数
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install)
ACTION="install"
shift
;;
--uninstall)
ACTION="uninstall"
shift
;;
--version)
if [[ -z "$2" ]]; then
log_error "请指定版本号,如: --version 5.8-6.0.4.2"
#================================
# 错误处理
#================================
error() {
local message="$1"
log "ERROR" "$message"
log "ERROR" "详细日志请查看: $LOG_FILE"
exit 1
}
#================================
# 执行命令
#================================
run_cmd() {
local command="$1"
local description="${2:-"执行命令"}"
log "INFO" "$description: $command"
# 执行命令并捕获输出
local output
output=$(eval "$command" 2>&1) || {
log "ERROR" "命令执行失败: $command"
log "ERROR" "错误详情: $output"
return 1
}
return 0
}
#================================
# 测试网络连接
#================================
test_network() {
local url="$1"
log "INFO" "测试网络连接: $url"
# 直接测试URL连通性设置5秒超时
if ! curl -fsSLI --connect-timeout 5 "$url" &>> "$LOG_FILE"; then
log "WARN" "网络源不可用: $url"
return 1
fi
return 0
}
#================================
# 下载文件
#================================
download_file() {
local url="$1"
local dest="$2"
log "INFO" "开始下载: $url"
# 使用wget下载显示进度条
if ! wget -q --show-progress -O "$dest" "$url" &>> "$LOG_FILE"; then
error "下载失败: $url"
fi
log "INFO" "下载完成: $dest"
}
#================================
# 生成包信息
#================================
generate_package_info() {
DRIVER_PACKAGE="${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}.tgz"
PACKAGE_PATH="${APP_DIR}/${DRIVER_PACKAGE}"
DRIVER_DIR="${APP_DIR}/${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}"
log "INFO" "生成包信息: $DRIVER_PACKAGE"
}
#================================
# 下载驱动包(优化版)
#================================
download_driver() {
log "TITLE" "${BLUE}开始获取驱动安装包${NC}"
# 定义下载源列表 (按优先级排序)
DOWNLOAD_SOURCES=(
"${PACKAGE_PATH}" # 本地文件
"http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内部源1
"http://10.102.32.207:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内部源2
"https://www.mellanox.com/downloads/ofed/${DRIVER_PACKAGE}" # 公共源
)
# 查找可用的下载源
DOWNLOAD_URL=""
log "INFO" "开始查找可用的下载源..."
for source in "${DOWNLOAD_SOURCES[@]}"; do
if [[ "$source" == /* ]]; then
# 本地文件检查
log "INFO" "检查本地文件: $source"
if [[ -f "$source" ]]; then
DOWNLOAD_URL="$source"
log "INFO" "${BLUE}找到本地文件: $DOWNLOAD_URL${NC}"
break
else
log "WARN" "本地文件不存在: $source"
fi
else
# 网络URL检查
if ! test_network "$source"; then
continue # 跳过不可用源
fi
DOWNLOAD_URL="$source"
log "INFO" "${BLUE}找到可用下载源: $DOWNLOAD_URL${NC}"
break
fi
DRIVER_VERSION="$2"
generate_package_info
shift 2
;;
--force)
FORCE=1
shift
;;
*)
log_error "未知参数: $1"
;;
esac
done
if [[ -z "$ACTION" ]]; then
log_error "请指定操作: --install 或 --uninstall"
fi
}
# 下载驱动包
download_driver() {
log_info "开始下载驱动包: $DRIVER_PACKAGE"
if [ -f "$PACKAGE_PATH" ]; then
log_info "使用本地驱动包: $PACKAGE_PATH"
else
log_info "本地包不存在,尝试从内网下载"
if wget -q -O "$PACKAGE_PATH" "$INTERNAL_URL"; then
log_info "内网下载成功"
else
log_warning "内网下载失败,尝试从官网下载"
if wget -q -O "$PACKAGE_PATH" "$OFFICIAL_URL"; then
log_info "官网下载成功"
else
log_error "驱动包下载失败,请手动放置到 /opt/"
fi
fi
# 检查是否找到可用源
if [[ -z "$DOWNLOAD_URL" ]]; then
log "ERROR" "无法找到可用的下载源"
log "ERROR" "请检查网络连接或手动下载安装包到/opt目录"
error "下载地址: https://www.mellanox.com/downloads/ofed"
fi
# 下载文件
TEMP_FILE="/tmp/${DRIVER_PACKAGE}"
log "INFO" "准备获取驱动包..."
if [[ "$DOWNLOAD_URL" == /* ]]; then
# 使用本地文件
log "INFO" "使用本地文件: $DOWNLOAD_URL"
run_cmd "cp $DOWNLOAD_URL $TEMP_FILE" "复制本地文件到临时目录"
else
# 从网络下载
log "INFO" "从网络下载: $DOWNLOAD_URL"
download_file "$DOWNLOAD_URL" "$TEMP_FILE"
fi
# 验证文件完整性
log "INFO" "验证下载文件的完整性..."
file_size=$(stat -c%s "$TEMP_FILE")
if [[ $file_size -lt 10485760 ]]; then # 检查文件大小是否小于10MB
log "ERROR" "下载的文件大小异常: $file_size 字节"
log "ERROR" "请检查网络连接或下载源的可用性"
error "建议手动下载后放置到/opt目录"
fi
log "SUCCESS" "文件完整性验证通过: $file_size 字节"
return 0
}
#================================
# 其他函数保持不变...
#================================
# 安装驱动
install_driver() {
log_info "开始安装驱动: $DRIVER_VERSION"
# 检查是否已安装
#if command -v ibv_devinfo &> /dev/null && [[ $FORCE -eq 0 ]]; then
# log_warning "检测到驱动已安装,使用 --force 覆盖安装"
# exit 0
#fi
log "TITLE" "${BLUE}开始安装驱动: $APP_VERSION${NC}"
kernel_version=$(uname -r)
log_info "当前内核版本: $kernel_version"
log "INFO" "当前内核版本: $kernel_version"
log_info "安装依赖包"
apt update &>> /tmp/mlnx_install.log
apt install -y net-tools bzip2 &>> /tmp/mlnx_install.log
log "INFO" "安装依赖包"
run_cmd "apt update" "更新软件包索引"
run_cmd "apt install -y net-tools bzip2" "安装依赖包"
log_info "解压驱动包"
tar -zxf "$PACKAGE_PATH" -C /opt/
log "INFO" "解压驱动包"
run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包"
log_info "执行驱动安装"
cd "$DRIVER_DIR"
./mlnxofedinstall --without-dkms --add-kernel-support --kernel "$kernel_version" --with-fw-update --force &>> /tmp/mlnx_install.log
log "INFO" "执行驱动安装"
run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --without-dkms --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装"
sleep 10
}
# 卸载驱动(修改后版本)
# 卸载驱动
uninstall_driver() {
log_info "开始卸载驱动: $DRIVER_VERSION"
log "TITLE" "${BLUE}开始卸载驱动: $APP_VERSION${NC}"
# 检查驱动目录,不存在则重新下载解压
if [ ! -d "$DRIVER_DIR" ]; then
log_warning "驱动目录不存在,尝试重新下载和解压"
log "WARN" "驱动目录不存在,尝试重新下载和解压"
download_driver # 复用安装的下载逻辑
log_info "解压驱动包"
tar -zxf "$PACKAGE_PATH" -C /opt/
run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包"
if [ ! -d "$DRIVER_DIR" ]; then
log_error "解压失败,无法找到驱动目录: $DRIVER_DIR"
error "解压失败,无法找到驱动目录: $DRIVER_DIR"
else
log_info "成功解压驱动包到: $DRIVER_DIR"
log "INFO" "成功解压驱动包到: $DRIVER_DIR"
fi
else
log_info "找到驱动目录: $DRIVER_DIR"
log "INFO" "找到驱动目录: $DRIVER_DIR"
fi
# 执行卸载
cd "$DRIVER_DIR"
log_info "执行卸载脚本"
./uninstall.sh -q -y &>> /tmp/mlnx_install.log || log_warning "卸载脚本执行失败,尝试手动清理"
run_cmd "cd $DRIVER_DIR && ./uninstall.sh -q -y" "执行卸载脚本" || log "WARN" "卸载脚本执行失败,尝试手动清理"
log_info "清理残留文件"
rm -rf "$DRIVER_DIR" "$PACKAGE_PATH"
log "INFO" "清理残留文件"
run_cmd "rm -rf $DRIVER_DIR $TEMP_FILE" "删除驱动目录和临时文件"
log_info "停止并禁用openibd服务"
systemctl stop openibd.service &>> /tmp/mlnx_install.log || true
systemctl disable openibd.service &>> /tmp/mlnx_install.log || true
log "INFO" "停止并禁用openibd服务"
run_cmd "systemctl stop openibd.service || true" "停止openibd服务"
run_cmd "systemctl disable openibd.service || true" "禁用openibd服务"
log_info "恢复网卡命名规则"
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules
rm -f /etc/modprobe.d/nvidia-gsp.conf
update-initramfs -u &>> /tmp/mlnx_install.log
log "INFO" "恢复网卡命名规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules" "清理IPOIB规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules" "清理网络规则"
run_cmd "rm -f /etc/modprobe.d/nvidia-gsp.conf" "删除nvidia配置"
run_cmd "update-initramfs -u" "更新initramfs"
}
# 配置网卡命名规则
configure_naming_rules() {
log_info "配置IB网卡命名规则"
log "TITLE" "${BLUE}配置IB网卡命名规则${NC}"
log_info "备份原有规则"
cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log || true
cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log || true
log "INFO" "备份原有规则"
run_cmd "cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak || true" "备份IPOIB规则"
run_cmd "cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak || true" "备份网络规则"
log_info "清除原有规则"
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log || true
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log || true
log "INFO" "清除原有规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules || true" "清除IPOIB规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules || true" "清除网络规则"
log_info "生成IB设备命名规则"
log "INFO" "生成IB设备命名规则"
ID=20
for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$i" ]; then
@ -168,7 +263,7 @@ configure_naming_rules() {
fi
done
log_info "生成网络设备命名规则"
log "INFO" "生成网络设备命名规则"
IDS=0
for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$j" ]; then
@ -177,25 +272,25 @@ configure_naming_rules() {
fi
done
log_info "配置nvidia选项"
echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-gsp.conf
update-initramfs -u &>> /tmp/mlnx_install.log
log "INFO" "配置nvidia选项"
run_cmd "echo \"options nvidia NVreg_EnableGpuFirmware=0\" > /etc/modprobe.d/nvidia-gsp.conf" "写入nvidia配置"
run_cmd "update-initramfs -u" "更新initramfs"
log_info "重启openibd服务"
systemctl restart openibd.service
log "INFO" "重启openibd服务"
run_cmd "systemctl restart openibd.service" "重启openibd服务"
sleep 15
}
# 检查驱动安装结果
check_installation() {
log_info "检查驱动安装结果"
log "TITLE" "${BLUE}检查驱动安装结果${NC}"
if command -v ibv_devinfo &> /dev/null; then
log_info "驱动安装成功"
log "SUCCESS" "驱动安装成功"
else
log_error "驱动安装失败"
error "驱动安装失败"
fi
log_info "检查网卡命名规则"
log "INFO" "检查网卡命名规则"
valid_count=0
for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$dev" ]; then
@ -205,56 +300,87 @@ check_installation() {
if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then
valid_count=$((valid_count+1))
else
log_warning "网卡 $dev 命名规则未生效"
log "WARN" "网卡 $dev 命名规则未生效"
fi
fi
done
if [ $valid_count -gt 0 ]; then
log_info "网卡命名规则生效,成功配置 $valid_count 个网卡"
log "SUCCESS" "网卡命名规则生效,成功配置 $valid_count 个网卡"
else
log_warning "所有网卡命名规则均未生效,建议重启系统手工配置!"
log "WARN" "所有网卡命名规则均未生效,建议重启系统手工配置!"
fi
}
# 检查卸载结果
check_uninstallation() {
log_info "检查卸载结果"
log "TITLE" "${BLUE}检查卸载结果${NC}"
if ! command -v ibv_devinfo &> /dev/null; then
log_info "驱动已成功卸载"
log "SUCCESS" "驱动已成功卸载"
else
log_warning "驱动命令仍存在,可能需要手动清理"
log "WARN" "驱动命令仍存在,可能需要手动清理"
fi
if [ ! -d "$DRIVER_DIR" ]; then
log_info "驱动目录已删除"
log "SUCCESS" "驱动目录已删除"
else
log_warning "驱动目录未完全删除: $DRIVER_DIR"
log "WARN" "驱动目录未完全删除: $DRIVER_DIR"
fi
}
# 主函数
main() {
log "TITLE" "${BLUE}MLNX驱动管理脚本启动${NC}"
# 检查root权限
if [[ $EUID -ne 0 ]]; then
error "此脚本需要root权限运行"
fi
# 生成包信息
generate_package_info
parse_args "$@"
log_info "开始执行 ${ACTION^^} 操作,版本: $DRIVER_VERSION"
# 解析参数
if [[ $# -lt 1 ]]; then
log "ERROR" "请指定操作: --install 或 --uninstall"
exit 1
fi
case "$ACTION" in
install)
case "$1" in
"--install")
download_driver
install_driver
configure_naming_rules
check_installation
;;
uninstall)
"--uninstall")
uninstall_driver
check_uninstallation
;;
"--version")
if [[ -n "$2" ]]; then
APP_VERSION="$2"
generate_package_info
log "INFO" "设置驱动版本: $APP_VERSION"
shift 2
main "$@"
else
error "请指定版本号,如: --version 5.8-6.0.4.2"
fi
;;
"--force")
FORCE=1
shift 1
main "$@"
;;
*)
error "未知参数: $1"
;;
esac
log_info "操作完成!"
log "TITLE" "${BLUE}操作完成!${NC}"
}
# 执行主函数
main "$@"