ansible-devops/scripts/ib-drive.sh

513 lines
16 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -euo pipefail
IFS=$'\n\t'
#================================
# 全局配置
#================================
LOG_FILE="/var/log/mlnx_driver_install_$(date +%Y%m%d%H%M%S).log"
DRIVER_NAME="MLNX_OFED"
APP_VERSION="" # 不再设置默认值,必须手动指定
ARCH="x86_64"
DISTRO="" # 不再设置默认值,必须手动指定
APP_DIR="/opt"
FORCE=0
# 支持的系统版本列表
SUPPORTED_DISTROS=("ubuntu22.04" "ubuntu24.04")
# 颜色定义
GREEN='\033[1;32m' # 绿色 - 成功
RED='\033[1;31m' # 红色 - 失败/错误
BLUE='\033[1;34m' # 蓝色 - 标题/信息
YELLOW='\033[1;33m' # 黄色 - 警告/进度
NC='\033[0m' # 重置颜色
#================================
# 日志函数
#================================
log() {
local level="$1"
local message="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 颜色输出
case "$level" in
"SUCCESS") echo -e "${GREEN}[${level}]${NC} $message" ;;
"ERROR") echo -e "${RED}[${level}]${NC} $message" ;;
"WARN") echo -e "${YELLOW}[${level}]${NC} $message" ;;
*) echo -e "${BLUE}[${level}]${NC} $message" ;;
esac
# 记录到日志文件
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
#================================
# 错误处理
#================================
error() {
local message="$1"
log "ERROR" "$message"
log "ERROR" "详细日志请查看: $LOG_FILE"
exit 1
}
#================================
# 执行命令
#================================
run_cmd() {
local command="$1"
local description="${2:-"执行命令"}"
log "INFO" "$description"
#log "INFO" "$description: $command"
# 执行命令并捕获输出
local output
output=$(eval "$command" 2>&1) || {
log "ERROR" "命令执行失败: $command"
log "ERROR" "错误详情: $output"
return 1
}
return 0
}
#================================
# 测试网络连接
#================================
test_network() {
local url="$1"
log "INFO" "测试网络连接: $url"
# 直接测试URL连通性
if ! curl -fsSLI --connect-timeout 5 "$url" &>> "$LOG_FILE"; then
log "WARN" "网络源不可用: $url"
return 1
fi
return 0
}
#================================
# 下载文件 - 带进度条使用curl
#================================
download_file() {
local url="$1"
local dest="$2"
log "INFO" "开始下载: $url"
# 创建一个临时文件来存储进度信息
local progress_file=$(mktemp)
# 使用curl下载显示进度条
echo -e "${YELLOW}[DOWNLOAD]${NC} 下载进度:"
# 启动后台进程来执行下载
(curl -# -L -o "$dest" "$url" 2>"$progress_file") &
local curl_pid=$!
# 显示进度信息
while kill -0 $curl_pid 2>/dev/null; do
if [ -s "$progress_file" ]; then
# 获取最后一行进度信息并显示
local last_line=$(tail -n 1 "$progress_file")
echo -ne "\r${YELLOW}[DOWNLOAD]${NC} $last_line"
fi
sleep 2
done
# 等待下载完成并获取退出状态
wait $curl_pid
local exit_status=$?
# 确保捕获最后一行进度信息
if [ -s "$progress_file" ]; then
local last_line=$(tail -n 1 "$progress_file")
echo -e "\r${YELLOW}[DOWNLOAD]${NC} $last_line"
fi
# 清理临时文件
rm -f "$progress_file"
# 检查下载是否成功
if [ $exit_status -ne 0 ]; then
error "下载失败: $url (错误码: $exit_status)"
fi
echo -e "\r${YELLOW}[DOWNLOAD]${NC} 下载完成! "
log "INFO" "下载完成: $dest"
# 验证文件完整性检查文件大小是否大于10MB
local file_size=$(stat -c%s "$dest")
if [ $file_size -lt 10485760 ]; then
log "ERROR" "下载的文件大小异常: $file_size 字节"
log "ERROR" "请检查网络连接或下载源的可用性"
error "建议手动下载后放置到/opt目录"
fi
}
#================================
# 生成包信息
#================================
generate_package_info() {
# 检查是否设置了必需的参数
if [[ -z "$APP_VERSION" ]]; then
error "必须指定驱动版本,请使用 --version 参数"
fi
if [[ -z "$DISTRO" ]]; then
error "必须指定系统版本,请使用 --distro 参数"
fi
# 使用改进的系统版本检查方法
local found=false
for supported in "${SUPPORTED_DISTROS[@]}"; do
if [[ "$supported" == "$DISTRO" ]]; then
found=true
break
fi
done
if [[ "$found" == "false" ]]; then
error "不支持的系统版本: $DISTRO。支持的系统: ${SUPPORTED_DISTROS[*]}"
fi
DRIVER_PACKAGE="${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}.tgz"
PACKAGE_PATH="${APP_DIR}/${DRIVER_PACKAGE}"
DRIVER_DIR="${APP_DIR}/${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}"
log "INFO" "生成包信息: $DRIVER_PACKAGE"
}
#================================
# 生成下载URL
#================================
generate_download_urls() {
# 清空下载源数组
DOWNLOAD_SOURCES=()
# 定义下载源模板 - 按优先级排序
local SOURCE_TEMPLATES=(
"$PACKAGE_PATH" # 本地文件优先
"http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内网1
"http://10.102.32.207:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内网2
"https://content.mellanox.com/ofed/MLNX_OFED-${APP_VERSION}/${DRIVER_PACKAGE}" # 官网
)
# 生成最终下载源列表
for template in "${SOURCE_TEMPLATES[@]}"; do
DOWNLOAD_SOURCES+=("$template")
done
}
#================================
# 下载驱动包
#================================
download_driver() {
log "TITLE" "${BLUE}开始获取驱动安装包${NC}"
# 生成下载URL - 按优先级排序
generate_download_urls
# 查找可用的下载源
DOWNLOAD_URL=""
log "INFO" "开始查找可用的下载源..."
for source in "${DOWNLOAD_SOURCES[@]}"; do
if [[ "$source" == /* ]]; then
# 本地文件检查
log "INFO" "检查本地文件: $source"
if [[ -f "$source" ]]; then
DOWNLOAD_URL="$source"
log "INFO" "${BLUE}找到本地文件: $DOWNLOAD_URL${NC}"
break
else
log "WARN" "本地文件不存在: $source"
fi
else
# 网络URL检查
log "INFO" "检查网络源: $source"
if test_network "$source"; then
DOWNLOAD_URL="$source"
log "INFO" "${BLUE}找到可用下载源: $DOWNLOAD_URL${NC}"
break
else
log "WARN" "网络源不可用: $source"
fi
fi
done
# 检查是否找到可用源
if [[ -z "$DOWNLOAD_URL" ]]; then
log "ERROR" "无法找到可用的下载源"
log "ERROR" "请检查网络连接或手动下载安装包到/opt目录"
error "官网下载地址: ${DOWNLOAD_SOURCES[-1]}"
fi
# 下载文件
TEMP_FILE="/opt/${DRIVER_PACKAGE}"
log "INFO" "准备获取驱动包..."
if [[ "$DOWNLOAD_URL" == /* ]]; then
# 使用本地文件
log "INFO" "使用本地文件: $DOWNLOAD_URL"
#run_cmd "cp $DOWNLOAD_URL $TEMP_FILE" "复制本地文件到临时目录"
else
# 从网络下载
log "INFO" "从网络下载: $DOWNLOAD_URL"
download_file "$DOWNLOAD_URL" "$TEMP_FILE"
fi
}
#================================
# 安装驱动
#================================
install_driver() {
log "TITLE" "${BLUE}开始安装驱动: $APP_VERSION${NC}"
kernel_version=$(uname -r)
log "INFO" "当前内核版本: $kernel_version"
# 根据系统版本安装不同的依赖
case "$DISTRO" in
"ubuntu22.04")
log "INFO" "安装Ubuntu 22.04依赖包"
run_cmd "apt update" "更新软件包索引"
run_cmd "apt install -y net-tools bzip2" "安装依赖包"
;;
"ubuntu24.04")
log "INFO" "安装Ubuntu 24.04依赖包"
run_cmd "apt update" "更新软件包索引"
run_cmd "apt install -y net-tools bzip2 dkms" "安装依赖包包含dkms"
;;
*)
error "不支持的系统版本: $DISTRO"
;;
esac
log "INFO" "解压驱动包"
run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包"
log "INFO" "执行驱动安装"
case "$DISTRO" in
"ubuntu22.04")
run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --without-dkms --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装"
;;
"ubuntu24.04")
# Ubuntu 24.04可能需要不同的安装选项
run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装Ubuntu 24.04"
;;
*)
error "不支持的系统版本: $DISTRO"
;;
esac
sleep 10
}
#================================
# 卸载驱动
#================================
uninstall_driver() {
log "TITLE" "${BLUE}开始卸载驱动: $APP_VERSION${NC}"
# 检查驱动目录,不存在则重新下载解压
if [ ! -d "$DRIVER_DIR" ]; then
log "WARN" "驱动目录不存在,尝试重新下载和解压"
download_driver # 复用安装的下载逻辑
run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包"
if [ ! -d "$DRIVER_DIR" ]; then
error "解压失败,无法找到驱动目录: $DRIVER_DIR"
else
log "INFO" "成功解压驱动包到: $DRIVER_DIR"
fi
else
log "INFO" "找到驱动目录: $DRIVER_DIR"
fi
# 执行卸载
run_cmd "cd $DRIVER_DIR && DEBIAN_FRONTEND=noninteractive && ./uninstall.sh -q --force 2>/dev/null" "执行卸载脚本" || log "WARN" "卸载脚本执行失败,尝试手动清理"
log "INFO" "清理残留文件"
log "INFO" "停止并禁用openibd服务"
run_cmd "systemctl stop openibd.service || true" "停止openibd服务"
run_cmd "systemctl disable openibd.service || true" "禁用openibd服务"
log "INFO" "恢复网卡命名规则"
run_cmd ">/etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log||true" "清理IPOIB规则"
run_cmd ">/etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log|true" "清理网络规则"
run_cmd "rm -f /etc/modprobe.d/nvidia-gsp.conf" "删除nvidia配置"
run_cmd "update-initramfs -u" "更新initramfs"
}
#================================
# 配置网卡命名规则
#================================
configure_naming_rules() {
log "TITLE" "${BLUE}配置IB网卡命名规则${NC}"
log "INFO" "备份原有规则"
run_cmd "cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log|| true" "备份IPOIB规则"
run_cmd "cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log|| true" "备份网络规则"
log "INFO" "清除原有规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log|| true" "清除IPOIB规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log|| true" "清除网络规则"
log "INFO" "生成IB设备命名规则"
ID=20
for i in $(ibdev2netdev -v | grep 400G | awk '{print $1}'); do
if [ -n "$i" ]; then
echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules
ID=$((ID+1))
fi
done
log "INFO" "生成网络设备命名规则"
IDS=0
for j in $(ibdev2netdev -v | grep 400G | awk '{print $1}'); do
if [ -n "$j" ]; then
echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules
IDS=$((IDS+1))
fi
done
log "INFO" "配置nvidia选项"
run_cmd "echo \"options nvidia NVreg_EnableGpuFirmware=0\" > /etc/modprobe.d/nvidia-gsp.conf" "写入nvidia配置"
run_cmd "update-initramfs -u" "更新initramfs"
log "INFO" "重启openibd服务"
run_cmd "systemctl restart openibd.service" "重启openibd服务"
sleep 15
}
#================================
# 检查驱动安装结果
#================================
check_installation() {
log "TITLE" "${BLUE}检查驱动安装结果${NC}"
if command -v ibv_devinfo &> /dev/null; then
log "SUCCESS" "驱动安装成功"
else
error "驱动安装失败"
fi
}
#================================
# 检查卸载结果
#================================
check_uninstallation() {
log "TITLE" "${BLUE}检查卸载结果${NC}"
if ! command -v ibv_devinfo &> /dev/null; then
log "SUCCESS" "驱动已成功卸载"
else
log "WARN" "驱动命令仍存在,可能需要手动清理"
fi
}
#================================
# 显示帮助信息
#================================
show_help() {
echo "MLNX_OFED驱动管理脚本"
echo "用法: $0 [选项] [命令]"
echo ""
echo "命令:"
echo " --install 安装驱动"
echo " --uninstall 卸载驱动"
echo ""
echo "选项:"
echo " --version <版本号> 指定驱动版本 (必需)"
echo " --distro <系统> 指定系统版本 (支持: ${SUPPORTED_DISTROS[*]}, 必需)"
echo " --force 强制操作"
echo " --help 显示此帮助信息"
echo ""
echo "示例:"
echo " $0 --install --distro ubuntu24.04 --version 5.9-1.0.8.0"
echo " $0 --uninstall --distro ubuntu22.04 --version 5.8-3.0.7.0"
}
#================================
# 主函数
#================================
main() {
log "TITLE" "${BLUE}MLNX驱动管理脚本启动========================================================${NC}"
# 检查root权限
if [[ $EUID -ne 0 ]]; then
error "此脚本需要root权限运行"
fi
# 解析参数
COMMAND=""
while [[ $# -gt 0 ]]; do
case "$1" in
"--install")
COMMAND="install"
;;
"--uninstall")
COMMAND="uninstall"
;;
"--version")
if [[ -n "$2" ]]; then
APP_VERSION="$2"
log "INFO" "设置驱动版本: $APP_VERSION"
shift 1
else
error "请指定版本号,如: --version 5.8-3.0.7.0"
fi
;;
"--distro")
if [[ -n "$2" ]]; then
DISTRO="$2"
log "INFO" "设置系统版本: $DISTRO"
shift 1
else
error "请指定系统版本,如: --distro ubuntu22.04"
fi
;;
"--force")
FORCE=1
;;
"--help")
show_help
exit 0
;;
*)
error "未知参数: $1"
;;
esac
shift 1
done
# 检查是否指定了命令
if [[ -z "$COMMAND" ]]; then
error "请指定操作: --install 或 --uninstall"
fi
# 生成包信息(会检查必需参数)
generate_package_info
# 执行命令
case "$COMMAND" in
"install")
download_driver
install_driver
configure_naming_rules
check_installation
;;
"uninstall")
uninstall_driver
check_uninstallation
;;
*)
error "未知命令: $COMMAND"
;;
esac
log "TITLE" "========================================================================${NC}"
}
# 执行主函数
main "$@"