ansible-devops/scripts/ib-drive.sh

387 lines
12 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -euo pipefail
IFS=$'\n\t'
#================================
# 全局配置
#================================
LOG_FILE="/var/log/mlnx_driver_install_$(date +%Y%m%d%H%M%S).log"
DRIVER_NAME="MLNX_OFED"
APP_VERSION="5.8-6.0.4.2"
ARCH="x86_64"
DISTRO="ubuntu22.04"
APP_DIR="/opt"
FORCE=0
# 颜色定义
GREEN='\033[1;32m' # 绿色 - 成功
RED='\033[1;31m' # 红色 - 失败/错误
BLUE='\033[1;34m' # 蓝色 - 标题/信息
NC='\033[0m' # 重置颜色
#================================
# 日志函数
#================================
log() {
local level="$1"
local message="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 颜色输出 - 仅SUCCESS和ERROR使用特殊颜色
case "$level" in
"SUCCESS") echo -e "${GREEN}[${level}]${NC} $message" ;;
"ERROR") echo -e "${RED}[${level}]${NC} $message" ;;
*) echo -e "${BLUE}[${level}]${NC} $message" ;;
esac
# 记录到日志文件(不包含颜色)
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
#================================
# 错误处理
#================================
error() {
local message="$1"
log "ERROR" "$message"
log "ERROR" "详细日志请查看: $LOG_FILE"
exit 1
}
#================================
# 执行命令
#================================
run_cmd() {
local command="$1"
local description="${2:-"执行命令"}"
log "INFO" "$description: $command"
# 执行命令并捕获输出
local output
output=$(eval "$command" 2>&1) || {
log "ERROR" "命令执行失败: $command"
log "ERROR" "错误详情: $output"
return 1
}
return 0
}
#================================
# 测试网络连接
#================================
test_network() {
local url="$1"
log "INFO" "测试网络连接: $url"
# 直接测试URL连通性设置5秒超时
if ! curl -fsSLI --connect-timeout 5 "$url" &>> "$LOG_FILE"; then
log "WARN" "网络源不可用: $url"
return 1
fi
return 0
}
#================================
# 下载文件
#================================
download_file() {
local url="$1"
local dest="$2"
log "INFO" "开始下载: $url"
# 使用wget下载显示进度条
if ! wget -q --show-progress -O "$dest" "$url" &>> "$LOG_FILE"; then
error "下载失败: $url"
fi
log "INFO" "下载完成: $dest"
}
#================================
# 生成包信息
#================================
generate_package_info() {
DRIVER_PACKAGE="${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}.tgz"
PACKAGE_PATH="${APP_DIR}/${DRIVER_PACKAGE}"
DRIVER_DIR="${APP_DIR}/${DRIVER_NAME}_LINUX-${APP_VERSION}-${DISTRO}-${ARCH}"
log "INFO" "生成包信息: $DRIVER_PACKAGE"
}
#================================
# 下载驱动包(优化版)
#================================
download_driver() {
log "TITLE" "${BLUE}开始获取驱动安装包${NC}"
# 定义下载源列表 (按优先级排序)
DOWNLOAD_SOURCES=(
"${PACKAGE_PATH}" # 本地文件
"http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内部源1
"http://10.102.32.207:5588/mlnx-ofed/${DRIVER_PACKAGE}" # 内部源2
"https://www.mellanox.com/downloads/ofed/${DRIVER_PACKAGE}" # 公共源
)
# 查找可用的下载源
DOWNLOAD_URL=""
log "INFO" "开始查找可用的下载源..."
for source in "${DOWNLOAD_SOURCES[@]}"; do
if [[ "$source" == /* ]]; then
# 本地文件检查
log "INFO" "检查本地文件: $source"
if [[ -f "$source" ]]; then
DOWNLOAD_URL="$source"
log "INFO" "${BLUE}找到本地文件: $DOWNLOAD_URL${NC}"
break
else
log "WARN" "本地文件不存在: $source"
fi
else
# 网络URL检查
if ! test_network "$source"; then
continue # 跳过不可用源
fi
DOWNLOAD_URL="$source"
log "INFO" "${BLUE}找到可用下载源: $DOWNLOAD_URL${NC}"
break
fi
done
# 检查是否找到可用源
if [[ -z "$DOWNLOAD_URL" ]]; then
log "ERROR" "无法找到可用的下载源"
log "ERROR" "请检查网络连接或手动下载安装包到/opt目录"
error "下载地址: https://www.mellanox.com/downloads/ofed"
fi
# 下载文件
TEMP_FILE="/tmp/${DRIVER_PACKAGE}"
log "INFO" "准备获取驱动包..."
if [[ "$DOWNLOAD_URL" == /* ]]; then
# 使用本地文件
log "INFO" "使用本地文件: $DOWNLOAD_URL"
run_cmd "cp $DOWNLOAD_URL $TEMP_FILE" "复制本地文件到临时目录"
else
# 从网络下载
log "INFO" "从网络下载: $DOWNLOAD_URL"
download_file "$DOWNLOAD_URL" "$TEMP_FILE"
fi
# 验证文件完整性
log "INFO" "验证下载文件的完整性..."
file_size=$(stat -c%s "$TEMP_FILE")
if [[ $file_size -lt 10485760 ]]; then # 检查文件大小是否小于10MB
log "ERROR" "下载的文件大小异常: $file_size 字节"
log "ERROR" "请检查网络连接或下载源的可用性"
error "建议手动下载后放置到/opt目录"
fi
log "SUCCESS" "文件完整性验证通过: $file_size 字节"
return 0
}
#================================
# 其他函数保持不变...
#================================
# 安装驱动
install_driver() {
log "TITLE" "${BLUE}开始安装驱动: $APP_VERSION${NC}"
kernel_version=$(uname -r)
log "INFO" "当前内核版本: $kernel_version"
log "INFO" "安装依赖包"
run_cmd "apt update" "更新软件包索引"
run_cmd "apt install -y net-tools bzip2" "安装依赖包"
log "INFO" "解压驱动包"
run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包"
log "INFO" "执行驱动安装"
run_cmd "cd $DRIVER_DIR && ./mlnxofedinstall --without-dkms --add-kernel-support --kernel $kernel_version --with-fw-update --force" "执行驱动安装"
sleep 10
}
# 卸载驱动
uninstall_driver() {
log "TITLE" "${BLUE}开始卸载驱动: $APP_VERSION${NC}"
# 检查驱动目录,不存在则重新下载解压
if [ ! -d "$DRIVER_DIR" ]; then
log "WARN" "驱动目录不存在,尝试重新下载和解压"
download_driver # 复用安装的下载逻辑
run_cmd "tar -zxf $TEMP_FILE -C $APP_DIR" "解压驱动包"
if [ ! -d "$DRIVER_DIR" ]; then
error "解压失败,无法找到驱动目录: $DRIVER_DIR"
else
log "INFO" "成功解压驱动包到: $DRIVER_DIR"
fi
else
log "INFO" "找到驱动目录: $DRIVER_DIR"
fi
# 执行卸载
run_cmd "cd $DRIVER_DIR && ./uninstall.sh -q -y" "执行卸载脚本" || log "WARN" "卸载脚本执行失败,尝试手动清理"
log "INFO" "清理残留文件"
run_cmd "rm -rf $DRIVER_DIR $TEMP_FILE" "删除驱动目录和临时文件"
log "INFO" "停止并禁用openibd服务"
run_cmd "systemctl stop openibd.service || true" "停止openibd服务"
run_cmd "systemctl disable openibd.service || true" "禁用openibd服务"
log "INFO" "恢复网卡命名规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log||true" "清理IPOIB规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log|true" "清理网络规则"
run_cmd "rm -f /etc/modprobe.d/nvidia-gsp.conf" "删除nvidia配置"
run_cmd "update-initramfs -u" "更新initramfs"
}
# 配置网卡命名规则
configure_naming_rules() {
log "TITLE" "${BLUE}配置IB网卡命名规则${NC}"
log "INFO" "备份原有规则"
run_cmd "cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log|| true" "备份IPOIB规则"
run_cmd "cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log|| true" "备份网络规则"
log "INFO" "清除原有规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log|| true" "清除IPOIB规则"
run_cmd "sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log|| true" "清除网络规则"
log "INFO" "生成IB设备命名规则"
ID=20
for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$i" ]; then
echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules
ID=$((ID+1))
fi
done
log "INFO" "生成网络设备命名规则"
IDS=0
for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$j" ]; then
echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules
IDS=$((IDS+1))
fi
done
log "INFO" "配置nvidia选项"
run_cmd "echo \"options nvidia NVreg_EnableGpuFirmware=0\" > /etc/modprobe.d/nvidia-gsp.conf" "写入nvidia配置"
run_cmd "update-initramfs -u" "更新initramfs"
log "INFO" "重启openibd服务"
run_cmd "systemctl restart openibd.service" "重启openibd服务"
sleep 15
}
# 检查驱动安装结果
check_installation() {
log "TITLE" "${BLUE}检查驱动安装结果${NC}"
if command -v ibv_devinfo &> /dev/null; then
log "SUCCESS" "驱动安装成功"
else
error "驱动安装失败"
fi
log "INFO" "检查网卡命名规则"
valid_count=0
for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$dev" ]; then
mlx_name=$(udevadm info -q name -n "$dev" 2>/dev/null | grep "mlx5_" || true)
net_name=$(ip link show "$dev" | grep "ib[0-9]" || true)
if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then
valid_count=$((valid_count+1))
else
log "WARN" "网卡 $dev 命名规则未生效"
fi
fi
done
if [ $valid_count -gt 0 ]; then
log "SUCCESS" "网卡命名规则生效,成功配置 $valid_count 个网卡"
else
log "WARN" "所有网卡命名规则均未生效,建议重启系统手工配置!"
fi
}
# 检查卸载结果
check_uninstallation() {
log "TITLE" "${BLUE}检查卸载结果${NC}"
if ! command -v ibv_devinfo &> /dev/null; then
log "SUCCESS" "驱动已成功卸载"
else
log "WARN" "驱动命令仍存在,可能需要手动清理"
fi
if [ ! -d "$DRIVER_DIR" ]; then
log "SUCCESS" "驱动目录已删除"
else
log "WARN" "驱动目录未完全删除: $DRIVER_DIR"
fi
}
# 主函数
main() {
log "TITLE" "${BLUE}MLNX驱动管理脚本启动${NC}"
# 检查root权限
if [[ $EUID -ne 0 ]]; then
error "此脚本需要root权限运行"
fi
# 生成包信息
generate_package_info
# 解析参数
if [[ $# -lt 1 ]]; then
log "ERROR" "请指定操作: --install 或 --uninstall"
exit 1
fi
case "$1" in
"--install")
download_driver
install_driver
configure_naming_rules
check_installation
;;
"--uninstall")
uninstall_driver
check_uninstallation
;;
"--version")
if [[ -n "$2" ]]; then
APP_VERSION="$2"
generate_package_info
log "INFO" "设置驱动版本: $APP_VERSION"
shift 2
main "$@"
else
error "请指定版本号,如: --version 5.8-6.0.4.2"
fi
;;
"--force")
FORCE=1
shift 1
main "$@"
;;
*)
error "未知参数: $1"
;;
esac
log "TITLE" "${BLUE}操作完成!${NC}"
}
# 执行主函数
main "$@"