ansible-devops/scripts/ib-drive.sh

261 lines
8.0 KiB
Bash
Raw Normal View History

2025-07-05 15:49:53 +08:00
#!/bin/bash
set -e
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 输出带颜色的信息
log_info() { echo -e "${GREEN}[INFO] $1${NC}"; }
log_error() { echo -e "${RED}[ERROR] $1${NC}"; exit 1; }
log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}"; }
# 默认变量
ACTION=""
DRIVER_VERSION="5.8-6.0.4.2"
DISTRO="ubuntu22.04"
ARCH="x86_64"
FORCE=0
# 生成包名和路径
generate_package_info() {
DRIVER_PACKAGE="MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}.tgz"
PACKAGE_PATH="/opt/${DRIVER_PACKAGE}"
DRIVER_DIR="/opt/MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}"
INTERNAL_URL="http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}"
OFFICIAL_URL="https://example.com/${DRIVER_PACKAGE}" # 请替换为实际官网地址
}
# 解析命令行参数
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install)
ACTION="install"
shift
;;
--uninstall)
ACTION="uninstall"
shift
;;
--version)
if [[ -z "$2" ]]; then
log_error "请指定版本号,如: --version 5.8-6.0.4.2"
fi
DRIVER_VERSION="$2"
generate_package_info
shift 2
;;
--force)
FORCE=1
shift
;;
*)
log_error "未知参数: $1"
;;
esac
done
if [[ -z "$ACTION" ]]; then
log_error "请指定操作: --install 或 --uninstall"
fi
}
# 下载驱动包
download_driver() {
log_info "开始下载驱动包: $DRIVER_PACKAGE"
if [ -f "$PACKAGE_PATH" ]; then
log_info "使用本地驱动包: $PACKAGE_PATH"
else
log_info "本地包不存在,尝试从内网下载"
if wget -q -O "$PACKAGE_PATH" "$INTERNAL_URL"; then
log_info "内网下载成功"
else
log_warning "内网下载失败,尝试从官网下载"
if wget -q -O "$PACKAGE_PATH" "$OFFICIAL_URL"; then
log_info "官网下载成功"
else
log_error "驱动包下载失败,请手动放置到 /opt/"
fi
fi
fi
}
# 安装驱动
install_driver() {
log_info "开始安装驱动: $DRIVER_VERSION"
# 检查是否已安装
#if command -v ibv_devinfo &> /dev/null && [[ $FORCE -eq 0 ]]; then
# log_warning "检测到驱动已安装,使用 --force 覆盖安装"
# exit 0
#fi
kernel_version=$(uname -r)
log_info "当前内核版本: $kernel_version"
log_info "安装依赖包"
apt update &>> /tmp/mlnx_install.log
apt install -y net-tools bzip2 &>> /tmp/mlnx_install.log
log_info "解压驱动包"
tar -zxf "$PACKAGE_PATH" -C /opt/
log_info "执行驱动安装"
cd "$DRIVER_DIR"
./mlnxofedinstall --without-dkms --add-kernel-support --kernel "$kernel_version" --with-fw-update --force &>> /tmp/mlnx_install.log
sleep 10
}
# 卸载驱动(修改后版本)
uninstall_driver() {
log_info "开始卸载驱动: $DRIVER_VERSION"
# 检查驱动目录,不存在则重新下载解压
if [ ! -d "$DRIVER_DIR" ]; then
log_warning "驱动目录不存在,尝试重新下载和解压"
download_driver # 复用安装的下载逻辑
log_info "解压驱动包"
tar -zxf "$PACKAGE_PATH" -C /opt/
if [ ! -d "$DRIVER_DIR" ]; then
log_error "解压失败,无法找到驱动目录: $DRIVER_DIR"
else
log_info "成功解压驱动包到: $DRIVER_DIR"
fi
else
log_info "找到驱动目录: $DRIVER_DIR"
fi
# 执行卸载
cd "$DRIVER_DIR"
log_info "执行卸载脚本"
./uninstall.sh -q -y &>> /tmp/mlnx_install.log || log_warning "卸载脚本执行失败,尝试手动清理"
log_info "清理残留文件"
rm -rf "$DRIVER_DIR" "$PACKAGE_PATH"
log_info "停止并禁用openibd服务"
systemctl stop openibd.service &>> /tmp/mlnx_install.log || true
systemctl disable openibd.service &>> /tmp/mlnx_install.log || true
log_info "恢复网卡命名规则"
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules
rm -f /etc/modprobe.d/nvidia-gsp.conf
update-initramfs -u &>> /tmp/mlnx_install.log
}
# 配置网卡命名规则
configure_naming_rules() {
log_info "配置IB网卡命名规则"
log_info "备份原有规则"
cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log || true
cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log || true
log_info "清除原有规则"
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log || true
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log || true
log_info "生成IB设备命名规则"
ID=20
for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$i" ]; then
echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules
ID=$((ID+1))
fi
done
log_info "生成网络设备命名规则"
IDS=0
for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$j" ]; then
echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules
IDS=$((IDS+1))
fi
done
log_info "配置nvidia选项"
echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-gsp.conf
update-initramfs -u &>> /tmp/mlnx_install.log
log_info "重启openibd服务"
systemctl restart openibd.service
sleep 15
}
# 检查驱动安装结果
check_installation() {
log_info "检查驱动安装结果"
if command -v ibv_devinfo &> /dev/null; then
log_info "驱动安装成功"
else
log_error "驱动安装失败"
fi
log_info "检查网卡命名规则"
valid_count=0
for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$dev" ]; then
mlx_name=$(udevadm info -q name -n "$dev" 2>/dev/null | grep "mlx5_" || true)
net_name=$(ip link show "$dev" | grep "ib[0-9]" || true)
if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then
valid_count=$((valid_count+1))
else
log_warning "网卡 $dev 命名规则未生效"
fi
fi
done
if [ $valid_count -gt 0 ]; then
log_info "网卡命名规则生效,成功配置 $valid_count 个网卡"
else
log_warning "所有网卡命名规则均未生效,建议重启系统手工配置!"
fi
}
# 检查卸载结果
check_uninstallation() {
log_info "检查卸载结果"
if ! command -v ibv_devinfo &> /dev/null; then
log_info "驱动已成功卸载"
else
log_warning "驱动命令仍存在,可能需要手动清理"
fi
if [ ! -d "$DRIVER_DIR" ]; then
log_info "驱动目录已删除"
else
log_warning "驱动目录未完全删除: $DRIVER_DIR"
fi
}
# 主函数
main() {
generate_package_info
parse_args "$@"
log_info "开始执行 ${ACTION^^} 操作,版本: $DRIVER_VERSION"
case "$ACTION" in
install)
download_driver
install_driver
configure_naming_rules
check_installation
;;
uninstall)
uninstall_driver
check_uninstallation
;;
esac
log_info "操作完成!"
}
# 执行主函数
main "$@"