#!/bin/bash set -euo pipefail # 全局变量 FABRICMANAGER_MAJOR_VERSION="565" FABRICMANAGER_FULL_VERSION="565.57.01-1" PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb" PACKAGE_PATH="/opt/${PACKAGE_NAME}" INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-fabricmanager" OFFICIAL_BASE_URL="https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64" TEMP_DIR="/tmp/fabricmanager_temp" LOG_FILE="/var/log/fabricmanager_$(date +%Y%m%d%H%M%S).log" # 颜色定义 GREEN='\033[1;32m' RED='\033[1;31m' YELLOW='\033[1;33m' NC='\033[0m' # 重置颜色 # 日志函数 log() { local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" echo "$msg" >> "$LOG_FILE" } # 步骤提示 step() { local msg="==> $1" echo -e "${GREEN}$msg${NC}" log "$msg" } # 警告提示 warning() { local msg="警告: $1" echo -e "${YELLOW}$msg${NC}" log "$msg" } # 错误提示 error() { local msg="错误: $1" echo -e "${RED}$msg${NC}" log "$msg" exit 1 } # 执行命令并记录日志 run_cmd() { step "执行: $1" eval "$1" &>> "$LOG_FILE" || { error "命令执行失败: $1" } } # 检查命令是否存在 check_cmd() { command -v "$1" &>/dev/null || error "未找到命令: $1" } # 参数解析 ACTION="" FORCE=0 while [[ $# -gt 0 ]]; do case "$1" in --install) ACTION="install"; shift ;; --uninstall) ACTION="uninstall"; shift ;; --version) CUSTOM_VERSION="$2" if [[ $CUSTOM_VERSION =~ ^([0-9]+)(_[0-9]+\.[0-9]+\.[0-9]+-[0-9]+)$ ]]; then FABRICMANAGER_MAJOR_VERSION="${BASH_REMATCH[1]}" FABRICMANAGER_FULL_VERSION="${BASH_REMATCH[2]#_}" PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb" PACKAGE_PATH="/opt/${PACKAGE_NAME}" else error "版本格式错误,应为 xxxx_xxxx.xx.xx-x" fi shift 2 ;; --force) FORCE=1; shift ;; *) error "未知参数: $1" ;; esac done [[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall" # 下载安装包 download_package() { step "检查安装包: $PACKAGE_PATH" if [[ -f "$PACKAGE_PATH" ]]; then step "使用本地安装包" return 0 fi step "本地包不存在,开始下载" mkdir -p "$(dirname "$PACKAGE_PATH")" local urls=( "${INTERNAL_BASE_URL}/${PACKAGE_NAME}" "${OFFICIAL_BASE_URL}/${PACKAGE_NAME}" ) for url in "${urls[@]}"; do step "尝试从 $url 下载" if wget -q -O "$PACKAGE_PATH" "$url"; then step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)" return 0 else warning "从 $url 下载失败" fi done error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH" } # 安装 NVIDIA Fabric Manager install_fabricmanager() { step "开始安装 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}" # 下载安装包 download_package # 安装前检查 step "检查系统依赖" check_cmd dpkg check_cmd systemctl # 安装 Fabric Manager step "安装 NVIDIA Fabric Manager" run_cmd "dpkg -i $PACKAGE_PATH" # 启动并启用服务 step "启动并启用 NVIDIA Fabric Manager 服务" run_cmd "systemctl enable nvidia-fabricmanager.service --now" # 验证安装 step "验证 NVIDIA Fabric Manager 服务状态" if systemctl is-active --quiet nvidia-fabricmanager.service; then step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 安装成功并运行中" else error "NVIDIA Fabric Manager 服务未运行" fi } # 卸载 NVIDIA Fabric Manager uninstall_fabricmanager() { step "开始卸载 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}" # 检查服务状态 if systemctl is-active --quiet nvidia-fabricmanager.service; then step "停止 NVIDIA Fabric Manager 服务" run_cmd "systemctl stop nvidia-fabricmanager.service" else step "NVIDIA Fabric Manager 服务未运行" fi # 禁用服务 step "禁用 NVIDIA Fabric Manager 服务" run_cmd "systemctl disable nvidia-fabricmanager.service" # 卸载软件包 step "卸载 NVIDIA Fabric Manager 软件包" if dpkg -s "nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" &>/dev/null; then run_cmd "dpkg -r nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" else warning "未找到 nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION} 软件包" if [[ $FORCE -eq 0 ]]; then error "请使用 --force 参数强制卸载" fi fi # 清理残留文件 step "清理残留文件" rm -f "$PACKAGE_PATH" step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 卸载完成" } step "开始 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程" case "$ACTION" in install) install_fabricmanager ;; uninstall) uninstall_fabricmanager ;; *) error "未知操作: $ACTION" ;; esac step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程完成"