#!/bin/bash set -euo pipefail # 颜色定义 GREEN='\033[1;32m' RED='\033[1;31m' BLUE='\033[1;34m' NC='\033[0m' # 日志函数 log_info() { echo -e "${BLUE}[INFO]${NC} $*"; } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $*"; } log_error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } # 初始化变量 ACTION="" DISTRO="" VERSION="" PKG_NAME="" DOWNLOAD_URL="" MAJOR_VER="" # 解析参数 while [[ $# -gt 0 ]]; do case "$1" in --install) ACTION="install"; shift ;; --uninstall) ACTION="uninstall"; shift ;; --distro) DISTRO="$2"; shift 2 ;; --version) VERSION="$2"; shift 2 ;; *) log_error "未知参数: $1,仅支持 --install --uninstall --distro --version" ;; esac done # 强制检查参数 [[ -z "$ACTION" ]] && log_error "必须指定 --install 或 --uninstall" [[ -z "$DISTRO" ]] && log_error "必须指定 --distro(如 ubuntu22.04)" [[ -z "$VERSION" ]] && log_error "必须指定 --version(如 570_570.124.06-1)" # 验证系统版本并生成URL if [[ "$DISTRO" =~ ^ubuntu(22\.04|24\.04)$ ]]; then local_ver="${DISTRO//./}" # ubuntu22.04 → ubuntu2204 PKG_NAME="nvidia-fabricmanager-${VERSION}_amd64.deb" DOWNLOAD_URL="https://developer.download.nvidia.cn/compute/cuda/repos/${local_ver}/x86_64/${PKG_NAME}" MAJOR_VER="${VERSION%%_*}" # 提取主版本号(如570) else log_error "仅支持 ubuntu22.04 和 ubuntu24.04" fi # 执行命令并隐藏输出,失败时显示错误信息和排查命令 run_cmd() { local cmd="$1" local success_msg="$2" local error_msg="$3" local debug_cmd="$4" #log_info "$cmd" if eval "$cmd &>/dev/null"; then log_success "$success_msg" else log_error "$error_msg" [[ -n "$debug_cmd" ]] && echo -e "${RED}排查命令:${NC} $debug_cmd" exit 1 fi } # 检查服务状态 check_service_status() { local service_name="$1" local success_msg="$2" local error_msg="$3" log_info "检查 $service_name 服务状态" if systemctl is-active --quiet "$service_name"; then log_success "$success_msg" else log_error "$error_msg" echo -e "${RED}排查命令:${NC} systemctl status $service_name && journalctl -xeu $service_name" exit 1 fi } # 主逻辑 case "$ACTION" in install) log_info "开始安装 NVIDIA Fabric Manager ${VERSION} (${DISTRO})" # 下载 log_info "开始下载安装包" run_cmd "wget -q -O $PKG_NAME $DOWNLOAD_URL" \ "下载安装包完成" \ "下载安装包失败" \ "wget --spider $DOWNLOAD_URL" # 安装 log_info "开始安装服务" run_cmd "dpkg -i $PKG_NAME" \ "服务已安装完成" \ "服务安装失败请检查!" \ "dpkg -i $PKG_NAME 2>&1 | tail -n 10" # 启动服务 run_cmd "systemctl enable --now nvidia-fabricmanager" \ "服务已设置为开机启动并尝试i启动" \ "服务启动命令执行失败" \ "systemctl enable --now nvidia-fabricmanager" # 检查服务状态 log_info "检查服务启动状态" check_service_status "nvidia-fabricmanager" \ "NVIDIA Fabric Manager 服务运行正常" \ "NVIDIA Fabric Manager 服务启动失败" log_success "NVIDIA Fabric Manager ${VERSION} 安装成功" ;; uninstall) log_info "开始卸载 NVIDIA Fabric Manager ${VERSION}" # 停止服务(忽略错误) systemctl stop nvidia-fabricmanager &>/dev/null || true # 卸载包 log_info "开始卸载服务" if dpkg -s "nvidia-fabricmanager-${MAJOR_VER}" &>/dev/null; then run_cmd "dpkg -r nvidia-fabricmanager-${MAJOR_VER}" \ "服务卸载完成" \ "服务卸载失败" \ "dpkg -r nvidia-fabricmanager-${MAJOR_VER} 2>&1 | tail -n 10" else log_success "包未安装,跳过卸载" fi # 清理安装包 rm -f "$PKG_NAME" &>/dev/null log_success "NVIDIA Fabric Manager ${VERSION} 卸载成功" ;; esac