diff --git a/scripts/nvidia-fabricmanager.sh b/scripts/nvidia-fabricmanager.sh index fbefd11..74d63b5 100644 --- a/scripts/nvidia-fabricmanager.sh +++ b/scripts/nvidia-fabricmanager.sh @@ -1,190 +1,139 @@ #!/bin/bash set -euo pipefail -# 全局变量 -FABRICMANAGER_MAJOR_VERSION="565" -FABRICMANAGER_FULL_VERSION="565.57.01-1" -PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb" -PACKAGE_PATH="/opt/${PACKAGE_NAME}" -INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-fabricmanager" -OFFICIAL_BASE_URL="https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64" -TEMP_DIR="/tmp/fabricmanager_temp" -LOG_FILE="/var/log/fabricmanager_$(date +%Y%m%d%H%M%S).log" - # 颜色定义 GREEN='\033[1;32m' RED='\033[1;31m' -YELLOW='\033[1;33m' -NC='\033[0m' # 重置颜色 +BLUE='\033[1;34m' +NC='\033[0m' # 日志函数 -log() { - local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" - echo "$msg" >> "$LOG_FILE" -} +log_info() { echo -e "${BLUE}[INFO]${NC} $*"; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } -# 步骤提示 -step() { - local msg="==> $1" - echo -e "${GREEN}$msg${NC}" - log "$msg" -} - -# 警告提示 -warning() { - local msg="警告: $1" - echo -e "${YELLOW}$msg${NC}" - log "$msg" -} - -# 错误提示 -error() { - local msg="错误: $1" - echo -e "${RED}$msg${NC}" - log "$msg" - exit 1 -} - -# 执行命令并记录日志 -run_cmd() { - step "执行: $1" - eval "$1" &>> "$LOG_FILE" || { - error "命令执行失败: $1" - } -} - -# 检查命令是否存在 -check_cmd() { - command -v "$1" &>/dev/null || error "未找到命令: $1" -} - -# 参数解析 +# 初始化变量 ACTION="" -FORCE=0 +DISTRO="" +VERSION="" +PKG_NAME="" +DOWNLOAD_URL="" +MAJOR_VER="" +# 解析参数 while [[ $# -gt 0 ]]; do case "$1" in --install) ACTION="install"; shift ;; --uninstall) ACTION="uninstall"; shift ;; - --version) - CUSTOM_VERSION="$2" - if [[ $CUSTOM_VERSION =~ ^([0-9]+)(_[0-9]+\.[0-9]+\.[0-9]+-[0-9]+)$ ]]; then - FABRICMANAGER_MAJOR_VERSION="${BASH_REMATCH[1]}" - FABRICMANAGER_FULL_VERSION="${BASH_REMATCH[2]#_}" - PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb" - PACKAGE_PATH="/opt/${PACKAGE_NAME}" - else - error "版本格式错误,应为 xxxx_xxxx.xx.xx-x" - fi - shift 2 ;; - --force) FORCE=1; shift ;; - *) error "未知参数: $1" ;; + --distro) DISTRO="$2"; shift 2 ;; + --version) VERSION="$2"; shift 2 ;; + *) log_error "未知参数: $1,仅支持 --install --uninstall --distro --version" ;; esac done -[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall" +# 强制检查参数 +[[ -z "$ACTION" ]] && log_error "必须指定 --install 或 --uninstall" +[[ -z "$DISTRO" ]] && log_error "必须指定 --distro(如 ubuntu22.04)" +[[ -z "$VERSION" ]] && log_error "必须指定 --version(如 570_570.124.06-1)" +# 验证系统版本并生成URL +if [[ "$DISTRO" =~ ^ubuntu(22\.04|24\.04)$ ]]; then + local_ver="${DISTRO//./}" # ubuntu22.04 → ubuntu2204 + PKG_NAME="nvidia-fabricmanager-${VERSION}_amd64.deb" + DOWNLOAD_URL="https://developer.download.nvidia.cn/compute/cuda/repos/${local_ver}/x86_64/${PKG_NAME}" + MAJOR_VER="${VERSION%%_*}" # 提取主版本号(如570) +else + log_error "仅支持 ubuntu22.04 和 ubuntu24.04" +fi -# 下载安装包 -download_package() { - step "检查安装包: $PACKAGE_PATH" - if [[ -f "$PACKAGE_PATH" ]]; then - step "使用本地安装包" - return 0 - fi - - step "本地包不存在,开始下载" - mkdir -p "$(dirname "$PACKAGE_PATH")" +# 执行命令并隐藏输出,失败时显示错误信息和排查命令 +run_cmd() { + local cmd="$1" + local success_msg="$2" + local error_msg="$3" + local debug_cmd="$4" - local urls=( - "${INTERNAL_BASE_URL}/${PACKAGE_NAME}" - "${OFFICIAL_BASE_URL}/${PACKAGE_NAME}" - ) - - for url in "${urls[@]}"; do - step "尝试从 $url 下载" - if wget -q -O "$PACKAGE_PATH" "$url"; then - step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)" - return 0 - else - warning "从 $url 下载失败" - fi - done - - error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH" -} - - -# 安装 NVIDIA Fabric Manager -install_fabricmanager() { - step "开始安装 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}" - - # 下载安装包 - download_package - - # 安装前检查 - step "检查系统依赖" - check_cmd dpkg - check_cmd systemctl - - # 安装 Fabric Manager - step "安装 NVIDIA Fabric Manager" - run_cmd "dpkg -i $PACKAGE_PATH" - - # 启动并启用服务 - step "启动并启用 NVIDIA Fabric Manager 服务" - run_cmd "systemctl enable nvidia-fabricmanager.service --now" - - # 验证安装 - step "验证 NVIDIA Fabric Manager 服务状态" - if systemctl is-active --quiet nvidia-fabricmanager.service; then - step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 安装成功并运行中" + #log_info "$cmd" + if eval "$cmd &>/dev/null"; then + log_success "$success_msg" else - error "NVIDIA Fabric Manager 服务未运行" + log_error "$error_msg" + [[ -n "$debug_cmd" ]] && echo -e "${RED}排查命令:${NC} $debug_cmd" + exit 1 fi } - -# 卸载 NVIDIA Fabric Manager -uninstall_fabricmanager() { - step "开始卸载 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}" +# 检查服务状态 +check_service_status() { + local service_name="$1" + local success_msg="$2" + local error_msg="$3" - # 检查服务状态 - if systemctl is-active --quiet nvidia-fabricmanager.service; then - step "停止 NVIDIA Fabric Manager 服务" - run_cmd "systemctl stop nvidia-fabricmanager.service" + log_info "检查 $service_name 服务状态" + if systemctl is-active --quiet "$service_name"; then + log_success "$success_msg" else - step "NVIDIA Fabric Manager 服务未运行" + log_error "$error_msg" + echo -e "${RED}排查命令:${NC} systemctl status $service_name && journalctl -xeu $service_name" + exit 1 fi - - # 禁用服务 - step "禁用 NVIDIA Fabric Manager 服务" - run_cmd "systemctl disable nvidia-fabricmanager.service" - - # 卸载软件包 - step "卸载 NVIDIA Fabric Manager 软件包" - if dpkg -s "nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" &>/dev/null; then - run_cmd "dpkg -r nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" - else - warning "未找到 nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION} 软件包" - if [[ $FORCE -eq 0 ]]; then - error "请使用 --force 参数强制卸载" - fi - fi - - # 清理残留文件 - step "清理残留文件" - rm -f "$PACKAGE_PATH" - - step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 卸载完成" } - -step "开始 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程" +# 主逻辑 case "$ACTION" in - install) install_fabricmanager ;; - uninstall) uninstall_fabricmanager ;; - *) error "未知操作: $ACTION" ;; -esac + install) + log_info "开始安装 NVIDIA Fabric Manager ${VERSION} (${DISTRO})" + + # 下载 + log_info "开始下载安装包" + run_cmd "wget -q -O $PKG_NAME $DOWNLOAD_URL" \ + "下载安装包完成" \ + "下载安装包失败" \ + "wget --spider $DOWNLOAD_URL" + + # 安装 + + log_info "开始安装服务" + run_cmd "dpkg -i $PKG_NAME" \ + "服务已安装完成" \ + "服务安装失败请检查!" \ + "dpkg -i $PKG_NAME 2>&1 | tail -n 10" + + # 启动服务 + run_cmd "systemctl enable --now nvidia-fabricmanager" \ + "服务已设置为开机启动并尝试i启动" \ + "服务启动命令执行失败" \ + "systemctl enable --now nvidia-fabricmanager" + + # 检查服务状态 + log_info "检查服务启动状态" + check_service_status "nvidia-fabricmanager" \ + "NVIDIA Fabric Manager 服务运行正常" \ + "NVIDIA Fabric Manager 服务启动失败" + + log_success "NVIDIA Fabric Manager ${VERSION} 安装成功" + ;; -step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程完成" \ No newline at end of file + uninstall) + log_info "开始卸载 NVIDIA Fabric Manager ${VERSION}" + + # 停止服务(忽略错误) + systemctl stop nvidia-fabricmanager &>/dev/null || true + + # 卸载包 + log_info "开始卸载服务" + if dpkg -s "nvidia-fabricmanager-${MAJOR_VER}" &>/dev/null; then + run_cmd "dpkg -r nvidia-fabricmanager-${MAJOR_VER}" \ + "服务卸载完成" \ + "服务卸载失败" \ + "dpkg -r nvidia-fabricmanager-${MAJOR_VER} 2>&1 | tail -n 10" + else + log_success "包未安装,跳过卸载" + fi + + # 清理安装包 + rm -f "$PKG_NAME" &>/dev/null + + log_success "NVIDIA Fabric Manager ${VERSION} 卸载成功" + ;; +esac