ansible-devops/scripts/nvidia-fabricmanager.sh

140 lines
4.3 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -euo pipefail
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
BLUE='\033[1;34m'
NC='\033[0m'
# 日志函数
log_info() { echo -e "${BLUE}[INFO]${NC} $*"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $*"; }
log_error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
# 初始化变量
ACTION=""
DISTRO=""
VERSION=""
PKG_NAME=""
DOWNLOAD_URL=""
MAJOR_VER=""
# 解析参数
while [[ $# -gt 0 ]]; do
case "$1" in
--install) ACTION="install"; shift ;;
--uninstall) ACTION="uninstall"; shift ;;
--distro) DISTRO="$2"; shift 2 ;;
--version) VERSION="$2"; shift 2 ;;
*) log_error "未知参数: $1,仅支持 --install --uninstall --distro --version" ;;
esac
done
# 强制检查参数
[[ -z "$ACTION" ]] && log_error "必须指定 --install 或 --uninstall"
[[ -z "$DISTRO" ]] && log_error "必须指定 --distro如 ubuntu22.04"
[[ -z "$VERSION" ]] && log_error "必须指定 --version如 570_570.124.06-1"
# 验证系统版本并生成URL
if [[ "$DISTRO" =~ ^ubuntu(22\.04|24\.04)$ ]]; then
local_ver="${DISTRO//./}" # ubuntu22.04 → ubuntu2204
PKG_NAME="nvidia-fabricmanager-${VERSION}_amd64.deb"
DOWNLOAD_URL="https://developer.download.nvidia.cn/compute/cuda/repos/${local_ver}/x86_64/${PKG_NAME}"
MAJOR_VER="${VERSION%%_*}" # 提取主版本号如570
else
log_error "仅支持 ubuntu22.04 和 ubuntu24.04"
fi
# 执行命令并隐藏输出,失败时显示错误信息和排查命令
run_cmd() {
local cmd="$1"
local success_msg="$2"
local error_msg="$3"
local debug_cmd="$4"
#log_info "$cmd"
if eval "$cmd &>/dev/null"; then
log_success "$success_msg"
else
log_error "$error_msg"
[[ -n "$debug_cmd" ]] && echo -e "${RED}排查命令:${NC} $debug_cmd"
exit 1
fi
}
# 检查服务状态
check_service_status() {
local service_name="$1"
local success_msg="$2"
local error_msg="$3"
log_info "检查 $service_name 服务状态"
if systemctl is-active --quiet "$service_name"; then
log_success "$success_msg"
else
log_error "$error_msg"
echo -e "${RED}排查命令:${NC} systemctl status $service_name && journalctl -xeu $service_name"
exit 1
fi
}
# 主逻辑
case "$ACTION" in
install)
log_info "开始安装 NVIDIA Fabric Manager ${VERSION} (${DISTRO})"
# 下载
log_info "开始下载安装包"
run_cmd "wget -q -O $PKG_NAME $DOWNLOAD_URL" \
"下载安装包完成" \
"下载安装包失败" \
"wget --spider $DOWNLOAD_URL"
# 安装
log_info "开始安装服务"
run_cmd "dpkg -i $PKG_NAME" \
"服务已安装完成" \
"服务安装失败请检查!" \
"dpkg -i $PKG_NAME 2>&1 | tail -n 10"
# 启动服务
run_cmd "systemctl enable --now nvidia-fabricmanager" \
"服务已设置为开机启动并尝试i启动" \
"服务启动命令执行失败" \
"systemctl enable --now nvidia-fabricmanager"
# 检查服务状态
log_info "检查服务启动状态"
check_service_status "nvidia-fabricmanager" \
"NVIDIA Fabric Manager 服务运行正常" \
"NVIDIA Fabric Manager 服务启动失败"
log_success "NVIDIA Fabric Manager ${VERSION} 安装成功"
;;
uninstall)
log_info "开始卸载 NVIDIA Fabric Manager ${VERSION}"
# 停止服务(忽略错误)
systemctl stop nvidia-fabricmanager &>/dev/null || true
# 卸载包
log_info "开始卸载服务"
if dpkg -s "nvidia-fabricmanager-${MAJOR_VER}" &>/dev/null; then
run_cmd "dpkg -r nvidia-fabricmanager-${MAJOR_VER}" \
"服务卸载完成" \
"服务卸载失败" \
"dpkg -r nvidia-fabricmanager-${MAJOR_VER} 2>&1 | tail -n 10"
else
log_success "包未安装,跳过卸载"
fi
# 清理安装包
rm -f "$PKG_NAME" &>/dev/null
log_success "NVIDIA Fabric Manager ${VERSION} 卸载成功"
;;
esac