ansible-devops/scripts/nvidia-fabricmanager.sh

140 lines
4.3 KiB
Bash
Raw Normal View History

2025-07-05 15:49:53 +08:00
#!/bin/bash
set -euo pipefail
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
2025-07-20 00:07:40 +08:00
BLUE='\033[1;34m'
NC='\033[0m'
2025-07-05 15:49:53 +08:00
# 日志函数
2025-07-20 00:07:40 +08:00
log_info() { echo -e "${BLUE}[INFO]${NC} $*"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $*"; }
log_error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
2025-07-05 15:49:53 +08:00
2025-07-20 00:07:40 +08:00
# 初始化变量
2025-07-05 15:49:53 +08:00
ACTION=""
2025-07-20 00:07:40 +08:00
DISTRO=""
VERSION=""
PKG_NAME=""
DOWNLOAD_URL=""
MAJOR_VER=""
2025-07-05 15:49:53 +08:00
2025-07-20 00:07:40 +08:00
# 解析参数
2025-07-05 15:49:53 +08:00
while [[ $# -gt 0 ]]; do
case "$1" in
--install) ACTION="install"; shift ;;
--uninstall) ACTION="uninstall"; shift ;;
2025-07-20 00:07:40 +08:00
--distro) DISTRO="$2"; shift 2 ;;
--version) VERSION="$2"; shift 2 ;;
*) log_error "未知参数: $1,仅支持 --install --uninstall --distro --version" ;;
2025-07-05 15:49:53 +08:00
esac
done
2025-07-20 00:07:40 +08:00
# 强制检查参数
[[ -z "$ACTION" ]] && log_error "必须指定 --install 或 --uninstall"
[[ -z "$DISTRO" ]] && log_error "必须指定 --distro如 ubuntu22.04"
[[ -z "$VERSION" ]] && log_error "必须指定 --version如 570_570.124.06-1"
# 验证系统版本并生成URL
if [[ "$DISTRO" =~ ^ubuntu(22\.04|24\.04)$ ]]; then
local_ver="${DISTRO//./}" # ubuntu22.04 → ubuntu2204
PKG_NAME="nvidia-fabricmanager-${VERSION}_amd64.deb"
DOWNLOAD_URL="https://developer.download.nvidia.cn/compute/cuda/repos/${local_ver}/x86_64/${PKG_NAME}"
MAJOR_VER="${VERSION%%_*}" # 提取主版本号如570
else
log_error "仅支持 ubuntu22.04 和 ubuntu24.04"
fi
# 执行命令并隐藏输出,失败时显示错误信息和排查命令
run_cmd() {
local cmd="$1"
local success_msg="$2"
local error_msg="$3"
local debug_cmd="$4"
2025-07-05 15:49:53 +08:00
2025-07-20 00:07:40 +08:00
#log_info "$cmd"
if eval "$cmd &>/dev/null"; then
log_success "$success_msg"
2025-07-05 15:49:53 +08:00
else
2025-07-20 00:07:40 +08:00
log_error "$error_msg"
[[ -n "$debug_cmd" ]] && echo -e "${RED}排查命令:${NC} $debug_cmd"
exit 1
2025-07-05 15:49:53 +08:00
fi
}
2025-07-20 00:07:40 +08:00
# 检查服务状态
check_service_status() {
local service_name="$1"
local success_msg="$2"
local error_msg="$3"
2025-07-05 15:49:53 +08:00
2025-07-20 00:07:40 +08:00
log_info "检查 $service_name 服务状态"
if systemctl is-active --quiet "$service_name"; then
log_success "$success_msg"
2025-07-05 15:49:53 +08:00
else
2025-07-20 00:07:40 +08:00
log_error "$error_msg"
echo -e "${RED}排查命令:${NC} systemctl status $service_name && journalctl -xeu $service_name"
exit 1
2025-07-05 15:49:53 +08:00
fi
}
2025-07-20 00:07:40 +08:00
# 主逻辑
2025-07-05 15:49:53 +08:00
case "$ACTION" in
2025-07-20 00:07:40 +08:00
install)
log_info "开始安装 NVIDIA Fabric Manager ${VERSION} (${DISTRO})"
# 下载
log_info "开始下载安装包"
run_cmd "wget -q -O $PKG_NAME $DOWNLOAD_URL" \
"下载安装包完成" \
"下载安装包失败" \
"wget --spider $DOWNLOAD_URL"
# 安装
log_info "开始安装服务"
run_cmd "dpkg -i $PKG_NAME" \
"服务已安装完成" \
"服务安装失败请检查!" \
"dpkg -i $PKG_NAME 2>&1 | tail -n 10"
# 启动服务
run_cmd "systemctl enable --now nvidia-fabricmanager" \
"服务已设置为开机启动并尝试i启动" \
"服务启动命令执行失败" \
"systemctl enable --now nvidia-fabricmanager"
# 检查服务状态
log_info "检查服务启动状态"
check_service_status "nvidia-fabricmanager" \
"NVIDIA Fabric Manager 服务运行正常" \
"NVIDIA Fabric Manager 服务启动失败"
log_success "NVIDIA Fabric Manager ${VERSION} 安装成功"
;;
uninstall)
log_info "开始卸载 NVIDIA Fabric Manager ${VERSION}"
# 停止服务(忽略错误)
systemctl stop nvidia-fabricmanager &>/dev/null || true
# 卸载包
log_info "开始卸载服务"
if dpkg -s "nvidia-fabricmanager-${MAJOR_VER}" &>/dev/null; then
run_cmd "dpkg -r nvidia-fabricmanager-${MAJOR_VER}" \
"服务卸载完成" \
"服务卸载失败" \
"dpkg -r nvidia-fabricmanager-${MAJOR_VER} 2>&1 | tail -n 10"
else
log_success "包未安装,跳过卸载"
fi
# 清理安装包
rm -f "$PKG_NAME" &>/dev/null
log_success "NVIDIA Fabric Manager ${VERSION} 卸载成功"
;;
2025-07-05 15:49:53 +08:00
esac