140 lines
4.3 KiB
Bash
140 lines
4.3 KiB
Bash
#!/bin/bash
|
||
set -euo pipefail
|
||
|
||
# 颜色定义
|
||
GREEN='\033[1;32m'
|
||
RED='\033[1;31m'
|
||
BLUE='\033[1;34m'
|
||
NC='\033[0m'
|
||
|
||
# 日志函数
|
||
log_info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
||
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $*"; }
|
||
log_error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||
|
||
# 初始化变量
|
||
ACTION=""
|
||
DISTRO=""
|
||
VERSION=""
|
||
PKG_NAME=""
|
||
DOWNLOAD_URL=""
|
||
MAJOR_VER=""
|
||
|
||
# 解析参数
|
||
while [[ $# -gt 0 ]]; do
|
||
case "$1" in
|
||
--install) ACTION="install"; shift ;;
|
||
--uninstall) ACTION="uninstall"; shift ;;
|
||
--distro) DISTRO="$2"; shift 2 ;;
|
||
--version) VERSION="$2"; shift 2 ;;
|
||
*) log_error "未知参数: $1,仅支持 --install --uninstall --distro --version" ;;
|
||
esac
|
||
done
|
||
|
||
# 强制检查参数
|
||
[[ -z "$ACTION" ]] && log_error "必须指定 --install 或 --uninstall"
|
||
[[ -z "$DISTRO" ]] && log_error "必须指定 --distro(如 ubuntu22.04)"
|
||
[[ -z "$VERSION" ]] && log_error "必须指定 --version(如 570_570.124.06-1)"
|
||
|
||
# 验证系统版本并生成URL
|
||
if [[ "$DISTRO" =~ ^ubuntu(22\.04|24\.04)$ ]]; then
|
||
local_ver="${DISTRO//./}" # ubuntu22.04 → ubuntu2204
|
||
PKG_NAME="nvidia-fabricmanager-${VERSION}_amd64.deb"
|
||
DOWNLOAD_URL="https://developer.download.nvidia.cn/compute/cuda/repos/${local_ver}/x86_64/${PKG_NAME}"
|
||
MAJOR_VER="${VERSION%%_*}" # 提取主版本号(如570)
|
||
else
|
||
log_error "仅支持 ubuntu22.04 和 ubuntu24.04"
|
||
fi
|
||
|
||
# 执行命令并隐藏输出,失败时显示错误信息和排查命令
|
||
run_cmd() {
|
||
local cmd="$1"
|
||
local success_msg="$2"
|
||
local error_msg="$3"
|
||
local debug_cmd="$4"
|
||
|
||
#log_info "$cmd"
|
||
if eval "$cmd &>/dev/null"; then
|
||
log_success "$success_msg"
|
||
else
|
||
log_error "$error_msg"
|
||
[[ -n "$debug_cmd" ]] && echo -e "${RED}排查命令:${NC} $debug_cmd"
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
# 检查服务状态
|
||
check_service_status() {
|
||
local service_name="$1"
|
||
local success_msg="$2"
|
||
local error_msg="$3"
|
||
|
||
log_info "检查 $service_name 服务状态"
|
||
if systemctl is-active --quiet "$service_name"; then
|
||
log_success "$success_msg"
|
||
else
|
||
log_error "$error_msg"
|
||
echo -e "${RED}排查命令:${NC} systemctl status $service_name && journalctl -xeu $service_name"
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
# 主逻辑
|
||
case "$ACTION" in
|
||
install)
|
||
log_info "开始安装 NVIDIA Fabric Manager ${VERSION} (${DISTRO})"
|
||
|
||
# 下载
|
||
log_info "开始下载安装包"
|
||
run_cmd "wget -q -O $PKG_NAME $DOWNLOAD_URL" \
|
||
"下载安装包完成" \
|
||
"下载安装包失败" \
|
||
"wget --spider $DOWNLOAD_URL"
|
||
|
||
# 安装
|
||
|
||
log_info "开始安装服务"
|
||
run_cmd "dpkg -i $PKG_NAME" \
|
||
"服务已安装完成" \
|
||
"服务安装失败请检查!" \
|
||
"dpkg -i $PKG_NAME 2>&1 | tail -n 10"
|
||
|
||
# 启动服务
|
||
run_cmd "systemctl enable --now nvidia-fabricmanager" \
|
||
"服务已设置为开机启动并尝试i启动" \
|
||
"服务启动命令执行失败" \
|
||
"systemctl enable --now nvidia-fabricmanager"
|
||
|
||
# 检查服务状态
|
||
log_info "检查服务启动状态"
|
||
check_service_status "nvidia-fabricmanager" \
|
||
"NVIDIA Fabric Manager 服务运行正常" \
|
||
"NVIDIA Fabric Manager 服务启动失败"
|
||
|
||
log_success "NVIDIA Fabric Manager ${VERSION} 安装成功"
|
||
;;
|
||
|
||
uninstall)
|
||
log_info "开始卸载 NVIDIA Fabric Manager ${VERSION}"
|
||
|
||
# 停止服务(忽略错误)
|
||
systemctl stop nvidia-fabricmanager &>/dev/null || true
|
||
|
||
# 卸载包
|
||
log_info "开始卸载服务"
|
||
if dpkg -s "nvidia-fabricmanager-${MAJOR_VER}" &>/dev/null; then
|
||
run_cmd "dpkg -r nvidia-fabricmanager-${MAJOR_VER}" \
|
||
"服务卸载完成" \
|
||
"服务卸载失败" \
|
||
"dpkg -r nvidia-fabricmanager-${MAJOR_VER} 2>&1 | tail -n 10"
|
||
else
|
||
log_success "包未安装,跳过卸载"
|
||
fi
|
||
|
||
# 清理安装包
|
||
rm -f "$PKG_NAME" &>/dev/null
|
||
|
||
log_success "NVIDIA Fabric Manager ${VERSION} 卸载成功"
|
||
;;
|
||
esac
|