ansible-devops/scripts/nvidia-fabricmanager.sh

190 lines
5.2 KiB
Bash

#!/bin/bash
set -euo pipefail
# 全局变量
FABRICMANAGER_MAJOR_VERSION="565"
FABRICMANAGER_FULL_VERSION="565.57.01-1"
PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb"
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-fabricmanager"
OFFICIAL_BASE_URL="https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64"
TEMP_DIR="/tmp/fabricmanager_temp"
LOG_FILE="/var/log/fabricmanager_$(date +%Y%m%d%H%M%S).log"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数
log() {
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
echo "$msg" >> "$LOG_FILE"
}
# 步骤提示
step() {
local msg="==> $1"
echo -e "${GREEN}$msg${NC}"
log "$msg"
}
# 警告提示
warning() {
local msg="警告: $1"
echo -e "${YELLOW}$msg${NC}"
log "$msg"
}
# 错误提示
error() {
local msg="错误: $1"
echo -e "${RED}$msg${NC}"
log "$msg"
exit 1
}
# 执行命令并记录日志
run_cmd() {
step "执行: $1"
eval "$1" &>> "$LOG_FILE" || {
error "命令执行失败: $1"
}
}
# 检查命令是否存在
check_cmd() {
command -v "$1" &>/dev/null || error "未找到命令: $1"
}
# 参数解析
ACTION=""
FORCE=0
while [[ $# -gt 0 ]]; do
case "$1" in
--install) ACTION="install"; shift ;;
--uninstall) ACTION="uninstall"; shift ;;
--version)
CUSTOM_VERSION="$2"
if [[ $CUSTOM_VERSION =~ ^([0-9]+)(_[0-9]+\.[0-9]+\.[0-9]+-[0-9]+)$ ]]; then
FABRICMANAGER_MAJOR_VERSION="${BASH_REMATCH[1]}"
FABRICMANAGER_FULL_VERSION="${BASH_REMATCH[2]#_}"
PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb"
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
else
error "版本格式错误,应为 xxxx_xxxx.xx.xx-x"
fi
shift 2 ;;
--force) FORCE=1; shift ;;
*) error "未知参数: $1" ;;
esac
done
[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall"
# 下载安装包
download_package() {
step "检查安装包: $PACKAGE_PATH"
if [[ -f "$PACKAGE_PATH" ]]; then
step "使用本地安装包"
return 0
fi
step "本地包不存在,开始下载"
mkdir -p "$(dirname "$PACKAGE_PATH")"
local urls=(
"${INTERNAL_BASE_URL}/${PACKAGE_NAME}"
"${OFFICIAL_BASE_URL}/${PACKAGE_NAME}"
)
for url in "${urls[@]}"; do
step "尝试从 $url 下载"
if wget -q -O "$PACKAGE_PATH" "$url"; then
step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)"
return 0
else
warning "$url 下载失败"
fi
done
error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH"
}
# 安装 NVIDIA Fabric Manager
install_fabricmanager() {
step "开始安装 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}"
# 下载安装包
download_package
# 安装前检查
step "检查系统依赖"
check_cmd dpkg
check_cmd systemctl
# 安装 Fabric Manager
step "安装 NVIDIA Fabric Manager"
run_cmd "dpkg -i $PACKAGE_PATH"
# 启动并启用服务
step "启动并启用 NVIDIA Fabric Manager 服务"
run_cmd "systemctl enable nvidia-fabricmanager.service --now"
# 验证安装
step "验证 NVIDIA Fabric Manager 服务状态"
if systemctl is-active --quiet nvidia-fabricmanager.service; then
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 安装成功并运行中"
else
error "NVIDIA Fabric Manager 服务未运行"
fi
}
# 卸载 NVIDIA Fabric Manager
uninstall_fabricmanager() {
step "开始卸载 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}"
# 检查服务状态
if systemctl is-active --quiet nvidia-fabricmanager.service; then
step "停止 NVIDIA Fabric Manager 服务"
run_cmd "systemctl stop nvidia-fabricmanager.service"
else
step "NVIDIA Fabric Manager 服务未运行"
fi
# 禁用服务
step "禁用 NVIDIA Fabric Manager 服务"
run_cmd "systemctl disable nvidia-fabricmanager.service"
# 卸载软件包
step "卸载 NVIDIA Fabric Manager 软件包"
if dpkg -s "nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" &>/dev/null; then
run_cmd "dpkg -r nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}"
else
warning "未找到 nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION} 软件包"
if [[ $FORCE -eq 0 ]]; then
error "请使用 --force 参数强制卸载"
fi
fi
# 清理残留文件
step "清理残留文件"
rm -f "$PACKAGE_PATH"
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 卸载完成"
}
step "开始 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程"
case "$ACTION" in
install) install_fabricmanager ;;
uninstall) uninstall_fabricmanager ;;
*) error "未知操作: $ACTION" ;;
esac
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程完成"