forked from yindun/ansible-devops
169 lines
4.1 KiB
Bash
169 lines
4.1 KiB
Bash
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
# 全局变量
|
|
LOG_FILE="/var/log/nvidia-dcgm_$(date +%Y%m%d%H%M%S).log"
|
|
|
|
# 颜色定义
|
|
GREEN='\033[1;32m'
|
|
RED='\033[1;31m'
|
|
NC='\033[0m' # 重置颜色
|
|
|
|
# 日志函数 - 记录所有操作到日志
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
|
}
|
|
|
|
# 步骤提示 - 绿色输出到控制台并记录日志
|
|
step() {
|
|
local msg="==> $1"
|
|
echo -e "${GREEN}$msg${NC}"
|
|
log "$msg"
|
|
}
|
|
|
|
# 错误提示 - 红色输出到控制台并记录日志
|
|
error() {
|
|
local msg="错误: $1"
|
|
echo -e "${RED}$msg${NC}"
|
|
log "$msg"
|
|
exit 1
|
|
}
|
|
|
|
# 执行命令并屏蔽输出,仅记录关键信息
|
|
run_cmd() {
|
|
step "执行: $1"
|
|
eval "$1" &>> "$LOG_FILE" || {
|
|
error "命令执行失败: $1"
|
|
}
|
|
}
|
|
|
|
# 检测系统版本
|
|
detect_os_version() {
|
|
if [[ -f /etc/os-release ]]; then
|
|
. /etc/os-release
|
|
OS_ID=$ID
|
|
OS_VERSION=$VERSION_ID
|
|
step "检测到系统: ${OS_ID} ${OS_VERSION}"
|
|
else
|
|
error "无法检测到操作系统版本"
|
|
fi
|
|
}
|
|
|
|
# 清理现有的 DCGM 配置
|
|
cleanup_dcgm_config() {
|
|
step "清理现有的 DCGM 配置"
|
|
run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
|
|
run_cmd "apt-get remove -y datacenter-gpu-manager || true"
|
|
run_cmd "apt-get autoremove -y || true"
|
|
run_cmd "apt-get autoclean -y || true"
|
|
}
|
|
|
|
# 添加 CUDA 仓库密钥
|
|
add_cuda_keyring() {
|
|
step "添加 CUDA 仓库密钥"
|
|
run_cmd "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb"
|
|
run_cmd "dpkg -i cuda-keyring_1.0-1_all.deb"
|
|
run_cmd "rm cuda-keyring_1.0-1_all.deb"
|
|
}
|
|
|
|
# 更新包列表
|
|
update_package_list() {
|
|
step "更新包列表"
|
|
run_cmd "apt-get update"
|
|
}
|
|
|
|
# 安装 DCGM for Ubuntu 22.04
|
|
install_dcgm_for_ubuntu_22() {
|
|
step "开始安装 DCGM for Ubuntu 22"
|
|
|
|
# 清理现有的 DCGM 配置
|
|
cleanup_dcgm_config
|
|
|
|
# 添加 CUDA 仓库密钥
|
|
add_cuda_keyring
|
|
|
|
# 添加 CUDA 仓库
|
|
run_cmd "add-apt-repository \"deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /\" -y"
|
|
|
|
# 更新包列表
|
|
update_package_list
|
|
|
|
# 获取 CUDA 版本
|
|
CUDA_VERSION=$(nvidia-smi | sed -E -n 's/.*CUDA Version: ([0-9]+)[.].*/\1/p')
|
|
if [[ -z "$CUDA_VERSION" ]]; then
|
|
error "无法检测到 CUDA 版本"
|
|
fi
|
|
|
|
# 安装 DCGM
|
|
run_cmd "apt install --install-recommends datacenter-gpu-manager-4-cuda${CUDA_VERSION} -y"
|
|
|
|
# 启动并启用 DCGM 服务
|
|
run_cmd "systemctl --now enable nvidia-dcgm"
|
|
run_cmd "systemctl restart nvidia-dcgm"
|
|
run_cmd "systemctl status nvidia-dcgm"
|
|
|
|
# 验证 DCGM 安装
|
|
run_cmd "dcgmi discovery -l"
|
|
|
|
step "DCGM 安装成功"
|
|
}
|
|
|
|
# 卸载 DCGM
|
|
uninstall_dcgm() {
|
|
step "开始卸载 DCGM"
|
|
|
|
# 停止并禁用 DCGM 服务
|
|
run_cmd "systemctl stop nvidia-dcgm || true"
|
|
run_cmd "systemctl disable nvidia-dcgm || true"
|
|
|
|
# 移除 DCGM 包
|
|
run_cmd "apt-get remove -y datacenter-gpu-manager || true"
|
|
run_cmd "apt-get autoremove -y || true"
|
|
run_cmd "apt-get autoclean -y || true"
|
|
|
|
# 清理 CUDA 仓库密钥
|
|
run_cmd "rm -f /usr/share/keyrings/cuda-archive-keyring.gpg"
|
|
|
|
# 删除 CUDA 仓库配置文件
|
|
run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
|
|
|
|
# 更新包列表
|
|
update_package_list
|
|
|
|
step "DCGM 卸载完成"
|
|
}
|
|
|
|
# 主流程
|
|
step "NVIDIA DCGM 安装脚本启动"
|
|
detect_os_version
|
|
|
|
if [[ $# -ne 1 ]]; then
|
|
error "请使用 --install 或 --uninstall"
|
|
fi
|
|
|
|
ACTION=$1
|
|
case "$ACTION" in
|
|
"--install")
|
|
case "$OS_ID-$OS_VERSION" in
|
|
ubuntu-22.04)
|
|
install_dcgm_for_ubuntu_22
|
|
;;
|
|
ubuntu-24.04)
|
|
install_dcgm_for_ubuntu_22
|
|
;;
|
|
|
|
*)
|
|
error "不支持的操作系统版本: ${OS_ID} ${OS_VERSION}"
|
|
;;
|
|
esac
|
|
;;
|
|
"--uninstall")
|
|
uninstall_dcgm
|
|
;;
|
|
*)
|
|
error "无效的参数,请使用 --install 或 --uninstall"
|
|
;;
|
|
esac
|
|
|
|
step "操作完成,日志路径: $LOG_FILE"
|