#!/bin/bash set -euo pipefail # 全局变量 LOG_FILE="/var/log/nvidia-dcgm_$(date +%Y%m%d%H%M%S).log" # 颜色定义 GREEN='\033[1;32m' RED='\033[1;31m' NC='\033[0m' # 重置颜色 # 日志函数 - 记录所有操作到日志 log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE" } # 步骤提示 - 绿色输出到控制台并记录日志 step() { local msg="==> $1" echo -e "${GREEN}$msg${NC}" log "$msg" } # 错误提示 - 红色输出到控制台并记录日志 error() { local msg="错误: $1" echo -e "${RED}$msg${NC}" log "$msg" exit 1 } # 执行命令并屏蔽输出,仅记录关键信息 run_cmd() { step "执行: $1" eval "$1" &>> "$LOG_FILE" || { error "命令执行失败: $1" } } # 检测系统版本 detect_os_version() { if [[ -f /etc/os-release ]]; then . /etc/os-release OS_ID=$ID OS_VERSION=$VERSION_ID step "检测到系统: ${OS_ID} ${OS_VERSION}" else error "无法检测到操作系统版本" fi } # 清理现有的 DCGM 配置 cleanup_dcgm_config() { step "清理现有的 DCGM 配置" run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list" run_cmd "apt-get remove -y datacenter-gpu-manager || true" run_cmd "apt-get autoremove -y || true" run_cmd "apt-get autoclean -y || true" } # 添加 CUDA 仓库密钥 add_cuda_keyring() { step "添加 CUDA 仓库密钥" run_cmd "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb" run_cmd "dpkg -i cuda-keyring_1.0-1_all.deb" run_cmd "rm cuda-keyring_1.0-1_all.deb" } # 更新包列表 update_package_list() { step "更新包列表" run_cmd "apt-get update" } # 安装 DCGM for Ubuntu 22.04 install_dcgm_for_ubuntu_22() { step "开始安装 DCGM for Ubuntu 22" # 清理现有的 DCGM 配置 cleanup_dcgm_config # 添加 CUDA 仓库密钥 add_cuda_keyring # 添加 CUDA 仓库 run_cmd "add-apt-repository \"deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /\" -y" # 更新包列表 update_package_list # 获取 CUDA 版本 CUDA_VERSION=$(nvidia-smi | sed -E -n 's/.*CUDA Version: ([0-9]+)[.].*/\1/p') if [[ -z "$CUDA_VERSION" ]]; then error "无法检测到 CUDA 版本" fi # 安装 DCGM run_cmd "apt install --install-recommends datacenter-gpu-manager-4-cuda${CUDA_VERSION} -y" # 启动并启用 DCGM 服务 run_cmd "systemctl --now enable nvidia-dcgm" run_cmd "systemctl restart nvidia-dcgm" run_cmd "systemctl status nvidia-dcgm" # 验证 DCGM 安装 run_cmd "dcgmi discovery -l" step "DCGM 安装成功" } # 卸载 DCGM uninstall_dcgm() { step "开始卸载 DCGM" # 停止并禁用 DCGM 服务 run_cmd "systemctl stop nvidia-dcgm || true" run_cmd "systemctl disable nvidia-dcgm || true" # 移除 DCGM 包 run_cmd "apt-get remove -y datacenter-gpu-manager || true" run_cmd "apt-get autoremove -y || true" run_cmd "apt-get autoclean -y || true" # 清理 CUDA 仓库密钥 run_cmd "rm -f /usr/share/keyrings/cuda-archive-keyring.gpg" # 删除 CUDA 仓库配置文件 run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list" # 更新包列表 update_package_list step "DCGM 卸载完成" } # 主流程 step "NVIDIA DCGM 安装脚本启动" detect_os_version if [[ $# -ne 1 ]]; then error "请使用 --install 或 --uninstall" fi ACTION=$1 case "$ACTION" in "--install") case "$OS_ID-$OS_VERSION" in ubuntu-22.04) install_dcgm_for_ubuntu_22 ;; ubuntu-24.04) install_dcgm_for_ubuntu_22 ;; *) error "不支持的操作系统版本: ${OS_ID} ${OS_VERSION}" ;; esac ;; "--uninstall") uninstall_dcgm ;; *) error "无效的参数,请使用 --install 或 --uninstall" ;; esac step "操作完成,日志路径: $LOG_FILE"