From 3bd6d94486a2b18348a511cb50d91c9644255a64 Mon Sep 17 00:00:00 2001 From: joy Date: Wed, 24 Sep 2025 10:03:14 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20scripts/nvidia-container-t?= =?UTF-8?q?oolkit.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/nvidia-container-toolkit.sh | 222 ++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 scripts/nvidia-container-toolkit.sh diff --git a/scripts/nvidia-container-toolkit.sh b/scripts/nvidia-container-toolkit.sh new file mode 100644 index 0000000..5dbcf0e --- /dev/null +++ b/scripts/nvidia-container-toolkit.sh @@ -0,0 +1,222 @@ +#!/bin/bash +set -e + +# 初始化核心变量 +OPERATION="" +VERSION="" +# NVIDIA Toolkit核心依赖包(卸载/安装需统一处理) +NVIDIA_PACKAGES="nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1" +# 编译构建依赖包(仅安装时需要,卸载不处理) +BUILD_DEPENDS="build-essential devscripts debhelper fakeroot zlib1g-dev libnuma-dev libhwloc-dev libevent-dev" +# 提取核心版本号(如从"1.17.6-1"提取"1.17.6",用于友好输出) +EXTRACT_VERSION="" + +# 日志输出函数(统一风格) +print_step() { + echo -e "\033[1;34m[*] $1\033[0m" +} +print_success() { + echo -e "\033[1;32m[+] $1\033[0m" +} +print_error() { + echo -e "\033[1;31m[-] 错误:$1\033[0m" >&2 +} + +# 解析命令行参数(支持--install/--uninstall --version) +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --install) + OPERATION="install" + shift + ;; + --uninstall) + OPERATION="uninstall" + shift + ;; + --version) + if [[ -n "$2" && "$2" != --* ]]; then + VERSION="$2" + # 提取核心版本(如1.17.6-1 → 1.17.6) + EXTRACT_VERSION=$(echo "$VERSION" | awk -F'-' '{print $1}') + shift 2 + else + print_error "缺少版本号,请用 --version 指定(如 1.17.6-1)" + exit 1 + fi + ;; + *) + print_error "未知参数:$1" + exit 1 + ;; + esac + done + + # 验证必要参数 + if [[ -z "$OPERATION" ]]; then + print_error "请指定操作:--install(安装) 或 --uninstall(卸载)" + exit 1 + fi + if [[ -z "$VERSION" ]]; then + print_error "请用 --version 指定NVIDIA Container Toolkit版本" + exit 1 + fi +} + +# 检查Docker是否已安装(Toolkit依赖Docker) +check_docker_exist() { + if ! docker --version >/dev/null 2>&1; then + print_error "未检测到Docker环境!NVIDIA Container Toolkit依赖Docker,请先安装Docker" + exit 1 + fi + print_success "Docker环境检测正常" +} + +# 添加NVIDIA官方GPG密钥与软件源(静默执行) +add_nvidia_repo() { + print_step "正在配置NVIDIA官方软件源" + # 创建密钥存储目录 + mkdir -p /usr/share/keyrings >/dev/null 2>&1 + + # 1. 导入GPG密钥(自动覆盖现有密钥) + if ! curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor --yes -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1; then + print_error "GPG密钥下载失败(网络异常或密钥地址失效)" + exit 1 + fi + + # 2. 添加软件源(并绑定密钥) + if ! curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1; then + print_error "NVIDIA软件源配置失败" + exit 1 + fi + + # 3. 更新软件源索引 + if ! apt-get update -qq >/dev/null 2>&1; then + print_error "更新软件源索引失败" + exit 1 + fi + print_success "NVIDIA软件源配置完成" +} + +# 安装NVIDIA Container Toolkit(指定版本+静默安装) +install_nvidia_toolkit() { + print_step "正在安装NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION)" + + # 构建安装命令:为所有核心包指定版本 + install_cmd="apt-get install -qq -y" + for pkg in $NVIDIA_PACKAGES; do + install_cmd="$install_cmd $pkg=$VERSION" + done + + # 执行安装(静默,仅捕错) + if ! eval $install_cmd >/dev/null 2>&1; then + print_error "Toolkit安装失败(可能版本不兼容或依赖冲突)" + exit 1 + fi + + # 安装编译构建依赖库 + print_step "正在安装编译构建依赖库" + if ! apt-get install -qq -y $BUILD_DEPENDS >/dev/null 2>&1; then + print_error "编译依赖库(如build-essential)安装失败" + exit 1 + fi + + # 配置Docker启用NVIDIA Runtime并重启Docker + print_step "正在配置Docker支持NVIDIA Runtime" + if ! nvidia-ctk runtime configure --runtime=docker >/dev/null 2>&1; then + print_error "Docker NVIDIA Runtime配置失败" + exit 1 + fi + if ! systemctl restart docker >/dev/null 2>&1; then + print_error "重启Docker服务失败(需手动执行:systemctl restart docker)" + exit 1 + fi + + # 验证安装结果 + if ! nvidia-container-runtime --version >/dev/null 2>&1; then + print_error "Toolkit安装后验证失败(nvidia-container-runtime命令不可用)" + exit 1 + fi + print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 安装成功!" + echo -e "\033[1;33m[提示] 可执行 'nvidia-container-runtime --version' 查看详细版本信息\033[0m" +} + +# 卸载NVIDIA Container Toolkit(指定版本+清理) +uninstall_nvidia_toolkit() { + print_step "正在卸载NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION)" + + # 停止Docker服务(避免卸载时文件占用) + if systemctl is-active --quiet docker; then + print_step "停止Docker服务(避免文件占用)" + if ! systemctl stop docker >/dev/null 2>&1; then + print_error "Docker服务停止失败(需手动停止后重试)" + exit 1 + fi + fi + + # 构建卸载命令:为所有核心包指定版本(精准卸载) + uninstall_cmd="apt-get purge -qq -y" + for pkg in $NVIDIA_PACKAGES; do + uninstall_cmd="$uninstall_cmd $pkg=$VERSION" + done + + # 执行卸载(静默,仅捕错) + if ! eval $uninstall_cmd >/dev/null 2>&1; then + print_error "Toolkit卸载失败(可能包已被删除或依赖冲突)" + exit 1 + fi + + # 清理残留文件(软件源、密钥、缓存) + print_step "清理NVIDIA残留文件" + rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1 + rm -f /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1 + apt-get autoremove -qq -y >/dev/null 2>&1 + apt-get clean -qq >/dev/null 2>&1 + + # 重启Docker恢复默认配置 + if ! systemctl start docker >/dev/null 2>&1; then + print_error "卸载后重启Docker失败(需手动执行:systemctl start docker)" + exit 1 + fi + + print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 卸载成功!" +} + +# 主函数(流程控制) +main() { + # 1. 检查root权限 + if [[ $EUID -ne 0 ]]; then + print_error "请用root用户运行脚本(sudo bash nvidia-container-runtime.sh ...)" + exit 1 + fi + + # 2. 解析参数 + parse_args "$@" + + # 3. 安装/卸载分支逻辑 + case "$OPERATION" in + install) + # 安装前先检查Docker环境 + check_docker_exist + # 配置NVIDIA源 → 安装Toolkit → 配置Docker + add_nvidia_repo + install_nvidia_toolkit + ;; + uninstall) + # 卸载Toolkit → 清理残留 → 重启Docker + uninstall_nvidia_toolkit + ;; + *) + print_error "未知操作:$OPERATION" + exit 1 + ;; + esac + + # 正常退出(状态码0) + exit 0 +} + +# 启动脚本 +main "$@"