#!/bin/bash set -e # 初始化核心变量 OPERATION="" VERSION="" # NVIDIA Toolkit核心依赖包(卸载/安装需统一处理) NVIDIA_PACKAGES="nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1" # 编译构建依赖包(仅安装时需要,卸载不处理) BUILD_DEPENDS="build-essential devscripts debhelper fakeroot zlib1g-dev libnuma-dev libhwloc-dev libevent-dev" # 提取核心版本号(如从"1.17.6-1"提取"1.17.6",用于友好输出) EXTRACT_VERSION="" # 日志输出函数(统一风格) print_step() { echo -e "\033[1;34m[*] $1\033[0m" } print_success() { echo -e "\033[1;32m[+] $1\033[0m" } print_error() { echo -e "\033[1;31m[-] 错误:$1\033[0m" >&2 } # 解析命令行参数(支持--install/--uninstall --version) parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --install) OPERATION="install" shift ;; --uninstall) OPERATION="uninstall" shift ;; --version) if [[ -n "$2" && "$2" != --* ]]; then VERSION="$2" # 提取核心版本(如1.17.6-1 → 1.17.6) EXTRACT_VERSION=$(echo "$VERSION" | awk -F'-' '{print $1}') shift 2 else print_error "缺少版本号,请用 --version 指定(如 1.17.6-1)" exit 1 fi ;; *) print_error "未知参数:$1" exit 1 ;; esac done # 验证必要参数 if [[ -z "$OPERATION" ]]; then print_error "请指定操作:--install(安装) 或 --uninstall(卸载)" exit 1 fi if [[ -z "$VERSION" ]]; then print_error "请用 --version 指定NVIDIA Container Toolkit版本" exit 1 fi } # 检查Docker是否已安装(Toolkit依赖Docker) check_docker_exist() { if ! docker --version >/dev/null 2>&1; then print_error "未检测到Docker环境!NVIDIA Container Toolkit依赖Docker,请先安装Docker" exit 1 fi print_success "Docker环境检测正常" } # 添加NVIDIA官方GPG密钥与软件源(静默执行) add_nvidia_repo() { print_step "正在配置NVIDIA官方软件源" # 创建密钥存储目录 mkdir -p /usr/share/keyrings >/dev/null 2>&1 # 1. 导入GPG密钥(自动覆盖现有密钥) if ! curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor --yes -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1; then print_error "GPG密钥下载失败(网络异常或密钥地址失效)" exit 1 fi # 2. 添加软件源(并绑定密钥) if ! curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | \ tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1; then print_error "NVIDIA软件源配置失败" exit 1 fi # 3. 更新软件源索引 if ! apt-get update -qq >/dev/null 2>&1; then print_error "更新软件源索引失败" exit 1 fi print_success "NVIDIA软件源配置完成" } # 安装NVIDIA Container Toolkit(指定版本+静默安装) install_nvidia_toolkit() { print_step "正在安装NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION)" # 构建安装命令:为所有核心包指定版本 install_cmd="apt-get install -qq -y" for pkg in $NVIDIA_PACKAGES; do install_cmd="$install_cmd $pkg=$VERSION" done # 执行安装(静默,仅捕错) if ! eval $install_cmd >/dev/null 2>&1; then print_error "Toolkit安装失败(可能版本不兼容或依赖冲突)" exit 1 fi # 安装编译构建依赖库 print_step "正在安装编译构建依赖库" if ! apt-get install -qq -y $BUILD_DEPENDS >/dev/null 2>&1; then print_error "编译依赖库(如build-essential)安装失败" exit 1 fi # 配置Docker启用NVIDIA Runtime并重启Docker print_step "正在配置Docker支持NVIDIA Runtime" if ! nvidia-ctk runtime configure --runtime=docker >/dev/null 2>&1; then print_error "Docker NVIDIA Runtime配置失败" exit 1 fi if ! systemctl restart docker >/dev/null 2>&1; then print_error "重启Docker服务失败(需手动执行:systemctl restart docker)" exit 1 fi # 验证安装结果 if ! nvidia-container-runtime --version >/dev/null 2>&1; then print_error "Toolkit安装后验证失败(nvidia-container-runtime命令不可用)" exit 1 fi print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 安装成功!" echo -e "\033[1;33m[提示] 可执行 'nvidia-container-runtime --version' 查看详细版本信息\033[0m" } # 卸载NVIDIA Container Toolkit(指定版本+清理) uninstall_nvidia_toolkit() { print_step "正在卸载NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION)" # 停止Docker服务(避免卸载时文件占用) if systemctl is-active --quiet docker; then print_step "停止Docker服务(避免文件占用)" if ! systemctl stop docker >/dev/null 2>&1; then print_error "Docker服务停止失败(需手动停止后重试)" exit 1 fi fi # 构建卸载命令:为所有核心包指定版本(精准卸载) uninstall_cmd="apt-get purge -qq -y" for pkg in $NVIDIA_PACKAGES; do uninstall_cmd="$uninstall_cmd $pkg=$VERSION" done # 执行卸载(静默,仅捕错) if ! eval $uninstall_cmd >/dev/null 2>&1; then print_error "Toolkit卸载失败(可能包已被删除或依赖冲突)" exit 1 fi # 清理残留文件(软件源、密钥、缓存) print_step "清理NVIDIA残留文件" rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1 rm -f /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1 apt-get autoremove -qq -y >/dev/null 2>&1 apt-get clean -qq >/dev/null 2>&1 # 重启Docker恢复默认配置 if ! systemctl start docker >/dev/null 2>&1; then print_error "卸载后重启Docker失败(需手动执行:systemctl start docker)" exit 1 fi print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 卸载成功!" } # 主函数(流程控制) main() { # 1. 检查root权限 if [[ $EUID -ne 0 ]]; then print_error "请用root用户运行脚本(sudo bash nvidia-container-runtime.sh ...)" exit 1 fi # 2. 解析参数 parse_args "$@" # 3. 安装/卸载分支逻辑 case "$OPERATION" in install) # 安装前先检查Docker环境 check_docker_exist # 配置NVIDIA源 → 安装Toolkit → 配置Docker add_nvidia_repo install_nvidia_toolkit ;; uninstall) # 卸载Toolkit → 清理残留 → 重启Docker uninstall_nvidia_toolkit ;; *) print_error "未知操作:$OPERATION" exit 1 ;; esac # 正常退出(状态码0) exit 0 } # 启动脚本 main "$@"