添加 scripts/nvidia-container-toolkit.sh

This commit is contained in:
joy 2025-09-24 10:03:14 +08:00
parent 4d61430c67
commit 3bd6d94486
1 changed files with 222 additions and 0 deletions

View File

@ -0,0 +1,222 @@
#!/bin/bash
set -e
# 初始化核心变量
OPERATION=""
VERSION=""
# NVIDIA Toolkit核心依赖包卸载/安装需统一处理)
NVIDIA_PACKAGES="nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1"
# 编译构建依赖包(仅安装时需要,卸载不处理)
BUILD_DEPENDS="build-essential devscripts debhelper fakeroot zlib1g-dev libnuma-dev libhwloc-dev libevent-dev"
# 提取核心版本号(如从"1.17.6-1"提取"1.17.6",用于友好输出)
EXTRACT_VERSION=""
# 日志输出函数(统一风格)
print_step() {
echo -e "\033[1;34m[*] $1\033[0m"
}
print_success() {
echo -e "\033[1;32m[+] $1\033[0m"
}
print_error() {
echo -e "\033[1;31m[-] 错误:$1\033[0m" >&2
}
# 解析命令行参数(支持--install/--uninstall --version
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install)
OPERATION="install"
shift
;;
--uninstall)
OPERATION="uninstall"
shift
;;
--version)
if [[ -n "$2" && "$2" != --* ]]; then
VERSION="$2"
# 提取核心版本如1.17.6-1 → 1.17.6
EXTRACT_VERSION=$(echo "$VERSION" | awk -F'-' '{print $1}')
shift 2
else
print_error "缺少版本号,请用 --version 指定(如 1.17.6-1"
exit 1
fi
;;
*)
print_error "未知参数:$1"
exit 1
;;
esac
done
# 验证必要参数
if [[ -z "$OPERATION" ]]; then
print_error "请指定操作:--install安装 或 --uninstall卸载"
exit 1
fi
if [[ -z "$VERSION" ]]; then
print_error "请用 --version 指定NVIDIA Container Toolkit版本"
exit 1
fi
}
# 检查Docker是否已安装Toolkit依赖Docker
check_docker_exist() {
if ! docker --version >/dev/null 2>&1; then
print_error "未检测到Docker环境NVIDIA Container Toolkit依赖Docker请先安装Docker"
exit 1
fi
print_success "Docker环境检测正常"
}
# 添加NVIDIA官方GPG密钥与软件源静默执行
add_nvidia_repo() {
print_step "正在配置NVIDIA官方软件源"
# 创建密钥存储目录
mkdir -p /usr/share/keyrings >/dev/null 2>&1
# 1. 导入GPG密钥自动覆盖现有密钥
if ! curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor --yes -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1; then
print_error "GPG密钥下载失败网络异常或密钥地址失效"
exit 1
fi
# 2. 添加软件源(并绑定密钥)
if ! curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | \
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1; then
print_error "NVIDIA软件源配置失败"
exit 1
fi
# 3. 更新软件源索引
if ! apt-get update -qq >/dev/null 2>&1; then
print_error "更新软件源索引失败"
exit 1
fi
print_success "NVIDIA软件源配置完成"
}
# 安装NVIDIA Container Toolkit指定版本+静默安装)
install_nvidia_toolkit() {
print_step "正在安装NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION"
# 构建安装命令:为所有核心包指定版本
install_cmd="apt-get install -qq -y"
for pkg in $NVIDIA_PACKAGES; do
install_cmd="$install_cmd $pkg=$VERSION"
done
# 执行安装(静默,仅捕错)
if ! eval $install_cmd >/dev/null 2>&1; then
print_error "Toolkit安装失败可能版本不兼容或依赖冲突"
exit 1
fi
# 安装编译构建依赖库
print_step "正在安装编译构建依赖库"
if ! apt-get install -qq -y $BUILD_DEPENDS >/dev/null 2>&1; then
print_error "编译依赖库如build-essential安装失败"
exit 1
fi
# 配置Docker启用NVIDIA Runtime并重启Docker
print_step "正在配置Docker支持NVIDIA Runtime"
if ! nvidia-ctk runtime configure --runtime=docker >/dev/null 2>&1; then
print_error "Docker NVIDIA Runtime配置失败"
exit 1
fi
if ! systemctl restart docker >/dev/null 2>&1; then
print_error "重启Docker服务失败需手动执行systemctl restart docker"
exit 1
fi
# 验证安装结果
if ! nvidia-container-runtime --version >/dev/null 2>&1; then
print_error "Toolkit安装后验证失败nvidia-container-runtime命令不可用"
exit 1
fi
print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 安装成功!"
echo -e "\033[1;33m[提示] 可执行 'nvidia-container-runtime --version' 查看详细版本信息\033[0m"
}
# 卸载NVIDIA Container Toolkit指定版本+清理)
uninstall_nvidia_toolkit() {
print_step "正在卸载NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION"
# 停止Docker服务避免卸载时文件占用
if systemctl is-active --quiet docker; then
print_step "停止Docker服务避免文件占用"
if ! systemctl stop docker >/dev/null 2>&1; then
print_error "Docker服务停止失败需手动停止后重试"
exit 1
fi
fi
# 构建卸载命令:为所有核心包指定版本(精准卸载)
uninstall_cmd="apt-get purge -qq -y"
for pkg in $NVIDIA_PACKAGES; do
uninstall_cmd="$uninstall_cmd $pkg=$VERSION"
done
# 执行卸载(静默,仅捕错)
if ! eval $uninstall_cmd >/dev/null 2>&1; then
print_error "Toolkit卸载失败可能包已被删除或依赖冲突"
exit 1
fi
# 清理残留文件(软件源、密钥、缓存)
print_step "清理NVIDIA残留文件"
rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1
rm -f /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1
apt-get autoremove -qq -y >/dev/null 2>&1
apt-get clean -qq >/dev/null 2>&1
# 重启Docker恢复默认配置
if ! systemctl start docker >/dev/null 2>&1; then
print_error "卸载后重启Docker失败需手动执行systemctl start docker"
exit 1
fi
print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 卸载成功!"
}
# 主函数(流程控制)
main() {
# 1. 检查root权限
if [[ $EUID -ne 0 ]]; then
print_error "请用root用户运行脚本sudo bash nvidia-container-runtime.sh ..."
exit 1
fi
# 2. 解析参数
parse_args "$@"
# 3. 安装/卸载分支逻辑
case "$OPERATION" in
install)
# 安装前先检查Docker环境
check_docker_exist
# 配置NVIDIA源 → 安装Toolkit → 配置Docker
add_nvidia_repo
install_nvidia_toolkit
;;
uninstall)
# 卸载Toolkit → 清理残留 → 重启Docker
uninstall_nvidia_toolkit
;;
*)
print_error "未知操作:$OPERATION"
exit 1
;;
esac
# 正常退出状态码0
exit 0
}
# 启动脚本
main "$@"