ansible-devops/scripts/nvidia-container-toolkit.sh

223 lines
7.4 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -e
# 初始化核心变量
OPERATION=""
VERSION=""
# NVIDIA Toolkit核心依赖包卸载/安装需统一处理)
NVIDIA_PACKAGES="nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1"
# 编译构建依赖包(仅安装时需要,卸载不处理)
BUILD_DEPENDS="build-essential devscripts debhelper fakeroot zlib1g-dev libnuma-dev libhwloc-dev libevent-dev"
# 提取核心版本号(如从"1.17.6-1"提取"1.17.6",用于友好输出)
EXTRACT_VERSION=""
# 日志输出函数(统一风格)
print_step() {
echo -e "\033[1;34m[*] $1\033[0m"
}
print_success() {
echo -e "\033[1;32m[+] $1\033[0m"
}
print_error() {
echo -e "\033[1;31m[-] 错误:$1\033[0m" >&2
}
# 解析命令行参数(支持--install/--uninstall --version
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install)
OPERATION="install"
shift
;;
--uninstall)
OPERATION="uninstall"
shift
;;
--version)
if [[ -n "$2" && "$2" != --* ]]; then
VERSION="$2"
# 提取核心版本如1.17.6-1 → 1.17.6
EXTRACT_VERSION=$(echo "$VERSION" | awk -F'-' '{print $1}')
shift 2
else
print_error "缺少版本号,请用 --version 指定(如 1.17.6-1"
exit 1
fi
;;
*)
print_error "未知参数:$1"
exit 1
;;
esac
done
# 验证必要参数
if [[ -z "$OPERATION" ]]; then
print_error "请指定操作:--install安装 或 --uninstall卸载"
exit 1
fi
if [[ -z "$VERSION" ]]; then
print_error "请用 --version 指定NVIDIA Container Toolkit版本"
exit 1
fi
}
# 检查Docker是否已安装Toolkit依赖Docker
check_docker_exist() {
if ! docker --version >/dev/null 2>&1; then
print_error "未检测到Docker环境NVIDIA Container Toolkit依赖Docker请先安装Docker"
exit 1
fi
print_success "Docker环境检测正常"
}
# 添加NVIDIA官方GPG密钥与软件源静默执行
add_nvidia_repo() {
print_step "正在配置NVIDIA官方软件源"
# 创建密钥存储目录
mkdir -p /usr/share/keyrings >/dev/null 2>&1
# 1. 导入GPG密钥自动覆盖现有密钥
if ! curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor --yes -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1; then
print_error "GPG密钥下载失败网络异常或密钥地址失效"
exit 1
fi
# 2. 添加软件源(并绑定密钥)
if ! curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | \
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1; then
print_error "NVIDIA软件源配置失败"
exit 1
fi
# 3. 更新软件源索引
if ! apt-get update -qq >/dev/null 2>&1; then
print_error "更新软件源索引失败"
exit 1
fi
print_success "NVIDIA软件源配置完成"
}
# 安装NVIDIA Container Toolkit指定版本+静默安装)
install_nvidia_toolkit() {
print_step "正在安装NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION"
# 构建安装命令:为所有核心包指定版本
install_cmd="apt-get install -qq -y"
for pkg in $NVIDIA_PACKAGES; do
install_cmd="$install_cmd $pkg=$VERSION"
done
# 执行安装(静默,仅捕错)
if ! eval $install_cmd >/dev/null 2>&1; then
print_error "Toolkit安装失败可能版本不兼容或依赖冲突"
exit 1
fi
# 安装编译构建依赖库
print_step "正在安装编译构建依赖库"
if ! apt-get install -qq -y $BUILD_DEPENDS >/dev/null 2>&1; then
print_error "编译依赖库如build-essential安装失败"
exit 1
fi
# 配置Docker启用NVIDIA Runtime并重启Docker
print_step "正在配置Docker支持NVIDIA Runtime"
if ! nvidia-ctk runtime configure --runtime=docker >/dev/null 2>&1; then
print_error "Docker NVIDIA Runtime配置失败"
exit 1
fi
if ! systemctl restart docker >/dev/null 2>&1; then
print_error "重启Docker服务失败需手动执行systemctl restart docker"
exit 1
fi
# 验证安装结果
if ! nvidia-container-runtime --version >/dev/null 2>&1; then
print_error "Toolkit安装后验证失败nvidia-container-runtime命令不可用"
exit 1
fi
print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 安装成功!"
echo -e "\033[1;33m[提示] 可执行 'nvidia-container-runtime --version' 查看详细版本信息\033[0m"
}
# 卸载NVIDIA Container Toolkit指定版本+清理)
uninstall_nvidia_toolkit() {
print_step "正在卸载NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION"
# 停止Docker服务避免卸载时文件占用
if systemctl is-active --quiet docker; then
print_step "停止Docker服务避免文件占用"
if ! systemctl stop docker >/dev/null 2>&1; then
print_error "Docker服务停止失败需手动停止后重试"
exit 1
fi
fi
# 构建卸载命令:为所有核心包指定版本(精准卸载)
uninstall_cmd="apt-get purge -qq -y"
for pkg in $NVIDIA_PACKAGES; do
uninstall_cmd="$uninstall_cmd $pkg=$VERSION"
done
# 执行卸载(静默,仅捕错)
if ! eval $uninstall_cmd >/dev/null 2>&1; then
print_error "Toolkit卸载失败可能包已被删除或依赖冲突"
exit 1
fi
# 清理残留文件(软件源、密钥、缓存)
print_step "清理NVIDIA残留文件"
rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1
rm -f /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1
apt-get autoremove -qq -y >/dev/null 2>&1
apt-get clean -qq >/dev/null 2>&1
# 重启Docker恢复默认配置
if ! systemctl start docker >/dev/null 2>&1; then
print_error "卸载后重启Docker失败需手动执行systemctl start docker"
exit 1
fi
print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 卸载成功!"
}
# 主函数(流程控制)
main() {
# 1. 检查root权限
if [[ $EUID -ne 0 ]]; then
print_error "请用root用户运行脚本sudo bash nvidia-container-runtime.sh ..."
exit 1
fi
# 2. 解析参数
parse_args "$@"
# 3. 安装/卸载分支逻辑
case "$OPERATION" in
install)
# 安装前先检查Docker环境
check_docker_exist
# 配置NVIDIA源 → 安装Toolkit → 配置Docker
add_nvidia_repo
install_nvidia_toolkit
;;
uninstall)
# 卸载Toolkit → 清理残留 → 重启Docker
uninstall_nvidia_toolkit
;;
*)
print_error "未知操作:$OPERATION"
exit 1
;;
esac
# 正常退出状态码0
exit 0
}
# 启动脚本
main "$@"