forked from yindun/ansible-devops
添加 scripts/nvidia-container-toolkit.sh
This commit is contained in:
parent
4d61430c67
commit
3bd6d94486
|
|
@ -0,0 +1,222 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# 初始化核心变量
|
||||
OPERATION=""
|
||||
VERSION=""
|
||||
# NVIDIA Toolkit核心依赖包(卸载/安装需统一处理)
|
||||
NVIDIA_PACKAGES="nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1"
|
||||
# 编译构建依赖包(仅安装时需要,卸载不处理)
|
||||
BUILD_DEPENDS="build-essential devscripts debhelper fakeroot zlib1g-dev libnuma-dev libhwloc-dev libevent-dev"
|
||||
# 提取核心版本号(如从"1.17.6-1"提取"1.17.6",用于友好输出)
|
||||
EXTRACT_VERSION=""
|
||||
|
||||
# 日志输出函数(统一风格)
|
||||
print_step() {
|
||||
echo -e "\033[1;34m[*] $1\033[0m"
|
||||
}
|
||||
print_success() {
|
||||
echo -e "\033[1;32m[+] $1\033[0m"
|
||||
}
|
||||
print_error() {
|
||||
echo -e "\033[1;31m[-] 错误:$1\033[0m" >&2
|
||||
}
|
||||
|
||||
# 解析命令行参数(支持--install/--uninstall --version)
|
||||
parse_args() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--install)
|
||||
OPERATION="install"
|
||||
shift
|
||||
;;
|
||||
--uninstall)
|
||||
OPERATION="uninstall"
|
||||
shift
|
||||
;;
|
||||
--version)
|
||||
if [[ -n "$2" && "$2" != --* ]]; then
|
||||
VERSION="$2"
|
||||
# 提取核心版本(如1.17.6-1 → 1.17.6)
|
||||
EXTRACT_VERSION=$(echo "$VERSION" | awk -F'-' '{print $1}')
|
||||
shift 2
|
||||
else
|
||||
print_error "缺少版本号,请用 --version 指定(如 1.17.6-1)"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
print_error "未知参数:$1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# 验证必要参数
|
||||
if [[ -z "$OPERATION" ]]; then
|
||||
print_error "请指定操作:--install(安装) 或 --uninstall(卸载)"
|
||||
exit 1
|
||||
fi
|
||||
if [[ -z "$VERSION" ]]; then
|
||||
print_error "请用 --version 指定NVIDIA Container Toolkit版本"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查Docker是否已安装(Toolkit依赖Docker)
|
||||
check_docker_exist() {
|
||||
if ! docker --version >/dev/null 2>&1; then
|
||||
print_error "未检测到Docker环境!NVIDIA Container Toolkit依赖Docker,请先安装Docker"
|
||||
exit 1
|
||||
fi
|
||||
print_success "Docker环境检测正常"
|
||||
}
|
||||
|
||||
# 添加NVIDIA官方GPG密钥与软件源(静默执行)
|
||||
add_nvidia_repo() {
|
||||
print_step "正在配置NVIDIA官方软件源"
|
||||
# 创建密钥存储目录
|
||||
mkdir -p /usr/share/keyrings >/dev/null 2>&1
|
||||
|
||||
# 1. 导入GPG密钥(自动覆盖现有密钥)
|
||||
if ! curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor --yes -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1; then
|
||||
print_error "GPG密钥下载失败(网络异常或密钥地址失效)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 2. 添加软件源(并绑定密钥)
|
||||
if ! curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | \
|
||||
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1; then
|
||||
print_error "NVIDIA软件源配置失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 3. 更新软件源索引
|
||||
if ! apt-get update -qq >/dev/null 2>&1; then
|
||||
print_error "更新软件源索引失败"
|
||||
exit 1
|
||||
fi
|
||||
print_success "NVIDIA软件源配置完成"
|
||||
}
|
||||
|
||||
# 安装NVIDIA Container Toolkit(指定版本+静默安装)
|
||||
install_nvidia_toolkit() {
|
||||
print_step "正在安装NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION)"
|
||||
|
||||
# 构建安装命令:为所有核心包指定版本
|
||||
install_cmd="apt-get install -qq -y"
|
||||
for pkg in $NVIDIA_PACKAGES; do
|
||||
install_cmd="$install_cmd $pkg=$VERSION"
|
||||
done
|
||||
|
||||
# 执行安装(静默,仅捕错)
|
||||
if ! eval $install_cmd >/dev/null 2>&1; then
|
||||
print_error "Toolkit安装失败(可能版本不兼容或依赖冲突)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 安装编译构建依赖库
|
||||
print_step "正在安装编译构建依赖库"
|
||||
if ! apt-get install -qq -y $BUILD_DEPENDS >/dev/null 2>&1; then
|
||||
print_error "编译依赖库(如build-essential)安装失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 配置Docker启用NVIDIA Runtime并重启Docker
|
||||
print_step "正在配置Docker支持NVIDIA Runtime"
|
||||
if ! nvidia-ctk runtime configure --runtime=docker >/dev/null 2>&1; then
|
||||
print_error "Docker NVIDIA Runtime配置失败"
|
||||
exit 1
|
||||
fi
|
||||
if ! systemctl restart docker >/dev/null 2>&1; then
|
||||
print_error "重启Docker服务失败(需手动执行:systemctl restart docker)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 验证安装结果
|
||||
if ! nvidia-container-runtime --version >/dev/null 2>&1; then
|
||||
print_error "Toolkit安装后验证失败(nvidia-container-runtime命令不可用)"
|
||||
exit 1
|
||||
fi
|
||||
print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 安装成功!"
|
||||
echo -e "\033[1;33m[提示] 可执行 'nvidia-container-runtime --version' 查看详细版本信息\033[0m"
|
||||
}
|
||||
|
||||
# 卸载NVIDIA Container Toolkit(指定版本+清理)
|
||||
uninstall_nvidia_toolkit() {
|
||||
print_step "正在卸载NVIDIA Container Toolkit $EXTRACT_VERSION(完整版本:$VERSION)"
|
||||
|
||||
# 停止Docker服务(避免卸载时文件占用)
|
||||
if systemctl is-active --quiet docker; then
|
||||
print_step "停止Docker服务(避免文件占用)"
|
||||
if ! systemctl stop docker >/dev/null 2>&1; then
|
||||
print_error "Docker服务停止失败(需手动停止后重试)"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# 构建卸载命令:为所有核心包指定版本(精准卸载)
|
||||
uninstall_cmd="apt-get purge -qq -y"
|
||||
for pkg in $NVIDIA_PACKAGES; do
|
||||
uninstall_cmd="$uninstall_cmd $pkg=$VERSION"
|
||||
done
|
||||
|
||||
# 执行卸载(静默,仅捕错)
|
||||
if ! eval $uninstall_cmd >/dev/null 2>&1; then
|
||||
print_error "Toolkit卸载失败(可能包已被删除或依赖冲突)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 清理残留文件(软件源、密钥、缓存)
|
||||
print_step "清理NVIDIA残留文件"
|
||||
rm -f /etc/apt/sources.list.d/nvidia-container-toolkit.list >/dev/null 2>&1
|
||||
rm -f /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg >/dev/null 2>&1
|
||||
apt-get autoremove -qq -y >/dev/null 2>&1
|
||||
apt-get clean -qq >/dev/null 2>&1
|
||||
|
||||
# 重启Docker恢复默认配置
|
||||
if ! systemctl start docker >/dev/null 2>&1; then
|
||||
print_error "卸载后重启Docker失败(需手动执行:systemctl start docker)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_success "NVIDIA Container Toolkit $EXTRACT_VERSION 卸载成功!"
|
||||
}
|
||||
|
||||
# 主函数(流程控制)
|
||||
main() {
|
||||
# 1. 检查root权限
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
print_error "请用root用户运行脚本(sudo bash nvidia-container-runtime.sh ...)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 2. 解析参数
|
||||
parse_args "$@"
|
||||
|
||||
# 3. 安装/卸载分支逻辑
|
||||
case "$OPERATION" in
|
||||
install)
|
||||
# 安装前先检查Docker环境
|
||||
check_docker_exist
|
||||
# 配置NVIDIA源 → 安装Toolkit → 配置Docker
|
||||
add_nvidia_repo
|
||||
install_nvidia_toolkit
|
||||
;;
|
||||
uninstall)
|
||||
# 卸载Toolkit → 清理残留 → 重启Docker
|
||||
uninstall_nvidia_toolkit
|
||||
;;
|
||||
*)
|
||||
print_error "未知操作:$OPERATION"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# 正常退出(状态码0)
|
||||
exit 0
|
||||
}
|
||||
|
||||
# 启动脚本
|
||||
main "$@"
|
||||
Loading…
Reference in New Issue