ansible-devops/scripts/nvidia-driver.sh

235 lines
6.6 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -euo pipefail
# 全局变量
DEFAULT_VERSION="565.57.01"
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-linux"
#OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
PACKAGE_TEMPLATE="NVIDIA-Linux-x86_64-%s.run"
INSTALL_DIR="/opt"
LOG_FILE="/var/log/nvidia-driver_$(date +%Y%m%d%H%M%S).log"
SERVICE_FILE="/etc/systemd/system/nvidia_peermem.service"
PERSISTENCE_SERVICE="/etc/systemd/system/nvidia-persistenced.service"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
NC='\033[0m' # 重置颜色
# 日志函数 - 记录所有操作到日志
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
}
# 步骤提示 - 绿色输出到控制台并记录日志
step() {
local msg="==> $1"
echo -e "${GREEN}$msg${NC}"
log "$msg"
}
# 错误提示 - 红色输出到控制台并记录日志
error() {
local msg="错误: $1"
echo -e "${RED}$msg${NC}"
log "$msg"
exit 1
}
# 执行命令并屏蔽输出,仅记录关键信息
run_cmd() {
step "执行: $1"
eval "$1" &>> "$LOG_FILE" || {
error "命令执行失败: $1"
}
}
# 参数解析
ACTION=""
VERSION="$DEFAULT_VERSION"
while [[ $# -gt 0 ]]; do
case "$1" in
--install) ACTION="install"; shift ;;
--uninstall) ACTION="uninstall"; shift ;;
--version) VERSION="$2"; shift 2 ;;
*) error "未知参数 $1" ;;
esac
done
OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
[[ -z "$ACTION" ]] && { error "必须指定 --install 或 --uninstall"; }
# peermem_service 开机启动函数
install_peermem_service() {
step "开始配置 nvidia_peermem 开机启动"
# 创建服务文件
cat > "$SERVICE_FILE" <<EOF
[Unit]
After=network.target
[Service]
ExecStart=/usr/sbin/modprobe nvidia_peermem
[Install]
WantedBy=default.target
EOF
# 设置权限
run_cmd "chmod 644 $SERVICE_FILE"
step "服务文件已创建:$SERVICE_FILE"
# 重载 systemd 并启用服务
run_cmd "systemctl daemon-reload"
run_cmd "systemctl enable --now nvidia_peermem.service"
step "服务已启用并开机自启"
}
# peermem_service 开机启动卸载函数
uninstall_peermem_service() {
step "开始移除 nvidia_peermem 开机启动配置"
if [[ -f "$SERVICE_FILE" ]]; then
run_cmd "systemctl stop nvidia_peermem.service"
run_cmd "systemctl disable nvidia_peermem.service"
run_cmd "rm -f $SERVICE_FILE"
step "服务文件已删除:$SERVICE_FILE"
else
step "警告nvidia_peermem 服务文件不存在"
fi
}
# nvidia_persistenced GPU 持久模式
install_persistence_service() {
step "开始配置 nvidia-persistenced 开机启动"
# 创建服务文件
cat > "$PERSISTENCE_SERVICE" <<EOF
[Unit]
Description=NVIDIA Persistence Daemon
After=syslog.target network.target
Wants=nvidia-modules.service
[Service]
Type=forking
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
ExecStart=/usr/bin/nvidia-persistenced --verbose
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
[Install]
WantedBy=multi-user.target
EOF
# 设置权限
run_cmd "chmod 644 $PERSISTENCE_SERVICE"
step "服务文件已创建:$PERSISTENCE_SERVICE"
# 重载 systemd 并启用服务
run_cmd "systemctl daemon-reload"
run_cmd "systemctl enable --now nvidia-persistenced.service"
step "nvidia-persistenced 服务已启用并开机自启"
}
# 卸载 persistence_server 开机启动
uninstall_persistence_service() {
step "开始移除 nvidia-persistenced 开机启动配置"
if [[ -f "$PERSISTENCE_SERVICE" ]]; then
run_cmd "systemctl stop nvidia-persistenced.service"
run_cmd "systemctl disable nvidia-persistenced.service"
run_cmd "rm -f $PERSISTENCE_SERVICE"
step "服务文件已删除:$PERSISTENCE_SERVICE"
else
step "警告nvidia-persistenced 服务文件不存在"
fi
}
# 显卡驱动安装函数
install_driver() {
step "开始安装显卡驱动,版本:$VERSION"
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
# 检查/下载包
if [[ ! -f "$PACKAGE_PATH" ]]; then
step "未找到本地包,开始下载"
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
else
step "使用本地包:$PACKAGE_PATH"
fi
# 安装驱动
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
run_cmd "./$PACKAGE_NAME -q -s"
# 配置服务
run_cmd "modprobe nvidia_peermem"
run_cmd "nvidia-smi -pm 1"
# 验证版本
run_cmd "nvidia-smi | grep $VERSION"
nvidia-smi | grep "$VERSION" >> "$LOG_FILE" 2>&1 || {
log "版本验证失败"; exit 1
}
install_peermem_service # 开机启动加载nvidia_peermem
install_persistence_service # 开机启动 GPU 持久模式
step "安装完成"
}
# 显卡驱动卸载函数
uninstall_driver() {
step "开始卸载显卡驱动,版本:$VERSION"
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
# 检查卸载脚本
if [[ -f "$PACKAGE_PATH" ]]; then
step "找到安装包,使用安装包卸载"
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
step "卸载失败,请手动卸载驱动程序"
return
fi
else
step "未找到本地安装包,尝试下载卸载包"
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
if [[ -f "$PACKAGE_PATH" ]]; then
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
step "卸载失败,请手动卸载驱动程序"
return
fi
else
step "无法找到或下载卸载包,请手动卸载驱动程序"
return
fi
fi
uninstall_peermem_service # 移除 peermem 服务
uninstall_persistence_service # 移除 persistenced 服务
step "卸载完成"
}
# 包下载函数
download_package() {
local package_name="$1"
local package_path="$2"
local download_urls=(
"${INTERNAL_BASE_URL}/${package_name}"
"${OFFICIAL_BASE_URL}/${package_name}"
)
for url in "${download_urls[@]}"; do
step "尝试从 $url 下载"
wget -qO "$package_path" "$url" && return 0
step "下载失败,尝试下一个 URL"
done
error "无法从任何来源下载 $package_name"
}
# 根据动作调用对应函数
case "$ACTION" in
install) install_driver ;;
uninstall) uninstall_driver ;;
esac