ansible-devops/scripts/nvidia-driver.sh

235 lines
6.6 KiB
Bash
Raw Normal View History

2025-07-05 15:49:53 +08:00
#!/bin/bash
set -euo pipefail
# 全局变量
DEFAULT_VERSION="565.57.01"
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-linux"
#OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
PACKAGE_TEMPLATE="NVIDIA-Linux-x86_64-%s.run"
INSTALL_DIR="/opt"
LOG_FILE="/var/log/nvidia-driver_$(date +%Y%m%d%H%M%S).log"
SERVICE_FILE="/etc/systemd/system/nvidia_peermem.service"
PERSISTENCE_SERVICE="/etc/systemd/system/nvidia-persistenced.service"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
NC='\033[0m' # 重置颜色
# 日志函数 - 记录所有操作到日志
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
}
# 步骤提示 - 绿色输出到控制台并记录日志
step() {
local msg="==> $1"
echo -e "${GREEN}$msg${NC}"
log "$msg"
}
# 错误提示 - 红色输出到控制台并记录日志
error() {
local msg="错误: $1"
echo -e "${RED}$msg${NC}"
log "$msg"
exit 1
}
# 执行命令并屏蔽输出,仅记录关键信息
run_cmd() {
step "执行: $1"
eval "$1" &>> "$LOG_FILE" || {
error "命令执行失败: $1"
}
}
# 参数解析
ACTION=""
VERSION="$DEFAULT_VERSION"
while [[ $# -gt 0 ]]; do
case "$1" in
--install) ACTION="install"; shift ;;
--uninstall) ACTION="uninstall"; shift ;;
--version) VERSION="$2"; shift 2 ;;
*) error "未知参数 $1" ;;
esac
done
OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
[[ -z "$ACTION" ]] && { error "必须指定 --install 或 --uninstall"; }
# peermem_service 开机启动函数
install_peermem_service() {
step "开始配置 nvidia_peermem 开机启动"
# 创建服务文件
cat > "$SERVICE_FILE" <<EOF
[Unit]
After=network.target
[Service]
ExecStart=/usr/sbin/modprobe nvidia_peermem
[Install]
WantedBy=default.target
EOF
# 设置权限
run_cmd "chmod 644 $SERVICE_FILE"
step "服务文件已创建:$SERVICE_FILE"
# 重载 systemd 并启用服务
run_cmd "systemctl daemon-reload"
run_cmd "systemctl enable --now nvidia_peermem.service"
step "服务已启用并开机自启"
}
# peermem_service 开机启动卸载函数
uninstall_peermem_service() {
step "开始移除 nvidia_peermem 开机启动配置"
if [[ -f "$SERVICE_FILE" ]]; then
run_cmd "systemctl stop nvidia_peermem.service"
run_cmd "systemctl disable nvidia_peermem.service"
run_cmd "rm -f $SERVICE_FILE"
step "服务文件已删除:$SERVICE_FILE"
else
step "警告nvidia_peermem 服务文件不存在"
fi
}
# nvidia_persistenced GPU 持久模式
install_persistence_service() {
step "开始配置 nvidia-persistenced 开机启动"
# 创建服务文件
cat > "$PERSISTENCE_SERVICE" <<EOF
[Unit]
Description=NVIDIA Persistence Daemon
After=syslog.target network.target
Wants=nvidia-modules.service
[Service]
Type=forking
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
ExecStart=/usr/bin/nvidia-persistenced --verbose
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
[Install]
WantedBy=multi-user.target
EOF
# 设置权限
run_cmd "chmod 644 $PERSISTENCE_SERVICE"
step "服务文件已创建:$PERSISTENCE_SERVICE"
# 重载 systemd 并启用服务
run_cmd "systemctl daemon-reload"
run_cmd "systemctl enable --now nvidia-persistenced.service"
step "nvidia-persistenced 服务已启用并开机自启"
}
# 卸载 persistence_server 开机启动
uninstall_persistence_service() {
step "开始移除 nvidia-persistenced 开机启动配置"
if [[ -f "$PERSISTENCE_SERVICE" ]]; then
run_cmd "systemctl stop nvidia-persistenced.service"
run_cmd "systemctl disable nvidia-persistenced.service"
run_cmd "rm -f $PERSISTENCE_SERVICE"
step "服务文件已删除:$PERSISTENCE_SERVICE"
else
step "警告nvidia-persistenced 服务文件不存在"
fi
}
# 显卡驱动安装函数
install_driver() {
step "开始安装显卡驱动,版本:$VERSION"
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
# 检查/下载包
if [[ ! -f "$PACKAGE_PATH" ]]; then
step "未找到本地包,开始下载"
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
else
step "使用本地包:$PACKAGE_PATH"
fi
# 安装驱动
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
run_cmd "./$PACKAGE_NAME -q -s"
# 配置服务
run_cmd "modprobe nvidia_peermem"
run_cmd "nvidia-smi -pm 1"
# 验证版本
run_cmd "nvidia-smi | grep $VERSION"
nvidia-smi | grep "$VERSION" >> "$LOG_FILE" 2>&1 || {
log "版本验证失败"; exit 1
}
install_peermem_service # 开机启动加载nvidia_peermem
install_persistence_service # 开机启动 GPU 持久模式
step "安装完成"
}
# 显卡驱动卸载函数
uninstall_driver() {
step "开始卸载显卡驱动,版本:$VERSION"
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
# 检查卸载脚本
if [[ -f "$PACKAGE_PATH" ]]; then
step "找到安装包,使用安装包卸载"
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
step "卸载失败,请手动卸载驱动程序"
return
fi
else
step "未找到本地安装包,尝试下载卸载包"
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
if [[ -f "$PACKAGE_PATH" ]]; then
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
step "卸载失败,请手动卸载驱动程序"
return
fi
else
step "无法找到或下载卸载包,请手动卸载驱动程序"
return
fi
fi
uninstall_peermem_service # 移除 peermem 服务
uninstall_persistence_service # 移除 persistenced 服务
step "卸载完成"
}
# 包下载函数
download_package() {
local package_name="$1"
local package_path="$2"
local download_urls=(
"${INTERNAL_BASE_URL}/${package_name}"
"${OFFICIAL_BASE_URL}/${package_name}"
)
for url in "${download_urls[@]}"; do
step "尝试从 $url 下载"
wget -qO "$package_path" "$url" && return 0
step "下载失败,尝试下一个 URL"
done
error "无法从任何来源下载 $package_name"
}
# 根据动作调用对应函数
case "$ACTION" in
install) install_driver ;;
uninstall) uninstall_driver ;;
esac