235 lines
6.6 KiB
Bash
235 lines
6.6 KiB
Bash
#!/bin/bash
|
||
set -euo pipefail
|
||
|
||
# 全局变量
|
||
DEFAULT_VERSION="565.57.01"
|
||
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-linux"
|
||
#OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
|
||
PACKAGE_TEMPLATE="NVIDIA-Linux-x86_64-%s.run"
|
||
INSTALL_DIR="/opt"
|
||
LOG_FILE="/var/log/nvidia-driver_$(date +%Y%m%d%H%M%S).log"
|
||
SERVICE_FILE="/etc/systemd/system/nvidia_peermem.service"
|
||
PERSISTENCE_SERVICE="/etc/systemd/system/nvidia-persistenced.service"
|
||
|
||
# 颜色定义
|
||
GREEN='\033[1;32m'
|
||
RED='\033[1;31m'
|
||
NC='\033[0m' # 重置颜色
|
||
|
||
# 日志函数 - 记录所有操作到日志
|
||
log() {
|
||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
||
}
|
||
|
||
# 步骤提示 - 绿色输出到控制台并记录日志
|
||
step() {
|
||
local msg="==> $1"
|
||
echo -e "${GREEN}$msg${NC}"
|
||
log "$msg"
|
||
}
|
||
|
||
# 错误提示 - 红色输出到控制台并记录日志
|
||
error() {
|
||
local msg="错误: $1"
|
||
echo -e "${RED}$msg${NC}"
|
||
log "$msg"
|
||
exit 1
|
||
}
|
||
|
||
# 执行命令并屏蔽输出,仅记录关键信息
|
||
run_cmd() {
|
||
step "执行: $1"
|
||
eval "$1" &>> "$LOG_FILE" || {
|
||
error "命令执行失败: $1"
|
||
}
|
||
}
|
||
|
||
# 参数解析
|
||
ACTION=""
|
||
VERSION="$DEFAULT_VERSION"
|
||
|
||
while [[ $# -gt 0 ]]; do
|
||
case "$1" in
|
||
--install) ACTION="install"; shift ;;
|
||
--uninstall) ACTION="uninstall"; shift ;;
|
||
--version) VERSION="$2"; shift 2 ;;
|
||
*) error "未知参数 $1" ;;
|
||
esac
|
||
done
|
||
OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
|
||
|
||
[[ -z "$ACTION" ]] && { error "必须指定 --install 或 --uninstall"; }
|
||
|
||
# peermem_service 开机启动函数
|
||
install_peermem_service() {
|
||
step "开始配置 nvidia_peermem 开机启动"
|
||
# 创建服务文件
|
||
cat > "$SERVICE_FILE" <<EOF
|
||
[Unit]
|
||
After=network.target
|
||
[Service]
|
||
ExecStart=/usr/sbin/modprobe nvidia_peermem
|
||
[Install]
|
||
WantedBy=default.target
|
||
EOF
|
||
|
||
# 设置权限
|
||
run_cmd "chmod 644 $SERVICE_FILE"
|
||
step "服务文件已创建:$SERVICE_FILE"
|
||
|
||
# 重载 systemd 并启用服务
|
||
run_cmd "systemctl daemon-reload"
|
||
run_cmd "systemctl enable --now nvidia_peermem.service"
|
||
step "服务已启用并开机自启"
|
||
}
|
||
|
||
# peermem_service 开机启动卸载函数
|
||
uninstall_peermem_service() {
|
||
step "开始移除 nvidia_peermem 开机启动配置"
|
||
if [[ -f "$SERVICE_FILE" ]]; then
|
||
run_cmd "systemctl stop nvidia_peermem.service"
|
||
run_cmd "systemctl disable nvidia_peermem.service"
|
||
run_cmd "rm -f $SERVICE_FILE"
|
||
step "服务文件已删除:$SERVICE_FILE"
|
||
else
|
||
step "警告:nvidia_peermem 服务文件不存在"
|
||
fi
|
||
}
|
||
|
||
# nvidia_persistenced GPU 持久模式
|
||
install_persistence_service() {
|
||
step "开始配置 nvidia-persistenced 开机启动"
|
||
# 创建服务文件
|
||
cat > "$PERSISTENCE_SERVICE" <<EOF
|
||
[Unit]
|
||
Description=NVIDIA Persistence Daemon
|
||
After=syslog.target network.target
|
||
Wants=nvidia-modules.service
|
||
|
||
[Service]
|
||
Type=forking
|
||
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
|
||
ExecStart=/usr/bin/nvidia-persistenced --verbose
|
||
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
|
||
|
||
[Install]
|
||
WantedBy=multi-user.target
|
||
EOF
|
||
|
||
# 设置权限
|
||
run_cmd "chmod 644 $PERSISTENCE_SERVICE"
|
||
step "服务文件已创建:$PERSISTENCE_SERVICE"
|
||
|
||
# 重载 systemd 并启用服务
|
||
run_cmd "systemctl daemon-reload"
|
||
run_cmd "systemctl enable --now nvidia-persistenced.service"
|
||
step "nvidia-persistenced 服务已启用并开机自启"
|
||
}
|
||
|
||
# 卸载 persistence_server 开机启动
|
||
uninstall_persistence_service() {
|
||
step "开始移除 nvidia-persistenced 开机启动配置"
|
||
if [[ -f "$PERSISTENCE_SERVICE" ]]; then
|
||
run_cmd "systemctl stop nvidia-persistenced.service"
|
||
run_cmd "systemctl disable nvidia-persistenced.service"
|
||
run_cmd "rm -f $PERSISTENCE_SERVICE"
|
||
step "服务文件已删除:$PERSISTENCE_SERVICE"
|
||
else
|
||
step "警告:nvidia-persistenced 服务文件不存在"
|
||
fi
|
||
}
|
||
|
||
# 显卡驱动安装函数
|
||
install_driver() {
|
||
step "开始安装显卡驱动,版本:$VERSION"
|
||
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
|
||
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
|
||
|
||
# 检查/下载包
|
||
if [[ ! -f "$PACKAGE_PATH" ]]; then
|
||
step "未找到本地包,开始下载"
|
||
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
|
||
else
|
||
step "使用本地包:$PACKAGE_PATH"
|
||
fi
|
||
|
||
# 安装驱动
|
||
cd "$INSTALL_DIR"
|
||
run_cmd "chmod +x $PACKAGE_NAME"
|
||
run_cmd "./$PACKAGE_NAME -q -s"
|
||
|
||
# 配置服务
|
||
run_cmd "modprobe nvidia_peermem"
|
||
run_cmd "nvidia-smi -pm 1"
|
||
|
||
# 验证版本
|
||
run_cmd "nvidia-smi | grep $VERSION"
|
||
nvidia-smi | grep "$VERSION" >> "$LOG_FILE" 2>&1 || {
|
||
log "版本验证失败"; exit 1
|
||
}
|
||
|
||
install_peermem_service # 开机启动加载:nvidia_peermem
|
||
install_persistence_service # 开机启动 GPU 持久模式
|
||
step "安装完成"
|
||
}
|
||
|
||
# 显卡驱动卸载函数
|
||
uninstall_driver() {
|
||
step "开始卸载显卡驱动,版本:$VERSION"
|
||
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
|
||
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
|
||
|
||
# 检查卸载脚本
|
||
if [[ -f "$PACKAGE_PATH" ]]; then
|
||
step "找到安装包,使用安装包卸载"
|
||
cd "$INSTALL_DIR"
|
||
run_cmd "chmod +x $PACKAGE_NAME"
|
||
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
|
||
step "卸载失败,请手动卸载驱动程序"
|
||
return
|
||
fi
|
||
else
|
||
step "未找到本地安装包,尝试下载卸载包"
|
||
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
|
||
if [[ -f "$PACKAGE_PATH" ]]; then
|
||
cd "$INSTALL_DIR"
|
||
run_cmd "chmod +x $PACKAGE_NAME"
|
||
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
|
||
step "卸载失败,请手动卸载驱动程序"
|
||
return
|
||
fi
|
||
else
|
||
step "无法找到或下载卸载包,请手动卸载驱动程序"
|
||
return
|
||
fi
|
||
fi
|
||
|
||
uninstall_peermem_service # 移除 peermem 服务
|
||
uninstall_persistence_service # 移除 persistenced 服务
|
||
step "卸载完成"
|
||
}
|
||
|
||
# 包下载函数
|
||
download_package() {
|
||
local package_name="$1"
|
||
local package_path="$2"
|
||
local download_urls=(
|
||
"${INTERNAL_BASE_URL}/${package_name}"
|
||
"${OFFICIAL_BASE_URL}/${package_name}"
|
||
)
|
||
|
||
for url in "${download_urls[@]}"; do
|
||
step "尝试从 $url 下载"
|
||
wget -qO "$package_path" "$url" && return 0
|
||
step "下载失败,尝试下一个 URL"
|
||
done
|
||
|
||
error "无法从任何来源下载 $package_name"
|
||
}
|
||
|
||
# 根据动作调用对应函数
|
||
case "$ACTION" in
|
||
install) install_driver ;;
|
||
uninstall) uninstall_driver ;;
|
||
esac
|