forked from yindun/ansible-devops
235 lines
6.6 KiB
Bash
235 lines
6.6 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
set -euo pipefail
|
|||
|
|
|
|||
|
|
# 全局变量
|
|||
|
|
DEFAULT_VERSION="565.57.01"
|
|||
|
|
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-linux"
|
|||
|
|
#OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
|
|||
|
|
PACKAGE_TEMPLATE="NVIDIA-Linux-x86_64-%s.run"
|
|||
|
|
INSTALL_DIR="/opt"
|
|||
|
|
LOG_FILE="/var/log/nvidia-driver_$(date +%Y%m%d%H%M%S).log"
|
|||
|
|
SERVICE_FILE="/etc/systemd/system/nvidia_peermem.service"
|
|||
|
|
PERSISTENCE_SERVICE="/etc/systemd/system/nvidia-persistenced.service"
|
|||
|
|
|
|||
|
|
# 颜色定义
|
|||
|
|
GREEN='\033[1;32m'
|
|||
|
|
RED='\033[1;31m'
|
|||
|
|
NC='\033[0m' # 重置颜色
|
|||
|
|
|
|||
|
|
# 日志函数 - 记录所有操作到日志
|
|||
|
|
log() {
|
|||
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 步骤提示 - 绿色输出到控制台并记录日志
|
|||
|
|
step() {
|
|||
|
|
local msg="==> $1"
|
|||
|
|
echo -e "${GREEN}$msg${NC}"
|
|||
|
|
log "$msg"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 错误提示 - 红色输出到控制台并记录日志
|
|||
|
|
error() {
|
|||
|
|
local msg="错误: $1"
|
|||
|
|
echo -e "${RED}$msg${NC}"
|
|||
|
|
log "$msg"
|
|||
|
|
exit 1
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 执行命令并屏蔽输出,仅记录关键信息
|
|||
|
|
run_cmd() {
|
|||
|
|
step "执行: $1"
|
|||
|
|
eval "$1" &>> "$LOG_FILE" || {
|
|||
|
|
error "命令执行失败: $1"
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 参数解析
|
|||
|
|
ACTION=""
|
|||
|
|
VERSION="$DEFAULT_VERSION"
|
|||
|
|
|
|||
|
|
while [[ $# -gt 0 ]]; do
|
|||
|
|
case "$1" in
|
|||
|
|
--install) ACTION="install"; shift ;;
|
|||
|
|
--uninstall) ACTION="uninstall"; shift ;;
|
|||
|
|
--version) VERSION="$2"; shift 2 ;;
|
|||
|
|
*) error "未知参数 $1" ;;
|
|||
|
|
esac
|
|||
|
|
done
|
|||
|
|
OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
|
|||
|
|
|
|||
|
|
[[ -z "$ACTION" ]] && { error "必须指定 --install 或 --uninstall"; }
|
|||
|
|
|
|||
|
|
# peermem_service 开机启动函数
|
|||
|
|
install_peermem_service() {
|
|||
|
|
step "开始配置 nvidia_peermem 开机启动"
|
|||
|
|
# 创建服务文件
|
|||
|
|
cat > "$SERVICE_FILE" <<EOF
|
|||
|
|
[Unit]
|
|||
|
|
After=network.target
|
|||
|
|
[Service]
|
|||
|
|
ExecStart=/usr/sbin/modprobe nvidia_peermem
|
|||
|
|
[Install]
|
|||
|
|
WantedBy=default.target
|
|||
|
|
EOF
|
|||
|
|
|
|||
|
|
# 设置权限
|
|||
|
|
run_cmd "chmod 644 $SERVICE_FILE"
|
|||
|
|
step "服务文件已创建:$SERVICE_FILE"
|
|||
|
|
|
|||
|
|
# 重载 systemd 并启用服务
|
|||
|
|
run_cmd "systemctl daemon-reload"
|
|||
|
|
run_cmd "systemctl enable --now nvidia_peermem.service"
|
|||
|
|
step "服务已启用并开机自启"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# peermem_service 开机启动卸载函数
|
|||
|
|
uninstall_peermem_service() {
|
|||
|
|
step "开始移除 nvidia_peermem 开机启动配置"
|
|||
|
|
if [[ -f "$SERVICE_FILE" ]]; then
|
|||
|
|
run_cmd "systemctl stop nvidia_peermem.service"
|
|||
|
|
run_cmd "systemctl disable nvidia_peermem.service"
|
|||
|
|
run_cmd "rm -f $SERVICE_FILE"
|
|||
|
|
step "服务文件已删除:$SERVICE_FILE"
|
|||
|
|
else
|
|||
|
|
step "警告:nvidia_peermem 服务文件不存在"
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# nvidia_persistenced GPU 持久模式
|
|||
|
|
install_persistence_service() {
|
|||
|
|
step "开始配置 nvidia-persistenced 开机启动"
|
|||
|
|
# 创建服务文件
|
|||
|
|
cat > "$PERSISTENCE_SERVICE" <<EOF
|
|||
|
|
[Unit]
|
|||
|
|
Description=NVIDIA Persistence Daemon
|
|||
|
|
After=syslog.target network.target
|
|||
|
|
Wants=nvidia-modules.service
|
|||
|
|
|
|||
|
|
[Service]
|
|||
|
|
Type=forking
|
|||
|
|
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
|
|||
|
|
ExecStart=/usr/bin/nvidia-persistenced --verbose
|
|||
|
|
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
|
|||
|
|
|
|||
|
|
[Install]
|
|||
|
|
WantedBy=multi-user.target
|
|||
|
|
EOF
|
|||
|
|
|
|||
|
|
# 设置权限
|
|||
|
|
run_cmd "chmod 644 $PERSISTENCE_SERVICE"
|
|||
|
|
step "服务文件已创建:$PERSISTENCE_SERVICE"
|
|||
|
|
|
|||
|
|
# 重载 systemd 并启用服务
|
|||
|
|
run_cmd "systemctl daemon-reload"
|
|||
|
|
run_cmd "systemctl enable --now nvidia-persistenced.service"
|
|||
|
|
step "nvidia-persistenced 服务已启用并开机自启"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 卸载 persistence_server 开机启动
|
|||
|
|
uninstall_persistence_service() {
|
|||
|
|
step "开始移除 nvidia-persistenced 开机启动配置"
|
|||
|
|
if [[ -f "$PERSISTENCE_SERVICE" ]]; then
|
|||
|
|
run_cmd "systemctl stop nvidia-persistenced.service"
|
|||
|
|
run_cmd "systemctl disable nvidia-persistenced.service"
|
|||
|
|
run_cmd "rm -f $PERSISTENCE_SERVICE"
|
|||
|
|
step "服务文件已删除:$PERSISTENCE_SERVICE"
|
|||
|
|
else
|
|||
|
|
step "警告:nvidia-persistenced 服务文件不存在"
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 显卡驱动安装函数
|
|||
|
|
install_driver() {
|
|||
|
|
step "开始安装显卡驱动,版本:$VERSION"
|
|||
|
|
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
|
|||
|
|
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
|
|||
|
|
|
|||
|
|
# 检查/下载包
|
|||
|
|
if [[ ! -f "$PACKAGE_PATH" ]]; then
|
|||
|
|
step "未找到本地包,开始下载"
|
|||
|
|
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
|
|||
|
|
else
|
|||
|
|
step "使用本地包:$PACKAGE_PATH"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 安装驱动
|
|||
|
|
cd "$INSTALL_DIR"
|
|||
|
|
run_cmd "chmod +x $PACKAGE_NAME"
|
|||
|
|
run_cmd "./$PACKAGE_NAME -q -s"
|
|||
|
|
|
|||
|
|
# 配置服务
|
|||
|
|
run_cmd "modprobe nvidia_peermem"
|
|||
|
|
run_cmd "nvidia-smi -pm 1"
|
|||
|
|
|
|||
|
|
# 验证版本
|
|||
|
|
run_cmd "nvidia-smi | grep $VERSION"
|
|||
|
|
nvidia-smi | grep "$VERSION" >> "$LOG_FILE" 2>&1 || {
|
|||
|
|
log "版本验证失败"; exit 1
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
install_peermem_service # 开机启动加载:nvidia_peermem
|
|||
|
|
install_persistence_service # 开机启动 GPU 持久模式
|
|||
|
|
step "安装完成"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 显卡驱动卸载函数
|
|||
|
|
uninstall_driver() {
|
|||
|
|
step "开始卸载显卡驱动,版本:$VERSION"
|
|||
|
|
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
|
|||
|
|
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
|
|||
|
|
|
|||
|
|
# 检查卸载脚本
|
|||
|
|
if [[ -f "$PACKAGE_PATH" ]]; then
|
|||
|
|
step "找到安装包,使用安装包卸载"
|
|||
|
|
cd "$INSTALL_DIR"
|
|||
|
|
run_cmd "chmod +x $PACKAGE_NAME"
|
|||
|
|
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
|
|||
|
|
step "卸载失败,请手动卸载驱动程序"
|
|||
|
|
return
|
|||
|
|
fi
|
|||
|
|
else
|
|||
|
|
step "未找到本地安装包,尝试下载卸载包"
|
|||
|
|
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
|
|||
|
|
if [[ -f "$PACKAGE_PATH" ]]; then
|
|||
|
|
cd "$INSTALL_DIR"
|
|||
|
|
run_cmd "chmod +x $PACKAGE_NAME"
|
|||
|
|
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
|
|||
|
|
step "卸载失败,请手动卸载驱动程序"
|
|||
|
|
return
|
|||
|
|
fi
|
|||
|
|
else
|
|||
|
|
step "无法找到或下载卸载包,请手动卸载驱动程序"
|
|||
|
|
return
|
|||
|
|
fi
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
uninstall_peermem_service # 移除 peermem 服务
|
|||
|
|
uninstall_persistence_service # 移除 persistenced 服务
|
|||
|
|
step "卸载完成"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 包下载函数
|
|||
|
|
download_package() {
|
|||
|
|
local package_name="$1"
|
|||
|
|
local package_path="$2"
|
|||
|
|
local download_urls=(
|
|||
|
|
"${INTERNAL_BASE_URL}/${package_name}"
|
|||
|
|
"${OFFICIAL_BASE_URL}/${package_name}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
for url in "${download_urls[@]}"; do
|
|||
|
|
step "尝试从 $url 下载"
|
|||
|
|
wget -qO "$package_path" "$url" && return 0
|
|||
|
|
step "下载失败,尝试下一个 URL"
|
|||
|
|
done
|
|||
|
|
|
|||
|
|
error "无法从任何来源下载 $package_name"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 根据动作调用对应函数
|
|||
|
|
case "$ACTION" in
|
|||
|
|
install) install_driver ;;
|
|||
|
|
uninstall) uninstall_driver ;;
|
|||
|
|
esac
|