ansible-devops/roles/gpu_drive/files/install.sh

51 lines
1.7 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -eo pipefail
LOG_DIR="{{ log_base_dir }}/{{ driver.name }}" # 从角色变量注入路径
LOG_FILE="${LOG_DIR}/install-$(date +%Y%m%d).log"
mkdir -p "$LOG_DIR"
exec > >(tee -a "$LOG_FILE") 2>&1
# 参数解析(严格匹配角色定义的操作)
OPERATION=""
VERSION=""
while [[ $# -gt 0 ]]; do
case "$1" in
--install) OPERATION="install" ;;
--uninstall) OPERATION="uninstall" ;;
--version) VERSION="$2"; shift ;;
*) echo "错误:未知参数 $1" >&2; exit 1 ;;
esac
shift
done
# 安装逻辑(使用角色专属变量)
install() {
local DRIVER_VERSION="${VERSION:-$DEFAULT_VERSION}"
echo "[$(date)] 开始安装NVIDIA驱动版本$DRIVER_VERSION,型号:$GPU_MODEL..."
# 企业内部镜像下载(安全加速)
wget -q "${DOWNLOAD_URL}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" \
-O "/tmp/nvidia-driver.run"
# 静默安装(企业级无交互模式)
sh "/tmp/nvidia-driver.run" --silent --no-x-check --no-nouveau-check
# 严格功能验证(硬件型号匹配)
if ! nvidia-smi --query-gpu=name --format=csv,noheader | grep -q "$GPU_MODEL"; then
echo "错误驱动安装后未识别到目标GPU型号" >&2; exit 1
fi
}
# 卸载逻辑(幂等性设计)
uninstall() {
echo "[$(date)] 开始卸载NVIDIA驱动..."
/usr/bin/nvidia-uninstall --silent # 官方静默卸载工具
rm -f "/tmp/nvidia-driver.run" # 清理残留文件
}
# 主流程(依赖角色变量注入的默认值)
DEFAULT_VERSION="{{ driver.default_version }}"
DOWNLOAD_URL="{{ driver.download_url }}"
install || uninstall # 根据OPERATION执行对应函数由剧本参数控制
exit 0