#!/bin/bash set -euo pipefail # 全局变量 CUDA_VERSION="12.6" DRIVER_VERSION="560.35.05" PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run" PACKAGE_PATH="/opt/${PACKAGE_NAME}" INTERNAL_BASE_URL="http://10.101.0.51:5588/cuda-linux" OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers" TEMP_DIR="/tmp/cuda_temp" CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2) CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}" ENV_PROFILE="/etc/profile" LOG_FILE="/var/log/cuda_manager_$(date +%Y%m%d%H%M%S).log" # 颜色定义 GREEN='\033[1;32m' RED='\033[1;31m' YELLOW='\033[1;33m' NC='\033[0m' # 重置颜色 # 日志函数 log() { local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" echo "$msg" >> "$LOG_FILE" } # 步骤提示 step() { local msg="==> $1" echo -e "${GREEN}$msg${NC}" log "$msg" } # 警告提示 warning() { local msg="警告: $1" echo -e "${YELLOW}$msg${NC}" log "$msg" } # 错误提示 error() { local msg="错误: $1" echo -e "${RED}$msg${NC}" log "$msg" exit 1 } # 执行命令并记录日志 run_cmd() { step "执行: $1" eval "$1" &>> "$LOG_FILE" || { error "命令执行失败: $1" } } # 检查命令是否存在 check_cmd() { command -v "$1" &>/dev/null || error "未找到命令: $1" } # 参数解析 ACTION="" FORCE=0 while [[ $# -gt 0 ]]; do case "$1" in --install) ACTION="install"; shift ;; --uninstall) ACTION="uninstall"; shift ;; --version) CUSTOM_VERSION="$2" if [[ $CUSTOM_VERSION =~ ^([0-9]+\.[0-9]+(\.[0-9]+)?)(_([0-9]+\.[0-9]+\.[0-9]+))$ ]]; then CUDA_VERSION="${BASH_REMATCH[1]}" DRIVER_VERSION="${BASH_REMATCH[4]}" PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run" PACKAGE_PATH="/opt/${PACKAGE_NAME}" CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2) CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}" OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers" else error "版本格式错误,应为 x.x.x_y.y.y 或 x.x_y.y.y" fi shift 2 ;; --force) FORCE=1; shift ;; *) error "未知参数: $1" ;; esac done [[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall" # 下载安装包 download_package() { step "检查安装包: $PACKAGE_PATH" if [[ -f "$PACKAGE_PATH" ]]; then step "使用本地安装包" return 0 fi step "本地包不存在,开始下载" mkdir -p "$(dirname "$PACKAGE_PATH")" local urls=( "${INTERNAL_BASE_URL}/${PACKAGE_NAME}" "${OFFICIAL_BASE_URL}/${PACKAGE_NAME}" ) for url in "${urls[@]}"; do step "尝试从 $url 下载" if wget -q -O "$PACKAGE_PATH" "$url"; then step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)" return 0 else warning "从 $url 下载失败" fi done error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH" } # 安装 CUDA install_cuda() { step "开始安装 CUDA ${CUDA_VERSION}" # 下载安装包 download_package # 创建临时目录 mkdir -p "$TEMP_DIR" trap 'rm -rf "$TEMP_DIR"' EXIT # 检查安装包参数 step "检查安装包支持的参数" sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1 # 检查是否支持 --toolkit 参数 if grep -q -- '--toolkit' "$TEMP_DIR/help.txt"; then step "安装包支持 --toolkit 参数" run_cmd "sh $PACKAGE_PATH --silent --toolkit" else warning "安装包不支持 --toolkit 参数,尝试完整安装" run_cmd "sh $PACKAGE_PATH --silent" fi # 配置环境变量 step "配置 CUDA 环境变量" if ! grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then cat >> "$ENV_PROFILE" << EOF # CUDA ${CUDA_VERSION} export PATH=${CUDA_INSTALL_DIR}/bin:\$PATH export LD_LIBRARY_PATH=${CUDA_INSTALL_DIR}/lib64:\$LD_LIBRARY_PATH EOF step "已添加环境变量到 $ENV_PROFILE" else step "环境变量已存在,跳过添加" fi # 生效环境变量 export PATH=${CUDA_INSTALL_DIR}/bin:\$PATH #export LC_BYOBU=0 && source "/etc/profile" #临时解决 export LC_BYOBU=0 export LD_LIBRARY_PATH=${CUDA_INSTALL_DIR}/lib64:\$LD_LIBRARY_PATH # 验证安装 step "验证 CUDA 安装" if command -v nvcc &>/dev/null; then nvcc_version=$(nvcc -V | grep release | awk '{print $5}' | tr -d ',') if [[ "$nvcc_version" == *"${CUDA_MAJOR_MINOR}"* ]]; then step "CUDA ${CUDA_VERSION} 安装成功" else error "CUDA 版本不匹配,期望 ${CUDA_VERSION},实际 $nvcc_version" fi else error "nvcc 命令未找到,安装失败" fi } # 卸载 CUDA uninstall_cuda() { step "开始卸载 CUDA ${CUDA_VERSION}" # 创建临时目录 mkdir -p "$TEMP_DIR" trap 'rm -rf "$TEMP_DIR"' EXIT # 检查官方卸载脚本 OFFICIAL_UNINSTALLER="${CUDA_INSTALL_DIR}/bin/cuda-uninstaller" if [[ -x "$OFFICIAL_UNINSTALLER" ]]; then step "找到官方卸载脚本: $OFFICIAL_UNINSTALLER" # 执行官方卸载脚本 step "执行官方卸载程序" run_cmd "$OFFICIAL_UNINSTALLER --silent" else warning "未找到官方卸载脚本,尝试其他方法" # 检查安装包是否存在 if [[ -f "$PACKAGE_PATH" ]]; then step "找到安装包: $PACKAGE_PATH" else if [[ $FORCE -eq 1 ]]; then warning "未找到安装包,继续强制卸载" else step "未找到安装包,开始下载" download_package fi fi # 检查安装包是否支持 --uninstall 参数 step "检查安装包是否支持 --uninstall 参数" sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1 if grep -q -- '--uninstall' "$TEMP_DIR/help.txt"; then step "安装包支持 --uninstall 参数" run_cmd "sh $PACKAGE_PATH --silent --uninstall" else step "安装包不支持 --uninstall 参数,尝试解压查找卸载脚本" # 解压安装包 step "解压安装包到 $TEMP_DIR" run_cmd "sh $PACKAGE_PATH --extract=$TEMP_DIR" # 查找卸载脚本 UNINSTALL_SCRIPT=$(find "$TEMP_DIR" -name "uninstall_cuda*" -type f | head -n1) if [[ -n "$UNINSTALL_SCRIPT" ]]; then step "找到卸载脚本: $UNINSTALL_SCRIPT" run_cmd "sh $UNINSTALL_SCRIPT" else warning "未找到卸载脚本,继续手动清理" fi fi fi # 清理环境变量 step "清理环境变量" if grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then run_cmd "sed -i '/cuda-${CUDA_MAJOR_MINOR}/d' $ENV_PROFILE" step "已从 $ENV_PROFILE 移除 CUDA 环境变量" else step "环境变量已清理" fi # 清理安装目录 if [[ -d "$CUDA_INSTALL_DIR" ]]; then step "删除安装目录: $CUDA_INSTALL_DIR" run_cmd "rm -rf $CUDA_INSTALL_DIR" else step "安装目录不存在,跳过删除" fi # 清理残留文件 step "清理残留文件" find / -name "*cuda-${CUDA_MAJOR_MINOR}*" 2>/dev/null | while read -r file; do if [[ -e "$file" ]]; then step "删除残留文件: $file" rm -rf "$file" 2>/dev/null || warning "无法删除: $file" fi done step "CUDA ${CUDA_VERSION} 卸载完成" } step "开始 CUDA ${CUDA_VERSION} ${ACTION} 流程" case "$ACTION" in install) install_cuda ;; uninstall) uninstall_cuda ;; *) error "未知操作: $ACTION" ;; esac step "CUDA ${CUDA_VERSION} ${ACTION} 流程完成"