2025-07-05 15:49:53 +08:00
|
|
|
#!/bin/bash
|
|
|
|
|
set -euo pipefail
|
|
|
|
|
|
|
|
|
|
# 全局变量
|
|
|
|
|
CUDA_VERSION="12.6"
|
|
|
|
|
DRIVER_VERSION="560.35.05"
|
|
|
|
|
PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run"
|
|
|
|
|
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
|
|
|
|
|
INTERNAL_BASE_URL="http://10.101.0.51:5588/cuda-linux"
|
|
|
|
|
OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers"
|
|
|
|
|
TEMP_DIR="/tmp/cuda_temp"
|
|
|
|
|
CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2)
|
|
|
|
|
CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
|
|
|
|
|
ENV_PROFILE="/etc/profile"
|
|
|
|
|
LOG_FILE="/var/log/cuda_manager_$(date +%Y%m%d%H%M%S).log"
|
|
|
|
|
|
|
|
|
|
# 颜色定义
|
|
|
|
|
GREEN='\033[1;32m'
|
|
|
|
|
RED='\033[1;31m'
|
|
|
|
|
YELLOW='\033[1;33m'
|
|
|
|
|
NC='\033[0m' # 重置颜色
|
|
|
|
|
|
|
|
|
|
# 日志函数
|
|
|
|
|
log() {
|
|
|
|
|
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
|
|
|
|
echo "$msg" >> "$LOG_FILE"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 步骤提示
|
|
|
|
|
step() {
|
|
|
|
|
local msg="==> $1"
|
|
|
|
|
echo -e "${GREEN}$msg${NC}"
|
|
|
|
|
log "$msg"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 警告提示
|
|
|
|
|
warning() {
|
|
|
|
|
local msg="警告: $1"
|
|
|
|
|
echo -e "${YELLOW}$msg${NC}"
|
|
|
|
|
log "$msg"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 错误提示
|
|
|
|
|
error() {
|
|
|
|
|
local msg="错误: $1"
|
|
|
|
|
echo -e "${RED}$msg${NC}"
|
|
|
|
|
log "$msg"
|
|
|
|
|
exit 1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 执行命令并记录日志
|
|
|
|
|
run_cmd() {
|
|
|
|
|
step "执行: $1"
|
|
|
|
|
eval "$1" &>> "$LOG_FILE" || {
|
|
|
|
|
error "命令执行失败: $1"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 检查命令是否存在
|
|
|
|
|
check_cmd() {
|
|
|
|
|
command -v "$1" &>/dev/null || error "未找到命令: $1"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 参数解析
|
|
|
|
|
ACTION=""
|
|
|
|
|
FORCE=0
|
|
|
|
|
|
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
|
|
|
case "$1" in
|
|
|
|
|
--install) ACTION="install"; shift ;;
|
|
|
|
|
--uninstall) ACTION="uninstall"; shift ;;
|
|
|
|
|
--version)
|
|
|
|
|
CUSTOM_VERSION="$2"
|
|
|
|
|
if [[ $CUSTOM_VERSION =~ ^([0-9]+\.[0-9]+(\.[0-9]+)?)(_([0-9]+\.[0-9]+\.[0-9]+))$ ]]; then
|
|
|
|
|
CUDA_VERSION="${BASH_REMATCH[1]}"
|
|
|
|
|
DRIVER_VERSION="${BASH_REMATCH[4]}"
|
|
|
|
|
PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run"
|
|
|
|
|
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
|
|
|
|
|
CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2)
|
|
|
|
|
CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
|
|
|
|
|
OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers"
|
|
|
|
|
else
|
|
|
|
|
error "版本格式错误,应为 x.x.x_y.y.y 或 x.x_y.y.y"
|
|
|
|
|
fi
|
|
|
|
|
shift 2 ;;
|
|
|
|
|
--force) FORCE=1; shift ;;
|
|
|
|
|
*) error "未知参数: $1" ;;
|
|
|
|
|
esac
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 下载安装包
|
|
|
|
|
download_package() {
|
|
|
|
|
step "检查安装包: $PACKAGE_PATH"
|
|
|
|
|
if [[ -f "$PACKAGE_PATH" ]]; then
|
|
|
|
|
step "使用本地安装包"
|
|
|
|
|
return 0
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
step "本地包不存在,开始下载"
|
|
|
|
|
mkdir -p "$(dirname "$PACKAGE_PATH")"
|
|
|
|
|
|
|
|
|
|
local urls=(
|
|
|
|
|
"${INTERNAL_BASE_URL}/${PACKAGE_NAME}"
|
|
|
|
|
"${OFFICIAL_BASE_URL}/${PACKAGE_NAME}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for url in "${urls[@]}"; do
|
|
|
|
|
step "尝试从 $url 下载"
|
|
|
|
|
if wget -q -O "$PACKAGE_PATH" "$url"; then
|
|
|
|
|
step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)"
|
|
|
|
|
return 0
|
|
|
|
|
else
|
|
|
|
|
warning "从 $url 下载失败"
|
|
|
|
|
fi
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 安装 CUDA
|
|
|
|
|
install_cuda() {
|
|
|
|
|
step "开始安装 CUDA ${CUDA_VERSION}"
|
|
|
|
|
|
|
|
|
|
# 下载安装包
|
|
|
|
|
download_package
|
|
|
|
|
|
|
|
|
|
# 创建临时目录
|
|
|
|
|
mkdir -p "$TEMP_DIR"
|
|
|
|
|
trap 'rm -rf "$TEMP_DIR"' EXIT
|
|
|
|
|
|
|
|
|
|
# 检查安装包参数
|
|
|
|
|
step "检查安装包支持的参数"
|
|
|
|
|
sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1
|
|
|
|
|
|
|
|
|
|
# 检查是否支持 --toolkit 参数
|
|
|
|
|
if grep -q -- '--toolkit' "$TEMP_DIR/help.txt"; then
|
|
|
|
|
step "安装包支持 --toolkit 参数"
|
|
|
|
|
run_cmd "sh $PACKAGE_PATH --silent --toolkit"
|
|
|
|
|
else
|
|
|
|
|
warning "安装包不支持 --toolkit 参数,尝试完整安装"
|
|
|
|
|
run_cmd "sh $PACKAGE_PATH --silent"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# 配置环境变量
|
|
|
|
|
step "配置 CUDA 环境变量"
|
|
|
|
|
if ! grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then
|
|
|
|
|
cat >> "$ENV_PROFILE" << EOF
|
|
|
|
|
|
|
|
|
|
# CUDA ${CUDA_VERSION}
|
|
|
|
|
export PATH=${CUDA_INSTALL_DIR}/bin:\$PATH
|
|
|
|
|
export LD_LIBRARY_PATH=${CUDA_INSTALL_DIR}/lib64:\$LD_LIBRARY_PATH
|
|
|
|
|
EOF
|
|
|
|
|
step "已添加环境变量到 $ENV_PROFILE"
|
|
|
|
|
else
|
|
|
|
|
step "环境变量已存在,跳过添加"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# 生效环境变量
|
2025-12-02 16:20:38 +08:00
|
|
|
export PATH=${CUDA_INSTALL_DIR}/bin:\$PATH
|
|
|
|
|
#export LC_BYOBU=0 && source "/etc/profile" #临时解决
|
|
|
|
|
export LC_BYOBU=0
|
|
|
|
|
export LD_LIBRARY_PATH=${CUDA_INSTALL_DIR}/lib64:\$LD_LIBRARY_PATH
|
2025-07-05 15:49:53 +08:00
|
|
|
# 验证安装
|
|
|
|
|
step "验证 CUDA 安装"
|
|
|
|
|
if command -v nvcc &>/dev/null; then
|
|
|
|
|
nvcc_version=$(nvcc -V | grep release | awk '{print $5}' | tr -d ',')
|
|
|
|
|
if [[ "$nvcc_version" == *"${CUDA_MAJOR_MINOR}"* ]]; then
|
|
|
|
|
step "CUDA ${CUDA_VERSION} 安装成功"
|
|
|
|
|
else
|
|
|
|
|
error "CUDA 版本不匹配,期望 ${CUDA_VERSION},实际 $nvcc_version"
|
|
|
|
|
fi
|
|
|
|
|
else
|
|
|
|
|
error "nvcc 命令未找到,安装失败"
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 卸载 CUDA
|
|
|
|
|
uninstall_cuda() {
|
|
|
|
|
step "开始卸载 CUDA ${CUDA_VERSION}"
|
|
|
|
|
|
|
|
|
|
# 创建临时目录
|
|
|
|
|
mkdir -p "$TEMP_DIR"
|
|
|
|
|
trap 'rm -rf "$TEMP_DIR"' EXIT
|
|
|
|
|
|
|
|
|
|
# 检查官方卸载脚本
|
|
|
|
|
OFFICIAL_UNINSTALLER="${CUDA_INSTALL_DIR}/bin/cuda-uninstaller"
|
|
|
|
|
if [[ -x "$OFFICIAL_UNINSTALLER" ]]; then
|
|
|
|
|
step "找到官方卸载脚本: $OFFICIAL_UNINSTALLER"
|
|
|
|
|
|
|
|
|
|
# 执行官方卸载脚本
|
|
|
|
|
step "执行官方卸载程序"
|
|
|
|
|
run_cmd "$OFFICIAL_UNINSTALLER --silent"
|
|
|
|
|
else
|
|
|
|
|
warning "未找到官方卸载脚本,尝试其他方法"
|
|
|
|
|
|
|
|
|
|
# 检查安装包是否存在
|
|
|
|
|
if [[ -f "$PACKAGE_PATH" ]]; then
|
|
|
|
|
step "找到安装包: $PACKAGE_PATH"
|
|
|
|
|
else
|
|
|
|
|
if [[ $FORCE -eq 1 ]]; then
|
|
|
|
|
warning "未找到安装包,继续强制卸载"
|
|
|
|
|
else
|
|
|
|
|
step "未找到安装包,开始下载"
|
|
|
|
|
download_package
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# 检查安装包是否支持 --uninstall 参数
|
|
|
|
|
step "检查安装包是否支持 --uninstall 参数"
|
|
|
|
|
sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1
|
|
|
|
|
|
|
|
|
|
if grep -q -- '--uninstall' "$TEMP_DIR/help.txt"; then
|
|
|
|
|
step "安装包支持 --uninstall 参数"
|
|
|
|
|
run_cmd "sh $PACKAGE_PATH --silent --uninstall"
|
|
|
|
|
else
|
|
|
|
|
step "安装包不支持 --uninstall 参数,尝试解压查找卸载脚本"
|
|
|
|
|
|
|
|
|
|
# 解压安装包
|
|
|
|
|
step "解压安装包到 $TEMP_DIR"
|
|
|
|
|
run_cmd "sh $PACKAGE_PATH --extract=$TEMP_DIR"
|
|
|
|
|
|
|
|
|
|
# 查找卸载脚本
|
|
|
|
|
UNINSTALL_SCRIPT=$(find "$TEMP_DIR" -name "uninstall_cuda*" -type f | head -n1)
|
|
|
|
|
|
|
|
|
|
if [[ -n "$UNINSTALL_SCRIPT" ]]; then
|
|
|
|
|
step "找到卸载脚本: $UNINSTALL_SCRIPT"
|
|
|
|
|
run_cmd "sh $UNINSTALL_SCRIPT"
|
|
|
|
|
else
|
|
|
|
|
warning "未找到卸载脚本,继续手动清理"
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# 清理环境变量
|
|
|
|
|
step "清理环境变量"
|
|
|
|
|
if grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then
|
|
|
|
|
run_cmd "sed -i '/cuda-${CUDA_MAJOR_MINOR}/d' $ENV_PROFILE"
|
|
|
|
|
step "已从 $ENV_PROFILE 移除 CUDA 环境变量"
|
|
|
|
|
else
|
|
|
|
|
step "环境变量已清理"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# 清理安装目录
|
|
|
|
|
if [[ -d "$CUDA_INSTALL_DIR" ]]; then
|
|
|
|
|
step "删除安装目录: $CUDA_INSTALL_DIR"
|
|
|
|
|
run_cmd "rm -rf $CUDA_INSTALL_DIR"
|
|
|
|
|
else
|
|
|
|
|
step "安装目录不存在,跳过删除"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# 清理残留文件
|
|
|
|
|
step "清理残留文件"
|
|
|
|
|
find / -name "*cuda-${CUDA_MAJOR_MINOR}*" 2>/dev/null | while read -r file; do
|
|
|
|
|
if [[ -e "$file" ]]; then
|
|
|
|
|
step "删除残留文件: $file"
|
|
|
|
|
rm -rf "$file" 2>/dev/null || warning "无法删除: $file"
|
|
|
|
|
fi
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
step "CUDA ${CUDA_VERSION} 卸载完成"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
step "开始 CUDA ${CUDA_VERSION} ${ACTION} 流程"
|
|
|
|
|
case "$ACTION" in
|
|
|
|
|
install) install_cuda ;;
|
|
|
|
|
uninstall) uninstall_cuda ;;
|
|
|
|
|
*) error "未知操作: $ACTION" ;;
|
|
|
|
|
esac
|
|
|
|
|
|
|
|
|
|
step "CUDA ${CUDA_VERSION} ${ACTION} 流程完成"
|