#!/bin/bash # 颜色定义 GREEN='\033[1;32m' RED='\033[1;31m' YELLOW='\033[1;33m' NC='\033[0m' # 重置颜色 # 日志函数 log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } # 默认参数 ACTION="" VERSION="" SCRIPT_REPO="http://116.205.97.109/scripts" #SCRIPT_REPO="http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts" INCLUDE_EXPORTER="no" # 默认不安装exporter组件 # 版本组合定义 define_versions() { # 组合1:CUDA 12.6.3 + NVIDIA 565.57.01 if [ "$VERSION" = "1" ]; then IB_VERSION="23.10-1.1.9.0" NVIDIA_VERSION="565.57.01" CUDA_VERSION="12.6.3_560.35.05" FABRICMANAGER_VERSION="565_565.57.01-1" EXPORTER_VERSION="1.0.0" # 组合2:CUDA 12.8.1 + NVIDIA 570.124.06 elif [ "$VERSION" = "2" ]; then IB_VERSION="23.10-1.1.9.0" NVIDIA_VERSION="570.124.06" CUDA_VERSION="12.8.1_570.124.06" FABRICMANAGER_VERSION="570_570.124.06-1" EXPORTER_VERSION="1.0.0" else log_error "不支持的版本组合: $VERSION。请选择 1 或 2" fi } # 显示版本信息 show_version_info() { echo -e "\n${YELLOW}您当前选择的组合版本如下:${NC}" echo -e "${GREEN}========================================${NC}" echo -e "${YELLOW}组件1: IB驱动${NC} ${GREEN}版本: ${IB_VERSION}${NC}" echo -e "${YELLOW}组件2: NVIDIA驱动${NC} ${GREEN}版本: ${NVIDIA_VERSION}${NC}" echo -e "${YELLOW}组件3: CUDA工具包${NC} ${GREEN}版本: ${CUDA_VERSION}${NC}" echo -e "${YELLOW}组件4: FabricManager${NC} ${GREEN}版本: ${FABRICMANAGER_VERSION}${NC}" echo -e "${YELLOW}Exporter组件:${NC} ${GREEN}状态: ${INCLUDE_EXPORTER}${NC}" echo -e "${GREEN}========================================${NC}\n" } # 执行安装 run_install() { log_info "开始执行组合$VERSION的安装流程..." # 系统优化 log_info "执行系统优化..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/system_optimize.sh" | bash # IB驱动 log_info "安装IB驱动..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --install --version "$IB_VERSION" # NVIDIA驱动 log_info "安装NVIDIA驱动..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --install --version "$NVIDIA_VERSION" # CUDA log_info "安装CUDA工具包..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --install --version "$CUDA_VERSION" # FabricManager log_info "安装NVIDIA FabricManager..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --install --version "$FABRICMANAGER_VERSION" # 安装exporter组件(如果指定) if [ "$INCLUDE_EXPORTER" = "yes" ]; then log_info "安装Exporter组件..." log_info "安装nvidia-dcgm..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --install log_info "安装dcgm-exporter..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --install log_info "安装node-exporter..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --install else log_info "跳过Exporter组件的安装" fi log_info "组合$VERSION的安装已完成!" } # 执行卸载 run_uninstall() { log_info "开始执行组合$VERSION的卸载流程..." # 注意卸载顺序与安装相反 # FabricManager log_info "卸载NVIDIA FabricManager..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --uninstall --version "$FABRICMANAGER_VERSION" # CUDA log_info "卸载CUDA工具包..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --uninstall --version "$CUDA_VERSION" # NVIDIA驱动 log_info "卸载NVIDIA驱动..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --uninstall --version "$NVIDIA_VERSION" # IB驱动 log_info "卸载IB驱动..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --uninstall --version "$IB_VERSION" # 卸载exporter组件(如果指定) if [ "$INCLUDE_EXPORTER" = "yes" ]; then log_info "卸载Exporter组件..." log_info "卸载nvidia-dcgm..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --uninstall log_info "卸载dcgm-exporter..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --uninstall log_info "卸载node-exporter..." cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --uninstall else log_info "跳过Exporter组件的卸载" fi log_info "组合$VERSION的卸载已完成!" } # 解析命令行参数 parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --install) ACTION="install" ;; --uninstall) ACTION="uninstall" ;; --version) VERSION="$2" shift ;; --include=exporter) INCLUDE_EXPORTER="yes" ;; *) log_error "未知参数: $1" ;; esac shift done if [[ -z "$ACTION" ]]; then log_error "请指定操作: --install 或 --uninstall" fi if [[ -z "$VERSION" ]]; then log_error "请指定版本组合: --version 1 或 --version 2" fi } # 主函数 main() { # 检查root权限 if [[ $EUID -ne 0 ]]; then log_error "此脚本需要root权限运行,请使用sudo执行" fi >/opt/gpu-manager.log parse_args "$@" define_versions show_version_info if [ "$ACTION" = "install" ]; then run_install else run_uninstall fi } # 执行主函数 main "$@"