ansible-devops/scripts/gpu-manager.sh

190 lines
5.9 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
# 默认参数
ACTION=""
VERSION=""
SCRIPT_REPO="http://116.205.97.109/scripts"
#SCRIPT_REPO="http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts"
INCLUDE_EXPORTER="no" # 默认不安装exporter组件
# 版本组合定义
define_versions() {
# 组合1CUDA 12.6.3 + NVIDIA 565.57.01
if [ "$VERSION" = "1" ]; then
IB_VERSION="23.10-1.1.9.0"
NVIDIA_VERSION="565.57.01"
CUDA_VERSION="12.6.3_560.35.05"
FABRICMANAGER_VERSION="565_565.57.01-1"
EXPORTER_VERSION="1.0.0"
# 组合2CUDA 12.8.1 + NVIDIA 570.124.06
elif [ "$VERSION" = "2" ]; then
IB_VERSION="23.10-1.1.9.0"
NVIDIA_VERSION="570.124.06"
CUDA_VERSION="12.8.1_570.124.06"
FABRICMANAGER_VERSION="570_570.124.06-1"
EXPORTER_VERSION="1.0.0"
else
log_error "不支持的版本组合: $VERSION。请选择 1 或 2"
fi
}
# 显示版本信息
show_version_info() {
echo -e "\n${YELLOW}您当前选择的组合版本如下:${NC}"
echo -e "${GREEN}========================================${NC}"
echo -e "${YELLOW}组件1: IB驱动${NC} ${GREEN}版本: ${IB_VERSION}${NC}"
echo -e "${YELLOW}组件2: NVIDIA驱动${NC} ${GREEN}版本: ${NVIDIA_VERSION}${NC}"
echo -e "${YELLOW}组件3: CUDA工具包${NC} ${GREEN}版本: ${CUDA_VERSION}${NC}"
echo -e "${YELLOW}组件4: FabricManager${NC} ${GREEN}版本: ${FABRICMANAGER_VERSION}${NC}"
echo -e "${YELLOW}Exporter组件:${NC} ${GREEN}状态: ${INCLUDE_EXPORTER}${NC}"
echo -e "${GREEN}========================================${NC}\n"
}
# 执行安装
run_install() {
log_info "开始执行组合$VERSION的安装流程..."
# 系统优化
log_info "执行系统优化..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/system_optimize.sh" | bash
# IB驱动
log_info "安装IB驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --install --version "$IB_VERSION"
# NVIDIA驱动
log_info "安装NVIDIA驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --install --version "$NVIDIA_VERSION"
# CUDA
log_info "安装CUDA工具包..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --install --version "$CUDA_VERSION"
# FabricManager
log_info "安装NVIDIA FabricManager..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --install --version "$FABRICMANAGER_VERSION"
# 安装exporter组件如果指定
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
log_info "安装Exporter组件..."
log_info "安装nvidia-dcgm..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --install
log_info "安装dcgm-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --install
log_info "安装node-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --install
else
log_info "跳过Exporter组件的安装"
fi
log_info "组合$VERSION的安装已完成!"
}
# 执行卸载
run_uninstall() {
log_info "开始执行组合$VERSION的卸载流程..."
# 注意卸载顺序与安装相反
# FabricManager
log_info "卸载NVIDIA FabricManager..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --uninstall --version "$FABRICMANAGER_VERSION"
# CUDA
log_info "卸载CUDA工具包..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --uninstall --version "$CUDA_VERSION"
# NVIDIA驱动
log_info "卸载NVIDIA驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --uninstall --version "$NVIDIA_VERSION"
# IB驱动
log_info "卸载IB驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --uninstall --version "$IB_VERSION"
# 卸载exporter组件如果指定
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
log_info "卸载Exporter组件..."
log_info "卸载nvidia-dcgm..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --uninstall
log_info "卸载dcgm-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --uninstall
log_info "卸载node-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --uninstall
else
log_info "跳过Exporter组件的卸载"
fi
log_info "组合$VERSION的卸载已完成!"
}
# 解析命令行参数
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install)
ACTION="install"
;;
--uninstall)
ACTION="uninstall"
;;
--version)
VERSION="$2"
shift
;;
--include=exporter)
INCLUDE_EXPORTER="yes"
;;
*)
log_error "未知参数: $1"
;;
esac
shift
done
if [[ -z "$ACTION" ]]; then
log_error "请指定操作: --install 或 --uninstall"
fi
if [[ -z "$VERSION" ]]; then
log_error "请指定版本组合: --version 1 或 --version 2"
fi
}
# 主函数
main() {
# 检查root权限
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要root权限运行请使用sudo执行"
fi
>/opt/gpu-manager.log
parse_args "$@"
define_versions
show_version_info
if [ "$ACTION" = "install" ]; then
run_install
else
run_uninstall
fi
}
# 执行主函数
main "$@"