ansible-devops/scripts/gpu-manager.sh

190 lines
5.9 KiB
Bash
Raw Permalink Normal View History

2025-07-05 15:49:53 +08:00
#!/bin/bash
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
# 默认参数
ACTION=""
VERSION=""
2025-07-05 18:22:07 +08:00
SCRIPT_REPO="http://116.205.97.109/scripts"
#SCRIPT_REPO="http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts"
2025-07-05 15:49:53 +08:00
INCLUDE_EXPORTER="no" # 默认不安装exporter组件
# 版本组合定义
define_versions() {
# 组合1CUDA 12.6.3 + NVIDIA 565.57.01
if [ "$VERSION" = "1" ]; then
IB_VERSION="23.10-1.1.9.0"
NVIDIA_VERSION="565.57.01"
CUDA_VERSION="12.6.3_560.35.05"
FABRICMANAGER_VERSION="565_565.57.01-1"
EXPORTER_VERSION="1.0.0"
# 组合2CUDA 12.8.1 + NVIDIA 570.124.06
elif [ "$VERSION" = "2" ]; then
IB_VERSION="23.10-1.1.9.0"
NVIDIA_VERSION="570.124.06"
CUDA_VERSION="12.8.1_570.124.06"
FABRICMANAGER_VERSION="570_570.124.06-1"
EXPORTER_VERSION="1.0.0"
else
log_error "不支持的版本组合: $VERSION。请选择 1 或 2"
fi
}
# 显示版本信息
show_version_info() {
echo -e "\n${YELLOW}您当前选择的组合版本如下:${NC}"
echo -e "${GREEN}========================================${NC}"
echo -e "${YELLOW}组件1: IB驱动${NC} ${GREEN}版本: ${IB_VERSION}${NC}"
echo -e "${YELLOW}组件2: NVIDIA驱动${NC} ${GREEN}版本: ${NVIDIA_VERSION}${NC}"
echo -e "${YELLOW}组件3: CUDA工具包${NC} ${GREEN}版本: ${CUDA_VERSION}${NC}"
echo -e "${YELLOW}组件4: FabricManager${NC} ${GREEN}版本: ${FABRICMANAGER_VERSION}${NC}"
echo -e "${YELLOW}Exporter组件:${NC} ${GREEN}状态: ${INCLUDE_EXPORTER}${NC}"
echo -e "${GREEN}========================================${NC}\n"
}
# 执行安装
run_install() {
log_info "开始执行组合$VERSION的安装流程..."
# 系统优化
log_info "执行系统优化..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/system_optimize.sh" | bash
# IB驱动
log_info "安装IB驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --install --version "$IB_VERSION"
# NVIDIA驱动
log_info "安装NVIDIA驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --install --version "$NVIDIA_VERSION"
# CUDA
log_info "安装CUDA工具包..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --install --version "$CUDA_VERSION"
# FabricManager
log_info "安装NVIDIA FabricManager..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --install --version "$FABRICMANAGER_VERSION"
# 安装exporter组件如果指定
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
log_info "安装Exporter组件..."
log_info "安装nvidia-dcgm..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --install
log_info "安装dcgm-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --install
log_info "安装node-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --install
else
log_info "跳过Exporter组件的安装"
fi
log_info "组合$VERSION的安装已完成!"
}
# 执行卸载
run_uninstall() {
log_info "开始执行组合$VERSION的卸载流程..."
# 注意卸载顺序与安装相反
# FabricManager
log_info "卸载NVIDIA FabricManager..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --uninstall --version "$FABRICMANAGER_VERSION"
# CUDA
log_info "卸载CUDA工具包..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --uninstall --version "$CUDA_VERSION"
# NVIDIA驱动
log_info "卸载NVIDIA驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --uninstall --version "$NVIDIA_VERSION"
# IB驱动
log_info "卸载IB驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --uninstall --version "$IB_VERSION"
# 卸载exporter组件如果指定
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
log_info "卸载Exporter组件..."
log_info "卸载nvidia-dcgm..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --uninstall
log_info "卸载dcgm-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --uninstall
log_info "卸载node-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --uninstall
else
log_info "跳过Exporter组件的卸载"
fi
log_info "组合$VERSION的卸载已完成!"
}
# 解析命令行参数
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install)
ACTION="install"
;;
--uninstall)
ACTION="uninstall"
;;
--version)
VERSION="$2"
shift
;;
--include=exporter)
INCLUDE_EXPORTER="yes"
;;
*)
log_error "未知参数: $1"
;;
esac
shift
done
if [[ -z "$ACTION" ]]; then
log_error "请指定操作: --install 或 --uninstall"
fi
if [[ -z "$VERSION" ]]; then
log_error "请指定版本组合: --version 1 或 --version 2"
fi
}
# 主函数
main() {
# 检查root权限
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要root权限运行请使用sudo执行"
fi
>/opt/gpu-manager.log
parse_args "$@"
define_versions
show_version_info
if [ "$ACTION" = "install" ]; then
run_install
else
run_uninstall
fi
}
# 执行主函数
main "$@"