189 lines
5.9 KiB
Bash
189 lines
5.9 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
|
|||
|
|
# 颜色定义
|
|||
|
|
GREEN='\033[1;32m'
|
|||
|
|
RED='\033[1;31m'
|
|||
|
|
YELLOW='\033[1;33m'
|
|||
|
|
NC='\033[0m' # 重置颜色
|
|||
|
|
|
|||
|
|
# 日志函数
|
|||
|
|
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
|||
|
|
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
|
|||
|
|
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
|
|||
|
|
|
|||
|
|
# 默认参数
|
|||
|
|
ACTION=""
|
|||
|
|
VERSION=""
|
|||
|
|
SCRIPT_REPO="http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts"
|
|||
|
|
INCLUDE_EXPORTER="no" # 默认不安装exporter组件
|
|||
|
|
|
|||
|
|
# 版本组合定义
|
|||
|
|
define_versions() {
|
|||
|
|
# 组合1:CUDA 12.6.3 + NVIDIA 565.57.01
|
|||
|
|
if [ "$VERSION" = "1" ]; then
|
|||
|
|
IB_VERSION="23.10-1.1.9.0"
|
|||
|
|
NVIDIA_VERSION="565.57.01"
|
|||
|
|
CUDA_VERSION="12.6.3_560.35.05"
|
|||
|
|
FABRICMANAGER_VERSION="565_565.57.01-1"
|
|||
|
|
EXPORTER_VERSION="1.0.0"
|
|||
|
|
# 组合2:CUDA 12.8.1 + NVIDIA 570.124.06
|
|||
|
|
elif [ "$VERSION" = "2" ]; then
|
|||
|
|
IB_VERSION="23.10-1.1.9.0"
|
|||
|
|
NVIDIA_VERSION="570.124.06"
|
|||
|
|
CUDA_VERSION="12.8.1_570.124.06"
|
|||
|
|
FABRICMANAGER_VERSION="570_570.124.06-1"
|
|||
|
|
EXPORTER_VERSION="1.0.0"
|
|||
|
|
else
|
|||
|
|
log_error "不支持的版本组合: $VERSION。请选择 1 或 2"
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 显示版本信息
|
|||
|
|
show_version_info() {
|
|||
|
|
echo -e "\n${YELLOW}您当前选择的组合版本如下:${NC}"
|
|||
|
|
echo -e "${GREEN}========================================${NC}"
|
|||
|
|
echo -e "${YELLOW}组件1: IB驱动${NC} ${GREEN}版本: ${IB_VERSION}${NC}"
|
|||
|
|
echo -e "${YELLOW}组件2: NVIDIA驱动${NC} ${GREEN}版本: ${NVIDIA_VERSION}${NC}"
|
|||
|
|
echo -e "${YELLOW}组件3: CUDA工具包${NC} ${GREEN}版本: ${CUDA_VERSION}${NC}"
|
|||
|
|
echo -e "${YELLOW}组件4: FabricManager${NC} ${GREEN}版本: ${FABRICMANAGER_VERSION}${NC}"
|
|||
|
|
echo -e "${YELLOW}Exporter组件:${NC} ${GREEN}状态: ${INCLUDE_EXPORTER}${NC}"
|
|||
|
|
echo -e "${GREEN}========================================${NC}\n"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 执行安装
|
|||
|
|
run_install() {
|
|||
|
|
log_info "开始执行组合$VERSION的安装流程..."
|
|||
|
|
|
|||
|
|
# 系统优化
|
|||
|
|
log_info "执行系统优化..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/system_optimize.sh" | bash
|
|||
|
|
|
|||
|
|
# IB驱动
|
|||
|
|
log_info "安装IB驱动..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --install --version "$IB_VERSION"
|
|||
|
|
|
|||
|
|
# NVIDIA驱动
|
|||
|
|
log_info "安装NVIDIA驱动..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --install --version "$NVIDIA_VERSION"
|
|||
|
|
|
|||
|
|
# CUDA
|
|||
|
|
log_info "安装CUDA工具包..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --install --version "$CUDA_VERSION"
|
|||
|
|
|
|||
|
|
# FabricManager
|
|||
|
|
log_info "安装NVIDIA FabricManager..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --install --version "$FABRICMANAGER_VERSION"
|
|||
|
|
|
|||
|
|
# 安装exporter组件(如果指定)
|
|||
|
|
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
|
|||
|
|
log_info "安装Exporter组件..."
|
|||
|
|
|
|||
|
|
log_info "安装nvidia-dcgm..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --install
|
|||
|
|
|
|||
|
|
log_info "安装dcgm-exporter..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --install
|
|||
|
|
|
|||
|
|
log_info "安装node-exporter..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --install
|
|||
|
|
else
|
|||
|
|
log_info "跳过Exporter组件的安装"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
log_info "组合$VERSION的安装已完成!"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 执行卸载
|
|||
|
|
run_uninstall() {
|
|||
|
|
log_info "开始执行组合$VERSION的卸载流程..."
|
|||
|
|
|
|||
|
|
# 注意卸载顺序与安装相反
|
|||
|
|
# FabricManager
|
|||
|
|
log_info "卸载NVIDIA FabricManager..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --uninstall --version "$FABRICMANAGER_VERSION"
|
|||
|
|
|
|||
|
|
# CUDA
|
|||
|
|
log_info "卸载CUDA工具包..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --uninstall --version "$CUDA_VERSION"
|
|||
|
|
|
|||
|
|
# NVIDIA驱动
|
|||
|
|
log_info "卸载NVIDIA驱动..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --uninstall --version "$NVIDIA_VERSION"
|
|||
|
|
|
|||
|
|
# IB驱动
|
|||
|
|
log_info "卸载IB驱动..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --uninstall --version "$IB_VERSION"
|
|||
|
|
|
|||
|
|
# 卸载exporter组件(如果指定)
|
|||
|
|
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
|
|||
|
|
log_info "卸载Exporter组件..."
|
|||
|
|
|
|||
|
|
log_info "卸载nvidia-dcgm..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --uninstall
|
|||
|
|
|
|||
|
|
log_info "卸载dcgm-exporter..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --uninstall
|
|||
|
|
|
|||
|
|
log_info "卸载node-exporter..."
|
|||
|
|
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --uninstall
|
|||
|
|
else
|
|||
|
|
log_info "跳过Exporter组件的卸载"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
log_info "组合$VERSION的卸载已完成!"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 解析命令行参数
|
|||
|
|
parse_args() {
|
|||
|
|
while [[ $# -gt 0 ]]; do
|
|||
|
|
case "$1" in
|
|||
|
|
--install)
|
|||
|
|
ACTION="install"
|
|||
|
|
;;
|
|||
|
|
--uninstall)
|
|||
|
|
ACTION="uninstall"
|
|||
|
|
;;
|
|||
|
|
--version)
|
|||
|
|
VERSION="$2"
|
|||
|
|
shift
|
|||
|
|
;;
|
|||
|
|
--include=exporter)
|
|||
|
|
INCLUDE_EXPORTER="yes"
|
|||
|
|
;;
|
|||
|
|
*)
|
|||
|
|
log_error "未知参数: $1"
|
|||
|
|
;;
|
|||
|
|
esac
|
|||
|
|
shift
|
|||
|
|
done
|
|||
|
|
|
|||
|
|
if [[ -z "$ACTION" ]]; then
|
|||
|
|
log_error "请指定操作: --install 或 --uninstall"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
if [[ -z "$VERSION" ]]; then
|
|||
|
|
log_error "请指定版本组合: --version 1 或 --version 2"
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 主函数
|
|||
|
|
main() {
|
|||
|
|
# 检查root权限
|
|||
|
|
if [[ $EUID -ne 0 ]]; then
|
|||
|
|
log_error "此脚本需要root权限运行,请使用sudo执行"
|
|||
|
|
fi
|
|||
|
|
>/opt/gpu-manager.log
|
|||
|
|
parse_args "$@"
|
|||
|
|
define_versions
|
|||
|
|
show_version_info
|
|||
|
|
|
|||
|
|
if [ "$ACTION" = "install" ]; then
|
|||
|
|
run_install
|
|||
|
|
else
|
|||
|
|
run_uninstall
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 执行主函数
|
|||
|
|
main "$@"
|