ansible-devops/scripts/all-install.sh

94 lines
3.9 KiB
Bash
Raw Permalink Normal View History

2025-12-02 15:21:29 +08:00
#!/bin/bash
# 自动化部署脚本:同步文件 + 执行各类安装脚本
# 设置脚本出错时立即终止
set -e
# ========== 日志配置 ==========
# 定义日志文件(带时间戳,避免覆盖)
2025-12-02 15:35:09 +08:00
LOG_FILE="/opt/deploy.log"
2025-12-02 15:21:29 +08:00
# 将stdout和stderr同时重定向到tee前台输出+日志写入)
exec > >(tee -a "$LOG_FILE") 2>&1
# ========== 颜色输出函数 ==========
green_echo() {
echo -e "\033[32m[INFO] $(date +'%Y-%m-%d %H:%M:%S') $1\033[0m"
}
red_echo() {
echo -e "\033[31m[ERROR] $(date +'%Y-%m-%d %H:%M:%S') $1\033[0m"
}
# ========== 执行步骤 ==========
# 1. 同步远程/opt目录到本地
green_echo "开始同步远程服务器172.51.4.158的/opt目录..."
2025-12-02 15:35:09 +08:00
# 安装sshpass
green_echo "开始安装sshpass..."
if apt install -y sshpass; then
green_echo "sshpass安装成功"
else
red_echo "sshpass安装失败请检查网络或软件源配置"
exit 1
fi
2025-12-02 15:21:29 +08:00
sshpass -p 'Zp5#tr6#xm9' rsync -avzP -e "ssh -o StrictHostKeyChecking=no" root@172.51.4.158:/opt/* /opt/
2025-12-02 16:08:16 +08:00
#sshpass -p 'Zp5#tr6#xm9' rsync -avzP -e "ssh -o StrictHostKeyChecking=no" root@172.51.4.158:/opt/* /opt/
2025-12-02 15:21:29 +08:00
green_echo "目录同步完成!"
# 2. 系统优化脚本
green_echo "执行系统优化脚本..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/system_optimize.sh | bash
green_echo "系统优化完成!"
# 3. IB驱动安装
green_echo "安装IB驱动版本24.10-2.1.8.0Ubuntu22.04..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/ib-drive.sh | bash -s -- --install --version "24.10-2.1.8.0" --distro "ubuntu22.04"
green_echo "IB驱动安装完成"
# 4. IB相关配置
green_echo "执行IB配置脚本..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/ib.sh | bash -s -- --install
green_echo "IB配置完成"
# 5. NVIDIA驱动安装
green_echo "安装NVIDIA驱动版本570.124.06..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/nvidia-driver.sh | bash -s -- --install --version '570.124.06'
green_echo "NVIDIA驱动安装完成"
# 6. NVIDIA Fabric Manager安装
green_echo "安装NVIDIA Fabric Manager版本570.124.06-1Ubuntu22.04..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/nvidia-fabricmanager.sh | bash -s -- --install --distro ubuntu22.04 --version 570_570.124.06-1
green_echo "Fabric Manager安装完成"
# 7. CUDA安装
green_echo "安装CUDA版本12.8.1_570.124.06..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/cuda.sh | bash -s -- --install --version '12.8.1_570.124.06'
green_echo "CUDA安装完成"
# 8. NVIDIA DCGM安装
green_echo "安装NVIDIA DCGM..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/nvidia-dcgm.sh | bash -s -- --install
green_echo "DCGM安装完成"
# 9. DCGM-Exporter安装
green_echo "安装DCGM-Exporter..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/dcgm-exporter.sh | bash -s -- --install
green_echo "DCGM-Exporter安装完成"
# 10. Node-Exporter安装
green_echo "安装Node-Exporter..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/node-exporter.sh | bash -s -- --install
green_echo "Node-Exporter安装完成"
# 11. GPU监控部署
green_echo "部署GPU监控..."
cd /opt/ && wget -qO- http://116.205.97.109/scripts/deploy_gpu_monitor.sh | bash -s -- --install
green_echo "GPU监控部署完成"
# 可选Docker与NVIDIA容器工具包如需启用移除注释
# green_echo "安装Docker版本5:20.10.13~3-0~ubuntu-jammy..."
# cd /opt/ && wget -qO- http://116.205.97.109/scripts/docker.sh | bash -s -- --install --version '5:20.10.13~3-0~ubuntu-jammy'
# green_echo "Docker安装完成"
# green_echo "安装NVIDIA容器工具包版本1.17.8-1..."
# cd /opt/ && wget -qO- http://116.205.97.109/scripts/nvidia-container-toolkit.sh | bash -s -- --install --version '1.17.8-1'
# green_echo "NVIDIA容器工具包安装完成"
green_echo "所有任务执行完毕!日志文件已保存至:$LOG_FILE"