diff --git a/scripts/README.md b/scripts/README.md index 9ea311d..0a10fc7 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -182,3 +182,29 @@ tail -f /opt/gpu-manager.log (2)超威B200:再生龙镜像:10.102.35.99:/nfs/clone.iso 备份路径: /nfs/chaowei-B200-1.7T-img #注意超威机型对再生龙引导镜像对版本有要求,最新版本无法引导。 (3) 技嘉A100:再生龙镜像:10.101.0.86:/nfs/ 备份路径: /nfs/2025-07-15-03-Jijia-A100-960G-img #技嘉A100-磁盘960G-CX7 ``` + + **ubuntu2404:** + ```bash + cd /opt/ + wget https://content.mellanox.com/ofed/MLNX_OFED-24.10-2.1.8.0/MLNX_OFED_LINUX-24.10-2.1.8.0-ubuntu24.04-x86_64.tgz #[ubuntu24.04] + wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2404/x86_64/nvidia-fabricmanager-570_570.124.06-1_amd64.deb #[ubuntu24.04] + wget https://cn.download.nvidia.com/tesla/570.124.06/NVIDIA-Linux-x86_64-570.124.06.run #[无版本要求] + wget https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run #[无版本要求] + cd /opt/ && git clone http://116.205.97.109:3000/yindun/ansible-devops.git + cd /opt/ansible-devops/scripts/ + + #-----临时替换适配ubuntu24.04 + sed -i -e 's/5.8-6.0.4.2/24.10-2.1.8.0/g' -e 's/22.04/24.04/g' ib-drive.sh && sed -i 's/2204/2404/g' nvidia-fabricmanager.sh + bash system_optimize.sh --install + bash ib-drive.sh --install --version "24.10-2.1.8.0" + bash nvidia-driver.sh --install --version '570.124.06' + bash nvidia-fabricmanager.sh --install --version "570_570.124.06-1" + bash cuda.sh --install --version "12.8.1_570.124.06" + + #安装exporter + cd /opt/ && wget -qO- http://116.205.97.109/scripts/nvidia-dcgm.sh | bash -s -- --install + cd /opt/ && wget -qO- http://116.205.97.109/scripts/dcgm-exporter.sh | bash -s -- --install + cd /opt/ && wget -qO- http://116.205.97.109/scripts/node-exporter.sh | bash -s -- --install + + #修改主机名,内核版本锁定,根分区扩容已集成在初始化脚本中无须重复执行。 + ```