This commit is contained in:
joy 2025-07-05 15:49:53 +08:00
parent 5fb837a692
commit 6535633321
19 changed files with 2369 additions and 0 deletions

8
group_vars/all.yaml Normal file
View File

@ -0,0 +1,8 @@
# 基础路径配置
script_dest: "/opt/ansible-scripts" # 脚本存储目录
log_base_dir: "/var/log/ansible-deploy" # 日志根目录
# ansible优化
ansible_ssh_common_args: "-o ControlMaster=auto -o ControlPersist=60s" #自动复用已建立的 SSH 连接
ansible_pipelining: yes
ansible_ssh_timeout: 120

12
inventory/prod/prod.ini Normal file
View File

@ -0,0 +1,12 @@
[compute_nodes]
gpu-node-01 ansible_host=10.0.0.101 gpu_model="NVIDIA A100"
gpu-node-02 ansible_host=10.0.0.102 gpu_model="NVIDIA H100"
[all_nodes:children]
compute_nodes
[all:vars]
ansible_user=root
ansible_ssh_port=22
ansible_ssh_pass=xxx

View File

@ -0,0 +1,9 @@
- name: 全量组件部署
hosts: all_nodes # 所有节点分组
roles:
- role: system_init # 基础初始化
- role: gpu_driver # GPU节点专
- role: node_exporter # 系统监控
- role: dcgm_exporter # GPU监控
vars:
operation: "install" # 操作类型install/uninstall

0
playbooks/deploy_gpu.yml Normal file
View File

View File

View File

@ -0,0 +1,50 @@
#!/bin/bash
set -eo pipefail
LOG_DIR="{{ log_base_dir }}/{{ driver.name }}" # 从角色变量注入路径
LOG_FILE="${LOG_DIR}/install-$(date +%Y%m%d).log"
mkdir -p "$LOG_DIR"
exec > >(tee -a "$LOG_FILE") 2>&1
# 参数解析(严格匹配角色定义的操作)
OPERATION=""
VERSION=""
while [[ $# -gt 0 ]]; do
case "$1" in
--install) OPERATION="install" ;;
--uninstall) OPERATION="uninstall" ;;
--version) VERSION="$2"; shift ;;
*) echo "错误:未知参数 $1" >&2; exit 1 ;;
esac
shift
done
# 安装逻辑(使用角色专属变量)
install() {
local DRIVER_VERSION="${VERSION:-$DEFAULT_VERSION}"
echo "[$(date)] 开始安装NVIDIA驱动版本$DRIVER_VERSION,型号:$GPU_MODEL..."
# 企业内部镜像下载(安全加速)
wget -q "${DOWNLOAD_URL}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" \
-O "/tmp/nvidia-driver.run"
# 静默安装(企业级无交互模式)
sh "/tmp/nvidia-driver.run" --silent --no-x-check --no-nouveau-check
# 严格功能验证(硬件型号匹配)
if ! nvidia-smi --query-gpu=name --format=csv,noheader | grep -q "$GPU_MODEL"; then
echo "错误驱动安装后未识别到目标GPU型号" >&2; exit 1
fi
}
# 卸载逻辑(幂等性设计)
uninstall() {
echo "[$(date)] 开始卸载NVIDIA驱动..."
/usr/bin/nvidia-uninstall --silent # 官方静默卸载工具
rm -f "/tmp/nvidia-driver.run" # 清理残留文件
}
# 主流程(依赖角色变量注入的默认值)
DEFAULT_VERSION="{{ driver.default_version }}"
DOWNLOAD_URL="{{ driver.download_url }}"
install || uninstall # 根据OPERATION执行对应函数由剧本参数控制
exit 0

View File

@ -0,0 +1,42 @@
- name: 创建角色专属日志目录
file:
path: "{{ log_base_dir }}/{{ driver.name }}"
state: directory
mode: "0750"
- name: 同步驱动脚本到目标服务器
copy:
src: "{{ driver.install_script }}"
dest: "{{ script_dest }}/{{ driver.install_script }}"
mode: "0755"
force: yes # 确保使用最新脚本
- name: 执行驱动操作(安装/卸载)
shell: |
{{ script_dest }}/{{ driver.install_script }} \
{{ operations[operation] }} \
{% if target_version is defined and target_version != "" %}--version {{ target_version }}{% endif %}
register: script_result
environment:
GPU_MODEL: "{{ hostvars[inventory_hostname]['gpu_model'] }}" # 注入主机硬件信息
retries: 3 # 企业级重试机制失败3次终止
delay: 30 # 重试间隔30秒
become: yes # 使用sudo执行
- name: 验证操作结果(安装时)
when: operation == "install"
shell: "{{ driver.service_check }}"
changed_when: false
failed_when: "GPU count: 0" in script_result.stderr
- name: 记录操作日志(企业级可观测性)
uri:
url: "http://logging.internal.com/api/ansible"
method: POST
body_format: json
body:
host: "{{ inventory_hostname }}"
component: "{{ driver.name }}_driver"
operation: "{{ operation }}"
version: "{{ target_version | default(driver.default_version) }}"
status: "{% if script_result.rc == 0 %}success{% else %}failed{% endif %}"

View File

@ -0,0 +1,12 @@
# 显卡驱动专属变量 脚本所需参数
driver:
name: "nvidia"
default_version: "545.29.06" # 版本
download_url: "http://repo.internal.com/drivers/nvidia" # 安装包下载路径
install_script: "nvidia-install.sh" # 脚本文件名
service_check: "nvidia-smi --list-gpus" # 安装后验证命令
# 操作参数
operations:
install: "--install"
uninstall: "--uninstall"

155
scripts/README.md Normal file
View File

@ -0,0 +1,155 @@
<h2 align="center">GPU 环境标准化部署脚本使用说明:</h2>
<p align="center">
<img src="https://img.shields.io/github/languages/code-size/nanchengcyu/TechMindWave-frontend" alt="code size"/>
<img src="https://img.shields.io/badge/ofed-17.0.2-blue" alt="ofed"/>
<img src="https://img.shields.io/badge/NVIDIA-565.57.01-brightgreen" alt="NVIDIA"/>
<img src="https://img.shields.io/badge/fabricmanager-565.57.01-blue" alt="fabricmanager"/>
<img src="https://img.shields.io/badge/CUDA-12.6.3-brightgreen" alt="CUDA"/>
<br>
<img src="https://img.shields.io/badge/Author-王云龙-orange" alt="Author" />
</p>
<hr>
### 一、脚本概述
该脚本旨在简化 GPU 相关应用的安装流程,适用于需要快速部署 GPU 环境的场景。
- **核心功能**
```bash
脚本可批量完成网卡驱动、显卡驱动、fabricmanager互联管理器、CUDA 工具包、Nvidia-dcgm、DCGM-EXporter、Node-EXporter 核心组件的安装与卸载操作
```
- **配置说明**
```bash
用户管理:若需删除 ubuntu 用户,需手动执行相关用户删除命令,并妥善处理该用户关联的数据与权限。​
磁盘管理:磁盘分区扩容需通过磁盘管理工具,根据实际需求对磁盘进行分区调整与扩容操作,以满足应用存储需求。​
网络配置:网卡重命名需手动修改网络配置文件,根据实际网络环境对网卡名称进行重新定义,确保网络连接正常。
```
- **使用建议**
```bash
新系统推荐使用一键自动安装脚本,可快速、全面地完成 GPU 相关应用的部署,具体使用方法详见文章末尾说明。​若系统之前已存在相关安装内容,或需要对各组件进行独立、定制化部署,建议使用单独部署脚本安装。
```
### 二、使用说明
#### 1系统初始化
```bash
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
```
#### 2MLNX_OFED 网络套件安装/卸载
```bash
#支持版本[23.10-1.1.9.0]
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --uninstall --version '23.10-1.1.9.0'
```
#### 3Nvidia 显卡驱动安装/卸载
```bash
#支持版本[565.57.01] [570.124.06]
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '565.57.01'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '565.57.01'
```
#### 4GPU 互联管理器安装/卸载
```bash
#支持版本[565_565.57.01-1] [570_570.124.06-1]
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '565_565.57.01-1'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '565_565.57.01-1'
```
#### 5NVIDIA CUDA 工具包部署/卸载
```bash
#支持版本[12.6.3_560.35.05] [12.8.1_570.124.06]
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.6.3_560.35.05'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.6.3_560.35.05'
```
#### 6dcgm/node exporter 部署/卸载
```bash
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-dcgm.sh | bash -s -- --install
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/dcgm-exporter.sh | bash -s -- --install
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/node-exporter.sh | bash -s -- --install
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-dcgm.sh | bash -s -- --uninstall
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/dcgm-exporter.sh | bash -s -- --uninstall
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/node-exporter.sh | bash -s -- --uninstall
```
#### 7批量组件安装/卸载
![Static Badge](https://img.shields.io/badge/组件[1]-orange?style=flat-square)
![Static Badge](https://img.shields.io/badge/mlnx_ofed-23.10.1.1.9.0-brightgreen?style=plastic)
![Static Badge](https://img.shields.io/badge/nvidia_drive-565.57.01-brightgreen?style=plastic)
![Static Badge](https://img.shields.io/badge/cuda-12.6.3.560.35.05-brightgreen?style=plastic)
![Static Badge](https://img.shields.io/badge/fabricmanager-565_565.57.01.1-brightgreen?style=plastic)
```bash
安装:---------------------------------------------------------------------------------------------------------------------------------------------
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '565.57.01'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.6.3_560.35.05'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '565_565.57.01-1'
卸载:---------------------------------------------------------------------------------------------------------------------------------------------
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh |bash -s -- --uninstall --version '23.10-1.1.9.0'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '565.57.01'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.6.3_560.35.05'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '565_565.57.01-1'
```
![Static Badge](https://img.shields.io/badge/组件[2]-orange?style=flat-square)
![Static Badge](https://img.shields.io/badge/mlnx_ofed-23.10.1.1.9.0-brightgreen?style=plastic)
![Static Badge](https://img.shields.io/badge/nvidia_drive-570.124.06-brightgreen?style=plastic)
![Static Badge](https://img.shields.io/badge/cuda-12.8.1.570.124.06-brightgreen?style=plastic)
![Static Badge](https://img.shields.io/badge/fabricmanager-570.124.06.1-brightgreen?style=plastic)
```bash
安装:---------------------------------------------------------------------------------------------------------------------------------------------
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '570.124.06'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.8.1_570.124.06'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '570_570.124.06-1'
卸载:--------------------------------------------------------------------------------------------------------------------------------------------
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.shbash -s -- --uninstall --version '23.10-1.1.9.0'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '570.124.06'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.8.1_570.124.06'
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '570_570.124.06-1'
```
![Static Badge](https://img.shields.io/badge/推荐一键安装脚本-orange?style=flat-square)
```bash
#安装/卸载服务(安装或卸载时间较长,建议放后台执行。)
#组合[1]-----------------------------------------------------------------------------------------------------------------------------------
screen -dmS install_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --install --version 1 --include=exporter 2>&1 > /opt/gpu-manager.log";
tail -f /opt/gpu-manager.log
screen -dmS uninstall_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --uninstall --version 1 --include=exporter 2>&1 > /opt/gpu-manager.log";
tail -f /opt/gpu-manager.log
#组合[2]-----------------------------------------------------------------------------------------------------------------------------------
screen -dmS install_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --install --version 2 --include=exporter 2>&1 > /opt/gpu-manager.log";
tail -f /opt/gpu-manager.log
screen -dmS uninstall_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --uninstall --version 2 --include=exporter 2>&1 > /opt/gpu-manager.log";
tail -f /opt/gpu-manager.log
#说明
#version 1 表示安装/卸载七.[1]组件版本mlnx_ofed-23.10.1.1.9.0+nvidia_drive-565.57.01 +cuda-12.6.3.560.35.05 +fabricmanager-565_565.57.01.1
#version 2 表示安装/卸载七.[2]组件版本mlnx_ofed-23.10.1.1.9.0+nvidia_drive-570.124.06+cuda-12.8.1.570.124.06+fabricmanager-570.124.06.1
#--include=exporter 指定该参数,脚本将安装/卸载exporter组件中的相关服务[dcgm-exporter,node-exporter,nvidia-dcgm],默认不安装/卸载。
```

275
scripts/cuda.sh Normal file
View File

@ -0,0 +1,275 @@
#!/bin/bash
set -euo pipefail
# 全局变量
CUDA_VERSION="12.6"
DRIVER_VERSION="560.35.05"
PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run"
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
INTERNAL_BASE_URL="http://10.101.0.51:5588/cuda-linux"
OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers"
TEMP_DIR="/tmp/cuda_temp"
CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2)
CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
ENV_PROFILE="/etc/profile"
LOG_FILE="/var/log/cuda_manager_$(date +%Y%m%d%H%M%S).log"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数
log() {
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
echo "$msg" >> "$LOG_FILE"
}
# 步骤提示
step() {
local msg="==> $1"
echo -e "${GREEN}$msg${NC}"
log "$msg"
}
# 警告提示
warning() {
local msg="警告: $1"
echo -e "${YELLOW}$msg${NC}"
log "$msg"
}
# 错误提示
error() {
local msg="错误: $1"
echo -e "${RED}$msg${NC}"
log "$msg"
exit 1
}
# 执行命令并记录日志
run_cmd() {
step "执行: $1"
eval "$1" &>> "$LOG_FILE" || {
error "命令执行失败: $1"
}
}
# 检查命令是否存在
check_cmd() {
command -v "$1" &>/dev/null || error "未找到命令: $1"
}
# 参数解析
ACTION=""
FORCE=0
while [[ $# -gt 0 ]]; do
case "$1" in
--install) ACTION="install"; shift ;;
--uninstall) ACTION="uninstall"; shift ;;
--version)
CUSTOM_VERSION="$2"
if [[ $CUSTOM_VERSION =~ ^([0-9]+\.[0-9]+(\.[0-9]+)?)(_([0-9]+\.[0-9]+\.[0-9]+))$ ]]; then
CUDA_VERSION="${BASH_REMATCH[1]}"
DRIVER_VERSION="${BASH_REMATCH[4]}"
PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run"
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2)
CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers"
else
error "版本格式错误,应为 x.x.x_y.y.y 或 x.x_y.y.y"
fi
shift 2 ;;
--force) FORCE=1; shift ;;
*) error "未知参数: $1" ;;
esac
done
[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall"
# 下载安装包
download_package() {
step "检查安装包: $PACKAGE_PATH"
if [[ -f "$PACKAGE_PATH" ]]; then
step "使用本地安装包"
return 0
fi
step "本地包不存在,开始下载"
mkdir -p "$(dirname "$PACKAGE_PATH")"
local urls=(
"${INTERNAL_BASE_URL}/${PACKAGE_NAME}"
"${OFFICIAL_BASE_URL}/${PACKAGE_NAME}"
)
for url in "${urls[@]}"; do
step "尝试从 $url 下载"
if wget -q -O "$PACKAGE_PATH" "$url"; then
step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)"
return 0
else
warning "$url 下载失败"
fi
done
error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH"
}
# 安装 CUDA
install_cuda() {
step "开始安装 CUDA ${CUDA_VERSION}"
# 下载安装包
download_package
# 创建临时目录
mkdir -p "$TEMP_DIR"
trap 'rm -rf "$TEMP_DIR"' EXIT
# 检查安装包参数
step "检查安装包支持的参数"
sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1
# 检查是否支持 --toolkit 参数
if grep -q -- '--toolkit' "$TEMP_DIR/help.txt"; then
step "安装包支持 --toolkit 参数"
run_cmd "sh $PACKAGE_PATH --silent --toolkit"
else
warning "安装包不支持 --toolkit 参数,尝试完整安装"
run_cmd "sh $PACKAGE_PATH --silent"
fi
# 配置环境变量
step "配置 CUDA 环境变量"
if ! grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then
cat >> "$ENV_PROFILE" << EOF
# CUDA ${CUDA_VERSION}
export PATH=${CUDA_INSTALL_DIR}/bin:\$PATH
export LD_LIBRARY_PATH=${CUDA_INSTALL_DIR}/lib64:\$LD_LIBRARY_PATH
EOF
step "已添加环境变量到 $ENV_PROFILE"
else
step "环境变量已存在,跳过添加"
fi
# 生效环境变量
export LC_BYOBU=0 && source "/etc/profile" #临时解决
# 验证安装
step "验证 CUDA 安装"
if command -v nvcc &>/dev/null; then
nvcc_version=$(nvcc -V | grep release | awk '{print $5}' | tr -d ',')
if [[ "$nvcc_version" == *"${CUDA_MAJOR_MINOR}"* ]]; then
step "CUDA ${CUDA_VERSION} 安装成功"
else
error "CUDA 版本不匹配,期望 ${CUDA_VERSION},实际 $nvcc_version"
fi
else
error "nvcc 命令未找到,安装失败"
fi
}
# 卸载 CUDA
uninstall_cuda() {
step "开始卸载 CUDA ${CUDA_VERSION}"
# 创建临时目录
mkdir -p "$TEMP_DIR"
trap 'rm -rf "$TEMP_DIR"' EXIT
# 检查官方卸载脚本
OFFICIAL_UNINSTALLER="${CUDA_INSTALL_DIR}/bin/cuda-uninstaller"
if [[ -x "$OFFICIAL_UNINSTALLER" ]]; then
step "找到官方卸载脚本: $OFFICIAL_UNINSTALLER"
# 执行官方卸载脚本
step "执行官方卸载程序"
run_cmd "$OFFICIAL_UNINSTALLER --silent"
else
warning "未找到官方卸载脚本,尝试其他方法"
# 检查安装包是否存在
if [[ -f "$PACKAGE_PATH" ]]; then
step "找到安装包: $PACKAGE_PATH"
else
if [[ $FORCE -eq 1 ]]; then
warning "未找到安装包,继续强制卸载"
else
step "未找到安装包,开始下载"
download_package
fi
fi
# 检查安装包是否支持 --uninstall 参数
step "检查安装包是否支持 --uninstall 参数"
sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1
if grep -q -- '--uninstall' "$TEMP_DIR/help.txt"; then
step "安装包支持 --uninstall 参数"
run_cmd "sh $PACKAGE_PATH --silent --uninstall"
else
step "安装包不支持 --uninstall 参数,尝试解压查找卸载脚本"
# 解压安装包
step "解压安装包到 $TEMP_DIR"
run_cmd "sh $PACKAGE_PATH --extract=$TEMP_DIR"
# 查找卸载脚本
UNINSTALL_SCRIPT=$(find "$TEMP_DIR" -name "uninstall_cuda*" -type f | head -n1)
if [[ -n "$UNINSTALL_SCRIPT" ]]; then
step "找到卸载脚本: $UNINSTALL_SCRIPT"
run_cmd "sh $UNINSTALL_SCRIPT"
else
warning "未找到卸载脚本,继续手动清理"
fi
fi
fi
# 清理环境变量
step "清理环境变量"
if grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then
run_cmd "sed -i '/cuda-${CUDA_MAJOR_MINOR}/d' $ENV_PROFILE"
step "已从 $ENV_PROFILE 移除 CUDA 环境变量"
else
step "环境变量已清理"
fi
# 清理安装目录
if [[ -d "$CUDA_INSTALL_DIR" ]]; then
step "删除安装目录: $CUDA_INSTALL_DIR"
run_cmd "rm -rf $CUDA_INSTALL_DIR"
else
step "安装目录不存在,跳过删除"
fi
# 清理残留文件
step "清理残留文件"
find / -name "*cuda-${CUDA_MAJOR_MINOR}*" 2>/dev/null | while read -r file; do
if [[ -e "$file" ]]; then
step "删除残留文件: $file"
rm -rf "$file" 2>/dev/null || warning "无法删除: $file"
fi
done
step "CUDA ${CUDA_VERSION} 卸载完成"
}
step "开始 CUDA ${CUDA_VERSION} ${ACTION} 流程"
case "$ACTION" in
install) install_cuda ;;
uninstall) uninstall_cuda ;;
*) error "未知操作: $ACTION" ;;
esac
step "CUDA ${CUDA_VERSION} ${ACTION} 流程完成"

288
scripts/dcgm-exporter.sh Normal file
View File

@ -0,0 +1,288 @@
#!/bin/bash
set -euo pipefail
# 全局变量
LOG_FILE="/var/log/dcgm-exporter-install_$(date +%Y%m%d%H%M%S).log"
GO_VERSION="1.21.1"
DCGM_EXPORTER_VERSION="4.2.0-4.1.0"
DCGM_EXPORTER_DIR="/opt/dcgm-exporter"
SERVICE_FILE="/etc/systemd/system/dcgm-exporter.service"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m'
# 日志函数(控制台+日志文件)
log() {
local timestamp="[$(date '+%Y-%m-%d %H:%M:%S')]"
echo -e "$timestamp $*" | tee -a "$LOG_FILE"
}
# 成功提示
success() {
log "${GREEN}$*${NC}"
}
# 警告提示
warning() {
log "${YELLOW}$*${NC}"
}
# 错误提示
error() {
log "${RED}✖ 错误: $*${NC}"
exit 1
}
# 执行命令(带详细错误处理)
run() {
local cmd="$1"
local error_msg="${2:-命令执行失败}"
local timeout="${3:-30}" # 默认超时30秒
log "→ 执行: $cmd"
# 使用timeout防止命令卡死
if ! timeout "$timeout" bash -c "$cmd" &>> "$LOG_FILE"; then
local exit_code=$?
if [[ $exit_code -eq 124 ]]; then
error "命令超时 ($timeout秒): $cmd"
else
error "$error_msg (退出码: $exit_code)"
fi
fi
}
# 检测DCGM状态
check_dcgm() {
log "检测DCGM服务状态..."
# 检查systemctl命令是否存在
if ! command -v systemctl &> /dev/null; then
error "未找到systemctl命令请确保系统支持systemd"
fi
# 检查nvidia-dcgm.service文件是否存在
if [[ ! -f "/lib/systemd/system/nvidia-dcgm.service" ]]; then
error "未找到nvidia-dcgm服务文件请确认DCGM已正确安装"
fi
# 获取服务状态
local status=$(systemctl is-active nvidia-dcgm 2>&1)
local exit_code=$?
if [[ $exit_code -ne 0 ]]; then
error "无法获取DCGM服务状态: $status"
fi
if [[ "$status" == "active" ]]; then
DCGM_VERSION=$(dcgmi version 2>&1 | grep -oP 'Version \K[^\s]+' || echo "未知")
success "DCGM服务运行中 (版本: $DCGM_VERSION)"
else
error "DCGM服务状态异常 ($status),请先执行: systemctl start nvidia-dcgm"
fi
}
# 安装Go环境
install_go() {
log "安装Go环境 (版本: $GO_VERSION)..."
GO_PACKAGE="go${GO_VERSION}.linux-amd64.tar.gz"
DL_URL="https://golang.google.cn/dl/$GO_PACKAGE"
TMP_PACKAGE="/tmp/$GO_PACKAGE"
# 下载安装包
if [[ ! -f "$TMP_PACKAGE" ]]; then
run "wget -qO $TMP_PACKAGE $DL_URL" "下载Go安装包失败"
fi
# 解压安装
run "tar -xzf $TMP_PACKAGE -C /usr/local" "解压Go安装包失败"
# 配置环境变量
GO_ENV="/etc/profile.d/go.sh"
cat > "$GO_ENV" <<'EOF'
export GOROOT=/usr/local/go
export GOPATH=/usr/local/gopath
export PATH=$PATH:$GOROOT/bin
export GO111MODULE=on
export GOPROXY=https://goproxy.cn,direct
EOF
log "→ 配置Go环境变量"
if ! chmod +x "$GO_ENV" &>> "$LOG_FILE"; then
error "设置Go环境变量文件权限失败"
fi
# 在当前shell中加载环境变量
log "→ 加载Go环境变量"
if ! source "$GO_ENV"; then
error "加载Go环境变量失败"
fi
# 验证安装
local go_version=$(go version 2>&1)
if [[ "$go_version" == *"go$GO_VERSION"* ]]; then
success "Go环境安装完成: $go_version"
else
error "Go环境验证失败: $go_version"
fi
}
# 卸载现有DCGM Exporter
uninstall_existing() {
log "检查是否存在旧版本DCGM Exporter..."
if [[ -d "$DCGM_EXPORTER_DIR" || -f "$SERVICE_FILE" ]]; then
log "发现旧版本,开始卸载..."
# 停止服务
if systemctl is-active --quiet dcgm-exporter; then
run "systemctl stop dcgm-exporter" "停止现有服务失败"
fi
# 禁用服务
if systemctl is-enabled --quiet dcgm-exporter; then
run "systemctl disable dcgm-exporter" "禁用现有服务失败"
fi
# 删除文件
run "rm -rf $DCGM_EXPORTER_DIR" "删除旧安装目录失败"
run "rm -f $SERVICE_FILE" "删除服务文件失败"
# 重新加载systemd
run "systemctl daemon-reload" "重新加载systemd失败"
success "旧版本卸载完成"
else
success "未发现旧版本,继续安装..."
fi
}
# 安装DCGM Exporter
install_exporter() {
log "安装DCGM Exporter (版本: $DCGM_EXPORTER_VERSION)..."
# 加载Go环境变量
log "→ 加载Go环境变量"
if [[ -f "/etc/profile.d/go.sh" ]]; then
if ! source "/etc/profile.d/go.sh"; then
error "加载Go环境变量失败"
fi
else
error "未找到Go环境变量配置文件"
fi
run "mkdir -p $DCGM_EXPORTER_DIR" "创建工作目录失败"
# 下载源码使用固定URL
DCGM_PACKAGE="4.2.0-4.1.0.tar.gz"
DL_URL="https://github.com/NVIDIA/dcgm-exporter/archive/refs/tags/$DCGM_PACKAGE"
TMP_PACKAGE="/tmp/$DCGM_PACKAGE"
if [[ ! -f "$TMP_PACKAGE" ]]; then
run "wget -qO $TMP_PACKAGE $DL_URL" "下载DCGM Exporter源码失败"
fi
# 解压
run "tar xf $TMP_PACKAGE -C /tmp" "解压DCGM Exporter源码失败"
SOURCE_DIR="/tmp/dcgm-exporter-$DCGM_EXPORTER_VERSION"
# 编译安装
log "→ 编译DCGM Exporter"
if ! cd "$SOURCE_DIR" &>> "$LOG_FILE"; then
error "进入源码目录失败"
fi
# 分步骤执行make便于调试
if ! make binary &>> "$LOG_FILE"; then
error "编译DCGM Exporter失败"
fi
if ! make install &>> "$LOG_FILE"; then
error "安装DCGM Exporter失败"
fi
# 复制文件
run "cp -a /usr/bin/dcgm-exporter $DCGM_EXPORTER_DIR/" "复制二进制文件失败"
run "cp -a /etc/dcgm-exporter/default-counters.csv $DCGM_EXPORTER_DIR/" "复制配置文件失败"
# 生成服务文件
cat > "$SERVICE_FILE" <<EOF
[Unit]
Description=DCGM Exporter
After=network.target nvidia-dcgm.service
[Service]
Type=simple
User=root
ExecStart=/opt/dcgm-exporter/dcgm-exporter -f /opt/dcgm-exporter/default-counters.csv -a 0.0.0.0:9411
Restart=always
StandardOutput=file:/var/log/dcgm-exporter.log
StandardError=file:/var/log/dcgm-exporter-error.log
[Install]
WantedBy=multi-user.target
EOF
run "chmod 644 $SERVICE_FILE" "设置服务文件权限失败"
# 启动服务
run "systemctl daemon-reload && systemctl enable --now dcgm-exporter.service" "启动DCGM Exporter服务失败"
# 验证服务
log "等待服务启动..."
for i in {1..30}; do
if curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:9411/metrics | grep -q "200"; then
success "DCGM Exporter服务启动成功 (http://127.0.0.1:9411/metrics)"
break
fi
sleep 1
done
[[ $i -eq 31 ]] && error "服务启动超时,请检查日志"
}
# 清理Go环境
clean_go() {
log "清理Go环境..."
run "rm -rf /usr/local/go" "删除Go安装目录失败"
run "rm -f /etc/profile.d/go.sh" "删除Go环境变量配置失败"
run "rm -rf /tmp/go*.tar.gz /tmp/dcgm-exporter*.tar.gz" "删除临时安装包失败"
success "Go环境清理完成"
}
# 主流程
log "================= DCGM Exporter安装 =================="
log "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
# 解析命令行参数
case "$1" in
"--install")
ACTION="install"
;;
"--uninstall")
ACTION="uninstall"
;;
*)
error "未知参数: $1\n用法: $0 [--install|--uninstall]"
;;
esac
# 执行对应操作
case "$ACTION" in
"install")
check_dcgm
install_go
uninstall_existing
install_exporter
clean_go
;;
"uninstall")
uninstall_existing
success "卸载完成"
;;
esac
# 完成
log "================= 操作完成 =================="
log "日志文件: $LOG_FILE"

188
scripts/gpu-manager.sh Normal file
View File

@ -0,0 +1,188 @@
#!/bin/bash
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
# 默认参数
ACTION=""
VERSION=""
SCRIPT_REPO="http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts"
INCLUDE_EXPORTER="no" # 默认不安装exporter组件
# 版本组合定义
define_versions() {
# 组合1CUDA 12.6.3 + NVIDIA 565.57.01
if [ "$VERSION" = "1" ]; then
IB_VERSION="23.10-1.1.9.0"
NVIDIA_VERSION="565.57.01"
CUDA_VERSION="12.6.3_560.35.05"
FABRICMANAGER_VERSION="565_565.57.01-1"
EXPORTER_VERSION="1.0.0"
# 组合2CUDA 12.8.1 + NVIDIA 570.124.06
elif [ "$VERSION" = "2" ]; then
IB_VERSION="23.10-1.1.9.0"
NVIDIA_VERSION="570.124.06"
CUDA_VERSION="12.8.1_570.124.06"
FABRICMANAGER_VERSION="570_570.124.06-1"
EXPORTER_VERSION="1.0.0"
else
log_error "不支持的版本组合: $VERSION。请选择 1 或 2"
fi
}
# 显示版本信息
show_version_info() {
echo -e "\n${YELLOW}您当前选择的组合版本如下:${NC}"
echo -e "${GREEN}========================================${NC}"
echo -e "${YELLOW}组件1: IB驱动${NC} ${GREEN}版本: ${IB_VERSION}${NC}"
echo -e "${YELLOW}组件2: NVIDIA驱动${NC} ${GREEN}版本: ${NVIDIA_VERSION}${NC}"
echo -e "${YELLOW}组件3: CUDA工具包${NC} ${GREEN}版本: ${CUDA_VERSION}${NC}"
echo -e "${YELLOW}组件4: FabricManager${NC} ${GREEN}版本: ${FABRICMANAGER_VERSION}${NC}"
echo -e "${YELLOW}Exporter组件:${NC} ${GREEN}状态: ${INCLUDE_EXPORTER}${NC}"
echo -e "${GREEN}========================================${NC}\n"
}
# 执行安装
run_install() {
log_info "开始执行组合$VERSION的安装流程..."
# 系统优化
log_info "执行系统优化..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/system_optimize.sh" | bash
# IB驱动
log_info "安装IB驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --install --version "$IB_VERSION"
# NVIDIA驱动
log_info "安装NVIDIA驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --install --version "$NVIDIA_VERSION"
# CUDA
log_info "安装CUDA工具包..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --install --version "$CUDA_VERSION"
# FabricManager
log_info "安装NVIDIA FabricManager..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --install --version "$FABRICMANAGER_VERSION"
# 安装exporter组件如果指定
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
log_info "安装Exporter组件..."
log_info "安装nvidia-dcgm..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --install
log_info "安装dcgm-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --install
log_info "安装node-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --install
else
log_info "跳过Exporter组件的安装"
fi
log_info "组合$VERSION的安装已完成!"
}
# 执行卸载
run_uninstall() {
log_info "开始执行组合$VERSION的卸载流程..."
# 注意卸载顺序与安装相反
# FabricManager
log_info "卸载NVIDIA FabricManager..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --uninstall --version "$FABRICMANAGER_VERSION"
# CUDA
log_info "卸载CUDA工具包..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --uninstall --version "$CUDA_VERSION"
# NVIDIA驱动
log_info "卸载NVIDIA驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --uninstall --version "$NVIDIA_VERSION"
# IB驱动
log_info "卸载IB驱动..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --uninstall --version "$IB_VERSION"
# 卸载exporter组件如果指定
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
log_info "卸载Exporter组件..."
log_info "卸载nvidia-dcgm..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --uninstall
log_info "卸载dcgm-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --uninstall
log_info "卸载node-exporter..."
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --uninstall
else
log_info "跳过Exporter组件的卸载"
fi
log_info "组合$VERSION的卸载已完成!"
}
# 解析命令行参数
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install)
ACTION="install"
;;
--uninstall)
ACTION="uninstall"
;;
--version)
VERSION="$2"
shift
;;
--include=exporter)
INCLUDE_EXPORTER="yes"
;;
*)
log_error "未知参数: $1"
;;
esac
shift
done
if [[ -z "$ACTION" ]]; then
log_error "请指定操作: --install 或 --uninstall"
fi
if [[ -z "$VERSION" ]]; then
log_error "请指定版本组合: --version 1 或 --version 2"
fi
}
# 主函数
main() {
# 检查root权限
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要root权限运行请使用sudo执行"
fi
>/opt/gpu-manager.log
parse_args "$@"
define_versions
show_version_info
if [ "$ACTION" = "install" ]; then
run_install
else
run_uninstall
fi
}
# 执行主函数
main "$@"

260
scripts/ib-drive.sh Normal file
View File

@ -0,0 +1,260 @@
#!/bin/bash
set -e
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 输出带颜色的信息
log_info() { echo -e "${GREEN}[INFO] $1${NC}"; }
log_error() { echo -e "${RED}[ERROR] $1${NC}"; exit 1; }
log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}"; }
# 默认变量
ACTION=""
DRIVER_VERSION="5.8-6.0.4.2"
DISTRO="ubuntu22.04"
ARCH="x86_64"
FORCE=0
# 生成包名和路径
generate_package_info() {
DRIVER_PACKAGE="MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}.tgz"
PACKAGE_PATH="/opt/${DRIVER_PACKAGE}"
DRIVER_DIR="/opt/MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}"
INTERNAL_URL="http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}"
OFFICIAL_URL="https://example.com/${DRIVER_PACKAGE}" # 请替换为实际官网地址
}
# 解析命令行参数
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install)
ACTION="install"
shift
;;
--uninstall)
ACTION="uninstall"
shift
;;
--version)
if [[ -z "$2" ]]; then
log_error "请指定版本号,如: --version 5.8-6.0.4.2"
fi
DRIVER_VERSION="$2"
generate_package_info
shift 2
;;
--force)
FORCE=1
shift
;;
*)
log_error "未知参数: $1"
;;
esac
done
if [[ -z "$ACTION" ]]; then
log_error "请指定操作: --install 或 --uninstall"
fi
}
# 下载驱动包
download_driver() {
log_info "开始下载驱动包: $DRIVER_PACKAGE"
if [ -f "$PACKAGE_PATH" ]; then
log_info "使用本地驱动包: $PACKAGE_PATH"
else
log_info "本地包不存在,尝试从内网下载"
if wget -q -O "$PACKAGE_PATH" "$INTERNAL_URL"; then
log_info "内网下载成功"
else
log_warning "内网下载失败,尝试从官网下载"
if wget -q -O "$PACKAGE_PATH" "$OFFICIAL_URL"; then
log_info "官网下载成功"
else
log_error "驱动包下载失败,请手动放置到 /opt/"
fi
fi
fi
}
# 安装驱动
install_driver() {
log_info "开始安装驱动: $DRIVER_VERSION"
# 检查是否已安装
#if command -v ibv_devinfo &> /dev/null && [[ $FORCE -eq 0 ]]; then
# log_warning "检测到驱动已安装,使用 --force 覆盖安装"
# exit 0
#fi
kernel_version=$(uname -r)
log_info "当前内核版本: $kernel_version"
log_info "安装依赖包"
apt update &>> /tmp/mlnx_install.log
apt install -y net-tools bzip2 &>> /tmp/mlnx_install.log
log_info "解压驱动包"
tar -zxf "$PACKAGE_PATH" -C /opt/
log_info "执行驱动安装"
cd "$DRIVER_DIR"
./mlnxofedinstall --without-dkms --add-kernel-support --kernel "$kernel_version" --with-fw-update --force &>> /tmp/mlnx_install.log
sleep 10
}
# 卸载驱动(修改后版本)
uninstall_driver() {
log_info "开始卸载驱动: $DRIVER_VERSION"
# 检查驱动目录,不存在则重新下载解压
if [ ! -d "$DRIVER_DIR" ]; then
log_warning "驱动目录不存在,尝试重新下载和解压"
download_driver # 复用安装的下载逻辑
log_info "解压驱动包"
tar -zxf "$PACKAGE_PATH" -C /opt/
if [ ! -d "$DRIVER_DIR" ]; then
log_error "解压失败,无法找到驱动目录: $DRIVER_DIR"
else
log_info "成功解压驱动包到: $DRIVER_DIR"
fi
else
log_info "找到驱动目录: $DRIVER_DIR"
fi
# 执行卸载
cd "$DRIVER_DIR"
log_info "执行卸载脚本"
./uninstall.sh -q -y &>> /tmp/mlnx_install.log || log_warning "卸载脚本执行失败,尝试手动清理"
log_info "清理残留文件"
rm -rf "$DRIVER_DIR" "$PACKAGE_PATH"
log_info "停止并禁用openibd服务"
systemctl stop openibd.service &>> /tmp/mlnx_install.log || true
systemctl disable openibd.service &>> /tmp/mlnx_install.log || true
log_info "恢复网卡命名规则"
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules
rm -f /etc/modprobe.d/nvidia-gsp.conf
update-initramfs -u &>> /tmp/mlnx_install.log
}
# 配置网卡命名规则
configure_naming_rules() {
log_info "配置IB网卡命名规则"
log_info "备份原有规则"
cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log || true
cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log || true
log_info "清除原有规则"
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log || true
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log || true
log_info "生成IB设备命名规则"
ID=20
for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$i" ]; then
echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules
ID=$((ID+1))
fi
done
log_info "生成网络设备命名规则"
IDS=0
for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$j" ]; then
echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules
IDS=$((IDS+1))
fi
done
log_info "配置nvidia选项"
echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-gsp.conf
update-initramfs -u &>> /tmp/mlnx_install.log
log_info "重启openibd服务"
systemctl restart openibd.service
sleep 15
}
# 检查驱动安装结果
check_installation() {
log_info "检查驱动安装结果"
if command -v ibv_devinfo &> /dev/null; then
log_info "驱动安装成功"
else
log_error "驱动安装失败"
fi
log_info "检查网卡命名规则"
valid_count=0
for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
if [ -n "$dev" ]; then
mlx_name=$(udevadm info -q name -n "$dev" 2>/dev/null | grep "mlx5_" || true)
net_name=$(ip link show "$dev" | grep "ib[0-9]" || true)
if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then
valid_count=$((valid_count+1))
else
log_warning "网卡 $dev 命名规则未生效"
fi
fi
done
if [ $valid_count -gt 0 ]; then
log_info "网卡命名规则生效,成功配置 $valid_count 个网卡"
else
log_warning "所有网卡命名规则均未生效,建议重启系统手工配置!"
fi
}
# 检查卸载结果
check_uninstallation() {
log_info "检查卸载结果"
if ! command -v ibv_devinfo &> /dev/null; then
log_info "驱动已成功卸载"
else
log_warning "驱动命令仍存在,可能需要手动清理"
fi
if [ ! -d "$DRIVER_DIR" ]; then
log_info "驱动目录已删除"
else
log_warning "驱动目录未完全删除: $DRIVER_DIR"
fi
}
# 主函数
main() {
generate_package_info
parse_args "$@"
log_info "开始执行 ${ACTION^^} 操作,版本: $DRIVER_VERSION"
case "$ACTION" in
install)
download_driver
install_driver
configure_naming_rules
check_installation
;;
uninstall)
uninstall_driver
check_uninstallation
;;
esac
log_info "操作完成!"
}
# 执行主函数
main "$@"

View File

@ -0,0 +1,180 @@
#!/bin/bash
##############################################################################################################################
#脚本功能:
#1.口令定期更换策略设置个90天最小密码长度为8位密码过期警告提前7天。
#2.口令复杂度设置密码长度至少为12位包含至少四种字符类型大写字母、小写字母、数字、特殊字符
#3.登录失败处理策略设置登录失败次数为5次锁定时间为10分钟。
#4.登录连接超时默认配置设置登录连接超时时间为10分钟。
#5.日志本地保存时间设置为6个月。
#6.禁止root ssh远程登录
#7.启动日志与审计服务rsyslog和auditd
#8.sshd开启PAM认证
#9.安装系统工具
##############################################################################################################################
# 定义新的配置参数
LOGIN_DEFS_POLICY_MAX_DAYS="PASS_MAX_DAYS 90"
LOGIN_DEFS_POLICY_MIN_DAYS="PASS_MIN_DAYS 0"
LOGIN_DEFS_POLICY_MIN_LEN="PASS_MIN_LEN 8"
LOGIN_DEFS_POLICY_WARN_AGE="PASS_WARN_AGE 7"
# 编辑/etc/login.defs配置文件
echo "正在编辑 /etc/login.defs 文件..."
# 检查并替换或添加设置
if grep -q "^PASS_MAX_DAYS" /etc/login.defs; then
sed -i "s/^PASS_MAX_DAYS.*/${LOGIN_DEFS_POLICY_MAX_DAYS}/" /etc/login.defs
fi
if grep -q "^PASS_MIN_DAYS" /etc/login.defs; then
sed -i "s/^PASS_MIN_DAYS.*/${LOGIN_DEFS_POLICY_MIN_DAYS}/" /etc/login.defs
fi
if grep -q "^PASS_MIN_LEN" /etc/login.defs; then
sed -i "s/^PASS_MIN_LEN.*/${LOGIN_DEFS_POLICY_MIN_LEN}/" /etc/login.defs
fi
if grep -q "^PASS_WARN_AGE" /etc/login.defs; then
sed -i "s/^PASS_WARN_AGE.*/${LOGIN_DEFS_POLICY_WARN_AGE}/" /etc/login.defs
fi
# 编辑/etc/security/pwquality.conf配置文件口令复杂度
PWQUALITY_POLICY_MINLEN="minlen = 12"
PWQUALITY_POLICY_MINCLASS="minclass = 4"
PWQUALITY_POLICY_DCREDIT="dcredit = -1"
PWQUALITY_POLICY_UCREDIT="ucredit = -1"
PWQUALITY_POLICY_LCREDIT="lcredit = -1"
PWQUALITY_POLICY_OCREDIT="ocredit = -1"
PWQUALITY_POLICY_FOR_ROOT="enforce_for_root"
PWQUALITY_POLICY_DIFOK="difok = 5"
echo "正在编辑 /etc/security/pwquality.conf 文件配置文件口令复杂度"
if grep -q "^minlen" /etc/security/pwquality.conf; then
sed -i "s/^minlen.*/${PWQUALITY_POLICY_MINLEN}/" /etc/security/pwquality.conf
elif grep -q "^# minlen" /etc/security/pwquality.conf; then
sed -i "s/^# minlen.*/${PWQUALITY_POLICY_MINLEN}/" /etc/security/pwquality.conf
fi
if grep -q "^minclass" /etc/security/pwquality.conf; then
sed -i "s/^minclass.*/${PWQUALITY_POLICY_MINCLASS}/" /etc/security/pwquality.conf
elif grep -q "^# minclass" /etc/security/pwquality.conf; then
sed -i "s/^# minclass.*/${PWQUALITY_POLICY_MINCLASS}/" /etc/security/pwquality.conf
fi
if grep -q "^dcredit" /etc/security/pwquality.conf; then
sed -i "s/^dcredit.*/${PWQUALITY_POLICY_DCREDIT}/" /etc/security/pwquality.conf
elif grep -q "^# dcredit" /etc/security/pwquality.conf; then
sed -i "s/^# dcredit.*/${PWQUALITY_POLICY_DCREDIT}/" /etc/security/pwquality.conf
fi
if grep -q "^ucredit" /etc/security/pwquality.conf; then
sed -i "s/^ucredit.*/${PWQUALITY_POLICY_UCREDIT}/" /etc/security/pwquality.conf
elif grep -q "^# ucredit" /etc/security/pwquality.conf; then
sed -i "s/^# ucredit.*/${PWQUALITY_POLICY_UCREDIT}/" /etc/security/pwquality.conf
fi
if grep -q "^lcredit" /etc/security/pwquality.conf; then
sed -i "s/^lcredit.*/${PWQUALITY_POLICY_LCREDIT}/" /etc/security/pwquality.conf
elif grep -q "^# lcredit" /etc/security/pwquality.conf; then
sed -i "s/^# lcredit.*/${PWQUALITY_POLICY_LCREDIT}/" /etc/security/pwquality.conf
fi
if grep -q "^ocredit" /etc/security/pwquality.conf; then
sed -i "s/^ocredit.*/${PWQUALITY_POLICY_OCREDIT}/" /etc/security/pwquality.conf
elif grep -q "^# ocredit" /etc/security/pwquality.conf; then
sed -i "s/^# ocredit.*/${PWQUALITY_POLICY_OCREDIT}/" /etc/security/pwquality.conf
fi
if grep -q "^enforce_for_root" /etc/security/pwquality.conf; then
:
elif grep -q "^# enforce_for_root" /etc/security/pwquality.conf; then
sed -i "s/^# enforce_for_root/${PWQUALITY_POLICY_FOR_ROOT}/" /etc/security/pwquality.conf
fi
if grep -q "^difok" /etc/security/pwquality.conf; then
sed -i "s/^difok.*/${PWQUALITY_POLICY_DIFOK}/" /etc/security/pwquality.conf
elif grep -q "^# difok.*" /etc/security/pwquality.conf; then
sed -i "s/^# difok.*/${PWQUALITY_POLICY_DIFOK}/" /etc/security/pwquality.conf
fi
# 执行以下命令,来更新`system-auth`和`password-auth`文件
egrep -q "^\s*password\s+requisite\s+pam_pwquality.so\s+" /etc/pam.d/system-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+try_first_pass)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1try_first_pass \2/ }' /etc/pam.d/system-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+retry=[0-9]+)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1retry=5 \2/ }' /etc/pam.d/system-auth && sed -ri "s/(^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*\s+)retry=[0-9]+(\s+.*)?$/\1retry=5\3/" /etc/pam.d/system-auth
sleep 2s
egrep -q "^\s*password\s+requisite\s+pam_pwquality.so\s+" /etc/pam.d/password-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+try_first_pass)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1try_first_pass \2/ }' /etc/pam.d/password-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+retry=[0-9]+)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1retry=5 \2/ }' /etc/pam.d/password-auth && sed -ri "s/(^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*\s+)retry=[0-9]+(\s+.*)?$/\1retry=5\3/" /etc/pam.d/password-auth
# 密码验证失败处理策略
echo "正在编辑 /etc/pam.d/password-auth 文件配置密码验证失败处理策略"
if grep -Pq "^auth\s*required\s*pam_faillock.so\s*authfail\s*even_deny_root\s*deny=5\s*unlock_time=600\s*$" /etc/pam.d/system-auth; then
:
else
sed -ri "/^auth.*pam_env.so$/i auth required pam_faillock.so preauth silent even_deny_root deny=5 unlock_time=600\nauth required pam_faillock.so authfail even_deny_root deny=5 unlock_time=600" /etc/pam.d/system-auth
fi
if grep -Pq "^auth\s*required\s*pam_faillock.so\s*authfail\s*even_deny_root\s*deny=5\s*unlock_time=600\s*$" /etc/pam.d/password-auth; then
:
else
sed -ri "/^auth.*pam_env.so$/i auth required pam_faillock.so preauth silent even_deny_root deny=5 unlock_time=600\nauth required pam_faillock.so authfail even_deny_root deny=5 unlock_time=600" /etc/pam.d/password-auth
fi
# 终端超时自动登出设置要求针对所有用户自动登退时间为600s
echo "正在编辑 /etc/profile 文件配置终端超时自动登出设置要求针对所有用户自动登退时间为600s"
if grep -q "^export TMOUT" /etc/profile; then
sed -i "s/^export TMOUT.*/export TMOUT=600/" /etc/profile
else
echo "export TMOUT=600" >> /etc/profile
fi
# 设置日志本地保存时间6个月
echo "正在编辑 /etc/logrotate.conf 文件设置日志本地保存时间6个月"
if grep -q "^rotate" /etc/logrotate.conf; then
sed -i "s/rotate.*/rotate 26/" /etc/logrotate.conf
fi
# 禁止root ssh远程登录
echo "正在编辑 /etc/ssh/sshd_config 文件禁止root ssh远程登录"
if grep -q "^PermitRootLogin" /etc/ssh/sshd_config; then
sed -i "s/^PermitRootLogin.*/PermitRootLogin no/" /etc/ssh/sshd_config
else
echo "PermitRootLogin no" >> /etc/ssh/sshd_config
fi
if grep -q "^PubkeyAuthentication" /etc/ssh/sshd_config; then
sed -i "s/^PubkeyAuthentication.*/PubkeyAuthentication yes/" /etc/ssh/sshd_config
else
sed -i "s/^#PubkeyAuthentication.*/PubkeyAuthentication yes/" /etc/ssh/sshd_config
fi
# 禁用 ssh DNS 解析
if grep -q "^UseDNS" /etc/ssh/sshd_config; then
sed -i "s/^UseDNS.*/UseDNS no/" /etc/ssh/sshd_config
else
sed -i "s/^#UseDNS.*/UseDNS no/" /etc/ssh/sshd_config
fi
# 开启ssh PAM认证
if grep -q "^UsePAM" /etc/ssh/sshd_config; then
sed -i "s/^UsePAM.*/UsePAM yes/" /etc/ssh/sshd_config
else
sed -i "s/^#UsePAM.*/UsePAM yes/" /etc/ssh/sshd_config
fi
# 重启sshd服务,生效配置
echo "正在重启sshd服务....."
if grep -Pq '^PubkeyAuthentication yes' /etc/ssh/sshd_config;then
systemctl restart sshd
fi
sleep 2s
# 开启rsyslog服务开启auditd服务
echo "正在启动rsyslog和auditd服务"
systemctl restart rsyslog.service
systemctl start rsyslog.service && systemctl enable rsyslog.service
sleep 2s
systemctl start auditd.service && systemctl enable auditd.service
echo "请自行修改操作系统默认密码。并做好密码保存。"
echo "已禁止root ssh远程登录请使用scloudadmin账号登录如无法登录请通过ipmi远程控制登录"
#9.安装系统工具
echo "安装sysstat ipmitool vim pciutils net-tools工具包"
dnf -y install sysstat.x86_64
dnf -y install ipmitool.x86_64
dnf -y install vim
dnf -y install pciutils.x86_64
dnf -y install net-tools.x86_64
echo "所有操作已完成。"

177
scripts/node-exporter.sh Normal file
View File

@ -0,0 +1,177 @@
#!/bin/bash
set -euo pipefail
# 全局变量
LOG_FILE="/var/log/node_exporter_$(date +%Y%m%d%H%M%S).log"
NODE_EXPORTER_VERSION="1.8.2"
PRIMARY_DOWNLOAD_URL="http://10.101.0.51:5588/node-exporter/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz"
BACKUP_DOWNLOAD_URL="https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz"
LOCAL_PACKAGE_PATH="/opt/node_exporter.tar.gz"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
NC='\033[0m' # 重置颜色
# 日志函数 - 记录所有操作到日志
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
}
# 步骤提示 - 绿色输出到控制台并记录日志
step() {
local msg="==> $1"
echo -e "${GREEN}$msg${NC}"
log "$msg"
}
# 错误提示 - 红色输出到控制台并记录日志
error() {
local msg="错误: $1"
echo -e "${RED}$msg${NC}"
log "$msg"
exit 1
}
# 执行命令并屏蔽输出
run_cmd() {
step "执行: $1"
eval "$1" &>> "$LOG_FILE" || {
error "命令执行失败: $1"
}
}
# 测试网络连通性
test_network_connectivity() {
local url=$1
step "测试网络连通性: $url"
if curl -fsSLI --connect-timeout 10 "$url" &>> "$LOG_FILE"; then
return 0
else
return 1
fi
}
# 下载文件
download_file() {
local url=$1
local dest=$2
step "下载文件: $url$dest"
if wget -qO "$dest" "$url" &>> "$LOG_FILE"; then
return 0
else
return 1
fi
}
# 安装 node_exporter
install_node_exporter() {
step "开始安装 node_exporter ${NODE_EXPORTER_VERSION}"
# 切换到 /opt 目录
run_cmd "cd /opt"
# 检查本地是否存在安装包
if [[ -f "$LOCAL_PACKAGE_PATH" ]]; then
step "发现本地安装包: $LOCAL_PACKAGE_PATH"
DOWNLOAD_URL="$LOCAL_PACKAGE_PATH"
else
# 测试主要下载地址的连通性
if test_network_connectivity "$PRIMARY_DOWNLOAD_URL"; then
DOWNLOAD_URL="$PRIMARY_DOWNLOAD_URL"
elif test_network_connectivity "$BACKUP_DOWNLOAD_URL"; then
DOWNLOAD_URL="$BACKUP_DOWNLOAD_URL"
else
error "无法连接到任何下载地址"
fi
# 下载 node_exporter
download_file "$DOWNLOAD_URL" "node_exporter.tar.gz"
fi
# 解压 tar 包
run_cmd "tar -zxvf node_exporter.tar.gz"
# 移动文件夹
run_cmd "mv node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/ ./node_exporter"
# 进入 node_exporter 目录
run_cmd "cd node_exporter/"
# 创建 bin 目录并移动二进制文件
run_cmd "mkdir bin"
run_cmd "mv node_exporter bin/"
# 配置 systemd 服务
cat > /lib/systemd/system/node_exporter.service <<EOF
[Unit]
Description=node_exporter
Documentation=https://prometheus.io/docs/guides/node-exporter/
After=network.target
[Service]
User=root
Group=root
ExecStart=/opt/node_exporter/bin/node_exporter --web.listen-address=:10086
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 重新加载 systemd 配置
run_cmd "systemctl daemon-reload"
# 启用并启动 node_exporter 服务
run_cmd "systemctl enable node_exporter"
run_cmd "systemctl restart node_exporter"
# 检查服务状态
run_cmd "systemctl status node_exporter"
# 验证安装
run_cmd "curl -I http://127.000.1:10086/metrics"
step "node_exporter 安装成功"
}
# 卸载 node_exporter
uninstall_node_exporter() {
step "开始卸载 node_exporter"
# 停止并禁用服务
run_cmd "systemctl stop node_exporter"
run_cmd "systemctl disable node_exporter"
# 删除 systemd 服务文件
run_cmd "rm -f /lib/systemd/system/node_exporter.service"
# 重新加载 systemd 配置
run_cmd "systemctl daemon-reload"
# 删除 node_exporter 目录和文件
run_cmd "rm -rf /opt/node_exporter"
run_cmd "rm -f /opt/node_exporter.tar.gz"
step "node_exporter 卸载完成"
}
# 参数解析
if [[ $# -ne 1 ]]; then
error "请使用 --install 或 --uninstall"
fi
ACTION=$1
case "$ACTION" in
"--install")
install_node_exporter
;;
"--uninstall")
uninstall_node_exporter
;;
*)
error "无效的参数,请使用 --install 或 --uninstall"
;;
esac
step "操作完成,日志路径: $LOG_FILE"

164
scripts/nvidia-dcgm.sh Normal file
View File

@ -0,0 +1,164 @@
#!/bin/bash
set -euo pipefail
# 全局变量
LOG_FILE="/var/log/nvidia-dcgm_$(date +%Y%m%d%H%M%S).log"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
NC='\033[0m' # 重置颜色
# 日志函数 - 记录所有操作到日志
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
}
# 步骤提示 - 绿色输出到控制台并记录日志
step() {
local msg="==> $1"
echo -e "${GREEN}$msg${NC}"
log "$msg"
}
# 错误提示 - 红色输出到控制台并记录日志
error() {
local msg="错误: $1"
echo -e "${RED}$msg${NC}"
log "$msg"
exit 1
}
# 执行命令并屏蔽输出,仅记录关键信息
run_cmd() {
step "执行: $1"
eval "$1" &>> "$LOG_FILE" || {
error "命令执行失败: $1"
}
}
# 检测系统版本
detect_os_version() {
if [[ -f /etc/os-release ]]; then
. /etc/os-release
OS_ID=$ID
OS_VERSION=$VERSION_ID
step "检测到系统: ${OS_ID} ${OS_VERSION}"
else
error "无法检测到操作系统版本"
fi
}
# 清理现有的 DCGM 配置
cleanup_dcgm_config() {
step "清理现有的 DCGM 配置"
run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
run_cmd "apt-get remove -y datacenter-gpu-manager || true"
run_cmd "apt-get autoremove -y || true"
run_cmd "apt-get autoclean -y || true"
}
# 添加 CUDA 仓库密钥
add_cuda_keyring() {
step "添加 CUDA 仓库密钥"
run_cmd "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb"
run_cmd "dpkg -i cuda-keyring_1.0-1_all.deb"
run_cmd "rm cuda-keyring_1.0-1_all.deb"
}
# 更新包列表
update_package_list() {
step "更新包列表"
run_cmd "apt-get update"
}
# 安装 DCGM for Ubuntu 22.04
install_dcgm_for_ubuntu_22() {
step "开始安装 DCGM for Ubuntu 22"
# 清理现有的 DCGM 配置
cleanup_dcgm_config
# 添加 CUDA 仓库密钥
add_cuda_keyring
# 添加 CUDA 仓库
run_cmd "add-apt-repository \"deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /\" -y"
# 更新包列表
update_package_list
# 获取 CUDA 版本
CUDA_VERSION=$(nvidia-smi | sed -E -n 's/.*CUDA Version: ([0-9]+)[.].*/\1/p')
if [[ -z "$CUDA_VERSION" ]]; then
error "无法检测到 CUDA 版本"
fi
# 安装 DCGM
run_cmd "apt install --install-recommends datacenter-gpu-manager-4-cuda${CUDA_VERSION} -y"
# 启动并启用 DCGM 服务
run_cmd "systemctl --now enable nvidia-dcgm"
run_cmd "systemctl restart nvidia-dcgm"
run_cmd "systemctl status nvidia-dcgm"
# 验证 DCGM 安装
run_cmd "dcgmi discovery -l"
step "DCGM 安装成功"
}
# 卸载 DCGM
uninstall_dcgm() {
step "开始卸载 DCGM"
# 停止并禁用 DCGM 服务
run_cmd "systemctl stop nvidia-dcgm || true"
run_cmd "systemctl disable nvidia-dcgm || true"
# 移除 DCGM 包
run_cmd "apt-get remove -y datacenter-gpu-manager || true"
run_cmd "apt-get autoremove -y || true"
run_cmd "apt-get autoclean -y || true"
# 清理 CUDA 仓库密钥
run_cmd "rm -f /usr/share/keyrings/cuda-archive-keyring.gpg"
# 删除 CUDA 仓库配置文件
run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
# 更新包列表
update_package_list
step "DCGM 卸载完成"
}
# 主流程
step "NVIDIA DCGM 安装脚本启动"
detect_os_version
if [[ $# -ne 1 ]]; then
error "请使用 --install 或 --uninstall"
fi
ACTION=$1
case "$ACTION" in
"--install")
case "$OS_ID-$OS_VERSION" in
ubuntu-22.04)
install_dcgm_for_ubuntu_22
;;
*)
error "不支持的操作系统版本: ${OS_ID} ${OS_VERSION}"
;;
esac
;;
"--uninstall")
uninstall_dcgm
;;
*)
error "无效的参数,请使用 --install 或 --uninstall"
;;
esac
step "操作完成,日志路径: $LOG_FILE"

235
scripts/nvidia-driver.sh Normal file
View File

@ -0,0 +1,235 @@
#!/bin/bash
set -euo pipefail
# 全局变量
DEFAULT_VERSION="565.57.01"
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-linux"
#OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
PACKAGE_TEMPLATE="NVIDIA-Linux-x86_64-%s.run"
INSTALL_DIR="/opt"
LOG_FILE="/var/log/nvidia-driver_$(date +%Y%m%d%H%M%S).log"
SERVICE_FILE="/etc/systemd/system/nvidia_peermem.service"
PERSISTENCE_SERVICE="/etc/systemd/system/nvidia-persistenced.service"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
NC='\033[0m' # 重置颜色
# 日志函数 - 记录所有操作到日志
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
}
# 步骤提示 - 绿色输出到控制台并记录日志
step() {
local msg="==> $1"
echo -e "${GREEN}$msg${NC}"
log "$msg"
}
# 错误提示 - 红色输出到控制台并记录日志
error() {
local msg="错误: $1"
echo -e "${RED}$msg${NC}"
log "$msg"
exit 1
}
# 执行命令并屏蔽输出,仅记录关键信息
run_cmd() {
step "执行: $1"
eval "$1" &>> "$LOG_FILE" || {
error "命令执行失败: $1"
}
}
# 参数解析
ACTION=""
VERSION="$DEFAULT_VERSION"
while [[ $# -gt 0 ]]; do
case "$1" in
--install) ACTION="install"; shift ;;
--uninstall) ACTION="uninstall"; shift ;;
--version) VERSION="$2"; shift 2 ;;
*) error "未知参数 $1" ;;
esac
done
OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
[[ -z "$ACTION" ]] && { error "必须指定 --install 或 --uninstall"; }
# peermem_service 开机启动函数
install_peermem_service() {
step "开始配置 nvidia_peermem 开机启动"
# 创建服务文件
cat > "$SERVICE_FILE" <<EOF
[Unit]
After=network.target
[Service]
ExecStart=/usr/sbin/modprobe nvidia_peermem
ExecStop=/usr/sbin/rmmod -f nvidia_peermem
[Install]
WantedBy=default.target
EOF
# 设置权限
run_cmd "chmod 644 $SERVICE_FILE"
step "服务文件已创建:$SERVICE_FILE"
# 重载 systemd 并启用服务
run_cmd "systemctl daemon-reload"
run_cmd "systemctl enable --now nvidia_peermem.service"
step "服务已启用并开机自启"
}
# peermem_service 开机启动卸载函数
uninstall_peermem_service() {
step "开始移除 nvidia_peermem 开机启动配置"
if [[ -f "$SERVICE_FILE" ]]; then
run_cmd "systemctl stop nvidia_peermem.service"
run_cmd "systemctl disable nvidia_peermem.service"
run_cmd "rm -f $SERVICE_FILE"
step "服务文件已删除:$SERVICE_FILE"
else
step "警告nvidia_peermem 服务文件不存在"
fi
}
# nvidia_persistenced GPU 持久模式
install_persistence_service() {
step "开始配置 nvidia-persistenced 开机启动"
# 创建服务文件
cat > "$PERSISTENCE_SERVICE" <<EOF
[Unit]
Description=NVIDIA Persistence Daemon
After=syslog.target network.target
Wants=nvidia-modules.service
[Service]
Type=forking
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
ExecStart=/usr/bin/nvidia-persistenced --verbose
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
[Install]
WantedBy=multi-user.target
EOF
# 设置权限
run_cmd "chmod 644 $PERSISTENCE_SERVICE"
step "服务文件已创建:$PERSISTENCE_SERVICE"
# 重载 systemd 并启用服务
run_cmd "systemctl daemon-reload"
run_cmd "systemctl enable --now nvidia-persistenced.service"
step "nvidia-persistenced 服务已启用并开机自启"
}
# 卸载 persistence_server 开机启动
uninstall_persistence_service() {
step "开始移除 nvidia-persistenced 开机启动配置"
if [[ -f "$PERSISTENCE_SERVICE" ]]; then
run_cmd "systemctl stop nvidia-persistenced.service"
run_cmd "systemctl disable nvidia-persistenced.service"
run_cmd "rm -f $PERSISTENCE_SERVICE"
step "服务文件已删除:$PERSISTENCE_SERVICE"
else
step "警告nvidia-persistenced 服务文件不存在"
fi
}
# 显卡驱动安装函数
install_driver() {
step "开始安装显卡驱动,版本:$VERSION"
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
# 检查/下载包
if [[ ! -f "$PACKAGE_PATH" ]]; then
step "未找到本地包,开始下载"
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
else
step "使用本地包:$PACKAGE_PATH"
fi
# 安装驱动
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
run_cmd "./$PACKAGE_NAME -q -s"
# 配置服务
run_cmd "modprobe nvidia_peermem"
run_cmd "nvidia-smi -pm 1"
# 验证版本
run_cmd "nvidia-smi | grep $VERSION"
nvidia-smi | grep "$VERSION" >> "$LOG_FILE" 2>&1 || {
log "版本验证失败"; exit 1
}
install_peermem_service # 开机启动加载nvidia_peermem
install_persistence_service # 开机启动 GPU 持久模式
step "安装完成"
}
# 显卡驱动卸载函数
uninstall_driver() {
step "开始卸载显卡驱动,版本:$VERSION"
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
# 检查卸载脚本
if [[ -f "$PACKAGE_PATH" ]]; then
step "找到安装包,使用安装包卸载"
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
step "卸载失败,请手动卸载驱动程序"
return
fi
else
step "未找到本地安装包,尝试下载卸载包"
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
if [[ -f "$PACKAGE_PATH" ]]; then
cd "$INSTALL_DIR"
run_cmd "chmod +x $PACKAGE_NAME"
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
step "卸载失败,请手动卸载驱动程序"
return
fi
else
step "无法找到或下载卸载包,请手动卸载驱动程序"
return
fi
fi
uninstall_peermem_service # 移除 peermem 服务
uninstall_persistence_service # 移除 persistenced 服务
step "卸载完成"
}
# 包下载函数
download_package() {
local package_name="$1"
local package_path="$2"
local download_urls=(
"${INTERNAL_BASE_URL}/${package_name}"
"${OFFICIAL_BASE_URL}/${package_name}"
)
for url in "${download_urls[@]}"; do
step "尝试从 $url 下载"
wget -qO "$package_path" "$url" && return 0
step "下载失败,尝试下一个 URL"
done
error "无法从任何来源下载 $package_name"
}
# 根据动作调用对应函数
case "$ACTION" in
install) install_driver ;;
uninstall) uninstall_driver ;;
esac

View File

@ -0,0 +1,190 @@
#!/bin/bash
set -euo pipefail
# 全局变量
FABRICMANAGER_MAJOR_VERSION="565"
FABRICMANAGER_FULL_VERSION="565.57.01-1"
PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb"
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-fabricmanager"
OFFICIAL_BASE_URL="https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64"
TEMP_DIR="/tmp/fabricmanager_temp"
LOG_FILE="/var/log/fabricmanager_$(date +%Y%m%d%H%M%S).log"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数
log() {
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
echo "$msg" >> "$LOG_FILE"
}
# 步骤提示
step() {
local msg="==> $1"
echo -e "${GREEN}$msg${NC}"
log "$msg"
}
# 警告提示
warning() {
local msg="警告: $1"
echo -e "${YELLOW}$msg${NC}"
log "$msg"
}
# 错误提示
error() {
local msg="错误: $1"
echo -e "${RED}$msg${NC}"
log "$msg"
exit 1
}
# 执行命令并记录日志
run_cmd() {
step "执行: $1"
eval "$1" &>> "$LOG_FILE" || {
error "命令执行失败: $1"
}
}
# 检查命令是否存在
check_cmd() {
command -v "$1" &>/dev/null || error "未找到命令: $1"
}
# 参数解析
ACTION=""
FORCE=0
while [[ $# -gt 0 ]]; do
case "$1" in
--install) ACTION="install"; shift ;;
--uninstall) ACTION="uninstall"; shift ;;
--version)
CUSTOM_VERSION="$2"
if [[ $CUSTOM_VERSION =~ ^([0-9]+)(_[0-9]+\.[0-9]+\.[0-9]+-[0-9]+)$ ]]; then
FABRICMANAGER_MAJOR_VERSION="${BASH_REMATCH[1]}"
FABRICMANAGER_FULL_VERSION="${BASH_REMATCH[2]#_}"
PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb"
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
else
error "版本格式错误,应为 xxxx_xxxx.xx.xx-x"
fi
shift 2 ;;
--force) FORCE=1; shift ;;
*) error "未知参数: $1" ;;
esac
done
[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall"
# 下载安装包
download_package() {
step "检查安装包: $PACKAGE_PATH"
if [[ -f "$PACKAGE_PATH" ]]; then
step "使用本地安装包"
return 0
fi
step "本地包不存在,开始下载"
mkdir -p "$(dirname "$PACKAGE_PATH")"
local urls=(
"${INTERNAL_BASE_URL}/${PACKAGE_NAME}"
"${OFFICIAL_BASE_URL}/${PACKAGE_NAME}"
)
for url in "${urls[@]}"; do
step "尝试从 $url 下载"
if wget -q -O "$PACKAGE_PATH" "$url"; then
step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)"
return 0
else
warning "$url 下载失败"
fi
done
error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH"
}
# 安装 NVIDIA Fabric Manager
install_fabricmanager() {
step "开始安装 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}"
# 下载安装包
download_package
# 安装前检查
step "检查系统依赖"
check_cmd dpkg
check_cmd systemctl
# 安装 Fabric Manager
step "安装 NVIDIA Fabric Manager"
run_cmd "dpkg -i $PACKAGE_PATH"
# 启动并启用服务
step "启动并启用 NVIDIA Fabric Manager 服务"
run_cmd "systemctl enable nvidia-fabricmanager.service --now"
# 验证安装
step "验证 NVIDIA Fabric Manager 服务状态"
if systemctl is-active --quiet nvidia-fabricmanager.service; then
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 安装成功并运行中"
else
error "NVIDIA Fabric Manager 服务未运行"
fi
}
# 卸载 NVIDIA Fabric Manager
uninstall_fabricmanager() {
step "开始卸载 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}"
# 检查服务状态
if systemctl is-active --quiet nvidia-fabricmanager.service; then
step "停止 NVIDIA Fabric Manager 服务"
run_cmd "systemctl stop nvidia-fabricmanager.service"
else
step "NVIDIA Fabric Manager 服务未运行"
fi
# 禁用服务
step "禁用 NVIDIA Fabric Manager 服务"
run_cmd "systemctl disable nvidia-fabricmanager.service"
# 卸载软件包
step "卸载 NVIDIA Fabric Manager 软件包"
if dpkg -s "nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" &>/dev/null; then
run_cmd "dpkg -r nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}"
else
warning "未找到 nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION} 软件包"
if [[ $FORCE -eq 0 ]]; then
error "请使用 --force 参数强制卸载"
fi
fi
# 清理残留文件
step "清理残留文件"
rm -f "$PACKAGE_PATH"
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 卸载完成"
}
step "开始 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程"
case "$ACTION" in
install) install_fabricmanager ;;
uninstall) uninstall_fabricmanager ;;
*) error "未知操作: $ACTION" ;;
esac
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程完成"

124
scripts/system_optimize.sh Normal file
View File

@ -0,0 +1,124 @@
#!/bin/bash
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数 - 绿色输出
log_info() {
echo -e "${GREEN}[INFO] $1${NC}"
}
# 错误函数 - 红色输出
log_error() {
echo -e "${RED}[ERROR] $1${NC}"
}
# 警告函数 - 黄色输出
log_warning() {
echo -e "${YELLOW}[WARNING] $1${NC}"
}
# 禁用apt的定期更新
disable_apt_periodic_updates() {
log_info "禁用apt的定期更新..."
# 修改10periodic配置文件将所有的1改为0禁用自动更新检查
sed -i 's/1/0/g' /etc/apt/apt.conf.d/10periodic
# 修改20auto-upgrades配置文件将所有的1改为0禁用自动升级
sed -i 's/1/0/g' /etc/apt/apt.conf.d/20auto-upgrades
log_info "apt定期更新已禁用"
}
# 设置系统时区为上海
set_timezone_to_shanghai() {
log_info "设置系统时区为上海..."
# 使用timedatectl命令设置系统时区为Asia/Shanghai
timedatectl set-timezone Asia/Shanghai
log_info "系统时区已设置为上海"
}
# 同步硬件时钟和系统时钟
synchronize_hardware_clock() {
log_info "同步硬件时钟和系统时钟..."
# 使用hwclock命令将系统时间同步到硬件时钟
hwclock --systohc
log_info "硬件时钟和系统时钟已同步"
}
# 删除"ubuntu"用户
#remove_ubuntu_user() {
# log_info "删除'ubuntu'用户..."
# # 使用userdel命令删除ubuntu用户并递归删除其主目录
# # &> /dev/null用于忽略可能的错误输出例如用户不存在的情况
# userdel -r ubuntu &> /dev/null
# log_info "已尝试删除'ubuntu'用户(如果存在)"
#}
# 禁止显卡驱动
disable_nouveau_driver() {
log_info "禁止nouveau显卡驱动..."
# 创建blacklist-nouveau.conf文件添加禁止nouveau驱动的配置
cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF
blacklist nouveau
blacklist lbm-nouveau
options nouveau modeset=0
alias nouveau off
alias lbm-nouveau off
EOF
# 创建nouveau-kms.conf文件禁用nouveau的KMS内核模式设置
cat > /etc/modprobe.d/nouveau-kms.conf << EOF
options nouveau modeset=0
EOF
# 更新initramfs使驱动禁用配置生效
update-initramfs -u &> /dev/null
log_info "nouveau显卡驱动已被禁止"
}
# 更改GRUB配置并更新
update_grub_configuration() {
log_info "更改GRUB配置启用传统网络接口命名..."
# 检查GRUB配置中是否已存在所需的网络接口命名设置
if ! grep -q 'GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub; then
# 如果不存在则注释掉原有的GRUB_CMDLINE_LINUX_DEFAULT行
sed -i '/GRUB_CMDLINE_LINUX_DEFAULT=.*/s/^/#/' /etc/default/grub
# 添加新的GRUB_CMDLINE_LINUX_DEFAULT行启用传统网络接口命名
sed -i '/GRUB_CMDLINE_LINUX_DEFAULT=.*/a\GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub
fi
# 再次检查配置是否已成功添加
if grep -q 'GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub; then
log_info "文件 /etc/default/grub 修改成功!"
else
log_error "文件 /etc/default/grub 修改失败!"
exit 1
fi
# 更新GRUB引导加载程序配置
update-grub &> /dev/null
log_info "GRUB配置已更新"
}
# 主函数:按顺序执行所有配置步骤
main() {
log_info "开始系统配置..."
disable_apt_periodic_updates
set_timezone_to_shanghai
synchronize_hardware_clock
disable_nouveau_driver
update_grub_configuration
log_info "系统配置完成!"
}
# 执行主函数
main