adsf
This commit is contained in:
parent
5fb837a692
commit
6535633321
|
|
@ -0,0 +1,8 @@
|
|||
# 基础路径配置
|
||||
script_dest: "/opt/ansible-scripts" # 脚本存储目录
|
||||
log_base_dir: "/var/log/ansible-deploy" # 日志根目录
|
||||
|
||||
# ansible优化
|
||||
ansible_ssh_common_args: "-o ControlMaster=auto -o ControlPersist=60s" #自动复用已建立的 SSH 连接
|
||||
ansible_pipelining: yes
|
||||
ansible_ssh_timeout: 120
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
[compute_nodes]
|
||||
gpu-node-01 ansible_host=10.0.0.101 gpu_model="NVIDIA A100"
|
||||
gpu-node-02 ansible_host=10.0.0.102 gpu_model="NVIDIA H100"
|
||||
|
||||
[all_nodes:children]
|
||||
compute_nodes
|
||||
|
||||
[all:vars]
|
||||
ansible_user=root
|
||||
ansible_ssh_port=22
|
||||
ansible_ssh_pass=xxx
|
||||
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
- name: 全量组件部署
|
||||
hosts: all_nodes # 所有节点分组
|
||||
roles:
|
||||
- role: system_init # 基础初始化
|
||||
- role: gpu_driver # GPU节点专
|
||||
- role: node_exporter # 系统监控
|
||||
- role: dcgm_exporter # GPU监控
|
||||
vars:
|
||||
operation: "install" # 操作类型install/uninstall
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
LOG_DIR="{{ log_base_dir }}/{{ driver.name }}" # 从角色变量注入路径
|
||||
LOG_FILE="${LOG_DIR}/install-$(date +%Y%m%d).log"
|
||||
mkdir -p "$LOG_DIR"
|
||||
exec > >(tee -a "$LOG_FILE") 2>&1
|
||||
|
||||
# 参数解析(严格匹配角色定义的操作)
|
||||
OPERATION=""
|
||||
VERSION=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--install) OPERATION="install" ;;
|
||||
--uninstall) OPERATION="uninstall" ;;
|
||||
--version) VERSION="$2"; shift ;;
|
||||
*) echo "错误:未知参数 $1" >&2; exit 1 ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# 安装逻辑(使用角色专属变量)
|
||||
install() {
|
||||
local DRIVER_VERSION="${VERSION:-$DEFAULT_VERSION}"
|
||||
echo "[$(date)] 开始安装NVIDIA驱动(版本:$DRIVER_VERSION,型号:$GPU_MODEL)..."
|
||||
|
||||
# 企业内部镜像下载(安全加速)
|
||||
wget -q "${DOWNLOAD_URL}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" \
|
||||
-O "/tmp/nvidia-driver.run"
|
||||
|
||||
# 静默安装(企业级无交互模式)
|
||||
sh "/tmp/nvidia-driver.run" --silent --no-x-check --no-nouveau-check
|
||||
|
||||
# 严格功能验证(硬件型号匹配)
|
||||
if ! nvidia-smi --query-gpu=name --format=csv,noheader | grep -q "$GPU_MODEL"; then
|
||||
echo "错误:驱动安装后未识别到目标GPU型号" >&2; exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 卸载逻辑(幂等性设计)
|
||||
uninstall() {
|
||||
echo "[$(date)] 开始卸载NVIDIA驱动..."
|
||||
/usr/bin/nvidia-uninstall --silent # 官方静默卸载工具
|
||||
rm -f "/tmp/nvidia-driver.run" # 清理残留文件
|
||||
}
|
||||
|
||||
# 主流程(依赖角色变量注入的默认值)
|
||||
DEFAULT_VERSION="{{ driver.default_version }}"
|
||||
DOWNLOAD_URL="{{ driver.download_url }}"
|
||||
install || uninstall # 根据OPERATION执行对应函数(由剧本参数控制)
|
||||
exit 0
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
- name: 创建角色专属日志目录
|
||||
file:
|
||||
path: "{{ log_base_dir }}/{{ driver.name }}"
|
||||
state: directory
|
||||
mode: "0750"
|
||||
|
||||
- name: 同步驱动脚本到目标服务器
|
||||
copy:
|
||||
src: "{{ driver.install_script }}"
|
||||
dest: "{{ script_dest }}/{{ driver.install_script }}"
|
||||
mode: "0755"
|
||||
force: yes # 确保使用最新脚本
|
||||
|
||||
- name: 执行驱动操作(安装/卸载)
|
||||
shell: |
|
||||
{{ script_dest }}/{{ driver.install_script }} \
|
||||
{{ operations[operation] }} \
|
||||
{% if target_version is defined and target_version != "" %}--version {{ target_version }}{% endif %}
|
||||
register: script_result
|
||||
environment:
|
||||
GPU_MODEL: "{{ hostvars[inventory_hostname]['gpu_model'] }}" # 注入主机硬件信息
|
||||
retries: 3 # 企业级重试机制(失败3次终止)
|
||||
delay: 30 # 重试间隔30秒
|
||||
become: yes # 使用sudo执行
|
||||
|
||||
- name: 验证操作结果(安装时)
|
||||
when: operation == "install"
|
||||
shell: "{{ driver.service_check }}"
|
||||
changed_when: false
|
||||
failed_when: "GPU count: 0" in script_result.stderr
|
||||
|
||||
- name: 记录操作日志(企业级可观测性)
|
||||
uri:
|
||||
url: "http://logging.internal.com/api/ansible"
|
||||
method: POST
|
||||
body_format: json
|
||||
body:
|
||||
host: "{{ inventory_hostname }}"
|
||||
component: "{{ driver.name }}_driver"
|
||||
operation: "{{ operation }}"
|
||||
version: "{{ target_version | default(driver.default_version) }}"
|
||||
status: "{% if script_result.rc == 0 %}success{% else %}failed{% endif %}"
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
# 显卡驱动专属变量 脚本所需参数
|
||||
driver:
|
||||
name: "nvidia"
|
||||
default_version: "545.29.06" # 版本
|
||||
download_url: "http://repo.internal.com/drivers/nvidia" # 安装包下载路径
|
||||
install_script: "nvidia-install.sh" # 脚本文件名
|
||||
service_check: "nvidia-smi --list-gpus" # 安装后验证命令
|
||||
|
||||
# 操作参数
|
||||
operations:
|
||||
install: "--install"
|
||||
uninstall: "--uninstall"
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
<h2 align="center">GPU 环境标准化部署脚本使用说明:</h2>
|
||||
|
||||
<p align="center">
|
||||
<img src="https://img.shields.io/github/languages/code-size/nanchengcyu/TechMindWave-frontend" alt="code size"/>
|
||||
<img src="https://img.shields.io/badge/ofed-17.0.2-blue" alt="ofed"/>
|
||||
<img src="https://img.shields.io/badge/NVIDIA-565.57.01-brightgreen" alt="NVIDIA"/>
|
||||
<img src="https://img.shields.io/badge/fabricmanager-565.57.01-blue" alt="fabricmanager"/>
|
||||
<img src="https://img.shields.io/badge/CUDA-12.6.3-brightgreen" alt="CUDA"/>
|
||||
<br>
|
||||
<img src="https://img.shields.io/badge/Author-王云龙-orange" alt="Author" />
|
||||
</p>
|
||||
<hr>
|
||||
|
||||
|
||||
### 一、脚本概述
|
||||
|
||||
该脚本旨在简化 GPU 相关应用的安装流程,适用于需要快速部署 GPU 环境的场景。
|
||||
|
||||
- **核心功能**:
|
||||
```bash
|
||||
脚本可批量完成网卡驱动、显卡驱动、fabricmanager互联管理器、CUDA 工具包、Nvidia-dcgm、DCGM-EXporter、Node-EXporter 核心组件的安装与卸载操作
|
||||
```
|
||||
- **配置说明**:
|
||||
```bash
|
||||
用户管理:若需删除 ubuntu 用户,需手动执行相关用户删除命令,并妥善处理该用户关联的数据与权限。
|
||||
磁盘管理:磁盘分区扩容需通过磁盘管理工具,根据实际需求对磁盘进行分区调整与扩容操作,以满足应用存储需求。
|
||||
网络配置:网卡重命名需手动修改网络配置文件,根据实际网络环境对网卡名称进行重新定义,确保网络连接正常。
|
||||
```
|
||||
- **使用建议**:
|
||||
```bash
|
||||
新系统推荐使用一键自动安装脚本,可快速、全面地完成 GPU 相关应用的部署,具体使用方法详见文章末尾说明。若系统之前已存在相关安装内容,或需要对各组件进行独立、定制化部署,建议使用单独部署脚本安装。
|
||||
```
|
||||
### 二、使用说明
|
||||
|
||||
#### (1)系统初始化
|
||||
|
||||
```bash
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
|
||||
```
|
||||
|
||||
#### (2)MLNX_OFED 网络套件安装/卸载
|
||||
|
||||
```bash
|
||||
#支持版本[23.10-1.1.9.0]
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --uninstall --version '23.10-1.1.9.0'
|
||||
```
|
||||
|
||||
|
||||
#### (3)Nvidia 显卡驱动安装/卸载
|
||||
|
||||
```bash
|
||||
#支持版本[565.57.01] [570.124.06]
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '565.57.01'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '565.57.01'
|
||||
|
||||
|
||||
```
|
||||
|
||||
#### (4)GPU 互联管理器安装/卸载
|
||||
|
||||
```bash
|
||||
#支持版本[565_565.57.01-1] [570_570.124.06-1]
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '565_565.57.01-1'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '565_565.57.01-1'
|
||||
```
|
||||
|
||||
|
||||
#### (5)NVIDIA CUDA 工具包部署/卸载
|
||||
|
||||
```bash
|
||||
#支持版本[12.6.3_560.35.05] [12.8.1_570.124.06]
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.6.3_560.35.05'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.6.3_560.35.05'
|
||||
```
|
||||
|
||||
#### (6)dcgm/node exporter 部署/卸载
|
||||
|
||||
```bash
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-dcgm.sh | bash -s -- --install
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/dcgm-exporter.sh | bash -s -- --install
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/node-exporter.sh | bash -s -- --install
|
||||
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-dcgm.sh | bash -s -- --uninstall
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/dcgm-exporter.sh | bash -s -- --uninstall
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/node-exporter.sh | bash -s -- --uninstall
|
||||
```
|
||||
|
||||
#### (7)批量组件安装/卸载
|
||||
|
||||

|
||||

|
||||

|
||||

|
||||

|
||||
```bash
|
||||
安装:---------------------------------------------------------------------------------------------------------------------------------------------
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '565.57.01'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.6.3_560.35.05'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '565_565.57.01-1'
|
||||
|
||||
卸载:---------------------------------------------------------------------------------------------------------------------------------------------
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh |bash -s -- --uninstall --version '23.10-1.1.9.0'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '565.57.01'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.6.3_560.35.05'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '565_565.57.01-1'
|
||||
```
|
||||
|
||||

|
||||

|
||||

|
||||

|
||||

|
||||
```bash
|
||||
安装:---------------------------------------------------------------------------------------------------------------------------------------------
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '570.124.06'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.8.1_570.124.06'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '570_570.124.06-1'
|
||||
|
||||
|
||||
卸载:--------------------------------------------------------------------------------------------------------------------------------------------
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.shbash -s -- --uninstall --version '23.10-1.1.9.0'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '570.124.06'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.8.1_570.124.06'
|
||||
cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '570_570.124.06-1'
|
||||
|
||||
```
|
||||

|
||||
```bash
|
||||
#安装/卸载服务(安装或卸载时间较长,建议放后台执行。):
|
||||
#组合[1]-----------------------------------------------------------------------------------------------------------------------------------
|
||||
screen -dmS install_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --install --version 1 --include=exporter 2>&1 > /opt/gpu-manager.log";
|
||||
tail -f /opt/gpu-manager.log
|
||||
|
||||
screen -dmS uninstall_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --uninstall --version 1 --include=exporter 2>&1 > /opt/gpu-manager.log";
|
||||
tail -f /opt/gpu-manager.log
|
||||
|
||||
#组合[2]-----------------------------------------------------------------------------------------------------------------------------------
|
||||
screen -dmS install_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --install --version 2 --include=exporter 2>&1 > /opt/gpu-manager.log";
|
||||
tail -f /opt/gpu-manager.log
|
||||
|
||||
screen -dmS uninstall_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --uninstall --version 2 --include=exporter 2>&1 > /opt/gpu-manager.log";
|
||||
tail -f /opt/gpu-manager.log
|
||||
|
||||
#说明:
|
||||
#version 1 表示安装/卸载七.[1]组件版本:mlnx_ofed-23.10.1.1.9.0+nvidia_drive-565.57.01 +cuda-12.6.3.560.35.05 +fabricmanager-565_565.57.01.1
|
||||
#version 2 表示安装/卸载七.[2]组件版本:mlnx_ofed-23.10.1.1.9.0+nvidia_drive-570.124.06+cuda-12.8.1.570.124.06+fabricmanager-570.124.06.1
|
||||
#--include=exporter 指定该参数,脚本将安装/卸载exporter组件中的相关服务[dcgm-exporter,node-exporter,nvidia-dcgm],默认不安装/卸载。
|
||||
|
||||
```
|
||||
|
||||
|
|
@ -0,0 +1,275 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# 全局变量
|
||||
CUDA_VERSION="12.6"
|
||||
DRIVER_VERSION="560.35.05"
|
||||
PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run"
|
||||
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
|
||||
INTERNAL_BASE_URL="http://10.101.0.51:5588/cuda-linux"
|
||||
OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers"
|
||||
TEMP_DIR="/tmp/cuda_temp"
|
||||
CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2)
|
||||
CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
|
||||
ENV_PROFILE="/etc/profile"
|
||||
LOG_FILE="/var/log/cuda_manager_$(date +%Y%m%d%H%M%S).log"
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # 重置颜色
|
||||
|
||||
# 日志函数
|
||||
log() {
|
||||
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||||
echo "$msg" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 步骤提示
|
||||
step() {
|
||||
local msg="==> $1"
|
||||
echo -e "${GREEN}$msg${NC}"
|
||||
log "$msg"
|
||||
}
|
||||
|
||||
# 警告提示
|
||||
warning() {
|
||||
local msg="警告: $1"
|
||||
echo -e "${YELLOW}$msg${NC}"
|
||||
log "$msg"
|
||||
}
|
||||
|
||||
# 错误提示
|
||||
error() {
|
||||
local msg="错误: $1"
|
||||
echo -e "${RED}$msg${NC}"
|
||||
log "$msg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 执行命令并记录日志
|
||||
run_cmd() {
|
||||
step "执行: $1"
|
||||
eval "$1" &>> "$LOG_FILE" || {
|
||||
error "命令执行失败: $1"
|
||||
}
|
||||
}
|
||||
|
||||
# 检查命令是否存在
|
||||
check_cmd() {
|
||||
command -v "$1" &>/dev/null || error "未找到命令: $1"
|
||||
}
|
||||
|
||||
# 参数解析
|
||||
ACTION=""
|
||||
FORCE=0
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--install) ACTION="install"; shift ;;
|
||||
--uninstall) ACTION="uninstall"; shift ;;
|
||||
--version)
|
||||
CUSTOM_VERSION="$2"
|
||||
if [[ $CUSTOM_VERSION =~ ^([0-9]+\.[0-9]+(\.[0-9]+)?)(_([0-9]+\.[0-9]+\.[0-9]+))$ ]]; then
|
||||
CUDA_VERSION="${BASH_REMATCH[1]}"
|
||||
DRIVER_VERSION="${BASH_REMATCH[4]}"
|
||||
PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run"
|
||||
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
|
||||
CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2)
|
||||
CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
|
||||
OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers"
|
||||
else
|
||||
error "版本格式错误,应为 x.x.x_y.y.y 或 x.x_y.y.y"
|
||||
fi
|
||||
shift 2 ;;
|
||||
--force) FORCE=1; shift ;;
|
||||
*) error "未知参数: $1" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall"
|
||||
|
||||
|
||||
# 下载安装包
|
||||
download_package() {
|
||||
step "检查安装包: $PACKAGE_PATH"
|
||||
if [[ -f "$PACKAGE_PATH" ]]; then
|
||||
step "使用本地安装包"
|
||||
return 0
|
||||
fi
|
||||
|
||||
step "本地包不存在,开始下载"
|
||||
mkdir -p "$(dirname "$PACKAGE_PATH")"
|
||||
|
||||
local urls=(
|
||||
"${INTERNAL_BASE_URL}/${PACKAGE_NAME}"
|
||||
"${OFFICIAL_BASE_URL}/${PACKAGE_NAME}"
|
||||
)
|
||||
|
||||
for url in "${urls[@]}"; do
|
||||
step "尝试从 $url 下载"
|
||||
if wget -q -O "$PACKAGE_PATH" "$url"; then
|
||||
step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)"
|
||||
return 0
|
||||
else
|
||||
warning "从 $url 下载失败"
|
||||
fi
|
||||
done
|
||||
|
||||
error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH"
|
||||
}
|
||||
|
||||
|
||||
# 安装 CUDA
|
||||
install_cuda() {
|
||||
step "开始安装 CUDA ${CUDA_VERSION}"
|
||||
|
||||
# 下载安装包
|
||||
download_package
|
||||
|
||||
# 创建临时目录
|
||||
mkdir -p "$TEMP_DIR"
|
||||
trap 'rm -rf "$TEMP_DIR"' EXIT
|
||||
|
||||
# 检查安装包参数
|
||||
step "检查安装包支持的参数"
|
||||
sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1
|
||||
|
||||
# 检查是否支持 --toolkit 参数
|
||||
if grep -q -- '--toolkit' "$TEMP_DIR/help.txt"; then
|
||||
step "安装包支持 --toolkit 参数"
|
||||
run_cmd "sh $PACKAGE_PATH --silent --toolkit"
|
||||
else
|
||||
warning "安装包不支持 --toolkit 参数,尝试完整安装"
|
||||
run_cmd "sh $PACKAGE_PATH --silent"
|
||||
fi
|
||||
|
||||
# 配置环境变量
|
||||
step "配置 CUDA 环境变量"
|
||||
if ! grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then
|
||||
cat >> "$ENV_PROFILE" << EOF
|
||||
|
||||
# CUDA ${CUDA_VERSION}
|
||||
export PATH=${CUDA_INSTALL_DIR}/bin:\$PATH
|
||||
export LD_LIBRARY_PATH=${CUDA_INSTALL_DIR}/lib64:\$LD_LIBRARY_PATH
|
||||
EOF
|
||||
step "已添加环境变量到 $ENV_PROFILE"
|
||||
else
|
||||
step "环境变量已存在,跳过添加"
|
||||
fi
|
||||
|
||||
# 生效环境变量
|
||||
export LC_BYOBU=0 && source "/etc/profile" #临时解决
|
||||
|
||||
# 验证安装
|
||||
step "验证 CUDA 安装"
|
||||
if command -v nvcc &>/dev/null; then
|
||||
nvcc_version=$(nvcc -V | grep release | awk '{print $5}' | tr -d ',')
|
||||
if [[ "$nvcc_version" == *"${CUDA_MAJOR_MINOR}"* ]]; then
|
||||
step "CUDA ${CUDA_VERSION} 安装成功"
|
||||
else
|
||||
error "CUDA 版本不匹配,期望 ${CUDA_VERSION},实际 $nvcc_version"
|
||||
fi
|
||||
else
|
||||
error "nvcc 命令未找到,安装失败"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# 卸载 CUDA
|
||||
uninstall_cuda() {
|
||||
step "开始卸载 CUDA ${CUDA_VERSION}"
|
||||
|
||||
# 创建临时目录
|
||||
mkdir -p "$TEMP_DIR"
|
||||
trap 'rm -rf "$TEMP_DIR"' EXIT
|
||||
|
||||
# 检查官方卸载脚本
|
||||
OFFICIAL_UNINSTALLER="${CUDA_INSTALL_DIR}/bin/cuda-uninstaller"
|
||||
if [[ -x "$OFFICIAL_UNINSTALLER" ]]; then
|
||||
step "找到官方卸载脚本: $OFFICIAL_UNINSTALLER"
|
||||
|
||||
# 执行官方卸载脚本
|
||||
step "执行官方卸载程序"
|
||||
run_cmd "$OFFICIAL_UNINSTALLER --silent"
|
||||
else
|
||||
warning "未找到官方卸载脚本,尝试其他方法"
|
||||
|
||||
# 检查安装包是否存在
|
||||
if [[ -f "$PACKAGE_PATH" ]]; then
|
||||
step "找到安装包: $PACKAGE_PATH"
|
||||
else
|
||||
if [[ $FORCE -eq 1 ]]; then
|
||||
warning "未找到安装包,继续强制卸载"
|
||||
else
|
||||
step "未找到安装包,开始下载"
|
||||
download_package
|
||||
fi
|
||||
fi
|
||||
|
||||
# 检查安装包是否支持 --uninstall 参数
|
||||
step "检查安装包是否支持 --uninstall 参数"
|
||||
sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1
|
||||
|
||||
if grep -q -- '--uninstall' "$TEMP_DIR/help.txt"; then
|
||||
step "安装包支持 --uninstall 参数"
|
||||
run_cmd "sh $PACKAGE_PATH --silent --uninstall"
|
||||
else
|
||||
step "安装包不支持 --uninstall 参数,尝试解压查找卸载脚本"
|
||||
|
||||
# 解压安装包
|
||||
step "解压安装包到 $TEMP_DIR"
|
||||
run_cmd "sh $PACKAGE_PATH --extract=$TEMP_DIR"
|
||||
|
||||
# 查找卸载脚本
|
||||
UNINSTALL_SCRIPT=$(find "$TEMP_DIR" -name "uninstall_cuda*" -type f | head -n1)
|
||||
|
||||
if [[ -n "$UNINSTALL_SCRIPT" ]]; then
|
||||
step "找到卸载脚本: $UNINSTALL_SCRIPT"
|
||||
run_cmd "sh $UNINSTALL_SCRIPT"
|
||||
else
|
||||
warning "未找到卸载脚本,继续手动清理"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# 清理环境变量
|
||||
step "清理环境变量"
|
||||
if grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then
|
||||
run_cmd "sed -i '/cuda-${CUDA_MAJOR_MINOR}/d' $ENV_PROFILE"
|
||||
step "已从 $ENV_PROFILE 移除 CUDA 环境变量"
|
||||
else
|
||||
step "环境变量已清理"
|
||||
fi
|
||||
|
||||
# 清理安装目录
|
||||
if [[ -d "$CUDA_INSTALL_DIR" ]]; then
|
||||
step "删除安装目录: $CUDA_INSTALL_DIR"
|
||||
run_cmd "rm -rf $CUDA_INSTALL_DIR"
|
||||
else
|
||||
step "安装目录不存在,跳过删除"
|
||||
fi
|
||||
|
||||
# 清理残留文件
|
||||
step "清理残留文件"
|
||||
find / -name "*cuda-${CUDA_MAJOR_MINOR}*" 2>/dev/null | while read -r file; do
|
||||
if [[ -e "$file" ]]; then
|
||||
step "删除残留文件: $file"
|
||||
rm -rf "$file" 2>/dev/null || warning "无法删除: $file"
|
||||
fi
|
||||
done
|
||||
|
||||
step "CUDA ${CUDA_VERSION} 卸载完成"
|
||||
}
|
||||
|
||||
|
||||
|
||||
step "开始 CUDA ${CUDA_VERSION} ${ACTION} 流程"
|
||||
case "$ACTION" in
|
||||
install) install_cuda ;;
|
||||
uninstall) uninstall_cuda ;;
|
||||
*) error "未知操作: $ACTION" ;;
|
||||
esac
|
||||
|
||||
step "CUDA ${CUDA_VERSION} ${ACTION} 流程完成"
|
||||
|
|
@ -0,0 +1,288 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# 全局变量
|
||||
LOG_FILE="/var/log/dcgm-exporter-install_$(date +%Y%m%d%H%M%S).log"
|
||||
GO_VERSION="1.21.1"
|
||||
DCGM_EXPORTER_VERSION="4.2.0-4.1.0"
|
||||
DCGM_EXPORTER_DIR="/opt/dcgm-exporter"
|
||||
SERVICE_FILE="/etc/systemd/system/dcgm-exporter.service"
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
# 日志函数(控制台+日志文件)
|
||||
log() {
|
||||
local timestamp="[$(date '+%Y-%m-%d %H:%M:%S')]"
|
||||
echo -e "$timestamp $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 成功提示
|
||||
success() {
|
||||
log "${GREEN}✔ $*${NC}"
|
||||
}
|
||||
|
||||
# 警告提示
|
||||
warning() {
|
||||
log "${YELLOW}⚠ $*${NC}"
|
||||
}
|
||||
|
||||
# 错误提示
|
||||
error() {
|
||||
log "${RED}✖ 错误: $*${NC}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 执行命令(带详细错误处理)
|
||||
run() {
|
||||
local cmd="$1"
|
||||
local error_msg="${2:-命令执行失败}"
|
||||
local timeout="${3:-30}" # 默认超时30秒
|
||||
|
||||
log "→ 执行: $cmd"
|
||||
|
||||
# 使用timeout防止命令卡死
|
||||
if ! timeout "$timeout" bash -c "$cmd" &>> "$LOG_FILE"; then
|
||||
local exit_code=$?
|
||||
if [[ $exit_code -eq 124 ]]; then
|
||||
error "命令超时 ($timeout秒): $cmd"
|
||||
else
|
||||
error "$error_msg (退出码: $exit_code)"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 检测DCGM状态
|
||||
check_dcgm() {
|
||||
log "检测DCGM服务状态..."
|
||||
|
||||
# 检查systemctl命令是否存在
|
||||
if ! command -v systemctl &> /dev/null; then
|
||||
error "未找到systemctl命令,请确保系统支持systemd"
|
||||
fi
|
||||
|
||||
# 检查nvidia-dcgm.service文件是否存在
|
||||
if [[ ! -f "/lib/systemd/system/nvidia-dcgm.service" ]]; then
|
||||
error "未找到nvidia-dcgm服务文件,请确认DCGM已正确安装"
|
||||
fi
|
||||
|
||||
# 获取服务状态
|
||||
local status=$(systemctl is-active nvidia-dcgm 2>&1)
|
||||
local exit_code=$?
|
||||
|
||||
if [[ $exit_code -ne 0 ]]; then
|
||||
error "无法获取DCGM服务状态: $status"
|
||||
fi
|
||||
|
||||
if [[ "$status" == "active" ]]; then
|
||||
DCGM_VERSION=$(dcgmi version 2>&1 | grep -oP 'Version \K[^\s]+' || echo "未知")
|
||||
success "DCGM服务运行中 (版本: $DCGM_VERSION)"
|
||||
else
|
||||
error "DCGM服务状态异常 ($status),请先执行: systemctl start nvidia-dcgm"
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装Go环境
|
||||
install_go() {
|
||||
log "安装Go环境 (版本: $GO_VERSION)..."
|
||||
GO_PACKAGE="go${GO_VERSION}.linux-amd64.tar.gz"
|
||||
DL_URL="https://golang.google.cn/dl/$GO_PACKAGE"
|
||||
TMP_PACKAGE="/tmp/$GO_PACKAGE"
|
||||
|
||||
# 下载安装包
|
||||
if [[ ! -f "$TMP_PACKAGE" ]]; then
|
||||
run "wget -qO $TMP_PACKAGE $DL_URL" "下载Go安装包失败"
|
||||
fi
|
||||
|
||||
# 解压安装
|
||||
run "tar -xzf $TMP_PACKAGE -C /usr/local" "解压Go安装包失败"
|
||||
|
||||
# 配置环境变量
|
||||
GO_ENV="/etc/profile.d/go.sh"
|
||||
cat > "$GO_ENV" <<'EOF'
|
||||
export GOROOT=/usr/local/go
|
||||
export GOPATH=/usr/local/gopath
|
||||
export PATH=$PATH:$GOROOT/bin
|
||||
export GO111MODULE=on
|
||||
export GOPROXY=https://goproxy.cn,direct
|
||||
EOF
|
||||
log "→ 配置Go环境变量"
|
||||
if ! chmod +x "$GO_ENV" &>> "$LOG_FILE"; then
|
||||
error "设置Go环境变量文件权限失败"
|
||||
fi
|
||||
|
||||
# 在当前shell中加载环境变量
|
||||
log "→ 加载Go环境变量"
|
||||
if ! source "$GO_ENV"; then
|
||||
error "加载Go环境变量失败"
|
||||
fi
|
||||
|
||||
# 验证安装
|
||||
local go_version=$(go version 2>&1)
|
||||
if [[ "$go_version" == *"go$GO_VERSION"* ]]; then
|
||||
success "Go环境安装完成: $go_version"
|
||||
else
|
||||
error "Go环境验证失败: $go_version"
|
||||
fi
|
||||
}
|
||||
|
||||
# 卸载现有DCGM Exporter
|
||||
uninstall_existing() {
|
||||
log "检查是否存在旧版本DCGM Exporter..."
|
||||
|
||||
if [[ -d "$DCGM_EXPORTER_DIR" || -f "$SERVICE_FILE" ]]; then
|
||||
log "发现旧版本,开始卸载..."
|
||||
|
||||
# 停止服务
|
||||
if systemctl is-active --quiet dcgm-exporter; then
|
||||
run "systemctl stop dcgm-exporter" "停止现有服务失败"
|
||||
fi
|
||||
|
||||
# 禁用服务
|
||||
if systemctl is-enabled --quiet dcgm-exporter; then
|
||||
run "systemctl disable dcgm-exporter" "禁用现有服务失败"
|
||||
fi
|
||||
|
||||
# 删除文件
|
||||
run "rm -rf $DCGM_EXPORTER_DIR" "删除旧安装目录失败"
|
||||
run "rm -f $SERVICE_FILE" "删除服务文件失败"
|
||||
|
||||
# 重新加载systemd
|
||||
run "systemctl daemon-reload" "重新加载systemd失败"
|
||||
|
||||
success "旧版本卸载完成"
|
||||
else
|
||||
success "未发现旧版本,继续安装..."
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装DCGM Exporter
|
||||
install_exporter() {
|
||||
log "安装DCGM Exporter (版本: $DCGM_EXPORTER_VERSION)..."
|
||||
|
||||
# 加载Go环境变量
|
||||
log "→ 加载Go环境变量"
|
||||
if [[ -f "/etc/profile.d/go.sh" ]]; then
|
||||
if ! source "/etc/profile.d/go.sh"; then
|
||||
error "加载Go环境变量失败"
|
||||
fi
|
||||
else
|
||||
error "未找到Go环境变量配置文件"
|
||||
fi
|
||||
|
||||
run "mkdir -p $DCGM_EXPORTER_DIR" "创建工作目录失败"
|
||||
|
||||
# 下载源码(使用固定URL)
|
||||
DCGM_PACKAGE="4.2.0-4.1.0.tar.gz"
|
||||
DL_URL="https://github.com/NVIDIA/dcgm-exporter/archive/refs/tags/$DCGM_PACKAGE"
|
||||
TMP_PACKAGE="/tmp/$DCGM_PACKAGE"
|
||||
|
||||
if [[ ! -f "$TMP_PACKAGE" ]]; then
|
||||
run "wget -qO $TMP_PACKAGE $DL_URL" "下载DCGM Exporter源码失败"
|
||||
fi
|
||||
|
||||
# 解压
|
||||
run "tar xf $TMP_PACKAGE -C /tmp" "解压DCGM Exporter源码失败"
|
||||
SOURCE_DIR="/tmp/dcgm-exporter-$DCGM_EXPORTER_VERSION"
|
||||
|
||||
# 编译安装
|
||||
log "→ 编译DCGM Exporter"
|
||||
if ! cd "$SOURCE_DIR" &>> "$LOG_FILE"; then
|
||||
error "进入源码目录失败"
|
||||
fi
|
||||
|
||||
# 分步骤执行make,便于调试
|
||||
if ! make binary &>> "$LOG_FILE"; then
|
||||
error "编译DCGM Exporter失败"
|
||||
fi
|
||||
|
||||
if ! make install &>> "$LOG_FILE"; then
|
||||
error "安装DCGM Exporter失败"
|
||||
fi
|
||||
|
||||
# 复制文件
|
||||
run "cp -a /usr/bin/dcgm-exporter $DCGM_EXPORTER_DIR/" "复制二进制文件失败"
|
||||
run "cp -a /etc/dcgm-exporter/default-counters.csv $DCGM_EXPORTER_DIR/" "复制配置文件失败"
|
||||
|
||||
|
||||
# 生成服务文件
|
||||
cat > "$SERVICE_FILE" <<EOF
|
||||
[Unit]
|
||||
Description=DCGM Exporter
|
||||
After=network.target nvidia-dcgm.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=root
|
||||
ExecStart=/opt/dcgm-exporter/dcgm-exporter -f /opt/dcgm-exporter/default-counters.csv -a 0.0.0.0:9411
|
||||
Restart=always
|
||||
StandardOutput=file:/var/log/dcgm-exporter.log
|
||||
StandardError=file:/var/log/dcgm-exporter-error.log
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
run "chmod 644 $SERVICE_FILE" "设置服务文件权限失败"
|
||||
|
||||
# 启动服务
|
||||
run "systemctl daemon-reload && systemctl enable --now dcgm-exporter.service" "启动DCGM Exporter服务失败"
|
||||
|
||||
# 验证服务
|
||||
log "等待服务启动..."
|
||||
for i in {1..30}; do
|
||||
if curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:9411/metrics | grep -q "200"; then
|
||||
success "DCGM Exporter服务启动成功 (http://127.0.0.1:9411/metrics)"
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
[[ $i -eq 31 ]] && error "服务启动超时,请检查日志"
|
||||
}
|
||||
|
||||
# 清理Go环境
|
||||
clean_go() {
|
||||
log "清理Go环境..."
|
||||
run "rm -rf /usr/local/go" "删除Go安装目录失败"
|
||||
run "rm -f /etc/profile.d/go.sh" "删除Go环境变量配置失败"
|
||||
run "rm -rf /tmp/go*.tar.gz /tmp/dcgm-exporter*.tar.gz" "删除临时安装包失败"
|
||||
success "Go环境清理完成"
|
||||
}
|
||||
|
||||
# 主流程
|
||||
log "================= DCGM Exporter安装 =================="
|
||||
log "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||
|
||||
# 解析命令行参数
|
||||
case "$1" in
|
||||
"--install")
|
||||
ACTION="install"
|
||||
;;
|
||||
"--uninstall")
|
||||
ACTION="uninstall"
|
||||
;;
|
||||
*)
|
||||
error "未知参数: $1\n用法: $0 [--install|--uninstall]"
|
||||
;;
|
||||
esac
|
||||
|
||||
# 执行对应操作
|
||||
case "$ACTION" in
|
||||
"install")
|
||||
check_dcgm
|
||||
install_go
|
||||
uninstall_existing
|
||||
install_exporter
|
||||
clean_go
|
||||
;;
|
||||
"uninstall")
|
||||
uninstall_existing
|
||||
success "卸载完成"
|
||||
;;
|
||||
esac
|
||||
|
||||
# 完成
|
||||
log "================= 操作完成 =================="
|
||||
log "日志文件: $LOG_FILE"
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
#!/bin/bash
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # 重置颜色
|
||||
|
||||
# 日志函数
|
||||
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
|
||||
log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
|
||||
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
|
||||
|
||||
# 默认参数
|
||||
ACTION=""
|
||||
VERSION=""
|
||||
SCRIPT_REPO="http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts"
|
||||
INCLUDE_EXPORTER="no" # 默认不安装exporter组件
|
||||
|
||||
# 版本组合定义
|
||||
define_versions() {
|
||||
# 组合1:CUDA 12.6.3 + NVIDIA 565.57.01
|
||||
if [ "$VERSION" = "1" ]; then
|
||||
IB_VERSION="23.10-1.1.9.0"
|
||||
NVIDIA_VERSION="565.57.01"
|
||||
CUDA_VERSION="12.6.3_560.35.05"
|
||||
FABRICMANAGER_VERSION="565_565.57.01-1"
|
||||
EXPORTER_VERSION="1.0.0"
|
||||
# 组合2:CUDA 12.8.1 + NVIDIA 570.124.06
|
||||
elif [ "$VERSION" = "2" ]; then
|
||||
IB_VERSION="23.10-1.1.9.0"
|
||||
NVIDIA_VERSION="570.124.06"
|
||||
CUDA_VERSION="12.8.1_570.124.06"
|
||||
FABRICMANAGER_VERSION="570_570.124.06-1"
|
||||
EXPORTER_VERSION="1.0.0"
|
||||
else
|
||||
log_error "不支持的版本组合: $VERSION。请选择 1 或 2"
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示版本信息
|
||||
show_version_info() {
|
||||
echo -e "\n${YELLOW}您当前选择的组合版本如下:${NC}"
|
||||
echo -e "${GREEN}========================================${NC}"
|
||||
echo -e "${YELLOW}组件1: IB驱动${NC} ${GREEN}版本: ${IB_VERSION}${NC}"
|
||||
echo -e "${YELLOW}组件2: NVIDIA驱动${NC} ${GREEN}版本: ${NVIDIA_VERSION}${NC}"
|
||||
echo -e "${YELLOW}组件3: CUDA工具包${NC} ${GREEN}版本: ${CUDA_VERSION}${NC}"
|
||||
echo -e "${YELLOW}组件4: FabricManager${NC} ${GREEN}版本: ${FABRICMANAGER_VERSION}${NC}"
|
||||
echo -e "${YELLOW}Exporter组件:${NC} ${GREEN}状态: ${INCLUDE_EXPORTER}${NC}"
|
||||
echo -e "${GREEN}========================================${NC}\n"
|
||||
}
|
||||
|
||||
# 执行安装
|
||||
run_install() {
|
||||
log_info "开始执行组合$VERSION的安装流程..."
|
||||
|
||||
# 系统优化
|
||||
log_info "执行系统优化..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/system_optimize.sh" | bash
|
||||
|
||||
# IB驱动
|
||||
log_info "安装IB驱动..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --install --version "$IB_VERSION"
|
||||
|
||||
# NVIDIA驱动
|
||||
log_info "安装NVIDIA驱动..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --install --version "$NVIDIA_VERSION"
|
||||
|
||||
# CUDA
|
||||
log_info "安装CUDA工具包..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --install --version "$CUDA_VERSION"
|
||||
|
||||
# FabricManager
|
||||
log_info "安装NVIDIA FabricManager..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --install --version "$FABRICMANAGER_VERSION"
|
||||
|
||||
# 安装exporter组件(如果指定)
|
||||
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
|
||||
log_info "安装Exporter组件..."
|
||||
|
||||
log_info "安装nvidia-dcgm..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --install
|
||||
|
||||
log_info "安装dcgm-exporter..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --install
|
||||
|
||||
log_info "安装node-exporter..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --install
|
||||
else
|
||||
log_info "跳过Exporter组件的安装"
|
||||
fi
|
||||
|
||||
log_info "组合$VERSION的安装已完成!"
|
||||
}
|
||||
|
||||
# 执行卸载
|
||||
run_uninstall() {
|
||||
log_info "开始执行组合$VERSION的卸载流程..."
|
||||
|
||||
# 注意卸载顺序与安装相反
|
||||
# FabricManager
|
||||
log_info "卸载NVIDIA FabricManager..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-fabricmanager.sh" | bash -s -- --uninstall --version "$FABRICMANAGER_VERSION"
|
||||
|
||||
# CUDA
|
||||
log_info "卸载CUDA工具包..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/cuda.sh" | bash -s -- --uninstall --version "$CUDA_VERSION"
|
||||
|
||||
# NVIDIA驱动
|
||||
log_info "卸载NVIDIA驱动..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-driver.sh" | bash -s -- --uninstall --version "$NVIDIA_VERSION"
|
||||
|
||||
# IB驱动
|
||||
log_info "卸载IB驱动..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/ib-drive.sh" | bash -s -- --uninstall --version "$IB_VERSION"
|
||||
|
||||
# 卸载exporter组件(如果指定)
|
||||
if [ "$INCLUDE_EXPORTER" = "yes" ]; then
|
||||
log_info "卸载Exporter组件..."
|
||||
|
||||
log_info "卸载nvidia-dcgm..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/nvidia-dcgm.sh" | bash -s -- --uninstall
|
||||
|
||||
log_info "卸载dcgm-exporter..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/dcgm-exporter.sh" | bash -s -- --uninstall
|
||||
|
||||
log_info "卸载node-exporter..."
|
||||
cd /opt/ && wget -qO- "${SCRIPT_REPO}/node-exporter.sh" | bash -s -- --uninstall
|
||||
else
|
||||
log_info "跳过Exporter组件的卸载"
|
||||
fi
|
||||
|
||||
log_info "组合$VERSION的卸载已完成!"
|
||||
}
|
||||
|
||||
# 解析命令行参数
|
||||
parse_args() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--install)
|
||||
ACTION="install"
|
||||
;;
|
||||
--uninstall)
|
||||
ACTION="uninstall"
|
||||
;;
|
||||
--version)
|
||||
VERSION="$2"
|
||||
shift
|
||||
;;
|
||||
--include=exporter)
|
||||
INCLUDE_EXPORTER="yes"
|
||||
;;
|
||||
*)
|
||||
log_error "未知参数: $1"
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [[ -z "$ACTION" ]]; then
|
||||
log_error "请指定操作: --install 或 --uninstall"
|
||||
fi
|
||||
|
||||
if [[ -z "$VERSION" ]]; then
|
||||
log_error "请指定版本组合: --version 1 或 --version 2"
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
# 检查root权限
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要root权限运行,请使用sudo执行"
|
||||
fi
|
||||
>/opt/gpu-manager.log
|
||||
parse_args "$@"
|
||||
define_versions
|
||||
show_version_info
|
||||
|
||||
if [ "$ACTION" = "install" ]; then
|
||||
run_install
|
||||
else
|
||||
run_uninstall
|
||||
fi
|
||||
}
|
||||
|
||||
# 执行主函数
|
||||
main "$@"
|
||||
|
|
@ -0,0 +1,260 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # 重置颜色
|
||||
|
||||
# 输出带颜色的信息
|
||||
log_info() { echo -e "${GREEN}[INFO] $1${NC}"; }
|
||||
log_error() { echo -e "${RED}[ERROR] $1${NC}"; exit 1; }
|
||||
log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}"; }
|
||||
|
||||
# 默认变量
|
||||
ACTION=""
|
||||
DRIVER_VERSION="5.8-6.0.4.2"
|
||||
DISTRO="ubuntu22.04"
|
||||
ARCH="x86_64"
|
||||
FORCE=0
|
||||
|
||||
# 生成包名和路径
|
||||
generate_package_info() {
|
||||
DRIVER_PACKAGE="MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}.tgz"
|
||||
PACKAGE_PATH="/opt/${DRIVER_PACKAGE}"
|
||||
DRIVER_DIR="/opt/MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}"
|
||||
INTERNAL_URL="http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}"
|
||||
OFFICIAL_URL="https://example.com/${DRIVER_PACKAGE}" # 请替换为实际官网地址
|
||||
}
|
||||
|
||||
# 解析命令行参数
|
||||
parse_args() {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--install)
|
||||
ACTION="install"
|
||||
shift
|
||||
;;
|
||||
--uninstall)
|
||||
ACTION="uninstall"
|
||||
shift
|
||||
;;
|
||||
--version)
|
||||
if [[ -z "$2" ]]; then
|
||||
log_error "请指定版本号,如: --version 5.8-6.0.4.2"
|
||||
fi
|
||||
DRIVER_VERSION="$2"
|
||||
generate_package_info
|
||||
shift 2
|
||||
;;
|
||||
--force)
|
||||
FORCE=1
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
log_error "未知参数: $1"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$ACTION" ]]; then
|
||||
log_error "请指定操作: --install 或 --uninstall"
|
||||
fi
|
||||
}
|
||||
|
||||
# 下载驱动包
|
||||
download_driver() {
|
||||
log_info "开始下载驱动包: $DRIVER_PACKAGE"
|
||||
if [ -f "$PACKAGE_PATH" ]; then
|
||||
log_info "使用本地驱动包: $PACKAGE_PATH"
|
||||
else
|
||||
log_info "本地包不存在,尝试从内网下载"
|
||||
if wget -q -O "$PACKAGE_PATH" "$INTERNAL_URL"; then
|
||||
log_info "内网下载成功"
|
||||
else
|
||||
log_warning "内网下载失败,尝试从官网下载"
|
||||
if wget -q -O "$PACKAGE_PATH" "$OFFICIAL_URL"; then
|
||||
log_info "官网下载成功"
|
||||
else
|
||||
log_error "驱动包下载失败,请手动放置到 /opt/"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装驱动
|
||||
install_driver() {
|
||||
log_info "开始安装驱动: $DRIVER_VERSION"
|
||||
|
||||
# 检查是否已安装
|
||||
#if command -v ibv_devinfo &> /dev/null && [[ $FORCE -eq 0 ]]; then
|
||||
# log_warning "检测到驱动已安装,使用 --force 覆盖安装"
|
||||
# exit 0
|
||||
#fi
|
||||
|
||||
kernel_version=$(uname -r)
|
||||
log_info "当前内核版本: $kernel_version"
|
||||
|
||||
log_info "安装依赖包"
|
||||
apt update &>> /tmp/mlnx_install.log
|
||||
apt install -y net-tools bzip2 &>> /tmp/mlnx_install.log
|
||||
|
||||
log_info "解压驱动包"
|
||||
tar -zxf "$PACKAGE_PATH" -C /opt/
|
||||
|
||||
log_info "执行驱动安装"
|
||||
cd "$DRIVER_DIR"
|
||||
./mlnxofedinstall --without-dkms --add-kernel-support --kernel "$kernel_version" --with-fw-update --force &>> /tmp/mlnx_install.log
|
||||
sleep 10
|
||||
}
|
||||
|
||||
# 卸载驱动(修改后版本)
|
||||
uninstall_driver() {
|
||||
log_info "开始卸载驱动: $DRIVER_VERSION"
|
||||
|
||||
# 检查驱动目录,不存在则重新下载解压
|
||||
if [ ! -d "$DRIVER_DIR" ]; then
|
||||
log_warning "驱动目录不存在,尝试重新下载和解压"
|
||||
download_driver # 复用安装的下载逻辑
|
||||
log_info "解压驱动包"
|
||||
tar -zxf "$PACKAGE_PATH" -C /opt/
|
||||
if [ ! -d "$DRIVER_DIR" ]; then
|
||||
log_error "解压失败,无法找到驱动目录: $DRIVER_DIR"
|
||||
else
|
||||
log_info "成功解压驱动包到: $DRIVER_DIR"
|
||||
fi
|
||||
else
|
||||
log_info "找到驱动目录: $DRIVER_DIR"
|
||||
fi
|
||||
|
||||
# 执行卸载
|
||||
cd "$DRIVER_DIR"
|
||||
log_info "执行卸载脚本"
|
||||
./uninstall.sh -q -y &>> /tmp/mlnx_install.log || log_warning "卸载脚本执行失败,尝试手动清理"
|
||||
|
||||
log_info "清理残留文件"
|
||||
rm -rf "$DRIVER_DIR" "$PACKAGE_PATH"
|
||||
|
||||
log_info "停止并禁用openibd服务"
|
||||
systemctl stop openibd.service &>> /tmp/mlnx_install.log || true
|
||||
systemctl disable openibd.service &>> /tmp/mlnx_install.log || true
|
||||
|
||||
log_info "恢复网卡命名规则"
|
||||
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules
|
||||
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules
|
||||
rm -f /etc/modprobe.d/nvidia-gsp.conf
|
||||
update-initramfs -u &>> /tmp/mlnx_install.log
|
||||
}
|
||||
|
||||
# 配置网卡命名规则
|
||||
configure_naming_rules() {
|
||||
log_info "配置IB网卡命名规则"
|
||||
|
||||
log_info "备份原有规则"
|
||||
cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log || true
|
||||
cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log || true
|
||||
|
||||
log_info "清除原有规则"
|
||||
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log || true
|
||||
sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log || true
|
||||
|
||||
log_info "生成IB设备命名规则"
|
||||
ID=20
|
||||
for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
|
||||
if [ -n "$i" ]; then
|
||||
echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules
|
||||
ID=$((ID+1))
|
||||
fi
|
||||
done
|
||||
|
||||
log_info "生成网络设备命名规则"
|
||||
IDS=0
|
||||
for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
|
||||
if [ -n "$j" ]; then
|
||||
echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules
|
||||
IDS=$((IDS+1))
|
||||
fi
|
||||
done
|
||||
|
||||
log_info "配置nvidia选项"
|
||||
echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-gsp.conf
|
||||
update-initramfs -u &>> /tmp/mlnx_install.log
|
||||
|
||||
log_info "重启openibd服务"
|
||||
systemctl restart openibd.service
|
||||
sleep 15
|
||||
}
|
||||
|
||||
# 检查驱动安装结果
|
||||
check_installation() {
|
||||
log_info "检查驱动安装结果"
|
||||
if command -v ibv_devinfo &> /dev/null; then
|
||||
log_info "驱动安装成功"
|
||||
else
|
||||
log_error "驱动安装失败"
|
||||
fi
|
||||
|
||||
log_info "检查网卡命名规则"
|
||||
valid_count=0
|
||||
for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
|
||||
if [ -n "$dev" ]; then
|
||||
mlx_name=$(udevadm info -q name -n "$dev" 2>/dev/null | grep "mlx5_" || true)
|
||||
net_name=$(ip link show "$dev" | grep "ib[0-9]" || true)
|
||||
|
||||
if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then
|
||||
valid_count=$((valid_count+1))
|
||||
else
|
||||
log_warning "网卡 $dev 命名规则未生效"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $valid_count -gt 0 ]; then
|
||||
log_info "网卡命名规则生效,成功配置 $valid_count 个网卡"
|
||||
else
|
||||
log_warning "所有网卡命名规则均未生效,建议重启系统手工配置!"
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查卸载结果
|
||||
check_uninstallation() {
|
||||
log_info "检查卸载结果"
|
||||
if ! command -v ibv_devinfo &> /dev/null; then
|
||||
log_info "驱动已成功卸载"
|
||||
else
|
||||
log_warning "驱动命令仍存在,可能需要手动清理"
|
||||
fi
|
||||
|
||||
if [ ! -d "$DRIVER_DIR" ]; then
|
||||
log_info "驱动目录已删除"
|
||||
else
|
||||
log_warning "驱动目录未完全删除: $DRIVER_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
generate_package_info
|
||||
parse_args "$@"
|
||||
|
||||
log_info "开始执行 ${ACTION^^} 操作,版本: $DRIVER_VERSION"
|
||||
|
||||
case "$ACTION" in
|
||||
install)
|
||||
download_driver
|
||||
install_driver
|
||||
configure_naming_rules
|
||||
check_installation
|
||||
;;
|
||||
uninstall)
|
||||
uninstall_driver
|
||||
check_uninstallation
|
||||
;;
|
||||
esac
|
||||
|
||||
log_info "操作完成!"
|
||||
}
|
||||
|
||||
# 执行主函数
|
||||
main "$@"
|
||||
|
|
@ -0,0 +1,180 @@
|
|||
|
||||
#!/bin/bash
|
||||
##############################################################################################################################
|
||||
#脚本功能:
|
||||
#1.口令定期更换策略设置个90天,最小密码长度为8位,密码过期警告提前7天。
|
||||
#2.口令复杂度设置:密码长度至少为12位,包含至少四种字符类型(大写字母、小写字母、数字、特殊字符)。
|
||||
#3.登录失败处理策略设置:登录失败次数为5次,锁定时间为10分钟。
|
||||
#4.登录连接超时默认配置设置:登录连接超时时间为10分钟。
|
||||
#5.日志本地保存时间设置为6个月。
|
||||
#6.禁止root ssh远程登录
|
||||
#7.启动日志与审计服务rsyslog和auditd
|
||||
#8.sshd开启PAM认证
|
||||
#9.安装系统工具
|
||||
##############################################################################################################################
|
||||
# 定义新的配置参数
|
||||
LOGIN_DEFS_POLICY_MAX_DAYS="PASS_MAX_DAYS 90"
|
||||
LOGIN_DEFS_POLICY_MIN_DAYS="PASS_MIN_DAYS 0"
|
||||
LOGIN_DEFS_POLICY_MIN_LEN="PASS_MIN_LEN 8"
|
||||
LOGIN_DEFS_POLICY_WARN_AGE="PASS_WARN_AGE 7"
|
||||
# 编辑/etc/login.defs配置文件
|
||||
echo "正在编辑 /etc/login.defs 文件..."
|
||||
# 检查并替换或添加设置
|
||||
if grep -q "^PASS_MAX_DAYS" /etc/login.defs; then
|
||||
sed -i "s/^PASS_MAX_DAYS.*/${LOGIN_DEFS_POLICY_MAX_DAYS}/" /etc/login.defs
|
||||
fi
|
||||
|
||||
if grep -q "^PASS_MIN_DAYS" /etc/login.defs; then
|
||||
sed -i "s/^PASS_MIN_DAYS.*/${LOGIN_DEFS_POLICY_MIN_DAYS}/" /etc/login.defs
|
||||
fi
|
||||
|
||||
if grep -q "^PASS_MIN_LEN" /etc/login.defs; then
|
||||
sed -i "s/^PASS_MIN_LEN.*/${LOGIN_DEFS_POLICY_MIN_LEN}/" /etc/login.defs
|
||||
fi
|
||||
|
||||
if grep -q "^PASS_WARN_AGE" /etc/login.defs; then
|
||||
sed -i "s/^PASS_WARN_AGE.*/${LOGIN_DEFS_POLICY_WARN_AGE}/" /etc/login.defs
|
||||
fi
|
||||
|
||||
# 编辑/etc/security/pwquality.conf配置文件口令复杂度
|
||||
PWQUALITY_POLICY_MINLEN="minlen = 12"
|
||||
PWQUALITY_POLICY_MINCLASS="minclass = 4"
|
||||
PWQUALITY_POLICY_DCREDIT="dcredit = -1"
|
||||
PWQUALITY_POLICY_UCREDIT="ucredit = -1"
|
||||
PWQUALITY_POLICY_LCREDIT="lcredit = -1"
|
||||
PWQUALITY_POLICY_OCREDIT="ocredit = -1"
|
||||
PWQUALITY_POLICY_FOR_ROOT="enforce_for_root"
|
||||
PWQUALITY_POLICY_DIFOK="difok = 5"
|
||||
|
||||
echo "正在编辑 /etc/security/pwquality.conf 文件配置文件口令复杂度"
|
||||
if grep -q "^minlen" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^minlen.*/${PWQUALITY_POLICY_MINLEN}/" /etc/security/pwquality.conf
|
||||
elif grep -q "^# minlen" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^# minlen.*/${PWQUALITY_POLICY_MINLEN}/" /etc/security/pwquality.conf
|
||||
fi
|
||||
|
||||
if grep -q "^minclass" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^minclass.*/${PWQUALITY_POLICY_MINCLASS}/" /etc/security/pwquality.conf
|
||||
elif grep -q "^# minclass" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^# minclass.*/${PWQUALITY_POLICY_MINCLASS}/" /etc/security/pwquality.conf
|
||||
fi
|
||||
|
||||
if grep -q "^dcredit" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^dcredit.*/${PWQUALITY_POLICY_DCREDIT}/" /etc/security/pwquality.conf
|
||||
elif grep -q "^# dcredit" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^# dcredit.*/${PWQUALITY_POLICY_DCREDIT}/" /etc/security/pwquality.conf
|
||||
fi
|
||||
|
||||
if grep -q "^ucredit" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^ucredit.*/${PWQUALITY_POLICY_UCREDIT}/" /etc/security/pwquality.conf
|
||||
elif grep -q "^# ucredit" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^# ucredit.*/${PWQUALITY_POLICY_UCREDIT}/" /etc/security/pwquality.conf
|
||||
fi
|
||||
|
||||
if grep -q "^lcredit" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^lcredit.*/${PWQUALITY_POLICY_LCREDIT}/" /etc/security/pwquality.conf
|
||||
elif grep -q "^# lcredit" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^# lcredit.*/${PWQUALITY_POLICY_LCREDIT}/" /etc/security/pwquality.conf
|
||||
fi
|
||||
|
||||
if grep -q "^ocredit" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^ocredit.*/${PWQUALITY_POLICY_OCREDIT}/" /etc/security/pwquality.conf
|
||||
elif grep -q "^# ocredit" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^# ocredit.*/${PWQUALITY_POLICY_OCREDIT}/" /etc/security/pwquality.conf
|
||||
fi
|
||||
|
||||
if grep -q "^enforce_for_root" /etc/security/pwquality.conf; then
|
||||
:
|
||||
elif grep -q "^# enforce_for_root" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^# enforce_for_root/${PWQUALITY_POLICY_FOR_ROOT}/" /etc/security/pwquality.conf
|
||||
fi
|
||||
|
||||
if grep -q "^difok" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^difok.*/${PWQUALITY_POLICY_DIFOK}/" /etc/security/pwquality.conf
|
||||
elif grep -q "^# difok.*" /etc/security/pwquality.conf; then
|
||||
sed -i "s/^# difok.*/${PWQUALITY_POLICY_DIFOK}/" /etc/security/pwquality.conf
|
||||
fi
|
||||
|
||||
# 执行以下命令,来更新`system-auth`和`password-auth`文件
|
||||
egrep -q "^\s*password\s+requisite\s+pam_pwquality.so\s+" /etc/pam.d/system-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+try_first_pass)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1try_first_pass \2/ }' /etc/pam.d/system-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+retry=[0-9]+)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1retry=5 \2/ }' /etc/pam.d/system-auth && sed -ri "s/(^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*\s+)retry=[0-9]+(\s+.*)?$/\1retry=5\3/" /etc/pam.d/system-auth
|
||||
sleep 2s
|
||||
egrep -q "^\s*password\s+requisite\s+pam_pwquality.so\s+" /etc/pam.d/password-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+try_first_pass)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1try_first_pass \2/ }' /etc/pam.d/password-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+retry=[0-9]+)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1retry=5 \2/ }' /etc/pam.d/password-auth && sed -ri "s/(^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*\s+)retry=[0-9]+(\s+.*)?$/\1retry=5\3/" /etc/pam.d/password-auth
|
||||
|
||||
# 密码验证失败处理策略
|
||||
echo "正在编辑 /etc/pam.d/password-auth 文件配置密码验证失败处理策略"
|
||||
if grep -Pq "^auth\s*required\s*pam_faillock.so\s*authfail\s*even_deny_root\s*deny=5\s*unlock_time=600\s*$" /etc/pam.d/system-auth; then
|
||||
:
|
||||
else
|
||||
sed -ri "/^auth.*pam_env.so$/i auth required pam_faillock.so preauth silent even_deny_root deny=5 unlock_time=600\nauth required pam_faillock.so authfail even_deny_root deny=5 unlock_time=600" /etc/pam.d/system-auth
|
||||
fi
|
||||
|
||||
if grep -Pq "^auth\s*required\s*pam_faillock.so\s*authfail\s*even_deny_root\s*deny=5\s*unlock_time=600\s*$" /etc/pam.d/password-auth; then
|
||||
:
|
||||
else
|
||||
sed -ri "/^auth.*pam_env.so$/i auth required pam_faillock.so preauth silent even_deny_root deny=5 unlock_time=600\nauth required pam_faillock.so authfail even_deny_root deny=5 unlock_time=600" /etc/pam.d/password-auth
|
||||
fi
|
||||
|
||||
|
||||
# 终端超时自动登出设置要求针对所有用户,自动登退时间为600s
|
||||
echo "正在编辑 /etc/profile 文件配置终端超时自动登出设置要求针对所有用户,自动登退时间为600s"
|
||||
if grep -q "^export TMOUT" /etc/profile; then
|
||||
sed -i "s/^export TMOUT.*/export TMOUT=600/" /etc/profile
|
||||
else
|
||||
echo "export TMOUT=600" >> /etc/profile
|
||||
fi
|
||||
|
||||
# 设置日志本地保存时间6个月
|
||||
echo "正在编辑 /etc/logrotate.conf 文件设置日志本地保存时间6个月"
|
||||
if grep -q "^rotate" /etc/logrotate.conf; then
|
||||
sed -i "s/rotate.*/rotate 26/" /etc/logrotate.conf
|
||||
fi
|
||||
|
||||
# 禁止root ssh远程登录
|
||||
echo "正在编辑 /etc/ssh/sshd_config 文件禁止root ssh远程登录"
|
||||
if grep -q "^PermitRootLogin" /etc/ssh/sshd_config; then
|
||||
sed -i "s/^PermitRootLogin.*/PermitRootLogin no/" /etc/ssh/sshd_config
|
||||
else
|
||||
echo "PermitRootLogin no" >> /etc/ssh/sshd_config
|
||||
fi
|
||||
|
||||
if grep -q "^PubkeyAuthentication" /etc/ssh/sshd_config; then
|
||||
sed -i "s/^PubkeyAuthentication.*/PubkeyAuthentication yes/" /etc/ssh/sshd_config
|
||||
else
|
||||
sed -i "s/^#PubkeyAuthentication.*/PubkeyAuthentication yes/" /etc/ssh/sshd_config
|
||||
fi
|
||||
# 禁用 ssh DNS 解析
|
||||
if grep -q "^UseDNS" /etc/ssh/sshd_config; then
|
||||
sed -i "s/^UseDNS.*/UseDNS no/" /etc/ssh/sshd_config
|
||||
else
|
||||
sed -i "s/^#UseDNS.*/UseDNS no/" /etc/ssh/sshd_config
|
||||
fi
|
||||
# 开启ssh PAM认证
|
||||
if grep -q "^UsePAM" /etc/ssh/sshd_config; then
|
||||
sed -i "s/^UsePAM.*/UsePAM yes/" /etc/ssh/sshd_config
|
||||
else
|
||||
sed -i "s/^#UsePAM.*/UsePAM yes/" /etc/ssh/sshd_config
|
||||
fi
|
||||
|
||||
# 重启sshd服务,生效配置
|
||||
echo "正在重启sshd服务....."
|
||||
if grep -Pq '^PubkeyAuthentication yes' /etc/ssh/sshd_config;then
|
||||
systemctl restart sshd
|
||||
fi
|
||||
sleep 2s
|
||||
# 开启rsyslog服务,开启auditd服务
|
||||
echo "正在启动rsyslog和auditd服务"
|
||||
systemctl restart rsyslog.service
|
||||
systemctl start rsyslog.service && systemctl enable rsyslog.service
|
||||
sleep 2s
|
||||
systemctl start auditd.service && systemctl enable auditd.service
|
||||
echo "请自行修改操作系统默认密码。并做好密码保存。"
|
||||
echo "已禁止root ssh远程登录,请使用scloudadmin账号登录,如无法登录请通过ipmi远程控制登录"
|
||||
|
||||
#9.安装系统工具
|
||||
echo "安装sysstat ipmitool vim pciutils net-tools工具包"
|
||||
dnf -y install sysstat.x86_64
|
||||
dnf -y install ipmitool.x86_64
|
||||
dnf -y install vim
|
||||
dnf -y install pciutils.x86_64
|
||||
dnf -y install net-tools.x86_64
|
||||
echo "所有操作已完成。"
|
||||
|
|
@ -0,0 +1,177 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# 全局变量
|
||||
LOG_FILE="/var/log/node_exporter_$(date +%Y%m%d%H%M%S).log"
|
||||
NODE_EXPORTER_VERSION="1.8.2"
|
||||
PRIMARY_DOWNLOAD_URL="http://10.101.0.51:5588/node-exporter/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz"
|
||||
BACKUP_DOWNLOAD_URL="https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz"
|
||||
LOCAL_PACKAGE_PATH="/opt/node_exporter.tar.gz"
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
NC='\033[0m' # 重置颜色
|
||||
|
||||
# 日志函数 - 记录所有操作到日志
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 步骤提示 - 绿色输出到控制台并记录日志
|
||||
step() {
|
||||
local msg="==> $1"
|
||||
echo -e "${GREEN}$msg${NC}"
|
||||
log "$msg"
|
||||
}
|
||||
|
||||
# 错误提示 - 红色输出到控制台并记录日志
|
||||
error() {
|
||||
local msg="错误: $1"
|
||||
echo -e "${RED}$msg${NC}"
|
||||
log "$msg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 执行命令并屏蔽输出
|
||||
run_cmd() {
|
||||
step "执行: $1"
|
||||
eval "$1" &>> "$LOG_FILE" || {
|
||||
error "命令执行失败: $1"
|
||||
}
|
||||
}
|
||||
|
||||
# 测试网络连通性
|
||||
test_network_connectivity() {
|
||||
local url=$1
|
||||
step "测试网络连通性: $url"
|
||||
if curl -fsSLI --connect-timeout 10 "$url" &>> "$LOG_FILE"; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 下载文件
|
||||
download_file() {
|
||||
local url=$1
|
||||
local dest=$2
|
||||
step "下载文件: $url 到 $dest"
|
||||
if wget -qO "$dest" "$url" &>> "$LOG_FILE"; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装 node_exporter
|
||||
install_node_exporter() {
|
||||
step "开始安装 node_exporter ${NODE_EXPORTER_VERSION}"
|
||||
|
||||
# 切换到 /opt 目录
|
||||
run_cmd "cd /opt"
|
||||
|
||||
# 检查本地是否存在安装包
|
||||
if [[ -f "$LOCAL_PACKAGE_PATH" ]]; then
|
||||
step "发现本地安装包: $LOCAL_PACKAGE_PATH"
|
||||
DOWNLOAD_URL="$LOCAL_PACKAGE_PATH"
|
||||
else
|
||||
# 测试主要下载地址的连通性
|
||||
if test_network_connectivity "$PRIMARY_DOWNLOAD_URL"; then
|
||||
DOWNLOAD_URL="$PRIMARY_DOWNLOAD_URL"
|
||||
elif test_network_connectivity "$BACKUP_DOWNLOAD_URL"; then
|
||||
DOWNLOAD_URL="$BACKUP_DOWNLOAD_URL"
|
||||
else
|
||||
error "无法连接到任何下载地址"
|
||||
fi
|
||||
|
||||
# 下载 node_exporter
|
||||
download_file "$DOWNLOAD_URL" "node_exporter.tar.gz"
|
||||
fi
|
||||
|
||||
# 解压 tar 包
|
||||
run_cmd "tar -zxvf node_exporter.tar.gz"
|
||||
|
||||
# 移动文件夹
|
||||
run_cmd "mv node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/ ./node_exporter"
|
||||
|
||||
# 进入 node_exporter 目录
|
||||
run_cmd "cd node_exporter/"
|
||||
|
||||
# 创建 bin 目录并移动二进制文件
|
||||
run_cmd "mkdir bin"
|
||||
run_cmd "mv node_exporter bin/"
|
||||
|
||||
# 配置 systemd 服务
|
||||
cat > /lib/systemd/system/node_exporter.service <<EOF
|
||||
[Unit]
|
||||
Description=node_exporter
|
||||
Documentation=https://prometheus.io/docs/guides/node-exporter/
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
Group=root
|
||||
ExecStart=/opt/node_exporter/bin/node_exporter --web.listen-address=:10086
|
||||
Restart=on-failure
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# 重新加载 systemd 配置
|
||||
run_cmd "systemctl daemon-reload"
|
||||
|
||||
# 启用并启动 node_exporter 服务
|
||||
run_cmd "systemctl enable node_exporter"
|
||||
run_cmd "systemctl restart node_exporter"
|
||||
|
||||
# 检查服务状态
|
||||
run_cmd "systemctl status node_exporter"
|
||||
|
||||
# 验证安装
|
||||
run_cmd "curl -I http://127.000.1:10086/metrics"
|
||||
|
||||
step "node_exporter 安装成功"
|
||||
}
|
||||
|
||||
# 卸载 node_exporter
|
||||
uninstall_node_exporter() {
|
||||
step "开始卸载 node_exporter"
|
||||
|
||||
# 停止并禁用服务
|
||||
run_cmd "systemctl stop node_exporter"
|
||||
run_cmd "systemctl disable node_exporter"
|
||||
|
||||
# 删除 systemd 服务文件
|
||||
run_cmd "rm -f /lib/systemd/system/node_exporter.service"
|
||||
|
||||
# 重新加载 systemd 配置
|
||||
run_cmd "systemctl daemon-reload"
|
||||
|
||||
# 删除 node_exporter 目录和文件
|
||||
run_cmd "rm -rf /opt/node_exporter"
|
||||
run_cmd "rm -f /opt/node_exporter.tar.gz"
|
||||
|
||||
step "node_exporter 卸载完成"
|
||||
}
|
||||
|
||||
# 参数解析
|
||||
if [[ $# -ne 1 ]]; then
|
||||
error "请使用 --install 或 --uninstall"
|
||||
fi
|
||||
|
||||
ACTION=$1
|
||||
case "$ACTION" in
|
||||
"--install")
|
||||
install_node_exporter
|
||||
;;
|
||||
"--uninstall")
|
||||
uninstall_node_exporter
|
||||
;;
|
||||
*)
|
||||
error "无效的参数,请使用 --install 或 --uninstall"
|
||||
;;
|
||||
esac
|
||||
|
||||
step "操作完成,日志路径: $LOG_FILE"
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# 全局变量
|
||||
LOG_FILE="/var/log/nvidia-dcgm_$(date +%Y%m%d%H%M%S).log"
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
NC='\033[0m' # 重置颜色
|
||||
|
||||
# 日志函数 - 记录所有操作到日志
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 步骤提示 - 绿色输出到控制台并记录日志
|
||||
step() {
|
||||
local msg="==> $1"
|
||||
echo -e "${GREEN}$msg${NC}"
|
||||
log "$msg"
|
||||
}
|
||||
|
||||
# 错误提示 - 红色输出到控制台并记录日志
|
||||
error() {
|
||||
local msg="错误: $1"
|
||||
echo -e "${RED}$msg${NC}"
|
||||
log "$msg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 执行命令并屏蔽输出,仅记录关键信息
|
||||
run_cmd() {
|
||||
step "执行: $1"
|
||||
eval "$1" &>> "$LOG_FILE" || {
|
||||
error "命令执行失败: $1"
|
||||
}
|
||||
}
|
||||
|
||||
# 检测系统版本
|
||||
detect_os_version() {
|
||||
if [[ -f /etc/os-release ]]; then
|
||||
. /etc/os-release
|
||||
OS_ID=$ID
|
||||
OS_VERSION=$VERSION_ID
|
||||
step "检测到系统: ${OS_ID} ${OS_VERSION}"
|
||||
else
|
||||
error "无法检测到操作系统版本"
|
||||
fi
|
||||
}
|
||||
|
||||
# 清理现有的 DCGM 配置
|
||||
cleanup_dcgm_config() {
|
||||
step "清理现有的 DCGM 配置"
|
||||
run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
|
||||
run_cmd "apt-get remove -y datacenter-gpu-manager || true"
|
||||
run_cmd "apt-get autoremove -y || true"
|
||||
run_cmd "apt-get autoclean -y || true"
|
||||
}
|
||||
|
||||
# 添加 CUDA 仓库密钥
|
||||
add_cuda_keyring() {
|
||||
step "添加 CUDA 仓库密钥"
|
||||
run_cmd "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb"
|
||||
run_cmd "dpkg -i cuda-keyring_1.0-1_all.deb"
|
||||
run_cmd "rm cuda-keyring_1.0-1_all.deb"
|
||||
}
|
||||
|
||||
# 更新包列表
|
||||
update_package_list() {
|
||||
step "更新包列表"
|
||||
run_cmd "apt-get update"
|
||||
}
|
||||
|
||||
# 安装 DCGM for Ubuntu 22.04
|
||||
install_dcgm_for_ubuntu_22() {
|
||||
step "开始安装 DCGM for Ubuntu 22"
|
||||
|
||||
# 清理现有的 DCGM 配置
|
||||
cleanup_dcgm_config
|
||||
|
||||
# 添加 CUDA 仓库密钥
|
||||
add_cuda_keyring
|
||||
|
||||
# 添加 CUDA 仓库
|
||||
run_cmd "add-apt-repository \"deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /\" -y"
|
||||
|
||||
# 更新包列表
|
||||
update_package_list
|
||||
|
||||
# 获取 CUDA 版本
|
||||
CUDA_VERSION=$(nvidia-smi | sed -E -n 's/.*CUDA Version: ([0-9]+)[.].*/\1/p')
|
||||
if [[ -z "$CUDA_VERSION" ]]; then
|
||||
error "无法检测到 CUDA 版本"
|
||||
fi
|
||||
|
||||
# 安装 DCGM
|
||||
run_cmd "apt install --install-recommends datacenter-gpu-manager-4-cuda${CUDA_VERSION} -y"
|
||||
|
||||
# 启动并启用 DCGM 服务
|
||||
run_cmd "systemctl --now enable nvidia-dcgm"
|
||||
run_cmd "systemctl restart nvidia-dcgm"
|
||||
run_cmd "systemctl status nvidia-dcgm"
|
||||
|
||||
# 验证 DCGM 安装
|
||||
run_cmd "dcgmi discovery -l"
|
||||
|
||||
step "DCGM 安装成功"
|
||||
}
|
||||
|
||||
# 卸载 DCGM
|
||||
uninstall_dcgm() {
|
||||
step "开始卸载 DCGM"
|
||||
|
||||
# 停止并禁用 DCGM 服务
|
||||
run_cmd "systemctl stop nvidia-dcgm || true"
|
||||
run_cmd "systemctl disable nvidia-dcgm || true"
|
||||
|
||||
# 移除 DCGM 包
|
||||
run_cmd "apt-get remove -y datacenter-gpu-manager || true"
|
||||
run_cmd "apt-get autoremove -y || true"
|
||||
run_cmd "apt-get autoclean -y || true"
|
||||
|
||||
# 清理 CUDA 仓库密钥
|
||||
run_cmd "rm -f /usr/share/keyrings/cuda-archive-keyring.gpg"
|
||||
|
||||
# 删除 CUDA 仓库配置文件
|
||||
run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
|
||||
|
||||
# 更新包列表
|
||||
update_package_list
|
||||
|
||||
step "DCGM 卸载完成"
|
||||
}
|
||||
|
||||
# 主流程
|
||||
step "NVIDIA DCGM 安装脚本启动"
|
||||
detect_os_version
|
||||
|
||||
if [[ $# -ne 1 ]]; then
|
||||
error "请使用 --install 或 --uninstall"
|
||||
fi
|
||||
|
||||
ACTION=$1
|
||||
case "$ACTION" in
|
||||
"--install")
|
||||
case "$OS_ID-$OS_VERSION" in
|
||||
ubuntu-22.04)
|
||||
install_dcgm_for_ubuntu_22
|
||||
;;
|
||||
*)
|
||||
error "不支持的操作系统版本: ${OS_ID} ${OS_VERSION}"
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
"--uninstall")
|
||||
uninstall_dcgm
|
||||
;;
|
||||
*)
|
||||
error "无效的参数,请使用 --install 或 --uninstall"
|
||||
;;
|
||||
esac
|
||||
|
||||
step "操作完成,日志路径: $LOG_FILE"
|
||||
|
|
@ -0,0 +1,235 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# 全局变量
|
||||
DEFAULT_VERSION="565.57.01"
|
||||
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-linux"
|
||||
#OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
|
||||
PACKAGE_TEMPLATE="NVIDIA-Linux-x86_64-%s.run"
|
||||
INSTALL_DIR="/opt"
|
||||
LOG_FILE="/var/log/nvidia-driver_$(date +%Y%m%d%H%M%S).log"
|
||||
SERVICE_FILE="/etc/systemd/system/nvidia_peermem.service"
|
||||
PERSISTENCE_SERVICE="/etc/systemd/system/nvidia-persistenced.service"
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
NC='\033[0m' # 重置颜色
|
||||
|
||||
# 日志函数 - 记录所有操作到日志
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 步骤提示 - 绿色输出到控制台并记录日志
|
||||
step() {
|
||||
local msg="==> $1"
|
||||
echo -e "${GREEN}$msg${NC}"
|
||||
log "$msg"
|
||||
}
|
||||
|
||||
# 错误提示 - 红色输出到控制台并记录日志
|
||||
error() {
|
||||
local msg="错误: $1"
|
||||
echo -e "${RED}$msg${NC}"
|
||||
log "$msg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 执行命令并屏蔽输出,仅记录关键信息
|
||||
run_cmd() {
|
||||
step "执行: $1"
|
||||
eval "$1" &>> "$LOG_FILE" || {
|
||||
error "命令执行失败: $1"
|
||||
}
|
||||
}
|
||||
|
||||
# 参数解析
|
||||
ACTION=""
|
||||
VERSION="$DEFAULT_VERSION"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--install) ACTION="install"; shift ;;
|
||||
--uninstall) ACTION="uninstall"; shift ;;
|
||||
--version) VERSION="$2"; shift 2 ;;
|
||||
*) error "未知参数 $1" ;;
|
||||
esac
|
||||
done
|
||||
OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
|
||||
|
||||
[[ -z "$ACTION" ]] && { error "必须指定 --install 或 --uninstall"; }
|
||||
|
||||
# peermem_service 开机启动函数
|
||||
install_peermem_service() {
|
||||
step "开始配置 nvidia_peermem 开机启动"
|
||||
# 创建服务文件
|
||||
cat > "$SERVICE_FILE" <<EOF
|
||||
[Unit]
|
||||
After=network.target
|
||||
[Service]
|
||||
ExecStart=/usr/sbin/modprobe nvidia_peermem
|
||||
ExecStop=/usr/sbin/rmmod -f nvidia_peermem
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
EOF
|
||||
|
||||
# 设置权限
|
||||
run_cmd "chmod 644 $SERVICE_FILE"
|
||||
step "服务文件已创建:$SERVICE_FILE"
|
||||
|
||||
# 重载 systemd 并启用服务
|
||||
run_cmd "systemctl daemon-reload"
|
||||
run_cmd "systemctl enable --now nvidia_peermem.service"
|
||||
step "服务已启用并开机自启"
|
||||
}
|
||||
|
||||
# peermem_service 开机启动卸载函数
|
||||
uninstall_peermem_service() {
|
||||
step "开始移除 nvidia_peermem 开机启动配置"
|
||||
if [[ -f "$SERVICE_FILE" ]]; then
|
||||
run_cmd "systemctl stop nvidia_peermem.service"
|
||||
run_cmd "systemctl disable nvidia_peermem.service"
|
||||
run_cmd "rm -f $SERVICE_FILE"
|
||||
step "服务文件已删除:$SERVICE_FILE"
|
||||
else
|
||||
step "警告:nvidia_peermem 服务文件不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# nvidia_persistenced GPU 持久模式
|
||||
install_persistence_service() {
|
||||
step "开始配置 nvidia-persistenced 开机启动"
|
||||
# 创建服务文件
|
||||
cat > "$PERSISTENCE_SERVICE" <<EOF
|
||||
[Unit]
|
||||
Description=NVIDIA Persistence Daemon
|
||||
After=syslog.target network.target
|
||||
Wants=nvidia-modules.service
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
|
||||
ExecStart=/usr/bin/nvidia-persistenced --verbose
|
||||
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# 设置权限
|
||||
run_cmd "chmod 644 $PERSISTENCE_SERVICE"
|
||||
step "服务文件已创建:$PERSISTENCE_SERVICE"
|
||||
|
||||
# 重载 systemd 并启用服务
|
||||
run_cmd "systemctl daemon-reload"
|
||||
run_cmd "systemctl enable --now nvidia-persistenced.service"
|
||||
step "nvidia-persistenced 服务已启用并开机自启"
|
||||
}
|
||||
|
||||
# 卸载 persistence_server 开机启动
|
||||
uninstall_persistence_service() {
|
||||
step "开始移除 nvidia-persistenced 开机启动配置"
|
||||
if [[ -f "$PERSISTENCE_SERVICE" ]]; then
|
||||
run_cmd "systemctl stop nvidia-persistenced.service"
|
||||
run_cmd "systemctl disable nvidia-persistenced.service"
|
||||
run_cmd "rm -f $PERSISTENCE_SERVICE"
|
||||
step "服务文件已删除:$PERSISTENCE_SERVICE"
|
||||
else
|
||||
step "警告:nvidia-persistenced 服务文件不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 显卡驱动安装函数
|
||||
install_driver() {
|
||||
step "开始安装显卡驱动,版本:$VERSION"
|
||||
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
|
||||
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
|
||||
|
||||
# 检查/下载包
|
||||
if [[ ! -f "$PACKAGE_PATH" ]]; then
|
||||
step "未找到本地包,开始下载"
|
||||
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
|
||||
else
|
||||
step "使用本地包:$PACKAGE_PATH"
|
||||
fi
|
||||
|
||||
# 安装驱动
|
||||
cd "$INSTALL_DIR"
|
||||
run_cmd "chmod +x $PACKAGE_NAME"
|
||||
run_cmd "./$PACKAGE_NAME -q -s"
|
||||
|
||||
# 配置服务
|
||||
run_cmd "modprobe nvidia_peermem"
|
||||
run_cmd "nvidia-smi -pm 1"
|
||||
|
||||
# 验证版本
|
||||
run_cmd "nvidia-smi | grep $VERSION"
|
||||
nvidia-smi | grep "$VERSION" >> "$LOG_FILE" 2>&1 || {
|
||||
log "版本验证失败"; exit 1
|
||||
}
|
||||
|
||||
install_peermem_service # 开机启动加载:nvidia_peermem
|
||||
install_persistence_service # 开机启动 GPU 持久模式
|
||||
step "安装完成"
|
||||
}
|
||||
|
||||
# 显卡驱动卸载函数
|
||||
uninstall_driver() {
|
||||
step "开始卸载显卡驱动,版本:$VERSION"
|
||||
PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
|
||||
PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
|
||||
|
||||
# 检查卸载脚本
|
||||
if [[ -f "$PACKAGE_PATH" ]]; then
|
||||
step "找到安装包,使用安装包卸载"
|
||||
cd "$INSTALL_DIR"
|
||||
run_cmd "chmod +x $PACKAGE_NAME"
|
||||
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
|
||||
step "卸载失败,请手动卸载驱动程序"
|
||||
return
|
||||
fi
|
||||
else
|
||||
step "未找到本地安装包,尝试下载卸载包"
|
||||
download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
|
||||
if [[ -f "$PACKAGE_PATH" ]]; then
|
||||
cd "$INSTALL_DIR"
|
||||
run_cmd "chmod +x $PACKAGE_NAME"
|
||||
if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
|
||||
step "卸载失败,请手动卸载驱动程序"
|
||||
return
|
||||
fi
|
||||
else
|
||||
step "无法找到或下载卸载包,请手动卸载驱动程序"
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
uninstall_peermem_service # 移除 peermem 服务
|
||||
uninstall_persistence_service # 移除 persistenced 服务
|
||||
step "卸载完成"
|
||||
}
|
||||
|
||||
# 包下载函数
|
||||
download_package() {
|
||||
local package_name="$1"
|
||||
local package_path="$2"
|
||||
local download_urls=(
|
||||
"${INTERNAL_BASE_URL}/${package_name}"
|
||||
"${OFFICIAL_BASE_URL}/${package_name}"
|
||||
)
|
||||
|
||||
for url in "${download_urls[@]}"; do
|
||||
step "尝试从 $url 下载"
|
||||
wget -qO "$package_path" "$url" && return 0
|
||||
step "下载失败,尝试下一个 URL"
|
||||
done
|
||||
|
||||
error "无法从任何来源下载 $package_name"
|
||||
}
|
||||
|
||||
# 根据动作调用对应函数
|
||||
case "$ACTION" in
|
||||
install) install_driver ;;
|
||||
uninstall) uninstall_driver ;;
|
||||
esac
|
||||
|
|
@ -0,0 +1,190 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# 全局变量
|
||||
FABRICMANAGER_MAJOR_VERSION="565"
|
||||
FABRICMANAGER_FULL_VERSION="565.57.01-1"
|
||||
PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb"
|
||||
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
|
||||
INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-fabricmanager"
|
||||
OFFICIAL_BASE_URL="https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64"
|
||||
TEMP_DIR="/tmp/fabricmanager_temp"
|
||||
LOG_FILE="/var/log/fabricmanager_$(date +%Y%m%d%H%M%S).log"
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # 重置颜色
|
||||
|
||||
# 日志函数
|
||||
log() {
|
||||
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||||
echo "$msg" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 步骤提示
|
||||
step() {
|
||||
local msg="==> $1"
|
||||
echo -e "${GREEN}$msg${NC}"
|
||||
log "$msg"
|
||||
}
|
||||
|
||||
# 警告提示
|
||||
warning() {
|
||||
local msg="警告: $1"
|
||||
echo -e "${YELLOW}$msg${NC}"
|
||||
log "$msg"
|
||||
}
|
||||
|
||||
# 错误提示
|
||||
error() {
|
||||
local msg="错误: $1"
|
||||
echo -e "${RED}$msg${NC}"
|
||||
log "$msg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 执行命令并记录日志
|
||||
run_cmd() {
|
||||
step "执行: $1"
|
||||
eval "$1" &>> "$LOG_FILE" || {
|
||||
error "命令执行失败: $1"
|
||||
}
|
||||
}
|
||||
|
||||
# 检查命令是否存在
|
||||
check_cmd() {
|
||||
command -v "$1" &>/dev/null || error "未找到命令: $1"
|
||||
}
|
||||
|
||||
# 参数解析
|
||||
ACTION=""
|
||||
FORCE=0
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--install) ACTION="install"; shift ;;
|
||||
--uninstall) ACTION="uninstall"; shift ;;
|
||||
--version)
|
||||
CUSTOM_VERSION="$2"
|
||||
if [[ $CUSTOM_VERSION =~ ^([0-9]+)(_[0-9]+\.[0-9]+\.[0-9]+-[0-9]+)$ ]]; then
|
||||
FABRICMANAGER_MAJOR_VERSION="${BASH_REMATCH[1]}"
|
||||
FABRICMANAGER_FULL_VERSION="${BASH_REMATCH[2]#_}"
|
||||
PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb"
|
||||
PACKAGE_PATH="/opt/${PACKAGE_NAME}"
|
||||
else
|
||||
error "版本格式错误,应为 xxxx_xxxx.xx.xx-x"
|
||||
fi
|
||||
shift 2 ;;
|
||||
--force) FORCE=1; shift ;;
|
||||
*) error "未知参数: $1" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall"
|
||||
|
||||
|
||||
# 下载安装包
|
||||
download_package() {
|
||||
step "检查安装包: $PACKAGE_PATH"
|
||||
if [[ -f "$PACKAGE_PATH" ]]; then
|
||||
step "使用本地安装包"
|
||||
return 0
|
||||
fi
|
||||
|
||||
step "本地包不存在,开始下载"
|
||||
mkdir -p "$(dirname "$PACKAGE_PATH")"
|
||||
|
||||
local urls=(
|
||||
"${INTERNAL_BASE_URL}/${PACKAGE_NAME}"
|
||||
"${OFFICIAL_BASE_URL}/${PACKAGE_NAME}"
|
||||
)
|
||||
|
||||
for url in "${urls[@]}"; do
|
||||
step "尝试从 $url 下载"
|
||||
if wget -q -O "$PACKAGE_PATH" "$url"; then
|
||||
step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)"
|
||||
return 0
|
||||
else
|
||||
warning "从 $url 下载失败"
|
||||
fi
|
||||
done
|
||||
|
||||
error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH"
|
||||
}
|
||||
|
||||
|
||||
# 安装 NVIDIA Fabric Manager
|
||||
install_fabricmanager() {
|
||||
step "开始安装 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}"
|
||||
|
||||
# 下载安装包
|
||||
download_package
|
||||
|
||||
# 安装前检查
|
||||
step "检查系统依赖"
|
||||
check_cmd dpkg
|
||||
check_cmd systemctl
|
||||
|
||||
# 安装 Fabric Manager
|
||||
step "安装 NVIDIA Fabric Manager"
|
||||
run_cmd "dpkg -i $PACKAGE_PATH"
|
||||
|
||||
# 启动并启用服务
|
||||
step "启动并启用 NVIDIA Fabric Manager 服务"
|
||||
run_cmd "systemctl enable nvidia-fabricmanager.service --now"
|
||||
|
||||
# 验证安装
|
||||
step "验证 NVIDIA Fabric Manager 服务状态"
|
||||
if systemctl is-active --quiet nvidia-fabricmanager.service; then
|
||||
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 安装成功并运行中"
|
||||
else
|
||||
error "NVIDIA Fabric Manager 服务未运行"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# 卸载 NVIDIA Fabric Manager
|
||||
uninstall_fabricmanager() {
|
||||
step "开始卸载 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}"
|
||||
|
||||
# 检查服务状态
|
||||
if systemctl is-active --quiet nvidia-fabricmanager.service; then
|
||||
step "停止 NVIDIA Fabric Manager 服务"
|
||||
run_cmd "systemctl stop nvidia-fabricmanager.service"
|
||||
else
|
||||
step "NVIDIA Fabric Manager 服务未运行"
|
||||
fi
|
||||
|
||||
# 禁用服务
|
||||
step "禁用 NVIDIA Fabric Manager 服务"
|
||||
run_cmd "systemctl disable nvidia-fabricmanager.service"
|
||||
|
||||
# 卸载软件包
|
||||
step "卸载 NVIDIA Fabric Manager 软件包"
|
||||
if dpkg -s "nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" &>/dev/null; then
|
||||
run_cmd "dpkg -r nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}"
|
||||
else
|
||||
warning "未找到 nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION} 软件包"
|
||||
if [[ $FORCE -eq 0 ]]; then
|
||||
error "请使用 --force 参数强制卸载"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 清理残留文件
|
||||
step "清理残留文件"
|
||||
rm -f "$PACKAGE_PATH"
|
||||
|
||||
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 卸载完成"
|
||||
}
|
||||
|
||||
|
||||
step "开始 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程"
|
||||
case "$ACTION" in
|
||||
install) install_fabricmanager ;;
|
||||
uninstall) uninstall_fabricmanager ;;
|
||||
*) error "未知操作: $ACTION" ;;
|
||||
esac
|
||||
|
||||
step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程完成"
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
|
||||
#!/bin/bash
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[1;32m'
|
||||
RED='\033[1;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # 重置颜色
|
||||
|
||||
# 日志函数 - 绿色输出
|
||||
log_info() {
|
||||
echo -e "${GREEN}[INFO] $1${NC}"
|
||||
}
|
||||
|
||||
# 错误函数 - 红色输出
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR] $1${NC}"
|
||||
}
|
||||
|
||||
# 警告函数 - 黄色输出
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING] $1${NC}"
|
||||
}
|
||||
|
||||
# 禁用apt的定期更新
|
||||
disable_apt_periodic_updates() {
|
||||
log_info "禁用apt的定期更新..."
|
||||
# 修改10periodic配置文件,将所有的1改为0,禁用自动更新检查
|
||||
sed -i 's/1/0/g' /etc/apt/apt.conf.d/10periodic
|
||||
# 修改20auto-upgrades配置文件,将所有的1改为0,禁用自动升级
|
||||
sed -i 's/1/0/g' /etc/apt/apt.conf.d/20auto-upgrades
|
||||
log_info "apt定期更新已禁用"
|
||||
}
|
||||
|
||||
# 设置系统时区为上海
|
||||
set_timezone_to_shanghai() {
|
||||
log_info "设置系统时区为上海..."
|
||||
# 使用timedatectl命令设置系统时区为Asia/Shanghai
|
||||
timedatectl set-timezone Asia/Shanghai
|
||||
log_info "系统时区已设置为上海"
|
||||
}
|
||||
|
||||
# 同步硬件时钟和系统时钟
|
||||
synchronize_hardware_clock() {
|
||||
log_info "同步硬件时钟和系统时钟..."
|
||||
# 使用hwclock命令将系统时间同步到硬件时钟
|
||||
hwclock --systohc
|
||||
log_info "硬件时钟和系统时钟已同步"
|
||||
}
|
||||
|
||||
# 删除"ubuntu"用户
|
||||
#remove_ubuntu_user() {
|
||||
# log_info "删除'ubuntu'用户..."
|
||||
# # 使用userdel命令删除ubuntu用户,并递归删除其主目录
|
||||
# # &> /dev/null用于忽略可能的错误输出(例如用户不存在的情况)
|
||||
# userdel -r ubuntu &> /dev/null
|
||||
# log_info "已尝试删除'ubuntu'用户(如果存在)"
|
||||
#}
|
||||
|
||||
# 禁止显卡驱动
|
||||
disable_nouveau_driver() {
|
||||
log_info "禁止nouveau显卡驱动..."
|
||||
|
||||
# 创建blacklist-nouveau.conf文件,添加禁止nouveau驱动的配置
|
||||
cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF
|
||||
blacklist nouveau
|
||||
blacklist lbm-nouveau
|
||||
options nouveau modeset=0
|
||||
alias nouveau off
|
||||
alias lbm-nouveau off
|
||||
EOF
|
||||
|
||||
# 创建nouveau-kms.conf文件,禁用nouveau的KMS(内核模式设置)
|
||||
cat > /etc/modprobe.d/nouveau-kms.conf << EOF
|
||||
options nouveau modeset=0
|
||||
EOF
|
||||
|
||||
# 更新initramfs,使驱动禁用配置生效
|
||||
update-initramfs -u &> /dev/null
|
||||
|
||||
log_info "nouveau显卡驱动已被禁止"
|
||||
}
|
||||
|
||||
# 更改GRUB配置并更新
|
||||
update_grub_configuration() {
|
||||
log_info "更改GRUB配置启用传统网络接口命名..."
|
||||
|
||||
# 检查GRUB配置中是否已存在所需的网络接口命名设置
|
||||
if ! grep -q 'GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub; then
|
||||
# 如果不存在,则注释掉原有的GRUB_CMDLINE_LINUX_DEFAULT行
|
||||
sed -i '/GRUB_CMDLINE_LINUX_DEFAULT=.*/s/^/#/' /etc/default/grub
|
||||
# 添加新的GRUB_CMDLINE_LINUX_DEFAULT行,启用传统网络接口命名
|
||||
sed -i '/GRUB_CMDLINE_LINUX_DEFAULT=.*/a\GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub
|
||||
fi
|
||||
|
||||
# 再次检查配置是否已成功添加
|
||||
if grep -q 'GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub; then
|
||||
log_info "文件 /etc/default/grub 修改成功!"
|
||||
else
|
||||
log_error "文件 /etc/default/grub 修改失败!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 更新GRUB引导加载程序配置
|
||||
update-grub &> /dev/null
|
||||
|
||||
log_info "GRUB配置已更新"
|
||||
}
|
||||
|
||||
# 主函数:按顺序执行所有配置步骤
|
||||
main() {
|
||||
log_info "开始系统配置..."
|
||||
|
||||
disable_apt_periodic_updates
|
||||
set_timezone_to_shanghai
|
||||
synchronize_hardware_clock
|
||||
disable_nouveau_driver
|
||||
update_grub_configuration
|
||||
|
||||
log_info "系统配置完成!"
|
||||
}
|
||||
|
||||
# 执行主函数
|
||||
main
|
||||
Loading…
Reference in New Issue