diff --git a/group_vars/all.yaml b/group_vars/all.yaml
new file mode 100644
index 0000000..7d7edca
--- /dev/null
+++ b/group_vars/all.yaml
@@ -0,0 +1,8 @@
+# 基础路径配置
+script_dest: "/opt/ansible-scripts" # 脚本存储目录
+log_base_dir: "/var/log/ansible-deploy" # 日志根目录
+
+# ansible优化
+ansible_ssh_common_args: "-o ControlMaster=auto -o ControlPersist=60s" #自动复用已建立的 SSH 连接
+ansible_pipelining: yes
+ansible_ssh_timeout: 120
diff --git a/inventory/prod/prod.ini b/inventory/prod/prod.ini
new file mode 100644
index 0000000..846f7a0
--- /dev/null
+++ b/inventory/prod/prod.ini
@@ -0,0 +1,12 @@
+[compute_nodes]
+gpu-node-01 ansible_host=10.0.0.101 gpu_model="NVIDIA A100"
+gpu-node-02 ansible_host=10.0.0.102 gpu_model="NVIDIA H100"
+
+[all_nodes:children]
+compute_nodes
+
+[all:vars]
+ansible_user=root
+ansible_ssh_port=22
+ansible_ssh_pass=xxx
+
diff --git a/playbooks/all_components_deploy.yml b/playbooks/all_components_deploy.yml
new file mode 100644
index 0000000..5ecd56f
--- /dev/null
+++ b/playbooks/all_components_deploy.yml
@@ -0,0 +1,9 @@
+- name: 全量组件部署
+ hosts: all_nodes # 所有节点分组
+ roles:
+ - role: system_init # 基础初始化
+ - role: gpu_driver # GPU节点专
+ - role: node_exporter # 系统监控
+ - role: dcgm_exporter # GPU监控
+ vars:
+ operation: "install" # 操作类型install/uninstall
diff --git a/playbooks/deploy_gpu.yml b/playbooks/deploy_gpu.yml
new file mode 100644
index 0000000..e69de29
diff --git a/playbooks/deploy_node-exporter.yml b/playbooks/deploy_node-exporter.yml
new file mode 100644
index 0000000..e69de29
diff --git a/roles/gpu_drive/files/install.sh b/roles/gpu_drive/files/install.sh
new file mode 100644
index 0000000..6199241
--- /dev/null
+++ b/roles/gpu_drive/files/install.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+set -eo pipefail
+LOG_DIR="{{ log_base_dir }}/{{ driver.name }}" # 从角色变量注入路径
+LOG_FILE="${LOG_DIR}/install-$(date +%Y%m%d).log"
+mkdir -p "$LOG_DIR"
+exec > >(tee -a "$LOG_FILE") 2>&1
+
+# 参数解析(严格匹配角色定义的操作)
+OPERATION=""
+VERSION=""
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --install) OPERATION="install" ;;
+ --uninstall) OPERATION="uninstall" ;;
+ --version) VERSION="$2"; shift ;;
+ *) echo "错误:未知参数 $1" >&2; exit 1 ;;
+ esac
+ shift
+done
+
+# 安装逻辑(使用角色专属变量)
+install() {
+ local DRIVER_VERSION="${VERSION:-$DEFAULT_VERSION}"
+ echo "[$(date)] 开始安装NVIDIA驱动(版本:$DRIVER_VERSION,型号:$GPU_MODEL)..."
+
+ # 企业内部镜像下载(安全加速)
+ wget -q "${DOWNLOAD_URL}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" \
+ -O "/tmp/nvidia-driver.run"
+
+ # 静默安装(企业级无交互模式)
+ sh "/tmp/nvidia-driver.run" --silent --no-x-check --no-nouveau-check
+
+ # 严格功能验证(硬件型号匹配)
+ if ! nvidia-smi --query-gpu=name --format=csv,noheader | grep -q "$GPU_MODEL"; then
+ echo "错误:驱动安装后未识别到目标GPU型号" >&2; exit 1
+ fi
+}
+
+# 卸载逻辑(幂等性设计)
+uninstall() {
+ echo "[$(date)] 开始卸载NVIDIA驱动..."
+ /usr/bin/nvidia-uninstall --silent # 官方静默卸载工具
+ rm -f "/tmp/nvidia-driver.run" # 清理残留文件
+}
+
+# 主流程(依赖角色变量注入的默认值)
+DEFAULT_VERSION="{{ driver.default_version }}"
+DOWNLOAD_URL="{{ driver.download_url }}"
+install || uninstall # 根据OPERATION执行对应函数(由剧本参数控制)
+exit 0
diff --git a/roles/gpu_drive/tasks/main.yml b/roles/gpu_drive/tasks/main.yml
new file mode 100644
index 0000000..7d820f8
--- /dev/null
+++ b/roles/gpu_drive/tasks/main.yml
@@ -0,0 +1,42 @@
+- name: 创建角色专属日志目录
+ file:
+ path: "{{ log_base_dir }}/{{ driver.name }}"
+ state: directory
+ mode: "0750"
+
+- name: 同步驱动脚本到目标服务器
+ copy:
+ src: "{{ driver.install_script }}"
+ dest: "{{ script_dest }}/{{ driver.install_script }}"
+ mode: "0755"
+ force: yes # 确保使用最新脚本
+
+- name: 执行驱动操作(安装/卸载)
+ shell: |
+ {{ script_dest }}/{{ driver.install_script }} \
+ {{ operations[operation] }} \
+ {% if target_version is defined and target_version != "" %}--version {{ target_version }}{% endif %}
+ register: script_result
+ environment:
+ GPU_MODEL: "{{ hostvars[inventory_hostname]['gpu_model'] }}" # 注入主机硬件信息
+ retries: 3 # 企业级重试机制(失败3次终止)
+ delay: 30 # 重试间隔30秒
+ become: yes # 使用sudo执行
+
+- name: 验证操作结果(安装时)
+ when: operation == "install"
+ shell: "{{ driver.service_check }}"
+ changed_when: false
+ failed_when: "GPU count: 0" in script_result.stderr
+
+- name: 记录操作日志(企业级可观测性)
+ uri:
+ url: "http://logging.internal.com/api/ansible"
+ method: POST
+ body_format: json
+ body:
+ host: "{{ inventory_hostname }}"
+ component: "{{ driver.name }}_driver"
+ operation: "{{ operation }}"
+ version: "{{ target_version | default(driver.default_version) }}"
+ status: "{% if script_result.rc == 0 %}success{% else %}failed{% endif %}"
diff --git a/roles/gpu_drive/vars/manin.yaml b/roles/gpu_drive/vars/manin.yaml
new file mode 100644
index 0000000..31a7af2
--- /dev/null
+++ b/roles/gpu_drive/vars/manin.yaml
@@ -0,0 +1,12 @@
+# 显卡驱动专属变量 脚本所需参数
+driver:
+ name: "nvidia"
+ default_version: "545.29.06" # 版本
+ download_url: "http://repo.internal.com/drivers/nvidia" # 安装包下载路径
+ install_script: "nvidia-install.sh" # 脚本文件名
+ service_check: "nvidia-smi --list-gpus" # 安装后验证命令
+
+# 操作参数
+operations:
+ install: "--install"
+ uninstall: "--uninstall"
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..c87e2b6
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,155 @@
+
GPU 环境标准化部署脚本使用说明:
+
+
+
+
+
+
+
+
+
+
+
+
+
+### 一、脚本概述
+
+该脚本旨在简化 GPU 相关应用的安装流程,适用于需要快速部署 GPU 环境的场景。
+
+- **核心功能**:
+ ```bash
+ 脚本可批量完成网卡驱动、显卡驱动、fabricmanager互联管理器、CUDA 工具包、Nvidia-dcgm、DCGM-EXporter、Node-EXporter 核心组件的安装与卸载操作
+ ```
+- **配置说明**:
+ ```bash
+ 用户管理:若需删除 ubuntu 用户,需手动执行相关用户删除命令,并妥善处理该用户关联的数据与权限。
+ 磁盘管理:磁盘分区扩容需通过磁盘管理工具,根据实际需求对磁盘进行分区调整与扩容操作,以满足应用存储需求。
+ 网络配置:网卡重命名需手动修改网络配置文件,根据实际网络环境对网卡名称进行重新定义,确保网络连接正常。
+ ```
+- **使用建议**:
+ ```bash
+ 新系统推荐使用一键自动安装脚本,可快速、全面地完成 GPU 相关应用的部署,具体使用方法详见文章末尾说明。若系统之前已存在相关安装内容,或需要对各组件进行独立、定制化部署,建议使用单独部署脚本安装。
+ ```
+### 二、使用说明
+
+#### (1)系统初始化
+
+ ```bash
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
+ ```
+
+#### (2)MLNX_OFED 网络套件安装/卸载
+
+ ```bash
+ #支持版本[23.10-1.1.9.0]
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --uninstall --version '23.10-1.1.9.0'
+ ```
+
+
+#### (3)Nvidia 显卡驱动安装/卸载
+
+ ```bash
+ #支持版本[565.57.01] [570.124.06]
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '565.57.01'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '565.57.01'
+
+
+ ```
+
+#### (4)GPU 互联管理器安装/卸载
+
+ ```bash
+ #支持版本[565_565.57.01-1] [570_570.124.06-1]
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '565_565.57.01-1'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '565_565.57.01-1'
+ ```
+
+
+#### (5)NVIDIA CUDA 工具包部署/卸载
+
+ ```bash
+ #支持版本[12.6.3_560.35.05] [12.8.1_570.124.06]
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.6.3_560.35.05'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.6.3_560.35.05'
+ ```
+
+#### (6)dcgm/node exporter 部署/卸载
+
+ ```bash
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-dcgm.sh | bash -s -- --install
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/dcgm-exporter.sh | bash -s -- --install
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/node-exporter.sh | bash -s -- --install
+
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-dcgm.sh | bash -s -- --uninstall
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/dcgm-exporter.sh | bash -s -- --uninstall
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/node-exporter.sh | bash -s -- --uninstall
+ ```
+
+#### (7)批量组件安装/卸载
+
+
+
+
+
+
+ ```bash
+ 安装:---------------------------------------------------------------------------------------------------------------------------------------------
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '565.57.01'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.6.3_560.35.05'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '565_565.57.01-1'
+
+ 卸载:---------------------------------------------------------------------------------------------------------------------------------------------
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh |bash -s -- --uninstall --version '23.10-1.1.9.0'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '565.57.01'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.6.3_560.35.05'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '565_565.57.01-1'
+ ```
+
+
+
+
+
+
+ ```bash
+ 安装:---------------------------------------------------------------------------------------------------------------------------------------------
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '570.124.06'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.8.1_570.124.06'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '570_570.124.06-1'
+
+
+ 卸载:--------------------------------------------------------------------------------------------------------------------------------------------
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.shbash -s -- --uninstall --version '23.10-1.1.9.0'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '570.124.06'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.8.1_570.124.06'
+ cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '570_570.124.06-1'
+
+ ```
+
+```bash
+#安装/卸载服务(安装或卸载时间较长,建议放后台执行。):
+#组合[1]-----------------------------------------------------------------------------------------------------------------------------------
+screen -dmS install_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --install --version 1 --include=exporter 2>&1 > /opt/gpu-manager.log";
+tail -f /opt/gpu-manager.log
+
+screen -dmS uninstall_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --uninstall --version 1 --include=exporter 2>&1 > /opt/gpu-manager.log";
+tail -f /opt/gpu-manager.log
+
+#组合[2]-----------------------------------------------------------------------------------------------------------------------------------
+screen -dmS install_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --install --version 2 --include=exporter 2>&1 > /opt/gpu-manager.log";
+tail -f /opt/gpu-manager.log
+
+screen -dmS uninstall_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --uninstall --version 2 --include=exporter 2>&1 > /opt/gpu-manager.log";
+tail -f /opt/gpu-manager.log
+
+#说明:
+#version 1 表示安装/卸载七.[1]组件版本:mlnx_ofed-23.10.1.1.9.0+nvidia_drive-565.57.01 +cuda-12.6.3.560.35.05 +fabricmanager-565_565.57.01.1
+#version 2 表示安装/卸载七.[2]组件版本:mlnx_ofed-23.10.1.1.9.0+nvidia_drive-570.124.06+cuda-12.8.1.570.124.06+fabricmanager-570.124.06.1
+#--include=exporter 指定该参数,脚本将安装/卸载exporter组件中的相关服务[dcgm-exporter,node-exporter,nvidia-dcgm],默认不安装/卸载。
+
+```
+
diff --git a/scripts/cuda.sh b/scripts/cuda.sh
new file mode 100644
index 0000000..ea411b1
--- /dev/null
+++ b/scripts/cuda.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+set -euo pipefail
+
+# 全局变量
+CUDA_VERSION="12.6"
+DRIVER_VERSION="560.35.05"
+PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run"
+PACKAGE_PATH="/opt/${PACKAGE_NAME}"
+INTERNAL_BASE_URL="http://10.101.0.51:5588/cuda-linux"
+OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers"
+TEMP_DIR="/tmp/cuda_temp"
+CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2)
+CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
+ENV_PROFILE="/etc/profile"
+LOG_FILE="/var/log/cuda_manager_$(date +%Y%m%d%H%M%S).log"
+
+# 颜色定义
+GREEN='\033[1;32m'
+RED='\033[1;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # 重置颜色
+
+# 日志函数
+log() {
+ local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+ echo "$msg" >> "$LOG_FILE"
+}
+
+# 步骤提示
+step() {
+ local msg="==> $1"
+ echo -e "${GREEN}$msg${NC}"
+ log "$msg"
+}
+
+# 警告提示
+warning() {
+ local msg="警告: $1"
+ echo -e "${YELLOW}$msg${NC}"
+ log "$msg"
+}
+
+# 错误提示
+error() {
+ local msg="错误: $1"
+ echo -e "${RED}$msg${NC}"
+ log "$msg"
+ exit 1
+}
+
+# 执行命令并记录日志
+run_cmd() {
+ step "执行: $1"
+ eval "$1" &>> "$LOG_FILE" || {
+ error "命令执行失败: $1"
+ }
+}
+
+# 检查命令是否存在
+check_cmd() {
+ command -v "$1" &>/dev/null || error "未找到命令: $1"
+}
+
+# 参数解析
+ACTION=""
+FORCE=0
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --install) ACTION="install"; shift ;;
+ --uninstall) ACTION="uninstall"; shift ;;
+ --version)
+ CUSTOM_VERSION="$2"
+ if [[ $CUSTOM_VERSION =~ ^([0-9]+\.[0-9]+(\.[0-9]+)?)(_([0-9]+\.[0-9]+\.[0-9]+))$ ]]; then
+ CUDA_VERSION="${BASH_REMATCH[1]}"
+ DRIVER_VERSION="${BASH_REMATCH[4]}"
+ PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run"
+ PACKAGE_PATH="/opt/${PACKAGE_NAME}"
+ CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2)
+ CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
+ OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers"
+ else
+ error "版本格式错误,应为 x.x.x_y.y.y 或 x.x_y.y.y"
+ fi
+ shift 2 ;;
+ --force) FORCE=1; shift ;;
+ *) error "未知参数: $1" ;;
+ esac
+done
+
+[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall"
+
+
+# 下载安装包
+download_package() {
+ step "检查安装包: $PACKAGE_PATH"
+ if [[ -f "$PACKAGE_PATH" ]]; then
+ step "使用本地安装包"
+ return 0
+ fi
+
+ step "本地包不存在,开始下载"
+ mkdir -p "$(dirname "$PACKAGE_PATH")"
+
+ local urls=(
+ "${INTERNAL_BASE_URL}/${PACKAGE_NAME}"
+ "${OFFICIAL_BASE_URL}/${PACKAGE_NAME}"
+ )
+
+ for url in "${urls[@]}"; do
+ step "尝试从 $url 下载"
+ if wget -q -O "$PACKAGE_PATH" "$url"; then
+ step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)"
+ return 0
+ else
+ warning "从 $url 下载失败"
+ fi
+ done
+
+ error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH"
+}
+
+
+# 安装 CUDA
+install_cuda() {
+ step "开始安装 CUDA ${CUDA_VERSION}"
+
+ # 下载安装包
+ download_package
+
+ # 创建临时目录
+ mkdir -p "$TEMP_DIR"
+ trap 'rm -rf "$TEMP_DIR"' EXIT
+
+ # 检查安装包参数
+ step "检查安装包支持的参数"
+ sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1
+
+ # 检查是否支持 --toolkit 参数
+ if grep -q -- '--toolkit' "$TEMP_DIR/help.txt"; then
+ step "安装包支持 --toolkit 参数"
+ run_cmd "sh $PACKAGE_PATH --silent --toolkit"
+ else
+ warning "安装包不支持 --toolkit 参数,尝试完整安装"
+ run_cmd "sh $PACKAGE_PATH --silent"
+ fi
+
+ # 配置环境变量
+ step "配置 CUDA 环境变量"
+ if ! grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then
+ cat >> "$ENV_PROFILE" << EOF
+
+# CUDA ${CUDA_VERSION}
+export PATH=${CUDA_INSTALL_DIR}/bin:\$PATH
+export LD_LIBRARY_PATH=${CUDA_INSTALL_DIR}/lib64:\$LD_LIBRARY_PATH
+EOF
+ step "已添加环境变量到 $ENV_PROFILE"
+ else
+ step "环境变量已存在,跳过添加"
+ fi
+
+ # 生效环境变量
+ export LC_BYOBU=0 && source "/etc/profile" #临时解决
+
+ # 验证安装
+ step "验证 CUDA 安装"
+ if command -v nvcc &>/dev/null; then
+ nvcc_version=$(nvcc -V | grep release | awk '{print $5}' | tr -d ',')
+ if [[ "$nvcc_version" == *"${CUDA_MAJOR_MINOR}"* ]]; then
+ step "CUDA ${CUDA_VERSION} 安装成功"
+ else
+ error "CUDA 版本不匹配,期望 ${CUDA_VERSION},实际 $nvcc_version"
+ fi
+ else
+ error "nvcc 命令未找到,安装失败"
+ fi
+}
+
+
+# 卸载 CUDA
+uninstall_cuda() {
+ step "开始卸载 CUDA ${CUDA_VERSION}"
+
+ # 创建临时目录
+ mkdir -p "$TEMP_DIR"
+ trap 'rm -rf "$TEMP_DIR"' EXIT
+
+ # 检查官方卸载脚本
+ OFFICIAL_UNINSTALLER="${CUDA_INSTALL_DIR}/bin/cuda-uninstaller"
+ if [[ -x "$OFFICIAL_UNINSTALLER" ]]; then
+ step "找到官方卸载脚本: $OFFICIAL_UNINSTALLER"
+
+ # 执行官方卸载脚本
+ step "执行官方卸载程序"
+ run_cmd "$OFFICIAL_UNINSTALLER --silent"
+ else
+ warning "未找到官方卸载脚本,尝试其他方法"
+
+ # 检查安装包是否存在
+ if [[ -f "$PACKAGE_PATH" ]]; then
+ step "找到安装包: $PACKAGE_PATH"
+ else
+ if [[ $FORCE -eq 1 ]]; then
+ warning "未找到安装包,继续强制卸载"
+ else
+ step "未找到安装包,开始下载"
+ download_package
+ fi
+ fi
+
+ # 检查安装包是否支持 --uninstall 参数
+ step "检查安装包是否支持 --uninstall 参数"
+ sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1
+
+ if grep -q -- '--uninstall' "$TEMP_DIR/help.txt"; then
+ step "安装包支持 --uninstall 参数"
+ run_cmd "sh $PACKAGE_PATH --silent --uninstall"
+ else
+ step "安装包不支持 --uninstall 参数,尝试解压查找卸载脚本"
+
+ # 解压安装包
+ step "解压安装包到 $TEMP_DIR"
+ run_cmd "sh $PACKAGE_PATH --extract=$TEMP_DIR"
+
+ # 查找卸载脚本
+ UNINSTALL_SCRIPT=$(find "$TEMP_DIR" -name "uninstall_cuda*" -type f | head -n1)
+
+ if [[ -n "$UNINSTALL_SCRIPT" ]]; then
+ step "找到卸载脚本: $UNINSTALL_SCRIPT"
+ run_cmd "sh $UNINSTALL_SCRIPT"
+ else
+ warning "未找到卸载脚本,继续手动清理"
+ fi
+ fi
+ fi
+
+ # 清理环境变量
+ step "清理环境变量"
+ if grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then
+ run_cmd "sed -i '/cuda-${CUDA_MAJOR_MINOR}/d' $ENV_PROFILE"
+ step "已从 $ENV_PROFILE 移除 CUDA 环境变量"
+ else
+ step "环境变量已清理"
+ fi
+
+ # 清理安装目录
+ if [[ -d "$CUDA_INSTALL_DIR" ]]; then
+ step "删除安装目录: $CUDA_INSTALL_DIR"
+ run_cmd "rm -rf $CUDA_INSTALL_DIR"
+ else
+ step "安装目录不存在,跳过删除"
+ fi
+
+ # 清理残留文件
+ step "清理残留文件"
+ find / -name "*cuda-${CUDA_MAJOR_MINOR}*" 2>/dev/null | while read -r file; do
+ if [[ -e "$file" ]]; then
+ step "删除残留文件: $file"
+ rm -rf "$file" 2>/dev/null || warning "无法删除: $file"
+ fi
+ done
+
+ step "CUDA ${CUDA_VERSION} 卸载完成"
+}
+
+
+
+step "开始 CUDA ${CUDA_VERSION} ${ACTION} 流程"
+case "$ACTION" in
+ install) install_cuda ;;
+ uninstall) uninstall_cuda ;;
+ *) error "未知操作: $ACTION" ;;
+esac
+
+step "CUDA ${CUDA_VERSION} ${ACTION} 流程完成"
diff --git a/scripts/dcgm-exporter.sh b/scripts/dcgm-exporter.sh
new file mode 100644
index 0000000..548cf76
--- /dev/null
+++ b/scripts/dcgm-exporter.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+set -euo pipefail
+
+# 全局变量
+LOG_FILE="/var/log/dcgm-exporter-install_$(date +%Y%m%d%H%M%S).log"
+GO_VERSION="1.21.1"
+DCGM_EXPORTER_VERSION="4.2.0-4.1.0"
+DCGM_EXPORTER_DIR="/opt/dcgm-exporter"
+SERVICE_FILE="/etc/systemd/system/dcgm-exporter.service"
+
+# 颜色定义
+GREEN='\033[1;32m'
+RED='\033[1;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+# 日志函数(控制台+日志文件)
+log() {
+ local timestamp="[$(date '+%Y-%m-%d %H:%M:%S')]"
+ echo -e "$timestamp $*" | tee -a "$LOG_FILE"
+}
+
+# 成功提示
+success() {
+ log "${GREEN}✔ $*${NC}"
+}
+
+# 警告提示
+warning() {
+ log "${YELLOW}⚠ $*${NC}"
+}
+
+# 错误提示
+error() {
+ log "${RED}✖ 错误: $*${NC}"
+ exit 1
+}
+
+# 执行命令(带详细错误处理)
+run() {
+ local cmd="$1"
+ local error_msg="${2:-命令执行失败}"
+ local timeout="${3:-30}" # 默认超时30秒
+
+ log "→ 执行: $cmd"
+
+ # 使用timeout防止命令卡死
+ if ! timeout "$timeout" bash -c "$cmd" &>> "$LOG_FILE"; then
+ local exit_code=$?
+ if [[ $exit_code -eq 124 ]]; then
+ error "命令超时 ($timeout秒): $cmd"
+ else
+ error "$error_msg (退出码: $exit_code)"
+ fi
+ fi
+}
+
+# 检测DCGM状态
+check_dcgm() {
+ log "检测DCGM服务状态..."
+
+ # 检查systemctl命令是否存在
+ if ! command -v systemctl &> /dev/null; then
+ error "未找到systemctl命令,请确保系统支持systemd"
+ fi
+
+ # 检查nvidia-dcgm.service文件是否存在
+ if [[ ! -f "/lib/systemd/system/nvidia-dcgm.service" ]]; then
+ error "未找到nvidia-dcgm服务文件,请确认DCGM已正确安装"
+ fi
+
+ # 获取服务状态
+ local status=$(systemctl is-active nvidia-dcgm 2>&1)
+ local exit_code=$?
+
+ if [[ $exit_code -ne 0 ]]; then
+ error "无法获取DCGM服务状态: $status"
+ fi
+
+ if [[ "$status" == "active" ]]; then
+ DCGM_VERSION=$(dcgmi version 2>&1 | grep -oP 'Version \K[^\s]+' || echo "未知")
+ success "DCGM服务运行中 (版本: $DCGM_VERSION)"
+ else
+ error "DCGM服务状态异常 ($status),请先执行: systemctl start nvidia-dcgm"
+ fi
+}
+
+# 安装Go环境
+install_go() {
+ log "安装Go环境 (版本: $GO_VERSION)..."
+ GO_PACKAGE="go${GO_VERSION}.linux-amd64.tar.gz"
+ DL_URL="https://golang.google.cn/dl/$GO_PACKAGE"
+ TMP_PACKAGE="/tmp/$GO_PACKAGE"
+
+ # 下载安装包
+ if [[ ! -f "$TMP_PACKAGE" ]]; then
+ run "wget -qO $TMP_PACKAGE $DL_URL" "下载Go安装包失败"
+ fi
+
+ # 解压安装
+ run "tar -xzf $TMP_PACKAGE -C /usr/local" "解压Go安装包失败"
+
+ # 配置环境变量
+ GO_ENV="/etc/profile.d/go.sh"
+ cat > "$GO_ENV" <<'EOF'
+export GOROOT=/usr/local/go
+export GOPATH=/usr/local/gopath
+export PATH=$PATH:$GOROOT/bin
+export GO111MODULE=on
+export GOPROXY=https://goproxy.cn,direct
+EOF
+ log "→ 配置Go环境变量"
+ if ! chmod +x "$GO_ENV" &>> "$LOG_FILE"; then
+ error "设置Go环境变量文件权限失败"
+ fi
+
+ # 在当前shell中加载环境变量
+ log "→ 加载Go环境变量"
+ if ! source "$GO_ENV"; then
+ error "加载Go环境变量失败"
+ fi
+
+ # 验证安装
+ local go_version=$(go version 2>&1)
+ if [[ "$go_version" == *"go$GO_VERSION"* ]]; then
+ success "Go环境安装完成: $go_version"
+ else
+ error "Go环境验证失败: $go_version"
+ fi
+}
+
+# 卸载现有DCGM Exporter
+uninstall_existing() {
+ log "检查是否存在旧版本DCGM Exporter..."
+
+ if [[ -d "$DCGM_EXPORTER_DIR" || -f "$SERVICE_FILE" ]]; then
+ log "发现旧版本,开始卸载..."
+
+ # 停止服务
+ if systemctl is-active --quiet dcgm-exporter; then
+ run "systemctl stop dcgm-exporter" "停止现有服务失败"
+ fi
+
+ # 禁用服务
+ if systemctl is-enabled --quiet dcgm-exporter; then
+ run "systemctl disable dcgm-exporter" "禁用现有服务失败"
+ fi
+
+ # 删除文件
+ run "rm -rf $DCGM_EXPORTER_DIR" "删除旧安装目录失败"
+ run "rm -f $SERVICE_FILE" "删除服务文件失败"
+
+ # 重新加载systemd
+ run "systemctl daemon-reload" "重新加载systemd失败"
+
+ success "旧版本卸载完成"
+ else
+ success "未发现旧版本,继续安装..."
+ fi
+}
+
+# 安装DCGM Exporter
+install_exporter() {
+ log "安装DCGM Exporter (版本: $DCGM_EXPORTER_VERSION)..."
+
+ # 加载Go环境变量
+ log "→ 加载Go环境变量"
+ if [[ -f "/etc/profile.d/go.sh" ]]; then
+ if ! source "/etc/profile.d/go.sh"; then
+ error "加载Go环境变量失败"
+ fi
+ else
+ error "未找到Go环境变量配置文件"
+ fi
+
+ run "mkdir -p $DCGM_EXPORTER_DIR" "创建工作目录失败"
+
+ # 下载源码(使用固定URL)
+ DCGM_PACKAGE="4.2.0-4.1.0.tar.gz"
+ DL_URL="https://github.com/NVIDIA/dcgm-exporter/archive/refs/tags/$DCGM_PACKAGE"
+ TMP_PACKAGE="/tmp/$DCGM_PACKAGE"
+
+ if [[ ! -f "$TMP_PACKAGE" ]]; then
+ run "wget -qO $TMP_PACKAGE $DL_URL" "下载DCGM Exporter源码失败"
+ fi
+
+ # 解压
+ run "tar xf $TMP_PACKAGE -C /tmp" "解压DCGM Exporter源码失败"
+ SOURCE_DIR="/tmp/dcgm-exporter-$DCGM_EXPORTER_VERSION"
+
+ # 编译安装
+ log "→ 编译DCGM Exporter"
+ if ! cd "$SOURCE_DIR" &>> "$LOG_FILE"; then
+ error "进入源码目录失败"
+ fi
+
+ # 分步骤执行make,便于调试
+ if ! make binary &>> "$LOG_FILE"; then
+ error "编译DCGM Exporter失败"
+ fi
+
+ if ! make install &>> "$LOG_FILE"; then
+ error "安装DCGM Exporter失败"
+ fi
+
+ # 复制文件
+ run "cp -a /usr/bin/dcgm-exporter $DCGM_EXPORTER_DIR/" "复制二进制文件失败"
+ run "cp -a /etc/dcgm-exporter/default-counters.csv $DCGM_EXPORTER_DIR/" "复制配置文件失败"
+
+
+ # 生成服务文件
+ cat > "$SERVICE_FILE" </opt/gpu-manager.log
+ parse_args "$@"
+ define_versions
+ show_version_info
+
+ if [ "$ACTION" = "install" ]; then
+ run_install
+ else
+ run_uninstall
+ fi
+}
+
+# 执行主函数
+main "$@"
diff --git a/scripts/ib-drive.sh b/scripts/ib-drive.sh
new file mode 100644
index 0000000..c9b95df
--- /dev/null
+++ b/scripts/ib-drive.sh
@@ -0,0 +1,260 @@
+#!/bin/bash
+set -e
+
+# 颜色定义
+GREEN='\033[1;32m'
+RED='\033[1;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # 重置颜色
+
+# 输出带颜色的信息
+log_info() { echo -e "${GREEN}[INFO] $1${NC}"; }
+log_error() { echo -e "${RED}[ERROR] $1${NC}"; exit 1; }
+log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}"; }
+
+# 默认变量
+ACTION=""
+DRIVER_VERSION="5.8-6.0.4.2"
+DISTRO="ubuntu22.04"
+ARCH="x86_64"
+FORCE=0
+
+# 生成包名和路径
+generate_package_info() {
+ DRIVER_PACKAGE="MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}.tgz"
+ PACKAGE_PATH="/opt/${DRIVER_PACKAGE}"
+ DRIVER_DIR="/opt/MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}"
+ INTERNAL_URL="http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}"
+ OFFICIAL_URL="https://example.com/${DRIVER_PACKAGE}" # 请替换为实际官网地址
+}
+
+# 解析命令行参数
+parse_args() {
+ while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --install)
+ ACTION="install"
+ shift
+ ;;
+ --uninstall)
+ ACTION="uninstall"
+ shift
+ ;;
+ --version)
+ if [[ -z "$2" ]]; then
+ log_error "请指定版本号,如: --version 5.8-6.0.4.2"
+ fi
+ DRIVER_VERSION="$2"
+ generate_package_info
+ shift 2
+ ;;
+ --force)
+ FORCE=1
+ shift
+ ;;
+ *)
+ log_error "未知参数: $1"
+ ;;
+ esac
+ done
+
+ if [[ -z "$ACTION" ]]; then
+ log_error "请指定操作: --install 或 --uninstall"
+ fi
+}
+
+# 下载驱动包
+download_driver() {
+ log_info "开始下载驱动包: $DRIVER_PACKAGE"
+ if [ -f "$PACKAGE_PATH" ]; then
+ log_info "使用本地驱动包: $PACKAGE_PATH"
+ else
+ log_info "本地包不存在,尝试从内网下载"
+ if wget -q -O "$PACKAGE_PATH" "$INTERNAL_URL"; then
+ log_info "内网下载成功"
+ else
+ log_warning "内网下载失败,尝试从官网下载"
+ if wget -q -O "$PACKAGE_PATH" "$OFFICIAL_URL"; then
+ log_info "官网下载成功"
+ else
+ log_error "驱动包下载失败,请手动放置到 /opt/"
+ fi
+ fi
+ fi
+}
+
+# 安装驱动
+install_driver() {
+ log_info "开始安装驱动: $DRIVER_VERSION"
+
+ # 检查是否已安装
+ #if command -v ibv_devinfo &> /dev/null && [[ $FORCE -eq 0 ]]; then
+ # log_warning "检测到驱动已安装,使用 --force 覆盖安装"
+ # exit 0
+ #fi
+
+ kernel_version=$(uname -r)
+ log_info "当前内核版本: $kernel_version"
+
+ log_info "安装依赖包"
+ apt update &>> /tmp/mlnx_install.log
+ apt install -y net-tools bzip2 &>> /tmp/mlnx_install.log
+
+ log_info "解压驱动包"
+ tar -zxf "$PACKAGE_PATH" -C /opt/
+
+ log_info "执行驱动安装"
+ cd "$DRIVER_DIR"
+ ./mlnxofedinstall --without-dkms --add-kernel-support --kernel "$kernel_version" --with-fw-update --force &>> /tmp/mlnx_install.log
+ sleep 10
+}
+
+# 卸载驱动(修改后版本)
+uninstall_driver() {
+ log_info "开始卸载驱动: $DRIVER_VERSION"
+
+ # 检查驱动目录,不存在则重新下载解压
+ if [ ! -d "$DRIVER_DIR" ]; then
+ log_warning "驱动目录不存在,尝试重新下载和解压"
+ download_driver # 复用安装的下载逻辑
+ log_info "解压驱动包"
+ tar -zxf "$PACKAGE_PATH" -C /opt/
+ if [ ! -d "$DRIVER_DIR" ]; then
+ log_error "解压失败,无法找到驱动目录: $DRIVER_DIR"
+ else
+ log_info "成功解压驱动包到: $DRIVER_DIR"
+ fi
+ else
+ log_info "找到驱动目录: $DRIVER_DIR"
+ fi
+
+ # 执行卸载
+ cd "$DRIVER_DIR"
+ log_info "执行卸载脚本"
+ ./uninstall.sh -q -y &>> /tmp/mlnx_install.log || log_warning "卸载脚本执行失败,尝试手动清理"
+
+ log_info "清理残留文件"
+ rm -rf "$DRIVER_DIR" "$PACKAGE_PATH"
+
+ log_info "停止并禁用openibd服务"
+ systemctl stop openibd.service &>> /tmp/mlnx_install.log || true
+ systemctl disable openibd.service &>> /tmp/mlnx_install.log || true
+
+ log_info "恢复网卡命名规则"
+ sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules
+ sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules
+ rm -f /etc/modprobe.d/nvidia-gsp.conf
+ update-initramfs -u &>> /tmp/mlnx_install.log
+}
+
+# 配置网卡命名规则
+configure_naming_rules() {
+ log_info "配置IB网卡命名规则"
+
+ log_info "备份原有规则"
+ cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log || true
+ cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log || true
+
+ log_info "清除原有规则"
+ sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log || true
+ sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log || true
+
+ log_info "生成IB设备命名规则"
+ ID=20
+ for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
+ if [ -n "$i" ]; then
+ echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules
+ ID=$((ID+1))
+ fi
+ done
+
+ log_info "生成网络设备命名规则"
+ IDS=0
+ for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
+ if [ -n "$j" ]; then
+ echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules
+ IDS=$((IDS+1))
+ fi
+ done
+
+ log_info "配置nvidia选项"
+ echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-gsp.conf
+ update-initramfs -u &>> /tmp/mlnx_install.log
+
+ log_info "重启openibd服务"
+ systemctl restart openibd.service
+ sleep 15
+}
+
+# 检查驱动安装结果
+check_installation() {
+ log_info "检查驱动安装结果"
+ if command -v ibv_devinfo &> /dev/null; then
+ log_info "驱动安装成功"
+ else
+ log_error "驱动安装失败"
+ fi
+
+ log_info "检查网卡命名规则"
+ valid_count=0
+ for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do
+ if [ -n "$dev" ]; then
+ mlx_name=$(udevadm info -q name -n "$dev" 2>/dev/null | grep "mlx5_" || true)
+ net_name=$(ip link show "$dev" | grep "ib[0-9]" || true)
+
+ if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then
+ valid_count=$((valid_count+1))
+ else
+ log_warning "网卡 $dev 命名规则未生效"
+ fi
+ fi
+ done
+
+ if [ $valid_count -gt 0 ]; then
+ log_info "网卡命名规则生效,成功配置 $valid_count 个网卡"
+ else
+ log_warning "所有网卡命名规则均未生效,建议重启系统手工配置!"
+ fi
+}
+
+# 检查卸载结果
+check_uninstallation() {
+ log_info "检查卸载结果"
+ if ! command -v ibv_devinfo &> /dev/null; then
+ log_info "驱动已成功卸载"
+ else
+ log_warning "驱动命令仍存在,可能需要手动清理"
+ fi
+
+ if [ ! -d "$DRIVER_DIR" ]; then
+ log_info "驱动目录已删除"
+ else
+ log_warning "驱动目录未完全删除: $DRIVER_DIR"
+ fi
+}
+
+# 主函数
+main() {
+ generate_package_info
+ parse_args "$@"
+
+ log_info "开始执行 ${ACTION^^} 操作,版本: $DRIVER_VERSION"
+
+ case "$ACTION" in
+ install)
+ download_driver
+ install_driver
+ configure_naming_rules
+ check_installation
+ ;;
+ uninstall)
+ uninstall_driver
+ check_uninstallation
+ ;;
+ esac
+
+ log_info "操作完成!"
+}
+
+# 执行主函数
+main "$@"
diff --git a/scripts/linux_security_hardening.sh b/scripts/linux_security_hardening.sh
new file mode 100644
index 0000000..e35f10b
--- /dev/null
+++ b/scripts/linux_security_hardening.sh
@@ -0,0 +1,180 @@
+
+#!/bin/bash
+##############################################################################################################################
+#脚本功能:
+#1.口令定期更换策略设置个90天,最小密码长度为8位,密码过期警告提前7天。
+#2.口令复杂度设置:密码长度至少为12位,包含至少四种字符类型(大写字母、小写字母、数字、特殊字符)。
+#3.登录失败处理策略设置:登录失败次数为5次,锁定时间为10分钟。
+#4.登录连接超时默认配置设置:登录连接超时时间为10分钟。
+#5.日志本地保存时间设置为6个月。
+#6.禁止root ssh远程登录
+#7.启动日志与审计服务rsyslog和auditd
+#8.sshd开启PAM认证
+#9.安装系统工具
+##############################################################################################################################
+# 定义新的配置参数
+LOGIN_DEFS_POLICY_MAX_DAYS="PASS_MAX_DAYS 90"
+LOGIN_DEFS_POLICY_MIN_DAYS="PASS_MIN_DAYS 0"
+LOGIN_DEFS_POLICY_MIN_LEN="PASS_MIN_LEN 8"
+LOGIN_DEFS_POLICY_WARN_AGE="PASS_WARN_AGE 7"
+# 编辑/etc/login.defs配置文件
+echo "正在编辑 /etc/login.defs 文件..."
+# 检查并替换或添加设置
+if grep -q "^PASS_MAX_DAYS" /etc/login.defs; then
+ sed -i "s/^PASS_MAX_DAYS.*/${LOGIN_DEFS_POLICY_MAX_DAYS}/" /etc/login.defs
+fi
+
+if grep -q "^PASS_MIN_DAYS" /etc/login.defs; then
+ sed -i "s/^PASS_MIN_DAYS.*/${LOGIN_DEFS_POLICY_MIN_DAYS}/" /etc/login.defs
+fi
+
+if grep -q "^PASS_MIN_LEN" /etc/login.defs; then
+ sed -i "s/^PASS_MIN_LEN.*/${LOGIN_DEFS_POLICY_MIN_LEN}/" /etc/login.defs
+fi
+
+if grep -q "^PASS_WARN_AGE" /etc/login.defs; then
+ sed -i "s/^PASS_WARN_AGE.*/${LOGIN_DEFS_POLICY_WARN_AGE}/" /etc/login.defs
+fi
+
+# 编辑/etc/security/pwquality.conf配置文件口令复杂度
+PWQUALITY_POLICY_MINLEN="minlen = 12"
+PWQUALITY_POLICY_MINCLASS="minclass = 4"
+PWQUALITY_POLICY_DCREDIT="dcredit = -1"
+PWQUALITY_POLICY_UCREDIT="ucredit = -1"
+PWQUALITY_POLICY_LCREDIT="lcredit = -1"
+PWQUALITY_POLICY_OCREDIT="ocredit = -1"
+PWQUALITY_POLICY_FOR_ROOT="enforce_for_root"
+PWQUALITY_POLICY_DIFOK="difok = 5"
+
+echo "正在编辑 /etc/security/pwquality.conf 文件配置文件口令复杂度"
+if grep -q "^minlen" /etc/security/pwquality.conf; then
+ sed -i "s/^minlen.*/${PWQUALITY_POLICY_MINLEN}/" /etc/security/pwquality.conf
+elif grep -q "^# minlen" /etc/security/pwquality.conf; then
+ sed -i "s/^# minlen.*/${PWQUALITY_POLICY_MINLEN}/" /etc/security/pwquality.conf
+fi
+
+if grep -q "^minclass" /etc/security/pwquality.conf; then
+ sed -i "s/^minclass.*/${PWQUALITY_POLICY_MINCLASS}/" /etc/security/pwquality.conf
+elif grep -q "^# minclass" /etc/security/pwquality.conf; then
+ sed -i "s/^# minclass.*/${PWQUALITY_POLICY_MINCLASS}/" /etc/security/pwquality.conf
+fi
+
+if grep -q "^dcredit" /etc/security/pwquality.conf; then
+ sed -i "s/^dcredit.*/${PWQUALITY_POLICY_DCREDIT}/" /etc/security/pwquality.conf
+elif grep -q "^# dcredit" /etc/security/pwquality.conf; then
+ sed -i "s/^# dcredit.*/${PWQUALITY_POLICY_DCREDIT}/" /etc/security/pwquality.conf
+fi
+
+if grep -q "^ucredit" /etc/security/pwquality.conf; then
+ sed -i "s/^ucredit.*/${PWQUALITY_POLICY_UCREDIT}/" /etc/security/pwquality.conf
+elif grep -q "^# ucredit" /etc/security/pwquality.conf; then
+ sed -i "s/^# ucredit.*/${PWQUALITY_POLICY_UCREDIT}/" /etc/security/pwquality.conf
+fi
+
+if grep -q "^lcredit" /etc/security/pwquality.conf; then
+ sed -i "s/^lcredit.*/${PWQUALITY_POLICY_LCREDIT}/" /etc/security/pwquality.conf
+elif grep -q "^# lcredit" /etc/security/pwquality.conf; then
+ sed -i "s/^# lcredit.*/${PWQUALITY_POLICY_LCREDIT}/" /etc/security/pwquality.conf
+fi
+
+if grep -q "^ocredit" /etc/security/pwquality.conf; then
+ sed -i "s/^ocredit.*/${PWQUALITY_POLICY_OCREDIT}/" /etc/security/pwquality.conf
+elif grep -q "^# ocredit" /etc/security/pwquality.conf; then
+ sed -i "s/^# ocredit.*/${PWQUALITY_POLICY_OCREDIT}/" /etc/security/pwquality.conf
+fi
+
+if grep -q "^enforce_for_root" /etc/security/pwquality.conf; then
+ :
+elif grep -q "^# enforce_for_root" /etc/security/pwquality.conf; then
+ sed -i "s/^# enforce_for_root/${PWQUALITY_POLICY_FOR_ROOT}/" /etc/security/pwquality.conf
+fi
+
+if grep -q "^difok" /etc/security/pwquality.conf; then
+ sed -i "s/^difok.*/${PWQUALITY_POLICY_DIFOK}/" /etc/security/pwquality.conf
+elif grep -q "^# difok.*" /etc/security/pwquality.conf; then
+ sed -i "s/^# difok.*/${PWQUALITY_POLICY_DIFOK}/" /etc/security/pwquality.conf
+fi
+
+# 执行以下命令,来更新`system-auth`和`password-auth`文件
+egrep -q "^\s*password\s+requisite\s+pam_pwquality.so\s+" /etc/pam.d/system-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+try_first_pass)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1try_first_pass \2/ }' /etc/pam.d/system-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+retry=[0-9]+)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1retry=5 \2/ }' /etc/pam.d/system-auth && sed -ri "s/(^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*\s+)retry=[0-9]+(\s+.*)?$/\1retry=5\3/" /etc/pam.d/system-auth
+sleep 2s
+egrep -q "^\s*password\s+requisite\s+pam_pwquality.so\s+" /etc/pam.d/password-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+try_first_pass)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1try_first_pass \2/ }' /etc/pam.d/password-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+retry=[0-9]+)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1retry=5 \2/ }' /etc/pam.d/password-auth && sed -ri "s/(^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*\s+)retry=[0-9]+(\s+.*)?$/\1retry=5\3/" /etc/pam.d/password-auth
+
+# 密码验证失败处理策略
+echo "正在编辑 /etc/pam.d/password-auth 文件配置密码验证失败处理策略"
+if grep -Pq "^auth\s*required\s*pam_faillock.so\s*authfail\s*even_deny_root\s*deny=5\s*unlock_time=600\s*$" /etc/pam.d/system-auth; then
+ :
+else
+ sed -ri "/^auth.*pam_env.so$/i auth required pam_faillock.so preauth silent even_deny_root deny=5 unlock_time=600\nauth required pam_faillock.so authfail even_deny_root deny=5 unlock_time=600" /etc/pam.d/system-auth
+fi
+
+if grep -Pq "^auth\s*required\s*pam_faillock.so\s*authfail\s*even_deny_root\s*deny=5\s*unlock_time=600\s*$" /etc/pam.d/password-auth; then
+ :
+else
+ sed -ri "/^auth.*pam_env.so$/i auth required pam_faillock.so preauth silent even_deny_root deny=5 unlock_time=600\nauth required pam_faillock.so authfail even_deny_root deny=5 unlock_time=600" /etc/pam.d/password-auth
+fi
+
+
+# 终端超时自动登出设置要求针对所有用户,自动登退时间为600s
+echo "正在编辑 /etc/profile 文件配置终端超时自动登出设置要求针对所有用户,自动登退时间为600s"
+if grep -q "^export TMOUT" /etc/profile; then
+ sed -i "s/^export TMOUT.*/export TMOUT=600/" /etc/profile
+else
+ echo "export TMOUT=600" >> /etc/profile
+fi
+
+# 设置日志本地保存时间6个月
+echo "正在编辑 /etc/logrotate.conf 文件设置日志本地保存时间6个月"
+if grep -q "^rotate" /etc/logrotate.conf; then
+ sed -i "s/rotate.*/rotate 26/" /etc/logrotate.conf
+fi
+
+# 禁止root ssh远程登录
+echo "正在编辑 /etc/ssh/sshd_config 文件禁止root ssh远程登录"
+if grep -q "^PermitRootLogin" /etc/ssh/sshd_config; then
+ sed -i "s/^PermitRootLogin.*/PermitRootLogin no/" /etc/ssh/sshd_config
+else
+ echo "PermitRootLogin no" >> /etc/ssh/sshd_config
+fi
+
+if grep -q "^PubkeyAuthentication" /etc/ssh/sshd_config; then
+ sed -i "s/^PubkeyAuthentication.*/PubkeyAuthentication yes/" /etc/ssh/sshd_config
+else
+ sed -i "s/^#PubkeyAuthentication.*/PubkeyAuthentication yes/" /etc/ssh/sshd_config
+fi
+# 禁用 ssh DNS 解析
+if grep -q "^UseDNS" /etc/ssh/sshd_config; then
+ sed -i "s/^UseDNS.*/UseDNS no/" /etc/ssh/sshd_config
+else
+ sed -i "s/^#UseDNS.*/UseDNS no/" /etc/ssh/sshd_config
+fi
+# 开启ssh PAM认证
+if grep -q "^UsePAM" /etc/ssh/sshd_config; then
+ sed -i "s/^UsePAM.*/UsePAM yes/" /etc/ssh/sshd_config
+else
+ sed -i "s/^#UsePAM.*/UsePAM yes/" /etc/ssh/sshd_config
+fi
+
+# 重启sshd服务,生效配置
+echo "正在重启sshd服务....."
+if grep -Pq '^PubkeyAuthentication yes' /etc/ssh/sshd_config;then
+ systemctl restart sshd
+fi
+sleep 2s
+# 开启rsyslog服务,开启auditd服务
+echo "正在启动rsyslog和auditd服务"
+systemctl restart rsyslog.service
+systemctl start rsyslog.service && systemctl enable rsyslog.service
+sleep 2s
+systemctl start auditd.service && systemctl enable auditd.service
+echo "请自行修改操作系统默认密码。并做好密码保存。"
+echo "已禁止root ssh远程登录,请使用scloudadmin账号登录,如无法登录请通过ipmi远程控制登录"
+
+#9.安装系统工具
+echo "安装sysstat ipmitool vim pciutils net-tools工具包"
+dnf -y install sysstat.x86_64
+dnf -y install ipmitool.x86_64
+dnf -y install vim
+dnf -y install pciutils.x86_64
+dnf -y install net-tools.x86_64
+echo "所有操作已完成。"
diff --git a/scripts/node-exporter.sh b/scripts/node-exporter.sh
new file mode 100644
index 0000000..db719bb
--- /dev/null
+++ b/scripts/node-exporter.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+set -euo pipefail
+
+# 全局变量
+LOG_FILE="/var/log/node_exporter_$(date +%Y%m%d%H%M%S).log"
+NODE_EXPORTER_VERSION="1.8.2"
+PRIMARY_DOWNLOAD_URL="http://10.101.0.51:5588/node-exporter/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz"
+BACKUP_DOWNLOAD_URL="https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz"
+LOCAL_PACKAGE_PATH="/opt/node_exporter.tar.gz"
+
+# 颜色定义
+GREEN='\033[1;32m'
+RED='\033[1;31m'
+NC='\033[0m' # 重置颜色
+
+# 日志函数 - 记录所有操作到日志
+log() {
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
+}
+
+# 步骤提示 - 绿色输出到控制台并记录日志
+step() {
+ local msg="==> $1"
+ echo -e "${GREEN}$msg${NC}"
+ log "$msg"
+}
+
+# 错误提示 - 红色输出到控制台并记录日志
+error() {
+ local msg="错误: $1"
+ echo -e "${RED}$msg${NC}"
+ log "$msg"
+ exit 1
+}
+
+# 执行命令并屏蔽输出
+run_cmd() {
+ step "执行: $1"
+ eval "$1" &>> "$LOG_FILE" || {
+ error "命令执行失败: $1"
+ }
+}
+
+# 测试网络连通性
+test_network_connectivity() {
+ local url=$1
+ step "测试网络连通性: $url"
+ if curl -fsSLI --connect-timeout 10 "$url" &>> "$LOG_FILE"; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+# 下载文件
+download_file() {
+ local url=$1
+ local dest=$2
+ step "下载文件: $url 到 $dest"
+ if wget -qO "$dest" "$url" &>> "$LOG_FILE"; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+# 安装 node_exporter
+install_node_exporter() {
+ step "开始安装 node_exporter ${NODE_EXPORTER_VERSION}"
+
+ # 切换到 /opt 目录
+ run_cmd "cd /opt"
+
+ # 检查本地是否存在安装包
+ if [[ -f "$LOCAL_PACKAGE_PATH" ]]; then
+ step "发现本地安装包: $LOCAL_PACKAGE_PATH"
+ DOWNLOAD_URL="$LOCAL_PACKAGE_PATH"
+ else
+ # 测试主要下载地址的连通性
+ if test_network_connectivity "$PRIMARY_DOWNLOAD_URL"; then
+ DOWNLOAD_URL="$PRIMARY_DOWNLOAD_URL"
+ elif test_network_connectivity "$BACKUP_DOWNLOAD_URL"; then
+ DOWNLOAD_URL="$BACKUP_DOWNLOAD_URL"
+ else
+ error "无法连接到任何下载地址"
+ fi
+
+ # 下载 node_exporter
+ download_file "$DOWNLOAD_URL" "node_exporter.tar.gz"
+ fi
+
+ # 解压 tar 包
+ run_cmd "tar -zxvf node_exporter.tar.gz"
+
+ # 移动文件夹
+ run_cmd "mv node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/ ./node_exporter"
+
+ # 进入 node_exporter 目录
+ run_cmd "cd node_exporter/"
+
+ # 创建 bin 目录并移动二进制文件
+ run_cmd "mkdir bin"
+ run_cmd "mv node_exporter bin/"
+
+ # 配置 systemd 服务
+ cat > /lib/systemd/system/node_exporter.service <> "$LOG_FILE"
+}
+
+# 步骤提示 - 绿色输出到控制台并记录日志
+step() {
+ local msg="==> $1"
+ echo -e "${GREEN}$msg${NC}"
+ log "$msg"
+}
+
+# 错误提示 - 红色输出到控制台并记录日志
+error() {
+ local msg="错误: $1"
+ echo -e "${RED}$msg${NC}"
+ log "$msg"
+ exit 1
+}
+
+# 执行命令并屏蔽输出,仅记录关键信息
+run_cmd() {
+ step "执行: $1"
+ eval "$1" &>> "$LOG_FILE" || {
+ error "命令执行失败: $1"
+ }
+}
+
+# 检测系统版本
+detect_os_version() {
+ if [[ -f /etc/os-release ]]; then
+ . /etc/os-release
+ OS_ID=$ID
+ OS_VERSION=$VERSION_ID
+ step "检测到系统: ${OS_ID} ${OS_VERSION}"
+ else
+ error "无法检测到操作系统版本"
+ fi
+}
+
+# 清理现有的 DCGM 配置
+cleanup_dcgm_config() {
+ step "清理现有的 DCGM 配置"
+ run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
+ run_cmd "apt-get remove -y datacenter-gpu-manager || true"
+ run_cmd "apt-get autoremove -y || true"
+ run_cmd "apt-get autoclean -y || true"
+}
+
+# 添加 CUDA 仓库密钥
+add_cuda_keyring() {
+ step "添加 CUDA 仓库密钥"
+ run_cmd "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb"
+ run_cmd "dpkg -i cuda-keyring_1.0-1_all.deb"
+ run_cmd "rm cuda-keyring_1.0-1_all.deb"
+}
+
+# 更新包列表
+update_package_list() {
+ step "更新包列表"
+ run_cmd "apt-get update"
+}
+
+# 安装 DCGM for Ubuntu 22.04
+install_dcgm_for_ubuntu_22() {
+ step "开始安装 DCGM for Ubuntu 22"
+
+ # 清理现有的 DCGM 配置
+ cleanup_dcgm_config
+
+ # 添加 CUDA 仓库密钥
+ add_cuda_keyring
+
+ # 添加 CUDA 仓库
+ run_cmd "add-apt-repository \"deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /\" -y"
+
+ # 更新包列表
+ update_package_list
+
+ # 获取 CUDA 版本
+ CUDA_VERSION=$(nvidia-smi | sed -E -n 's/.*CUDA Version: ([0-9]+)[.].*/\1/p')
+ if [[ -z "$CUDA_VERSION" ]]; then
+ error "无法检测到 CUDA 版本"
+ fi
+
+ # 安装 DCGM
+ run_cmd "apt install --install-recommends datacenter-gpu-manager-4-cuda${CUDA_VERSION} -y"
+
+ # 启动并启用 DCGM 服务
+ run_cmd "systemctl --now enable nvidia-dcgm"
+ run_cmd "systemctl restart nvidia-dcgm"
+ run_cmd "systemctl status nvidia-dcgm"
+
+ # 验证 DCGM 安装
+ run_cmd "dcgmi discovery -l"
+
+ step "DCGM 安装成功"
+}
+
+# 卸载 DCGM
+uninstall_dcgm() {
+ step "开始卸载 DCGM"
+
+ # 停止并禁用 DCGM 服务
+ run_cmd "systemctl stop nvidia-dcgm || true"
+ run_cmd "systemctl disable nvidia-dcgm || true"
+
+ # 移除 DCGM 包
+ run_cmd "apt-get remove -y datacenter-gpu-manager || true"
+ run_cmd "apt-get autoremove -y || true"
+ run_cmd "apt-get autoclean -y || true"
+
+ # 清理 CUDA 仓库密钥
+ run_cmd "rm -f /usr/share/keyrings/cuda-archive-keyring.gpg"
+
+ # 删除 CUDA 仓库配置文件
+ run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list"
+
+ # 更新包列表
+ update_package_list
+
+ step "DCGM 卸载完成"
+}
+
+# 主流程
+step "NVIDIA DCGM 安装脚本启动"
+detect_os_version
+
+if [[ $# -ne 1 ]]; then
+ error "请使用 --install 或 --uninstall"
+fi
+
+ACTION=$1
+case "$ACTION" in
+ "--install")
+ case "$OS_ID-$OS_VERSION" in
+ ubuntu-22.04)
+ install_dcgm_for_ubuntu_22
+ ;;
+ *)
+ error "不支持的操作系统版本: ${OS_ID} ${OS_VERSION}"
+ ;;
+ esac
+ ;;
+ "--uninstall")
+ uninstall_dcgm
+ ;;
+ *)
+ error "无效的参数,请使用 --install 或 --uninstall"
+ ;;
+esac
+
+step "操作完成,日志路径: $LOG_FILE"
diff --git a/scripts/nvidia-driver.sh b/scripts/nvidia-driver.sh
new file mode 100644
index 0000000..a323ce7
--- /dev/null
+++ b/scripts/nvidia-driver.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+set -euo pipefail
+
+# 全局变量
+DEFAULT_VERSION="565.57.01"
+INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-linux"
+#OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
+PACKAGE_TEMPLATE="NVIDIA-Linux-x86_64-%s.run"
+INSTALL_DIR="/opt"
+LOG_FILE="/var/log/nvidia-driver_$(date +%Y%m%d%H%M%S).log"
+SERVICE_FILE="/etc/systemd/system/nvidia_peermem.service"
+PERSISTENCE_SERVICE="/etc/systemd/system/nvidia-persistenced.service"
+
+# 颜色定义
+GREEN='\033[1;32m'
+RED='\033[1;31m'
+NC='\033[0m' # 重置颜色
+
+# 日志函数 - 记录所有操作到日志
+log() {
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
+}
+
+# 步骤提示 - 绿色输出到控制台并记录日志
+step() {
+ local msg="==> $1"
+ echo -e "${GREEN}$msg${NC}"
+ log "$msg"
+}
+
+# 错误提示 - 红色输出到控制台并记录日志
+error() {
+ local msg="错误: $1"
+ echo -e "${RED}$msg${NC}"
+ log "$msg"
+ exit 1
+}
+
+# 执行命令并屏蔽输出,仅记录关键信息
+run_cmd() {
+ step "执行: $1"
+ eval "$1" &>> "$LOG_FILE" || {
+ error "命令执行失败: $1"
+ }
+}
+
+# 参数解析
+ACTION=""
+VERSION="$DEFAULT_VERSION"
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --install) ACTION="install"; shift ;;
+ --uninstall) ACTION="uninstall"; shift ;;
+ --version) VERSION="$2"; shift 2 ;;
+ *) error "未知参数 $1" ;;
+ esac
+done
+OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}"
+
+[[ -z "$ACTION" ]] && { error "必须指定 --install 或 --uninstall"; }
+
+# peermem_service 开机启动函数
+install_peermem_service() {
+ step "开始配置 nvidia_peermem 开机启动"
+ # 创建服务文件
+ cat > "$SERVICE_FILE" < "$PERSISTENCE_SERVICE" <> "$LOG_FILE" 2>&1 || {
+ log "版本验证失败"; exit 1
+ }
+
+ install_peermem_service # 开机启动加载:nvidia_peermem
+ install_persistence_service # 开机启动 GPU 持久模式
+ step "安装完成"
+}
+
+# 显卡驱动卸载函数
+uninstall_driver() {
+ step "开始卸载显卡驱动,版本:$VERSION"
+ PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION")
+ PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}"
+
+ # 检查卸载脚本
+ if [[ -f "$PACKAGE_PATH" ]]; then
+ step "找到安装包,使用安装包卸载"
+ cd "$INSTALL_DIR"
+ run_cmd "chmod +x $PACKAGE_NAME"
+ if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
+ step "卸载失败,请手动卸载驱动程序"
+ return
+ fi
+ else
+ step "未找到本地安装包,尝试下载卸载包"
+ download_package "$PACKAGE_NAME" "$PACKAGE_PATH"
+ if [[ -f "$PACKAGE_PATH" ]]; then
+ cd "$INSTALL_DIR"
+ run_cmd "chmod +x $PACKAGE_NAME"
+ if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then
+ step "卸载失败,请手动卸载驱动程序"
+ return
+ fi
+ else
+ step "无法找到或下载卸载包,请手动卸载驱动程序"
+ return
+ fi
+ fi
+
+ uninstall_peermem_service # 移除 peermem 服务
+ uninstall_persistence_service # 移除 persistenced 服务
+ step "卸载完成"
+}
+
+# 包下载函数
+download_package() {
+ local package_name="$1"
+ local package_path="$2"
+ local download_urls=(
+ "${INTERNAL_BASE_URL}/${package_name}"
+ "${OFFICIAL_BASE_URL}/${package_name}"
+ )
+
+ for url in "${download_urls[@]}"; do
+ step "尝试从 $url 下载"
+ wget -qO "$package_path" "$url" && return 0
+ step "下载失败,尝试下一个 URL"
+ done
+
+ error "无法从任何来源下载 $package_name"
+}
+
+# 根据动作调用对应函数
+case "$ACTION" in
+ install) install_driver ;;
+ uninstall) uninstall_driver ;;
+esac
diff --git a/scripts/nvidia-fabricmanager.sh b/scripts/nvidia-fabricmanager.sh
new file mode 100644
index 0000000..fbefd11
--- /dev/null
+++ b/scripts/nvidia-fabricmanager.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+set -euo pipefail
+
+# 全局变量
+FABRICMANAGER_MAJOR_VERSION="565"
+FABRICMANAGER_FULL_VERSION="565.57.01-1"
+PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb"
+PACKAGE_PATH="/opt/${PACKAGE_NAME}"
+INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-fabricmanager"
+OFFICIAL_BASE_URL="https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64"
+TEMP_DIR="/tmp/fabricmanager_temp"
+LOG_FILE="/var/log/fabricmanager_$(date +%Y%m%d%H%M%S).log"
+
+# 颜色定义
+GREEN='\033[1;32m'
+RED='\033[1;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # 重置颜色
+
+# 日志函数
+log() {
+ local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+ echo "$msg" >> "$LOG_FILE"
+}
+
+# 步骤提示
+step() {
+ local msg="==> $1"
+ echo -e "${GREEN}$msg${NC}"
+ log "$msg"
+}
+
+# 警告提示
+warning() {
+ local msg="警告: $1"
+ echo -e "${YELLOW}$msg${NC}"
+ log "$msg"
+}
+
+# 错误提示
+error() {
+ local msg="错误: $1"
+ echo -e "${RED}$msg${NC}"
+ log "$msg"
+ exit 1
+}
+
+# 执行命令并记录日志
+run_cmd() {
+ step "执行: $1"
+ eval "$1" &>> "$LOG_FILE" || {
+ error "命令执行失败: $1"
+ }
+}
+
+# 检查命令是否存在
+check_cmd() {
+ command -v "$1" &>/dev/null || error "未找到命令: $1"
+}
+
+# 参数解析
+ACTION=""
+FORCE=0
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --install) ACTION="install"; shift ;;
+ --uninstall) ACTION="uninstall"; shift ;;
+ --version)
+ CUSTOM_VERSION="$2"
+ if [[ $CUSTOM_VERSION =~ ^([0-9]+)(_[0-9]+\.[0-9]+\.[0-9]+-[0-9]+)$ ]]; then
+ FABRICMANAGER_MAJOR_VERSION="${BASH_REMATCH[1]}"
+ FABRICMANAGER_FULL_VERSION="${BASH_REMATCH[2]#_}"
+ PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb"
+ PACKAGE_PATH="/opt/${PACKAGE_NAME}"
+ else
+ error "版本格式错误,应为 xxxx_xxxx.xx.xx-x"
+ fi
+ shift 2 ;;
+ --force) FORCE=1; shift ;;
+ *) error "未知参数: $1" ;;
+ esac
+done
+
+[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall"
+
+
+# 下载安装包
+download_package() {
+ step "检查安装包: $PACKAGE_PATH"
+ if [[ -f "$PACKAGE_PATH" ]]; then
+ step "使用本地安装包"
+ return 0
+ fi
+
+ step "本地包不存在,开始下载"
+ mkdir -p "$(dirname "$PACKAGE_PATH")"
+
+ local urls=(
+ "${INTERNAL_BASE_URL}/${PACKAGE_NAME}"
+ "${OFFICIAL_BASE_URL}/${PACKAGE_NAME}"
+ )
+
+ for url in "${urls[@]}"; do
+ step "尝试从 $url 下载"
+ if wget -q -O "$PACKAGE_PATH" "$url"; then
+ step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)"
+ return 0
+ else
+ warning "从 $url 下载失败"
+ fi
+ done
+
+ error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH"
+}
+
+
+# 安装 NVIDIA Fabric Manager
+install_fabricmanager() {
+ step "开始安装 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}"
+
+ # 下载安装包
+ download_package
+
+ # 安装前检查
+ step "检查系统依赖"
+ check_cmd dpkg
+ check_cmd systemctl
+
+ # 安装 Fabric Manager
+ step "安装 NVIDIA Fabric Manager"
+ run_cmd "dpkg -i $PACKAGE_PATH"
+
+ # 启动并启用服务
+ step "启动并启用 NVIDIA Fabric Manager 服务"
+ run_cmd "systemctl enable nvidia-fabricmanager.service --now"
+
+ # 验证安装
+ step "验证 NVIDIA Fabric Manager 服务状态"
+ if systemctl is-active --quiet nvidia-fabricmanager.service; then
+ step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 安装成功并运行中"
+ else
+ error "NVIDIA Fabric Manager 服务未运行"
+ fi
+}
+
+
+# 卸载 NVIDIA Fabric Manager
+uninstall_fabricmanager() {
+ step "开始卸载 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}"
+
+ # 检查服务状态
+ if systemctl is-active --quiet nvidia-fabricmanager.service; then
+ step "停止 NVIDIA Fabric Manager 服务"
+ run_cmd "systemctl stop nvidia-fabricmanager.service"
+ else
+ step "NVIDIA Fabric Manager 服务未运行"
+ fi
+
+ # 禁用服务
+ step "禁用 NVIDIA Fabric Manager 服务"
+ run_cmd "systemctl disable nvidia-fabricmanager.service"
+
+ # 卸载软件包
+ step "卸载 NVIDIA Fabric Manager 软件包"
+ if dpkg -s "nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" &>/dev/null; then
+ run_cmd "dpkg -r nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}"
+ else
+ warning "未找到 nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION} 软件包"
+ if [[ $FORCE -eq 0 ]]; then
+ error "请使用 --force 参数强制卸载"
+ fi
+ fi
+
+ # 清理残留文件
+ step "清理残留文件"
+ rm -f "$PACKAGE_PATH"
+
+ step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 卸载完成"
+}
+
+
+step "开始 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程"
+case "$ACTION" in
+ install) install_fabricmanager ;;
+ uninstall) uninstall_fabricmanager ;;
+ *) error "未知操作: $ACTION" ;;
+esac
+
+step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程完成"
\ No newline at end of file
diff --git a/scripts/system_optimize.sh b/scripts/system_optimize.sh
new file mode 100644
index 0000000..25b71ac
--- /dev/null
+++ b/scripts/system_optimize.sh
@@ -0,0 +1,124 @@
+
+#!/bin/bash
+
+# 颜色定义
+GREEN='\033[1;32m'
+RED='\033[1;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # 重置颜色
+
+# 日志函数 - 绿色输出
+log_info() {
+ echo -e "${GREEN}[INFO] $1${NC}"
+}
+
+# 错误函数 - 红色输出
+log_error() {
+ echo -e "${RED}[ERROR] $1${NC}"
+}
+
+# 警告函数 - 黄色输出
+log_warning() {
+ echo -e "${YELLOW}[WARNING] $1${NC}"
+}
+
+# 禁用apt的定期更新
+disable_apt_periodic_updates() {
+ log_info "禁用apt的定期更新..."
+ # 修改10periodic配置文件,将所有的1改为0,禁用自动更新检查
+ sed -i 's/1/0/g' /etc/apt/apt.conf.d/10periodic
+ # 修改20auto-upgrades配置文件,将所有的1改为0,禁用自动升级
+ sed -i 's/1/0/g' /etc/apt/apt.conf.d/20auto-upgrades
+ log_info "apt定期更新已禁用"
+}
+
+# 设置系统时区为上海
+set_timezone_to_shanghai() {
+ log_info "设置系统时区为上海..."
+ # 使用timedatectl命令设置系统时区为Asia/Shanghai
+ timedatectl set-timezone Asia/Shanghai
+ log_info "系统时区已设置为上海"
+}
+
+# 同步硬件时钟和系统时钟
+synchronize_hardware_clock() {
+ log_info "同步硬件时钟和系统时钟..."
+ # 使用hwclock命令将系统时间同步到硬件时钟
+ hwclock --systohc
+ log_info "硬件时钟和系统时钟已同步"
+}
+
+# 删除"ubuntu"用户
+#remove_ubuntu_user() {
+# log_info "删除'ubuntu'用户..."
+# # 使用userdel命令删除ubuntu用户,并递归删除其主目录
+# # &> /dev/null用于忽略可能的错误输出(例如用户不存在的情况)
+# userdel -r ubuntu &> /dev/null
+# log_info "已尝试删除'ubuntu'用户(如果存在)"
+#}
+
+# 禁止显卡驱动
+disable_nouveau_driver() {
+ log_info "禁止nouveau显卡驱动..."
+
+ # 创建blacklist-nouveau.conf文件,添加禁止nouveau驱动的配置
+ cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF
+blacklist nouveau
+blacklist lbm-nouveau
+options nouveau modeset=0
+alias nouveau off
+alias lbm-nouveau off
+EOF
+
+ # 创建nouveau-kms.conf文件,禁用nouveau的KMS(内核模式设置)
+ cat > /etc/modprobe.d/nouveau-kms.conf << EOF
+options nouveau modeset=0
+EOF
+
+ # 更新initramfs,使驱动禁用配置生效
+ update-initramfs -u &> /dev/null
+
+ log_info "nouveau显卡驱动已被禁止"
+}
+
+# 更改GRUB配置并更新
+update_grub_configuration() {
+ log_info "更改GRUB配置启用传统网络接口命名..."
+
+ # 检查GRUB配置中是否已存在所需的网络接口命名设置
+ if ! grep -q 'GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub; then
+ # 如果不存在,则注释掉原有的GRUB_CMDLINE_LINUX_DEFAULT行
+ sed -i '/GRUB_CMDLINE_LINUX_DEFAULT=.*/s/^/#/' /etc/default/grub
+ # 添加新的GRUB_CMDLINE_LINUX_DEFAULT行,启用传统网络接口命名
+ sed -i '/GRUB_CMDLINE_LINUX_DEFAULT=.*/a\GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub
+ fi
+
+ # 再次检查配置是否已成功添加
+ if grep -q 'GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub; then
+ log_info "文件 /etc/default/grub 修改成功!"
+ else
+ log_error "文件 /etc/default/grub 修改失败!"
+ exit 1
+ fi
+
+ # 更新GRUB引导加载程序配置
+ update-grub &> /dev/null
+
+ log_info "GRUB配置已更新"
+}
+
+# 主函数:按顺序执行所有配置步骤
+main() {
+ log_info "开始系统配置..."
+
+ disable_apt_periodic_updates
+ set_timezone_to_shanghai
+ synchronize_hardware_clock
+ disable_nouveau_driver
+ update_grub_configuration
+
+ log_info "系统配置完成!"
+}
+
+# 执行主函数
+main