diff --git a/group_vars/all.yaml b/group_vars/all.yaml new file mode 100644 index 0000000..7d7edca --- /dev/null +++ b/group_vars/all.yaml @@ -0,0 +1,8 @@ +# 基础路径配置 +script_dest: "/opt/ansible-scripts" # 脚本存储目录 +log_base_dir: "/var/log/ansible-deploy" # 日志根目录 + +# ansible优化 +ansible_ssh_common_args: "-o ControlMaster=auto -o ControlPersist=60s" #自动复用已建立的 SSH 连接 +ansible_pipelining: yes +ansible_ssh_timeout: 120 diff --git a/inventory/prod/prod.ini b/inventory/prod/prod.ini new file mode 100644 index 0000000..846f7a0 --- /dev/null +++ b/inventory/prod/prod.ini @@ -0,0 +1,12 @@ +[compute_nodes] +gpu-node-01 ansible_host=10.0.0.101 gpu_model="NVIDIA A100" +gpu-node-02 ansible_host=10.0.0.102 gpu_model="NVIDIA H100" + +[all_nodes:children] +compute_nodes + +[all:vars] +ansible_user=root +ansible_ssh_port=22 +ansible_ssh_pass=xxx + diff --git a/playbooks/all_components_deploy.yml b/playbooks/all_components_deploy.yml new file mode 100644 index 0000000..5ecd56f --- /dev/null +++ b/playbooks/all_components_deploy.yml @@ -0,0 +1,9 @@ +- name: 全量组件部署 + hosts: all_nodes # 所有节点分组 + roles: + - role: system_init # 基础初始化 + - role: gpu_driver # GPU节点专 + - role: node_exporter # 系统监控 + - role: dcgm_exporter # GPU监控 + vars: + operation: "install" # 操作类型install/uninstall diff --git a/playbooks/deploy_gpu.yml b/playbooks/deploy_gpu.yml new file mode 100644 index 0000000..e69de29 diff --git a/playbooks/deploy_node-exporter.yml b/playbooks/deploy_node-exporter.yml new file mode 100644 index 0000000..e69de29 diff --git a/roles/gpu_drive/files/install.sh b/roles/gpu_drive/files/install.sh new file mode 100644 index 0000000..6199241 --- /dev/null +++ b/roles/gpu_drive/files/install.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set -eo pipefail +LOG_DIR="{{ log_base_dir }}/{{ driver.name }}" # 从角色变量注入路径 +LOG_FILE="${LOG_DIR}/install-$(date +%Y%m%d).log" +mkdir -p "$LOG_DIR" +exec > >(tee -a "$LOG_FILE") 2>&1 + +# 参数解析(严格匹配角色定义的操作) +OPERATION="" +VERSION="" +while [[ $# -gt 0 ]]; do + case "$1" in + --install) OPERATION="install" ;; + --uninstall) OPERATION="uninstall" ;; + --version) VERSION="$2"; shift ;; + *) echo "错误:未知参数 $1" >&2; exit 1 ;; + esac + shift +done + +# 安装逻辑(使用角色专属变量) +install() { + local DRIVER_VERSION="${VERSION:-$DEFAULT_VERSION}" + echo "[$(date)] 开始安装NVIDIA驱动(版本:$DRIVER_VERSION,型号:$GPU_MODEL)..." + + # 企业内部镜像下载(安全加速) + wget -q "${DOWNLOAD_URL}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" \ + -O "/tmp/nvidia-driver.run" + + # 静默安装(企业级无交互模式) + sh "/tmp/nvidia-driver.run" --silent --no-x-check --no-nouveau-check + + # 严格功能验证(硬件型号匹配) + if ! nvidia-smi --query-gpu=name --format=csv,noheader | grep -q "$GPU_MODEL"; then + echo "错误:驱动安装后未识别到目标GPU型号" >&2; exit 1 + fi +} + +# 卸载逻辑(幂等性设计) +uninstall() { + echo "[$(date)] 开始卸载NVIDIA驱动..." + /usr/bin/nvidia-uninstall --silent # 官方静默卸载工具 + rm -f "/tmp/nvidia-driver.run" # 清理残留文件 +} + +# 主流程(依赖角色变量注入的默认值) +DEFAULT_VERSION="{{ driver.default_version }}" +DOWNLOAD_URL="{{ driver.download_url }}" +install || uninstall # 根据OPERATION执行对应函数(由剧本参数控制) +exit 0 diff --git a/roles/gpu_drive/tasks/main.yml b/roles/gpu_drive/tasks/main.yml new file mode 100644 index 0000000..7d820f8 --- /dev/null +++ b/roles/gpu_drive/tasks/main.yml @@ -0,0 +1,42 @@ +- name: 创建角色专属日志目录 + file: + path: "{{ log_base_dir }}/{{ driver.name }}" + state: directory + mode: "0750" + +- name: 同步驱动脚本到目标服务器 + copy: + src: "{{ driver.install_script }}" + dest: "{{ script_dest }}/{{ driver.install_script }}" + mode: "0755" + force: yes # 确保使用最新脚本 + +- name: 执行驱动操作(安装/卸载) + shell: | + {{ script_dest }}/{{ driver.install_script }} \ + {{ operations[operation] }} \ + {% if target_version is defined and target_version != "" %}--version {{ target_version }}{% endif %} + register: script_result + environment: + GPU_MODEL: "{{ hostvars[inventory_hostname]['gpu_model'] }}" # 注入主机硬件信息 + retries: 3 # 企业级重试机制(失败3次终止) + delay: 30 # 重试间隔30秒 + become: yes # 使用sudo执行 + +- name: 验证操作结果(安装时) + when: operation == "install" + shell: "{{ driver.service_check }}" + changed_when: false + failed_when: "GPU count: 0" in script_result.stderr + +- name: 记录操作日志(企业级可观测性) + uri: + url: "http://logging.internal.com/api/ansible" + method: POST + body_format: json + body: + host: "{{ inventory_hostname }}" + component: "{{ driver.name }}_driver" + operation: "{{ operation }}" + version: "{{ target_version | default(driver.default_version) }}" + status: "{% if script_result.rc == 0 %}success{% else %}failed{% endif %}" diff --git a/roles/gpu_drive/vars/manin.yaml b/roles/gpu_drive/vars/manin.yaml new file mode 100644 index 0000000..31a7af2 --- /dev/null +++ b/roles/gpu_drive/vars/manin.yaml @@ -0,0 +1,12 @@ +# 显卡驱动专属变量 脚本所需参数 +driver: + name: "nvidia" + default_version: "545.29.06" # 版本 + download_url: "http://repo.internal.com/drivers/nvidia" # 安装包下载路径 + install_script: "nvidia-install.sh" # 脚本文件名 + service_check: "nvidia-smi --list-gpus" # 安装后验证命令 + +# 操作参数 +operations: + install: "--install" + uninstall: "--uninstall" diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..c87e2b6 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,155 @@ +

GPU 环境标准化部署脚本使用说明:

+ +

+ code size + ofed + NVIDIA + fabricmanager + CUDA +
+ Author +

+
+ + +### 一、脚本概述 + +该脚本旨在简化 GPU 相关应用的安装流程,适用于需要快速部署 GPU 环境的场景。 + +- **核心功能**: + ```bash + 脚本可批量完成网卡驱动、显卡驱动、fabricmanager互联管理器、CUDA 工具包、Nvidia-dcgm、DCGM-EXporter、Node-EXporter 核心组件的安装与卸载操作 + ``` +- **配置说明**: + ```bash + 用户管理:若需删除 ubuntu 用户,需手动执行相关用户删除命令,并妥善处理该用户关联的数据与权限。​ + 磁盘管理:磁盘分区扩容需通过磁盘管理工具,根据实际需求对磁盘进行分区调整与扩容操作,以满足应用存储需求。​ + 网络配置:网卡重命名需手动修改网络配置文件,根据实际网络环境对网卡名称进行重新定义,确保网络连接正常。 + ``` +- **使用建议**: + ```bash + 新系统推荐使用一键自动安装脚本,可快速、全面地完成 GPU 相关应用的部署,具体使用方法详见文章末尾说明。​若系统之前已存在相关安装内容,或需要对各组件进行独立、定制化部署,建议使用单独部署脚本安装。 + ``` +### 二、使用说明 + +#### (1)系统初始化 + + ```bash + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash + ``` + +#### (2)MLNX_OFED 网络套件安装/卸载 + + ```bash + #支持版本[23.10-1.1.9.0] + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --uninstall --version '23.10-1.1.9.0' + ``` + + +#### (3)Nvidia 显卡驱动安装/卸载 + + ```bash + #支持版本[565.57.01] [570.124.06] + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '565.57.01' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '565.57.01' + + + ``` + +#### (4)GPU 互联管理器安装/卸载 + + ```bash + #支持版本[565_565.57.01-1] [570_570.124.06-1] + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '565_565.57.01-1' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '565_565.57.01-1' + ``` + + +#### (5)NVIDIA CUDA 工具包部署/卸载 + + ```bash + #支持版本[12.6.3_560.35.05] [12.8.1_570.124.06] + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.6.3_560.35.05' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.6.3_560.35.05' + ``` + +#### (6)dcgm/node exporter 部署/卸载 + + ```bash + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-dcgm.sh | bash -s -- --install + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/dcgm-exporter.sh | bash -s -- --install + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/node-exporter.sh | bash -s -- --install + + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-dcgm.sh | bash -s -- --uninstall + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/dcgm-exporter.sh | bash -s -- --uninstall + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/node-exporter.sh | bash -s -- --uninstall + ``` + +#### (7)批量组件安装/卸载 + +![Static Badge](https://img.shields.io/badge/组件[1]-orange?style=flat-square) +![Static Badge](https://img.shields.io/badge/mlnx_ofed-23.10.1.1.9.0-brightgreen?style=plastic) +![Static Badge](https://img.shields.io/badge/nvidia_drive-565.57.01-brightgreen?style=plastic) +![Static Badge](https://img.shields.io/badge/cuda-12.6.3.560.35.05-brightgreen?style=plastic) +![Static Badge](https://img.shields.io/badge/fabricmanager-565_565.57.01.1-brightgreen?style=plastic) + ```bash + 安装:--------------------------------------------------------------------------------------------------------------------------------------------- + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '565.57.01' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.6.3_560.35.05' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '565_565.57.01-1' + + 卸载:--------------------------------------------------------------------------------------------------------------------------------------------- + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh |bash -s -- --uninstall --version '23.10-1.1.9.0' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '565.57.01' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.6.3_560.35.05' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '565_565.57.01-1' + ``` + +![Static Badge](https://img.shields.io/badge/组件[2]-orange?style=flat-square) +![Static Badge](https://img.shields.io/badge/mlnx_ofed-23.10.1.1.9.0-brightgreen?style=plastic) +![Static Badge](https://img.shields.io/badge/nvidia_drive-570.124.06-brightgreen?style=plastic) +![Static Badge](https://img.shields.io/badge/cuda-12.8.1.570.124.06-brightgreen?style=plastic) +![Static Badge](https://img.shields.io/badge/fabricmanager-570.124.06.1-brightgreen?style=plastic) + ```bash + 安装:--------------------------------------------------------------------------------------------------------------------------------------------- + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/system_optimize.sh|bash + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.sh|bash -s -- --install --version '23.10-1.1.9.0' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --install --version '570.124.06' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --install --version '12.8.1_570.124.06' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --install --version '570_570.124.06-1' + + + 卸载:-------------------------------------------------------------------------------------------------------------------------------------------- + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/ib-drive.shbash -s -- --uninstall --version '23.10-1.1.9.0' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-driver.sh | bash -s -- --uninstall --version '570.124.06' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/cuda.sh | bash -s -- --uninstall --version '12.8.1_570.124.06' + cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/nvidia-fabricmanager.sh|bash -s -- --uninstall --version '570_570.124.06-1' + + ``` +![Static Badge](https://img.shields.io/badge/推荐一键安装脚本-orange?style=flat-square) +```bash +#安装/卸载服务(安装或卸载时间较长,建议放后台执行。): +#组合[1]----------------------------------------------------------------------------------------------------------------------------------- +screen -dmS install_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --install --version 1 --include=exporter 2>&1 > /opt/gpu-manager.log"; +tail -f /opt/gpu-manager.log + +screen -dmS uninstall_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --uninstall --version 1 --include=exporter 2>&1 > /opt/gpu-manager.log"; +tail -f /opt/gpu-manager.log + +#组合[2]----------------------------------------------------------------------------------------------------------------------------------- +screen -dmS install_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --install --version 2 --include=exporter 2>&1 > /opt/gpu-manager.log"; +tail -f /opt/gpu-manager.log + +screen -dmS uninstall_script bash -c "cd /opt/ && wget -qO- http://10.101.0.51:3000/yindun/ansible-devops/raw/branch/main/scripts/gpu-manager.sh|bash -s -- --uninstall --version 2 --include=exporter 2>&1 > /opt/gpu-manager.log"; +tail -f /opt/gpu-manager.log + +#说明: +#version 1 表示安装/卸载七.[1]组件版本:mlnx_ofed-23.10.1.1.9.0+nvidia_drive-565.57.01 +cuda-12.6.3.560.35.05 +fabricmanager-565_565.57.01.1 +#version 2 表示安装/卸载七.[2]组件版本:mlnx_ofed-23.10.1.1.9.0+nvidia_drive-570.124.06+cuda-12.8.1.570.124.06+fabricmanager-570.124.06.1 +#--include=exporter 指定该参数,脚本将安装/卸载exporter组件中的相关服务[dcgm-exporter,node-exporter,nvidia-dcgm],默认不安装/卸载。 + +``` + diff --git a/scripts/cuda.sh b/scripts/cuda.sh new file mode 100644 index 0000000..ea411b1 --- /dev/null +++ b/scripts/cuda.sh @@ -0,0 +1,275 @@ +#!/bin/bash +set -euo pipefail + +# 全局变量 +CUDA_VERSION="12.6" +DRIVER_VERSION="560.35.05" +PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run" +PACKAGE_PATH="/opt/${PACKAGE_NAME}" +INTERNAL_BASE_URL="http://10.101.0.51:5588/cuda-linux" +OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers" +TEMP_DIR="/tmp/cuda_temp" +CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2) +CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}" +ENV_PROFILE="/etc/profile" +LOG_FILE="/var/log/cuda_manager_$(date +%Y%m%d%H%M%S).log" + +# 颜色定义 +GREEN='\033[1;32m' +RED='\033[1;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # 重置颜色 + +# 日志函数 +log() { + local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" + echo "$msg" >> "$LOG_FILE" +} + +# 步骤提示 +step() { + local msg="==> $1" + echo -e "${GREEN}$msg${NC}" + log "$msg" +} + +# 警告提示 +warning() { + local msg="警告: $1" + echo -e "${YELLOW}$msg${NC}" + log "$msg" +} + +# 错误提示 +error() { + local msg="错误: $1" + echo -e "${RED}$msg${NC}" + log "$msg" + exit 1 +} + +# 执行命令并记录日志 +run_cmd() { + step "执行: $1" + eval "$1" &>> "$LOG_FILE" || { + error "命令执行失败: $1" + } +} + +# 检查命令是否存在 +check_cmd() { + command -v "$1" &>/dev/null || error "未找到命令: $1" +} + +# 参数解析 +ACTION="" +FORCE=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --install) ACTION="install"; shift ;; + --uninstall) ACTION="uninstall"; shift ;; + --version) + CUSTOM_VERSION="$2" + if [[ $CUSTOM_VERSION =~ ^([0-9]+\.[0-9]+(\.[0-9]+)?)(_([0-9]+\.[0-9]+\.[0-9]+))$ ]]; then + CUDA_VERSION="${BASH_REMATCH[1]}" + DRIVER_VERSION="${BASH_REMATCH[4]}" + PACKAGE_NAME="cuda_${CUDA_VERSION}_${DRIVER_VERSION}_linux.run" + PACKAGE_PATH="/opt/${PACKAGE_NAME}" + CUDA_MAJOR_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f1-2) + CUDA_INSTALL_DIR="/usr/local/cuda-${CUDA_MAJOR_MINOR}" + OFFICIAL_BASE_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers" + else + error "版本格式错误,应为 x.x.x_y.y.y 或 x.x_y.y.y" + fi + shift 2 ;; + --force) FORCE=1; shift ;; + *) error "未知参数: $1" ;; + esac +done + +[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall" + + +# 下载安装包 +download_package() { + step "检查安装包: $PACKAGE_PATH" + if [[ -f "$PACKAGE_PATH" ]]; then + step "使用本地安装包" + return 0 + fi + + step "本地包不存在,开始下载" + mkdir -p "$(dirname "$PACKAGE_PATH")" + + local urls=( + "${INTERNAL_BASE_URL}/${PACKAGE_NAME}" + "${OFFICIAL_BASE_URL}/${PACKAGE_NAME}" + ) + + for url in "${urls[@]}"; do + step "尝试从 $url 下载" + if wget -q -O "$PACKAGE_PATH" "$url"; then + step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)" + return 0 + else + warning "从 $url 下载失败" + fi + done + + error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH" +} + + +# 安装 CUDA +install_cuda() { + step "开始安装 CUDA ${CUDA_VERSION}" + + # 下载安装包 + download_package + + # 创建临时目录 + mkdir -p "$TEMP_DIR" + trap 'rm -rf "$TEMP_DIR"' EXIT + + # 检查安装包参数 + step "检查安装包支持的参数" + sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1 + + # 检查是否支持 --toolkit 参数 + if grep -q -- '--toolkit' "$TEMP_DIR/help.txt"; then + step "安装包支持 --toolkit 参数" + run_cmd "sh $PACKAGE_PATH --silent --toolkit" + else + warning "安装包不支持 --toolkit 参数,尝试完整安装" + run_cmd "sh $PACKAGE_PATH --silent" + fi + + # 配置环境变量 + step "配置 CUDA 环境变量" + if ! grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then + cat >> "$ENV_PROFILE" << EOF + +# CUDA ${CUDA_VERSION} +export PATH=${CUDA_INSTALL_DIR}/bin:\$PATH +export LD_LIBRARY_PATH=${CUDA_INSTALL_DIR}/lib64:\$LD_LIBRARY_PATH +EOF + step "已添加环境变量到 $ENV_PROFILE" + else + step "环境变量已存在,跳过添加" + fi + + # 生效环境变量 + export LC_BYOBU=0 && source "/etc/profile" #临时解决 + + # 验证安装 + step "验证 CUDA 安装" + if command -v nvcc &>/dev/null; then + nvcc_version=$(nvcc -V | grep release | awk '{print $5}' | tr -d ',') + if [[ "$nvcc_version" == *"${CUDA_MAJOR_MINOR}"* ]]; then + step "CUDA ${CUDA_VERSION} 安装成功" + else + error "CUDA 版本不匹配,期望 ${CUDA_VERSION},实际 $nvcc_version" + fi + else + error "nvcc 命令未找到,安装失败" + fi +} + + +# 卸载 CUDA +uninstall_cuda() { + step "开始卸载 CUDA ${CUDA_VERSION}" + + # 创建临时目录 + mkdir -p "$TEMP_DIR" + trap 'rm -rf "$TEMP_DIR"' EXIT + + # 检查官方卸载脚本 + OFFICIAL_UNINSTALLER="${CUDA_INSTALL_DIR}/bin/cuda-uninstaller" + if [[ -x "$OFFICIAL_UNINSTALLER" ]]; then + step "找到官方卸载脚本: $OFFICIAL_UNINSTALLER" + + # 执行官方卸载脚本 + step "执行官方卸载程序" + run_cmd "$OFFICIAL_UNINSTALLER --silent" + else + warning "未找到官方卸载脚本,尝试其他方法" + + # 检查安装包是否存在 + if [[ -f "$PACKAGE_PATH" ]]; then + step "找到安装包: $PACKAGE_PATH" + else + if [[ $FORCE -eq 1 ]]; then + warning "未找到安装包,继续强制卸载" + else + step "未找到安装包,开始下载" + download_package + fi + fi + + # 检查安装包是否支持 --uninstall 参数 + step "检查安装包是否支持 --uninstall 参数" + sh "$PACKAGE_PATH" --help > "$TEMP_DIR/help.txt" 2>&1 + + if grep -q -- '--uninstall' "$TEMP_DIR/help.txt"; then + step "安装包支持 --uninstall 参数" + run_cmd "sh $PACKAGE_PATH --silent --uninstall" + else + step "安装包不支持 --uninstall 参数,尝试解压查找卸载脚本" + + # 解压安装包 + step "解压安装包到 $TEMP_DIR" + run_cmd "sh $PACKAGE_PATH --extract=$TEMP_DIR" + + # 查找卸载脚本 + UNINSTALL_SCRIPT=$(find "$TEMP_DIR" -name "uninstall_cuda*" -type f | head -n1) + + if [[ -n "$UNINSTALL_SCRIPT" ]]; then + step "找到卸载脚本: $UNINSTALL_SCRIPT" + run_cmd "sh $UNINSTALL_SCRIPT" + else + warning "未找到卸载脚本,继续手动清理" + fi + fi + fi + + # 清理环境变量 + step "清理环境变量" + if grep -q "cuda-${CUDA_MAJOR_MINOR}" "$ENV_PROFILE"; then + run_cmd "sed -i '/cuda-${CUDA_MAJOR_MINOR}/d' $ENV_PROFILE" + step "已从 $ENV_PROFILE 移除 CUDA 环境变量" + else + step "环境变量已清理" + fi + + # 清理安装目录 + if [[ -d "$CUDA_INSTALL_DIR" ]]; then + step "删除安装目录: $CUDA_INSTALL_DIR" + run_cmd "rm -rf $CUDA_INSTALL_DIR" + else + step "安装目录不存在,跳过删除" + fi + + # 清理残留文件 + step "清理残留文件" + find / -name "*cuda-${CUDA_MAJOR_MINOR}*" 2>/dev/null | while read -r file; do + if [[ -e "$file" ]]; then + step "删除残留文件: $file" + rm -rf "$file" 2>/dev/null || warning "无法删除: $file" + fi + done + + step "CUDA ${CUDA_VERSION} 卸载完成" +} + + + +step "开始 CUDA ${CUDA_VERSION} ${ACTION} 流程" +case "$ACTION" in + install) install_cuda ;; + uninstall) uninstall_cuda ;; + *) error "未知操作: $ACTION" ;; +esac + +step "CUDA ${CUDA_VERSION} ${ACTION} 流程完成" diff --git a/scripts/dcgm-exporter.sh b/scripts/dcgm-exporter.sh new file mode 100644 index 0000000..548cf76 --- /dev/null +++ b/scripts/dcgm-exporter.sh @@ -0,0 +1,288 @@ +#!/bin/bash +set -euo pipefail + +# 全局变量 +LOG_FILE="/var/log/dcgm-exporter-install_$(date +%Y%m%d%H%M%S).log" +GO_VERSION="1.21.1" +DCGM_EXPORTER_VERSION="4.2.0-4.1.0" +DCGM_EXPORTER_DIR="/opt/dcgm-exporter" +SERVICE_FILE="/etc/systemd/system/dcgm-exporter.service" + +# 颜色定义 +GREEN='\033[1;32m' +RED='\033[1;31m' +YELLOW='\033[1;33m' +NC='\033[0m' + +# 日志函数(控制台+日志文件) +log() { + local timestamp="[$(date '+%Y-%m-%d %H:%M:%S')]" + echo -e "$timestamp $*" | tee -a "$LOG_FILE" +} + +# 成功提示 +success() { + log "${GREEN}✔ $*${NC}" +} + +# 警告提示 +warning() { + log "${YELLOW}⚠ $*${NC}" +} + +# 错误提示 +error() { + log "${RED}✖ 错误: $*${NC}" + exit 1 +} + +# 执行命令(带详细错误处理) +run() { + local cmd="$1" + local error_msg="${2:-命令执行失败}" + local timeout="${3:-30}" # 默认超时30秒 + + log "→ 执行: $cmd" + + # 使用timeout防止命令卡死 + if ! timeout "$timeout" bash -c "$cmd" &>> "$LOG_FILE"; then + local exit_code=$? + if [[ $exit_code -eq 124 ]]; then + error "命令超时 ($timeout秒): $cmd" + else + error "$error_msg (退出码: $exit_code)" + fi + fi +} + +# 检测DCGM状态 +check_dcgm() { + log "检测DCGM服务状态..." + + # 检查systemctl命令是否存在 + if ! command -v systemctl &> /dev/null; then + error "未找到systemctl命令,请确保系统支持systemd" + fi + + # 检查nvidia-dcgm.service文件是否存在 + if [[ ! -f "/lib/systemd/system/nvidia-dcgm.service" ]]; then + error "未找到nvidia-dcgm服务文件,请确认DCGM已正确安装" + fi + + # 获取服务状态 + local status=$(systemctl is-active nvidia-dcgm 2>&1) + local exit_code=$? + + if [[ $exit_code -ne 0 ]]; then + error "无法获取DCGM服务状态: $status" + fi + + if [[ "$status" == "active" ]]; then + DCGM_VERSION=$(dcgmi version 2>&1 | grep -oP 'Version \K[^\s]+' || echo "未知") + success "DCGM服务运行中 (版本: $DCGM_VERSION)" + else + error "DCGM服务状态异常 ($status),请先执行: systemctl start nvidia-dcgm" + fi +} + +# 安装Go环境 +install_go() { + log "安装Go环境 (版本: $GO_VERSION)..." + GO_PACKAGE="go${GO_VERSION}.linux-amd64.tar.gz" + DL_URL="https://golang.google.cn/dl/$GO_PACKAGE" + TMP_PACKAGE="/tmp/$GO_PACKAGE" + + # 下载安装包 + if [[ ! -f "$TMP_PACKAGE" ]]; then + run "wget -qO $TMP_PACKAGE $DL_URL" "下载Go安装包失败" + fi + + # 解压安装 + run "tar -xzf $TMP_PACKAGE -C /usr/local" "解压Go安装包失败" + + # 配置环境变量 + GO_ENV="/etc/profile.d/go.sh" + cat > "$GO_ENV" <<'EOF' +export GOROOT=/usr/local/go +export GOPATH=/usr/local/gopath +export PATH=$PATH:$GOROOT/bin +export GO111MODULE=on +export GOPROXY=https://goproxy.cn,direct +EOF + log "→ 配置Go环境变量" + if ! chmod +x "$GO_ENV" &>> "$LOG_FILE"; then + error "设置Go环境变量文件权限失败" + fi + + # 在当前shell中加载环境变量 + log "→ 加载Go环境变量" + if ! source "$GO_ENV"; then + error "加载Go环境变量失败" + fi + + # 验证安装 + local go_version=$(go version 2>&1) + if [[ "$go_version" == *"go$GO_VERSION"* ]]; then + success "Go环境安装完成: $go_version" + else + error "Go环境验证失败: $go_version" + fi +} + +# 卸载现有DCGM Exporter +uninstall_existing() { + log "检查是否存在旧版本DCGM Exporter..." + + if [[ -d "$DCGM_EXPORTER_DIR" || -f "$SERVICE_FILE" ]]; then + log "发现旧版本,开始卸载..." + + # 停止服务 + if systemctl is-active --quiet dcgm-exporter; then + run "systemctl stop dcgm-exporter" "停止现有服务失败" + fi + + # 禁用服务 + if systemctl is-enabled --quiet dcgm-exporter; then + run "systemctl disable dcgm-exporter" "禁用现有服务失败" + fi + + # 删除文件 + run "rm -rf $DCGM_EXPORTER_DIR" "删除旧安装目录失败" + run "rm -f $SERVICE_FILE" "删除服务文件失败" + + # 重新加载systemd + run "systemctl daemon-reload" "重新加载systemd失败" + + success "旧版本卸载完成" + else + success "未发现旧版本,继续安装..." + fi +} + +# 安装DCGM Exporter +install_exporter() { + log "安装DCGM Exporter (版本: $DCGM_EXPORTER_VERSION)..." + + # 加载Go环境变量 + log "→ 加载Go环境变量" + if [[ -f "/etc/profile.d/go.sh" ]]; then + if ! source "/etc/profile.d/go.sh"; then + error "加载Go环境变量失败" + fi + else + error "未找到Go环境变量配置文件" + fi + + run "mkdir -p $DCGM_EXPORTER_DIR" "创建工作目录失败" + + # 下载源码(使用固定URL) + DCGM_PACKAGE="4.2.0-4.1.0.tar.gz" + DL_URL="https://github.com/NVIDIA/dcgm-exporter/archive/refs/tags/$DCGM_PACKAGE" + TMP_PACKAGE="/tmp/$DCGM_PACKAGE" + + if [[ ! -f "$TMP_PACKAGE" ]]; then + run "wget -qO $TMP_PACKAGE $DL_URL" "下载DCGM Exporter源码失败" + fi + + # 解压 + run "tar xf $TMP_PACKAGE -C /tmp" "解压DCGM Exporter源码失败" + SOURCE_DIR="/tmp/dcgm-exporter-$DCGM_EXPORTER_VERSION" + + # 编译安装 + log "→ 编译DCGM Exporter" + if ! cd "$SOURCE_DIR" &>> "$LOG_FILE"; then + error "进入源码目录失败" + fi + + # 分步骤执行make,便于调试 + if ! make binary &>> "$LOG_FILE"; then + error "编译DCGM Exporter失败" + fi + + if ! make install &>> "$LOG_FILE"; then + error "安装DCGM Exporter失败" + fi + + # 复制文件 + run "cp -a /usr/bin/dcgm-exporter $DCGM_EXPORTER_DIR/" "复制二进制文件失败" + run "cp -a /etc/dcgm-exporter/default-counters.csv $DCGM_EXPORTER_DIR/" "复制配置文件失败" + + + # 生成服务文件 + cat > "$SERVICE_FILE" </opt/gpu-manager.log + parse_args "$@" + define_versions + show_version_info + + if [ "$ACTION" = "install" ]; then + run_install + else + run_uninstall + fi +} + +# 执行主函数 +main "$@" diff --git a/scripts/ib-drive.sh b/scripts/ib-drive.sh new file mode 100644 index 0000000..c9b95df --- /dev/null +++ b/scripts/ib-drive.sh @@ -0,0 +1,260 @@ +#!/bin/bash +set -e + +# 颜色定义 +GREEN='\033[1;32m' +RED='\033[1;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # 重置颜色 + +# 输出带颜色的信息 +log_info() { echo -e "${GREEN}[INFO] $1${NC}"; } +log_error() { echo -e "${RED}[ERROR] $1${NC}"; exit 1; } +log_warning() { echo -e "${YELLOW}[WARNING] $1${NC}"; } + +# 默认变量 +ACTION="" +DRIVER_VERSION="5.8-6.0.4.2" +DISTRO="ubuntu22.04" +ARCH="x86_64" +FORCE=0 + +# 生成包名和路径 +generate_package_info() { + DRIVER_PACKAGE="MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}.tgz" + PACKAGE_PATH="/opt/${DRIVER_PACKAGE}" + DRIVER_DIR="/opt/MLNX_OFED_LINUX-${DRIVER_VERSION}-${DISTRO}-${ARCH}" + INTERNAL_URL="http://10.101.0.51:5588/mlnx-ofed/${DRIVER_PACKAGE}" + OFFICIAL_URL="https://example.com/${DRIVER_PACKAGE}" # 请替换为实际官网地址 +} + +# 解析命令行参数 +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --install) + ACTION="install" + shift + ;; + --uninstall) + ACTION="uninstall" + shift + ;; + --version) + if [[ -z "$2" ]]; then + log_error "请指定版本号,如: --version 5.8-6.0.4.2" + fi + DRIVER_VERSION="$2" + generate_package_info + shift 2 + ;; + --force) + FORCE=1 + shift + ;; + *) + log_error "未知参数: $1" + ;; + esac + done + + if [[ -z "$ACTION" ]]; then + log_error "请指定操作: --install 或 --uninstall" + fi +} + +# 下载驱动包 +download_driver() { + log_info "开始下载驱动包: $DRIVER_PACKAGE" + if [ -f "$PACKAGE_PATH" ]; then + log_info "使用本地驱动包: $PACKAGE_PATH" + else + log_info "本地包不存在,尝试从内网下载" + if wget -q -O "$PACKAGE_PATH" "$INTERNAL_URL"; then + log_info "内网下载成功" + else + log_warning "内网下载失败,尝试从官网下载" + if wget -q -O "$PACKAGE_PATH" "$OFFICIAL_URL"; then + log_info "官网下载成功" + else + log_error "驱动包下载失败,请手动放置到 /opt/" + fi + fi + fi +} + +# 安装驱动 +install_driver() { + log_info "开始安装驱动: $DRIVER_VERSION" + + # 检查是否已安装 + #if command -v ibv_devinfo &> /dev/null && [[ $FORCE -eq 0 ]]; then + # log_warning "检测到驱动已安装,使用 --force 覆盖安装" + # exit 0 + #fi + + kernel_version=$(uname -r) + log_info "当前内核版本: $kernel_version" + + log_info "安装依赖包" + apt update &>> /tmp/mlnx_install.log + apt install -y net-tools bzip2 &>> /tmp/mlnx_install.log + + log_info "解压驱动包" + tar -zxf "$PACKAGE_PATH" -C /opt/ + + log_info "执行驱动安装" + cd "$DRIVER_DIR" + ./mlnxofedinstall --without-dkms --add-kernel-support --kernel "$kernel_version" --with-fw-update --force &>> /tmp/mlnx_install.log + sleep 10 +} + +# 卸载驱动(修改后版本) +uninstall_driver() { + log_info "开始卸载驱动: $DRIVER_VERSION" + + # 检查驱动目录,不存在则重新下载解压 + if [ ! -d "$DRIVER_DIR" ]; then + log_warning "驱动目录不存在,尝试重新下载和解压" + download_driver # 复用安装的下载逻辑 + log_info "解压驱动包" + tar -zxf "$PACKAGE_PATH" -C /opt/ + if [ ! -d "$DRIVER_DIR" ]; then + log_error "解压失败,无法找到驱动目录: $DRIVER_DIR" + else + log_info "成功解压驱动包到: $DRIVER_DIR" + fi + else + log_info "找到驱动目录: $DRIVER_DIR" + fi + + # 执行卸载 + cd "$DRIVER_DIR" + log_info "执行卸载脚本" + ./uninstall.sh -q -y &>> /tmp/mlnx_install.log || log_warning "卸载脚本执行失败,尝试手动清理" + + log_info "清理残留文件" + rm -rf "$DRIVER_DIR" "$PACKAGE_PATH" + + log_info "停止并禁用openibd服务" + systemctl stop openibd.service &>> /tmp/mlnx_install.log || true + systemctl disable openibd.service &>> /tmp/mlnx_install.log || true + + log_info "恢复网卡命名规则" + sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules + sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules + rm -f /etc/modprobe.d/nvidia-gsp.conf + update-initramfs -u &>> /tmp/mlnx_install.log +} + +# 配置网卡命名规则 +configure_naming_rules() { + log_info "配置IB网卡命名规则" + + log_info "备份原有规则" + cp /etc/udev/rules.d/70-persistent-ipoib.rules /etc/udev/rules.d/70-persistent-ipoib.rules.bak &>> /tmp/mlnx_install.log || true + cp /etc/udev/rules.d/70-persistent-net.rules /etc/udev/rules.d/70-persistent-net.rules.bak &>> /tmp/mlnx_install.log || true + + log_info "清除原有规则" + sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-ipoib.rules &>> /tmp/mlnx_install.log || true + sed -i '/^\s*#/!d' /etc/udev/rules.d/70-persistent-net.rules &>> /tmp/mlnx_install.log || true + + log_info "生成IB设备命名规则" + ID=20 + for i in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do + if [ -n "$i" ]; then + echo "ACTION==\"add\", KERNELS==\"$i\", SUBSYSTEM==\"infiniband\",PROGRAM=\"rdma_rename %k NAME_FIXED mlx5_$ID\"" >> /etc/udev/rules.d/70-persistent-ipoib.rules + ID=$((ID+1)) + fi + done + + log_info "生成网络设备命名规则" + IDS=0 + for j in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do + if [ -n "$j" ]; then + echo "SUBSYSTEM==\"net\", ACTION==\"add\", KERNELS==\"$j\", NAME=\"ib$IDS\"" >> /etc/udev/rules.d/70-persistent-net.rules + IDS=$((IDS+1)) + fi + done + + log_info "配置nvidia选项" + echo "options nvidia NVreg_EnableGpuFirmware=0" > /etc/modprobe.d/nvidia-gsp.conf + update-initramfs -u &>> /tmp/mlnx_install.log + + log_info "重启openibd服务" + systemctl restart openibd.service + sleep 15 +} + +# 检查驱动安装结果 +check_installation() { + log_info "检查驱动安装结果" + if command -v ibv_devinfo &> /dev/null; then + log_info "驱动安装成功" + else + log_error "驱动安装失败" + fi + + log_info "检查网卡命名规则" + valid_count=0 + for dev in $(ibdev2netdev -v | grep -v 200G | grep -v 25G | awk '{print $1}'); do + if [ -n "$dev" ]; then + mlx_name=$(udevadm info -q name -n "$dev" 2>/dev/null | grep "mlx5_" || true) + net_name=$(ip link show "$dev" | grep "ib[0-9]" || true) + + if [ -n "$mlx_name" ] || [ -n "$net_name" ]; then + valid_count=$((valid_count+1)) + else + log_warning "网卡 $dev 命名规则未生效" + fi + fi + done + + if [ $valid_count -gt 0 ]; then + log_info "网卡命名规则生效,成功配置 $valid_count 个网卡" + else + log_warning "所有网卡命名规则均未生效,建议重启系统手工配置!" + fi +} + +# 检查卸载结果 +check_uninstallation() { + log_info "检查卸载结果" + if ! command -v ibv_devinfo &> /dev/null; then + log_info "驱动已成功卸载" + else + log_warning "驱动命令仍存在,可能需要手动清理" + fi + + if [ ! -d "$DRIVER_DIR" ]; then + log_info "驱动目录已删除" + else + log_warning "驱动目录未完全删除: $DRIVER_DIR" + fi +} + +# 主函数 +main() { + generate_package_info + parse_args "$@" + + log_info "开始执行 ${ACTION^^} 操作,版本: $DRIVER_VERSION" + + case "$ACTION" in + install) + download_driver + install_driver + configure_naming_rules + check_installation + ;; + uninstall) + uninstall_driver + check_uninstallation + ;; + esac + + log_info "操作完成!" +} + +# 执行主函数 +main "$@" diff --git a/scripts/linux_security_hardening.sh b/scripts/linux_security_hardening.sh new file mode 100644 index 0000000..e35f10b --- /dev/null +++ b/scripts/linux_security_hardening.sh @@ -0,0 +1,180 @@ + +#!/bin/bash +############################################################################################################################## +#脚本功能: +#1.口令定期更换策略设置个90天,最小密码长度为8位,密码过期警告提前7天。 +#2.口令复杂度设置:密码长度至少为12位,包含至少四种字符类型(大写字母、小写字母、数字、特殊字符)。 +#3.登录失败处理策略设置:登录失败次数为5次,锁定时间为10分钟。 +#4.登录连接超时默认配置设置:登录连接超时时间为10分钟。 +#5.日志本地保存时间设置为6个月。 +#6.禁止root ssh远程登录 +#7.启动日志与审计服务rsyslog和auditd +#8.sshd开启PAM认证 +#9.安装系统工具 +############################################################################################################################## +# 定义新的配置参数 +LOGIN_DEFS_POLICY_MAX_DAYS="PASS_MAX_DAYS 90" +LOGIN_DEFS_POLICY_MIN_DAYS="PASS_MIN_DAYS 0" +LOGIN_DEFS_POLICY_MIN_LEN="PASS_MIN_LEN 8" +LOGIN_DEFS_POLICY_WARN_AGE="PASS_WARN_AGE 7" +# 编辑/etc/login.defs配置文件 +echo "正在编辑 /etc/login.defs 文件..." +# 检查并替换或添加设置 +if grep -q "^PASS_MAX_DAYS" /etc/login.defs; then + sed -i "s/^PASS_MAX_DAYS.*/${LOGIN_DEFS_POLICY_MAX_DAYS}/" /etc/login.defs +fi + +if grep -q "^PASS_MIN_DAYS" /etc/login.defs; then + sed -i "s/^PASS_MIN_DAYS.*/${LOGIN_DEFS_POLICY_MIN_DAYS}/" /etc/login.defs +fi + +if grep -q "^PASS_MIN_LEN" /etc/login.defs; then + sed -i "s/^PASS_MIN_LEN.*/${LOGIN_DEFS_POLICY_MIN_LEN}/" /etc/login.defs +fi + +if grep -q "^PASS_WARN_AGE" /etc/login.defs; then + sed -i "s/^PASS_WARN_AGE.*/${LOGIN_DEFS_POLICY_WARN_AGE}/" /etc/login.defs +fi + +# 编辑/etc/security/pwquality.conf配置文件口令复杂度 +PWQUALITY_POLICY_MINLEN="minlen = 12" +PWQUALITY_POLICY_MINCLASS="minclass = 4" +PWQUALITY_POLICY_DCREDIT="dcredit = -1" +PWQUALITY_POLICY_UCREDIT="ucredit = -1" +PWQUALITY_POLICY_LCREDIT="lcredit = -1" +PWQUALITY_POLICY_OCREDIT="ocredit = -1" +PWQUALITY_POLICY_FOR_ROOT="enforce_for_root" +PWQUALITY_POLICY_DIFOK="difok = 5" + +echo "正在编辑 /etc/security/pwquality.conf 文件配置文件口令复杂度" +if grep -q "^minlen" /etc/security/pwquality.conf; then + sed -i "s/^minlen.*/${PWQUALITY_POLICY_MINLEN}/" /etc/security/pwquality.conf +elif grep -q "^# minlen" /etc/security/pwquality.conf; then + sed -i "s/^# minlen.*/${PWQUALITY_POLICY_MINLEN}/" /etc/security/pwquality.conf +fi + +if grep -q "^minclass" /etc/security/pwquality.conf; then + sed -i "s/^minclass.*/${PWQUALITY_POLICY_MINCLASS}/" /etc/security/pwquality.conf +elif grep -q "^# minclass" /etc/security/pwquality.conf; then + sed -i "s/^# minclass.*/${PWQUALITY_POLICY_MINCLASS}/" /etc/security/pwquality.conf +fi + +if grep -q "^dcredit" /etc/security/pwquality.conf; then + sed -i "s/^dcredit.*/${PWQUALITY_POLICY_DCREDIT}/" /etc/security/pwquality.conf +elif grep -q "^# dcredit" /etc/security/pwquality.conf; then + sed -i "s/^# dcredit.*/${PWQUALITY_POLICY_DCREDIT}/" /etc/security/pwquality.conf +fi + +if grep -q "^ucredit" /etc/security/pwquality.conf; then + sed -i "s/^ucredit.*/${PWQUALITY_POLICY_UCREDIT}/" /etc/security/pwquality.conf +elif grep -q "^# ucredit" /etc/security/pwquality.conf; then + sed -i "s/^# ucredit.*/${PWQUALITY_POLICY_UCREDIT}/" /etc/security/pwquality.conf +fi + +if grep -q "^lcredit" /etc/security/pwquality.conf; then + sed -i "s/^lcredit.*/${PWQUALITY_POLICY_LCREDIT}/" /etc/security/pwquality.conf +elif grep -q "^# lcredit" /etc/security/pwquality.conf; then + sed -i "s/^# lcredit.*/${PWQUALITY_POLICY_LCREDIT}/" /etc/security/pwquality.conf +fi + +if grep -q "^ocredit" /etc/security/pwquality.conf; then + sed -i "s/^ocredit.*/${PWQUALITY_POLICY_OCREDIT}/" /etc/security/pwquality.conf +elif grep -q "^# ocredit" /etc/security/pwquality.conf; then + sed -i "s/^# ocredit.*/${PWQUALITY_POLICY_OCREDIT}/" /etc/security/pwquality.conf +fi + +if grep -q "^enforce_for_root" /etc/security/pwquality.conf; then + : +elif grep -q "^# enforce_for_root" /etc/security/pwquality.conf; then + sed -i "s/^# enforce_for_root/${PWQUALITY_POLICY_FOR_ROOT}/" /etc/security/pwquality.conf +fi + +if grep -q "^difok" /etc/security/pwquality.conf; then + sed -i "s/^difok.*/${PWQUALITY_POLICY_DIFOK}/" /etc/security/pwquality.conf +elif grep -q "^# difok.*" /etc/security/pwquality.conf; then + sed -i "s/^# difok.*/${PWQUALITY_POLICY_DIFOK}/" /etc/security/pwquality.conf +fi + +# 执行以下命令,来更新`system-auth`和`password-auth`文件 +egrep -q "^\s*password\s+requisite\s+pam_pwquality.so\s+" /etc/pam.d/system-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+try_first_pass)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1try_first_pass \2/ }' /etc/pam.d/system-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+retry=[0-9]+)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1retry=5 \2/ }' /etc/pam.d/system-auth && sed -ri "s/(^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*\s+)retry=[0-9]+(\s+.*)?$/\1retry=5\3/" /etc/pam.d/system-auth +sleep 2s +egrep -q "^\s*password\s+requisite\s+pam_pwquality.so\s+" /etc/pam.d/password-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+try_first_pass)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1try_first_pass \2/ }' /etc/pam.d/password-auth && sed -ri '/^\s*password\s+requisite\s+pam_pwquality.so\s+/ { /^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*(\s+retry=[0-9]+)(\s+.*)?$/! s/^(\s*password\s+requisite\s+pam_pwquality.so\s+)(.*)$/\1retry=5 \2/ }' /etc/pam.d/password-auth && sed -ri "s/(^\s*password\s+requisite\s+pam_pwquality.so(\s+\S+)*\s+)retry=[0-9]+(\s+.*)?$/\1retry=5\3/" /etc/pam.d/password-auth + +# 密码验证失败处理策略 +echo "正在编辑 /etc/pam.d/password-auth 文件配置密码验证失败处理策略" +if grep -Pq "^auth\s*required\s*pam_faillock.so\s*authfail\s*even_deny_root\s*deny=5\s*unlock_time=600\s*$" /etc/pam.d/system-auth; then + : +else + sed -ri "/^auth.*pam_env.so$/i auth required pam_faillock.so preauth silent even_deny_root deny=5 unlock_time=600\nauth required pam_faillock.so authfail even_deny_root deny=5 unlock_time=600" /etc/pam.d/system-auth +fi + +if grep -Pq "^auth\s*required\s*pam_faillock.so\s*authfail\s*even_deny_root\s*deny=5\s*unlock_time=600\s*$" /etc/pam.d/password-auth; then + : +else + sed -ri "/^auth.*pam_env.so$/i auth required pam_faillock.so preauth silent even_deny_root deny=5 unlock_time=600\nauth required pam_faillock.so authfail even_deny_root deny=5 unlock_time=600" /etc/pam.d/password-auth +fi + + +# 终端超时自动登出设置要求针对所有用户,自动登退时间为600s +echo "正在编辑 /etc/profile 文件配置终端超时自动登出设置要求针对所有用户,自动登退时间为600s" +if grep -q "^export TMOUT" /etc/profile; then + sed -i "s/^export TMOUT.*/export TMOUT=600/" /etc/profile +else + echo "export TMOUT=600" >> /etc/profile +fi + +# 设置日志本地保存时间6个月 +echo "正在编辑 /etc/logrotate.conf 文件设置日志本地保存时间6个月" +if grep -q "^rotate" /etc/logrotate.conf; then + sed -i "s/rotate.*/rotate 26/" /etc/logrotate.conf +fi + +# 禁止root ssh远程登录 +echo "正在编辑 /etc/ssh/sshd_config 文件禁止root ssh远程登录" +if grep -q "^PermitRootLogin" /etc/ssh/sshd_config; then + sed -i "s/^PermitRootLogin.*/PermitRootLogin no/" /etc/ssh/sshd_config +else + echo "PermitRootLogin no" >> /etc/ssh/sshd_config +fi + +if grep -q "^PubkeyAuthentication" /etc/ssh/sshd_config; then + sed -i "s/^PubkeyAuthentication.*/PubkeyAuthentication yes/" /etc/ssh/sshd_config +else + sed -i "s/^#PubkeyAuthentication.*/PubkeyAuthentication yes/" /etc/ssh/sshd_config +fi +# 禁用 ssh DNS 解析 +if grep -q "^UseDNS" /etc/ssh/sshd_config; then + sed -i "s/^UseDNS.*/UseDNS no/" /etc/ssh/sshd_config +else + sed -i "s/^#UseDNS.*/UseDNS no/" /etc/ssh/sshd_config +fi +# 开启ssh PAM认证 +if grep -q "^UsePAM" /etc/ssh/sshd_config; then + sed -i "s/^UsePAM.*/UsePAM yes/" /etc/ssh/sshd_config +else + sed -i "s/^#UsePAM.*/UsePAM yes/" /etc/ssh/sshd_config +fi + +# 重启sshd服务,生效配置 +echo "正在重启sshd服务....." +if grep -Pq '^PubkeyAuthentication yes' /etc/ssh/sshd_config;then + systemctl restart sshd +fi +sleep 2s +# 开启rsyslog服务,开启auditd服务 +echo "正在启动rsyslog和auditd服务" +systemctl restart rsyslog.service +systemctl start rsyslog.service && systemctl enable rsyslog.service +sleep 2s +systemctl start auditd.service && systemctl enable auditd.service +echo "请自行修改操作系统默认密码。并做好密码保存。" +echo "已禁止root ssh远程登录,请使用scloudadmin账号登录,如无法登录请通过ipmi远程控制登录" + +#9.安装系统工具 +echo "安装sysstat ipmitool vim pciutils net-tools工具包" +dnf -y install sysstat.x86_64 +dnf -y install ipmitool.x86_64 +dnf -y install vim +dnf -y install pciutils.x86_64 +dnf -y install net-tools.x86_64 +echo "所有操作已完成。" diff --git a/scripts/node-exporter.sh b/scripts/node-exporter.sh new file mode 100644 index 0000000..db719bb --- /dev/null +++ b/scripts/node-exporter.sh @@ -0,0 +1,177 @@ +#!/bin/bash +set -euo pipefail + +# 全局变量 +LOG_FILE="/var/log/node_exporter_$(date +%Y%m%d%H%M%S).log" +NODE_EXPORTER_VERSION="1.8.2" +PRIMARY_DOWNLOAD_URL="http://10.101.0.51:5588/node-exporter/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz" +BACKUP_DOWNLOAD_URL="https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz" +LOCAL_PACKAGE_PATH="/opt/node_exporter.tar.gz" + +# 颜色定义 +GREEN='\033[1;32m' +RED='\033[1;31m' +NC='\033[0m' # 重置颜色 + +# 日志函数 - 记录所有操作到日志 +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE" +} + +# 步骤提示 - 绿色输出到控制台并记录日志 +step() { + local msg="==> $1" + echo -e "${GREEN}$msg${NC}" + log "$msg" +} + +# 错误提示 - 红色输出到控制台并记录日志 +error() { + local msg="错误: $1" + echo -e "${RED}$msg${NC}" + log "$msg" + exit 1 +} + +# 执行命令并屏蔽输出 +run_cmd() { + step "执行: $1" + eval "$1" &>> "$LOG_FILE" || { + error "命令执行失败: $1" + } +} + +# 测试网络连通性 +test_network_connectivity() { + local url=$1 + step "测试网络连通性: $url" + if curl -fsSLI --connect-timeout 10 "$url" &>> "$LOG_FILE"; then + return 0 + else + return 1 + fi +} + +# 下载文件 +download_file() { + local url=$1 + local dest=$2 + step "下载文件: $url 到 $dest" + if wget -qO "$dest" "$url" &>> "$LOG_FILE"; then + return 0 + else + return 1 + fi +} + +# 安装 node_exporter +install_node_exporter() { + step "开始安装 node_exporter ${NODE_EXPORTER_VERSION}" + + # 切换到 /opt 目录 + run_cmd "cd /opt" + + # 检查本地是否存在安装包 + if [[ -f "$LOCAL_PACKAGE_PATH" ]]; then + step "发现本地安装包: $LOCAL_PACKAGE_PATH" + DOWNLOAD_URL="$LOCAL_PACKAGE_PATH" + else + # 测试主要下载地址的连通性 + if test_network_connectivity "$PRIMARY_DOWNLOAD_URL"; then + DOWNLOAD_URL="$PRIMARY_DOWNLOAD_URL" + elif test_network_connectivity "$BACKUP_DOWNLOAD_URL"; then + DOWNLOAD_URL="$BACKUP_DOWNLOAD_URL" + else + error "无法连接到任何下载地址" + fi + + # 下载 node_exporter + download_file "$DOWNLOAD_URL" "node_exporter.tar.gz" + fi + + # 解压 tar 包 + run_cmd "tar -zxvf node_exporter.tar.gz" + + # 移动文件夹 + run_cmd "mv node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/ ./node_exporter" + + # 进入 node_exporter 目录 + run_cmd "cd node_exporter/" + + # 创建 bin 目录并移动二进制文件 + run_cmd "mkdir bin" + run_cmd "mv node_exporter bin/" + + # 配置 systemd 服务 + cat > /lib/systemd/system/node_exporter.service <> "$LOG_FILE" +} + +# 步骤提示 - 绿色输出到控制台并记录日志 +step() { + local msg="==> $1" + echo -e "${GREEN}$msg${NC}" + log "$msg" +} + +# 错误提示 - 红色输出到控制台并记录日志 +error() { + local msg="错误: $1" + echo -e "${RED}$msg${NC}" + log "$msg" + exit 1 +} + +# 执行命令并屏蔽输出,仅记录关键信息 +run_cmd() { + step "执行: $1" + eval "$1" &>> "$LOG_FILE" || { + error "命令执行失败: $1" + } +} + +# 检测系统版本 +detect_os_version() { + if [[ -f /etc/os-release ]]; then + . /etc/os-release + OS_ID=$ID + OS_VERSION=$VERSION_ID + step "检测到系统: ${OS_ID} ${OS_VERSION}" + else + error "无法检测到操作系统版本" + fi +} + +# 清理现有的 DCGM 配置 +cleanup_dcgm_config() { + step "清理现有的 DCGM 配置" + run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list" + run_cmd "apt-get remove -y datacenter-gpu-manager || true" + run_cmd "apt-get autoremove -y || true" + run_cmd "apt-get autoclean -y || true" +} + +# 添加 CUDA 仓库密钥 +add_cuda_keyring() { + step "添加 CUDA 仓库密钥" + run_cmd "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb" + run_cmd "dpkg -i cuda-keyring_1.0-1_all.deb" + run_cmd "rm cuda-keyring_1.0-1_all.deb" +} + +# 更新包列表 +update_package_list() { + step "更新包列表" + run_cmd "apt-get update" +} + +# 安装 DCGM for Ubuntu 22.04 +install_dcgm_for_ubuntu_22() { + step "开始安装 DCGM for Ubuntu 22" + + # 清理现有的 DCGM 配置 + cleanup_dcgm_config + + # 添加 CUDA 仓库密钥 + add_cuda_keyring + + # 添加 CUDA 仓库 + run_cmd "add-apt-repository \"deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /\" -y" + + # 更新包列表 + update_package_list + + # 获取 CUDA 版本 + CUDA_VERSION=$(nvidia-smi | sed -E -n 's/.*CUDA Version: ([0-9]+)[.].*/\1/p') + if [[ -z "$CUDA_VERSION" ]]; then + error "无法检测到 CUDA 版本" + fi + + # 安装 DCGM + run_cmd "apt install --install-recommends datacenter-gpu-manager-4-cuda${CUDA_VERSION} -y" + + # 启动并启用 DCGM 服务 + run_cmd "systemctl --now enable nvidia-dcgm" + run_cmd "systemctl restart nvidia-dcgm" + run_cmd "systemctl status nvidia-dcgm" + + # 验证 DCGM 安装 + run_cmd "dcgmi discovery -l" + + step "DCGM 安装成功" +} + +# 卸载 DCGM +uninstall_dcgm() { + step "开始卸载 DCGM" + + # 停止并禁用 DCGM 服务 + run_cmd "systemctl stop nvidia-dcgm || true" + run_cmd "systemctl disable nvidia-dcgm || true" + + # 移除 DCGM 包 + run_cmd "apt-get remove -y datacenter-gpu-manager || true" + run_cmd "apt-get autoremove -y || true" + run_cmd "apt-get autoclean -y || true" + + # 清理 CUDA 仓库密钥 + run_cmd "rm -f /usr/share/keyrings/cuda-archive-keyring.gpg" + + # 删除 CUDA 仓库配置文件 + run_cmd "rm -f /etc/apt/sources.list.d/cuda-ubuntu2004-x86_64.list" + + # 更新包列表 + update_package_list + + step "DCGM 卸载完成" +} + +# 主流程 +step "NVIDIA DCGM 安装脚本启动" +detect_os_version + +if [[ $# -ne 1 ]]; then + error "请使用 --install 或 --uninstall" +fi + +ACTION=$1 +case "$ACTION" in + "--install") + case "$OS_ID-$OS_VERSION" in + ubuntu-22.04) + install_dcgm_for_ubuntu_22 + ;; + *) + error "不支持的操作系统版本: ${OS_ID} ${OS_VERSION}" + ;; + esac + ;; + "--uninstall") + uninstall_dcgm + ;; + *) + error "无效的参数,请使用 --install 或 --uninstall" + ;; +esac + +step "操作完成,日志路径: $LOG_FILE" diff --git a/scripts/nvidia-driver.sh b/scripts/nvidia-driver.sh new file mode 100644 index 0000000..a323ce7 --- /dev/null +++ b/scripts/nvidia-driver.sh @@ -0,0 +1,235 @@ +#!/bin/bash +set -euo pipefail + +# 全局变量 +DEFAULT_VERSION="565.57.01" +INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-linux" +#OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}" +PACKAGE_TEMPLATE="NVIDIA-Linux-x86_64-%s.run" +INSTALL_DIR="/opt" +LOG_FILE="/var/log/nvidia-driver_$(date +%Y%m%d%H%M%S).log" +SERVICE_FILE="/etc/systemd/system/nvidia_peermem.service" +PERSISTENCE_SERVICE="/etc/systemd/system/nvidia-persistenced.service" + +# 颜色定义 +GREEN='\033[1;32m' +RED='\033[1;31m' +NC='\033[0m' # 重置颜色 + +# 日志函数 - 记录所有操作到日志 +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE" +} + +# 步骤提示 - 绿色输出到控制台并记录日志 +step() { + local msg="==> $1" + echo -e "${GREEN}$msg${NC}" + log "$msg" +} + +# 错误提示 - 红色输出到控制台并记录日志 +error() { + local msg="错误: $1" + echo -e "${RED}$msg${NC}" + log "$msg" + exit 1 +} + +# 执行命令并屏蔽输出,仅记录关键信息 +run_cmd() { + step "执行: $1" + eval "$1" &>> "$LOG_FILE" || { + error "命令执行失败: $1" + } +} + +# 参数解析 +ACTION="" +VERSION="$DEFAULT_VERSION" + +while [[ $# -gt 0 ]]; do + case "$1" in + --install) ACTION="install"; shift ;; + --uninstall) ACTION="uninstall"; shift ;; + --version) VERSION="$2"; shift 2 ;; + *) error "未知参数 $1" ;; + esac +done +OFFICIAL_BASE_URL="https://cn.download.nvidia.com/tesla/${VERSION}" + +[[ -z "$ACTION" ]] && { error "必须指定 --install 或 --uninstall"; } + +# peermem_service 开机启动函数 +install_peermem_service() { + step "开始配置 nvidia_peermem 开机启动" + # 创建服务文件 + cat > "$SERVICE_FILE" < "$PERSISTENCE_SERVICE" <> "$LOG_FILE" 2>&1 || { + log "版本验证失败"; exit 1 + } + + install_peermem_service # 开机启动加载:nvidia_peermem + install_persistence_service # 开机启动 GPU 持久模式 + step "安装完成" +} + +# 显卡驱动卸载函数 +uninstall_driver() { + step "开始卸载显卡驱动,版本:$VERSION" + PACKAGE_NAME=$(printf "$PACKAGE_TEMPLATE" "$VERSION") + PACKAGE_PATH="${INSTALL_DIR}/${PACKAGE_NAME}" + + # 检查卸载脚本 + if [[ -f "$PACKAGE_PATH" ]]; then + step "找到安装包,使用安装包卸载" + cd "$INSTALL_DIR" + run_cmd "chmod +x $PACKAGE_NAME" + if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then + step "卸载失败,请手动卸载驱动程序" + return + fi + else + step "未找到本地安装包,尝试下载卸载包" + download_package "$PACKAGE_NAME" "$PACKAGE_PATH" + if [[ -f "$PACKAGE_PATH" ]]; then + cd "$INSTALL_DIR" + run_cmd "chmod +x $PACKAGE_NAME" + if ! ./"$PACKAGE_NAME" --uninstall -q -s &>/dev/null; then + step "卸载失败,请手动卸载驱动程序" + return + fi + else + step "无法找到或下载卸载包,请手动卸载驱动程序" + return + fi + fi + + uninstall_peermem_service # 移除 peermem 服务 + uninstall_persistence_service # 移除 persistenced 服务 + step "卸载完成" +} + +# 包下载函数 +download_package() { + local package_name="$1" + local package_path="$2" + local download_urls=( + "${INTERNAL_BASE_URL}/${package_name}" + "${OFFICIAL_BASE_URL}/${package_name}" + ) + + for url in "${download_urls[@]}"; do + step "尝试从 $url 下载" + wget -qO "$package_path" "$url" && return 0 + step "下载失败,尝试下一个 URL" + done + + error "无法从任何来源下载 $package_name" +} + +# 根据动作调用对应函数 +case "$ACTION" in + install) install_driver ;; + uninstall) uninstall_driver ;; +esac diff --git a/scripts/nvidia-fabricmanager.sh b/scripts/nvidia-fabricmanager.sh new file mode 100644 index 0000000..fbefd11 --- /dev/null +++ b/scripts/nvidia-fabricmanager.sh @@ -0,0 +1,190 @@ +#!/bin/bash +set -euo pipefail + +# 全局变量 +FABRICMANAGER_MAJOR_VERSION="565" +FABRICMANAGER_FULL_VERSION="565.57.01-1" +PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb" +PACKAGE_PATH="/opt/${PACKAGE_NAME}" +INTERNAL_BASE_URL="http://10.101.0.51:5588/nvidia-fabricmanager" +OFFICIAL_BASE_URL="https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64" +TEMP_DIR="/tmp/fabricmanager_temp" +LOG_FILE="/var/log/fabricmanager_$(date +%Y%m%d%H%M%S).log" + +# 颜色定义 +GREEN='\033[1;32m' +RED='\033[1;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # 重置颜色 + +# 日志函数 +log() { + local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" + echo "$msg" >> "$LOG_FILE" +} + +# 步骤提示 +step() { + local msg="==> $1" + echo -e "${GREEN}$msg${NC}" + log "$msg" +} + +# 警告提示 +warning() { + local msg="警告: $1" + echo -e "${YELLOW}$msg${NC}" + log "$msg" +} + +# 错误提示 +error() { + local msg="错误: $1" + echo -e "${RED}$msg${NC}" + log "$msg" + exit 1 +} + +# 执行命令并记录日志 +run_cmd() { + step "执行: $1" + eval "$1" &>> "$LOG_FILE" || { + error "命令执行失败: $1" + } +} + +# 检查命令是否存在 +check_cmd() { + command -v "$1" &>/dev/null || error "未找到命令: $1" +} + +# 参数解析 +ACTION="" +FORCE=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --install) ACTION="install"; shift ;; + --uninstall) ACTION="uninstall"; shift ;; + --version) + CUSTOM_VERSION="$2" + if [[ $CUSTOM_VERSION =~ ^([0-9]+)(_[0-9]+\.[0-9]+\.[0-9]+-[0-9]+)$ ]]; then + FABRICMANAGER_MAJOR_VERSION="${BASH_REMATCH[1]}" + FABRICMANAGER_FULL_VERSION="${BASH_REMATCH[2]#_}" + PACKAGE_NAME="nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}_${FABRICMANAGER_FULL_VERSION}_amd64.deb" + PACKAGE_PATH="/opt/${PACKAGE_NAME}" + else + error "版本格式错误,应为 xxxx_xxxx.xx.xx-x" + fi + shift 2 ;; + --force) FORCE=1; shift ;; + *) error "未知参数: $1" ;; + esac +done + +[[ -z "$ACTION" ]] && error "必须指定 --install 或 --uninstall" + + +# 下载安装包 +download_package() { + step "检查安装包: $PACKAGE_PATH" + if [[ -f "$PACKAGE_PATH" ]]; then + step "使用本地安装包" + return 0 + fi + + step "本地包不存在,开始下载" + mkdir -p "$(dirname "$PACKAGE_PATH")" + + local urls=( + "${INTERNAL_BASE_URL}/${PACKAGE_NAME}" + "${OFFICIAL_BASE_URL}/${PACKAGE_NAME}" + ) + + for url in "${urls[@]}"; do + step "尝试从 $url 下载" + if wget -q -O "$PACKAGE_PATH" "$url"; then + step "下载成功,文件大小: $(du -h "$PACKAGE_PATH" | cut -f1)" + return 0 + else + warning "从 $url 下载失败" + fi + done + + error "无法下载安装包,请检查网络或手动放置到 $PACKAGE_PATH" +} + + +# 安装 NVIDIA Fabric Manager +install_fabricmanager() { + step "开始安装 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}" + + # 下载安装包 + download_package + + # 安装前检查 + step "检查系统依赖" + check_cmd dpkg + check_cmd systemctl + + # 安装 Fabric Manager + step "安装 NVIDIA Fabric Manager" + run_cmd "dpkg -i $PACKAGE_PATH" + + # 启动并启用服务 + step "启动并启用 NVIDIA Fabric Manager 服务" + run_cmd "systemctl enable nvidia-fabricmanager.service --now" + + # 验证安装 + step "验证 NVIDIA Fabric Manager 服务状态" + if systemctl is-active --quiet nvidia-fabricmanager.service; then + step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 安装成功并运行中" + else + error "NVIDIA Fabric Manager 服务未运行" + fi +} + + +# 卸载 NVIDIA Fabric Manager +uninstall_fabricmanager() { + step "开始卸载 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION}" + + # 检查服务状态 + if systemctl is-active --quiet nvidia-fabricmanager.service; then + step "停止 NVIDIA Fabric Manager 服务" + run_cmd "systemctl stop nvidia-fabricmanager.service" + else + step "NVIDIA Fabric Manager 服务未运行" + fi + + # 禁用服务 + step "禁用 NVIDIA Fabric Manager 服务" + run_cmd "systemctl disable nvidia-fabricmanager.service" + + # 卸载软件包 + step "卸载 NVIDIA Fabric Manager 软件包" + if dpkg -s "nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" &>/dev/null; then + run_cmd "dpkg -r nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION}" + else + warning "未找到 nvidia-fabricmanager-${FABRICMANAGER_MAJOR_VERSION} 软件包" + if [[ $FORCE -eq 0 ]]; then + error "请使用 --force 参数强制卸载" + fi + fi + + # 清理残留文件 + step "清理残留文件" + rm -f "$PACKAGE_PATH" + + step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} 卸载完成" +} + + +step "开始 NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程" +case "$ACTION" in + install) install_fabricmanager ;; + uninstall) uninstall_fabricmanager ;; + *) error "未知操作: $ACTION" ;; +esac + +step "NVIDIA Fabric Manager ${FABRICMANAGER_FULL_VERSION} ${ACTION} 流程完成" \ No newline at end of file diff --git a/scripts/system_optimize.sh b/scripts/system_optimize.sh new file mode 100644 index 0000000..25b71ac --- /dev/null +++ b/scripts/system_optimize.sh @@ -0,0 +1,124 @@ + +#!/bin/bash + +# 颜色定义 +GREEN='\033[1;32m' +RED='\033[1;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # 重置颜色 + +# 日志函数 - 绿色输出 +log_info() { + echo -e "${GREEN}[INFO] $1${NC}" +} + +# 错误函数 - 红色输出 +log_error() { + echo -e "${RED}[ERROR] $1${NC}" +} + +# 警告函数 - 黄色输出 +log_warning() { + echo -e "${YELLOW}[WARNING] $1${NC}" +} + +# 禁用apt的定期更新 +disable_apt_periodic_updates() { + log_info "禁用apt的定期更新..." + # 修改10periodic配置文件,将所有的1改为0,禁用自动更新检查 + sed -i 's/1/0/g' /etc/apt/apt.conf.d/10periodic + # 修改20auto-upgrades配置文件,将所有的1改为0,禁用自动升级 + sed -i 's/1/0/g' /etc/apt/apt.conf.d/20auto-upgrades + log_info "apt定期更新已禁用" +} + +# 设置系统时区为上海 +set_timezone_to_shanghai() { + log_info "设置系统时区为上海..." + # 使用timedatectl命令设置系统时区为Asia/Shanghai + timedatectl set-timezone Asia/Shanghai + log_info "系统时区已设置为上海" +} + +# 同步硬件时钟和系统时钟 +synchronize_hardware_clock() { + log_info "同步硬件时钟和系统时钟..." + # 使用hwclock命令将系统时间同步到硬件时钟 + hwclock --systohc + log_info "硬件时钟和系统时钟已同步" +} + +# 删除"ubuntu"用户 +#remove_ubuntu_user() { +# log_info "删除'ubuntu'用户..." +# # 使用userdel命令删除ubuntu用户,并递归删除其主目录 +# # &> /dev/null用于忽略可能的错误输出(例如用户不存在的情况) +# userdel -r ubuntu &> /dev/null +# log_info "已尝试删除'ubuntu'用户(如果存在)" +#} + +# 禁止显卡驱动 +disable_nouveau_driver() { + log_info "禁止nouveau显卡驱动..." + + # 创建blacklist-nouveau.conf文件,添加禁止nouveau驱动的配置 + cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF +blacklist nouveau +blacklist lbm-nouveau +options nouveau modeset=0 +alias nouveau off +alias lbm-nouveau off +EOF + + # 创建nouveau-kms.conf文件,禁用nouveau的KMS(内核模式设置) + cat > /etc/modprobe.d/nouveau-kms.conf << EOF +options nouveau modeset=0 +EOF + + # 更新initramfs,使驱动禁用配置生效 + update-initramfs -u &> /dev/null + + log_info "nouveau显卡驱动已被禁止" +} + +# 更改GRUB配置并更新 +update_grub_configuration() { + log_info "更改GRUB配置启用传统网络接口命名..." + + # 检查GRUB配置中是否已存在所需的网络接口命名设置 + if ! grep -q 'GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub; then + # 如果不存在,则注释掉原有的GRUB_CMDLINE_LINUX_DEFAULT行 + sed -i '/GRUB_CMDLINE_LINUX_DEFAULT=.*/s/^/#/' /etc/default/grub + # 添加新的GRUB_CMDLINE_LINUX_DEFAULT行,启用传统网络接口命名 + sed -i '/GRUB_CMDLINE_LINUX_DEFAULT=.*/a\GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub + fi + + # 再次检查配置是否已成功添加 + if grep -q 'GRUB_CMDLINE_LINUX_DEFAULT="net.ifnames=1 biosdevname=0"' /etc/default/grub; then + log_info "文件 /etc/default/grub 修改成功!" + else + log_error "文件 /etc/default/grub 修改失败!" + exit 1 + fi + + # 更新GRUB引导加载程序配置 + update-grub &> /dev/null + + log_info "GRUB配置已更新" +} + +# 主函数:按顺序执行所有配置步骤 +main() { + log_info "开始系统配置..." + + disable_apt_periodic_updates + set_timezone_to_shanghai + synchronize_hardware_clock + disable_nouveau_driver + update_grub_configuration + + log_info "系统配置完成!" +} + +# 执行主函数 +main