#!/bin/bash # 颜色定义 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' MAGENTA='\033[0;35m' CYAN='\033[0;36m' WHITE='\033[0;37m' NC='\033[0m' # 恢复默认颜色 # 标题函数 print_title() { echo -e "${CYAN}========================================${NC}" echo -e "${CYAN}$1${NC}" echo -e "${CYAN}========================================${NC}" } # 检查显卡驱动 check_gpu_driver() { echo -n "显卡驱动检查:" if command -v nvidia-smi &>/dev/null; then driver_version=$(nvidia-smi | grep -i 'driver version' | awk '{print $6}') if [ -z "$driver_version" ]; then driver_version=$(nvidia-smi | grep -i 'version' | head -1 | awk '{print $9}') fi echo -e "${GREEN}OK${NC} 版本:${YELLOW}${driver_version}${NC}" else echo -e "${RED}失败${NC} 未找到nvidia-smi命令" fi } # 检查网卡驱动 check_network_driver() { echo -n "网卡驱动检查:" # 检查mlx5_core驱动版本(适用于CX7网卡) if modinfo mlx5_core &>/dev/null; then mlx5_version=$(modinfo mlx5_core | grep -i version | awk '{print $2}') echo -e "${GREEN}OK${NC}" echo -e " ${YELLOW}Mellanox CX7 驱动版本:${NC} ${mlx5_version}" else echo -e "${RED}失败${NC} 未找到mlx5_core驱动" fi # 检查IB驱动是否安装 if command -v ibdev2netdev &>/dev/null; then echo -e " ${YELLOW}InfiniBand 状态:${NC} 已安装" # 获取IB设备信息 ib_devices=$(ibdev2netdev 2>/dev/null | awk '{print $1}') if [ -n "$ib_devices" ]; then echo -e " ${YELLOW}IB 设备:${NC}" for dev in $ib_devices; do dev_info=$(ibdev2netdev | grep $dev) echo -e " - ${dev_info}" done fi else echo -e " ${YELLOW}InfiniBand 状态:${NC} 未安装或未启用" fi # 显示所有网络设备的驱动信息(精简版) all_drivers=$(lshw -C network 2>/dev/null | grep -A5 'description: Ethernet' | grep -e 'product:' -e 'vendor:' -e 'driver:' | tr '\n' '; ' | sed 's/; /\n - /g') if [ -n "$all_drivers" ]; then echo -e " ${YELLOW}所有网卡驱动信息:${NC}" echo -e " - ${all_drivers}" fi } # 检查根分区大小及LVM状态 check_root_partition() { echo -n "根分区检查:" root_info=$(df -h / | awk 'NR==2 {print "总大小: " $2 ", 已用: " $3 ", 可用: " $4 ", 使用率: " $5}') if [ -n "$root_info" ]; then echo -e "${GREEN}OK${NC} 信息:${YELLOW}${root_info}${NC}" # 检查是否使用LVM root_fs=$(df -P / | tail -1 | awk '{print $1}') if echo "$root_fs" | grep -q '^/dev/mapper/'; then echo -e " ${YELLOW}根分区使用LVM:${NC} ${GREEN}是${NC}" # 获取LV信息 lv_name=$(echo "$root_fs" | cut -d'/' -f3) vg_name=$(lvs --noheadings -o vg_name "$lv_name" 2>/dev/null | tr -d ' ') if [ -n "$vg_name" ]; then echo -e " ${YELLOW}卷组名称:${NC} ${vg_name}" # 检查VG是否有可用空间(扩容潜力) vg_free=$(vgs --noheadings -o vg_free "$vg_name" 2>/dev/null | tr -d ' ') if [ -n "$vg_free" ] && [ "$vg_free" != "0.00m" ]; then echo -e " ${YELLOW}卷组可用空间:${NC} ${vg_free} (可扩容)" else echo -e " ${YELLOW}卷组可用空间:${NC} ${vg_free} (无扩容空间)" fi # 检查LV是否已占满VG lv_size=$(lvs --noheadings -o lv_size "$lv_name" 2>/dev/null | tr -d ' ') vg_size=$(vgs --noheadings -o vg_size "$vg_name" 2>/dev/null | tr -d ' ') if [ -n "$lv_size" ] && [ -n "$vg_size" ]; then if [ "$lv_size" = "$vg_size" ]; then echo -e " ${YELLOW}逻辑卷状态:${NC} 已占满卷组全部空间" else echo -e " ${YELLOW}逻辑卷状态:${NC} 未占满卷组空间 (可扩容)" fi fi fi else echo -e " ${YELLOW}根分区使用LVM:${NC} ${RED}否${NC}" # 检查普通分区是否有未分配空间 partition=$(lsblk -no pkname "$root_fs" 2>/dev/null) if [ -n "$partition" ]; then unallocated=$(parted -s /dev/"$partition" unit GB print free | grep "Free Space" | tail -1 | awk '{print $3}') if [ -n "$unallocated" ] && [ "$unallocated" != "0.00GB" ]; then echo -e " ${YELLOW}分区未分配空间:${NC} ${unallocated} (可扩容)" else echo -e " ${YELLOW}分区未分配空间:${NC} 无 (不可扩容)" fi fi fi else echo -e "${RED}失败${NC} 无法获取根分区信息" fi } # 检查普通用户 check_normal_users() { echo -n "普通用户检查:" # 获取所有普通用户(UID >= 1000 且 非系统用户) normal_users=$(getent passwd | awk -F: '$3 >= 1000 && $3 < 65534 && !/nologin|false/ {print $1}') if [ -n "$normal_users" ]; then echo -e "${GREEN}OK${NC}" echo -e " ${YELLOW}已存在普通用户:${NC} ${normal_users}" else echo -e "${RED}失败${NC} 未找到普通用户" fi } # 检查CUDA check_cuda() { echo -n "CUDA检查:" if command -v nvcc &>/dev/null; then cuda_version=$(nvcc --version | grep -i release | awk '{print $5}' | tr -d ',') echo -e "${GREEN}OK${NC} 版本:${YELLOW}${cuda_version}${NC}" else echo -e "${RED}失败${NC} nvcc未找到" # 尝试从其他位置获取CUDA版本 if [ -f /usr/local/cuda/version.txt ]; then cuda_version_file=$(cat /usr/local/cuda/version.txt) echo -e " CUDA版本(从文件): ${YELLOW}${cuda_version_file}${NC}" fi fi } # 检查系统版本 check_system_version() { echo -n "系统版本检查:" os_info=$(lsb_release -ds 2>/dev/null || cat /etc/os-release 2>/dev/null | grep PRETTY_NAME | cut -d'"' -f2 || echo "未知") kernel_info=$(uname -r) echo -e "${GREEN}OK${NC} 系统:${YELLOW}${os_info}${NC}, 内核:${YELLOW}${kernel_info}${NC}" } # 检查内核是否锁定 check_kernel_lock() { echo -n "内核锁定检查:" if [ -f /etc/apt/preferences.d/kernel.pref ]; then echo -e "${GREEN}OK${NC} 存在内核锁定配置" else echo -e "${RED}失败${NC} 未找到内核锁定配置" fi } # 检查nvidia-fabricmanager check_fabricmanager() { echo -n "nvidia-fabricmanager检查:" if systemctl is-active --quiet nvidia-fabricmanager; then echo -e "${GREEN}OK${NC} 服务正在运行" else echo -e "${RED}失败${NC} 服务未运行" fi } # 检查node_exporter check_node_exporter() { echo -n "node_exporter检查:" if command -v node_exporter &>/dev/null; then status=$(systemctl is-active --quiet node_exporter && echo "${GREEN}运行中${NC}" || echo "${YELLOW}已安装但未运行${NC}") echo -e "${GREEN}OK${NC} 状态:${status}" else echo -e "${RED}失败${NC} 未安装" fi } # 检查dcgm_exporter check_dcgm_exporter() { echo -n "dcgm_exporter检查:" if command -v dcgm-exporter &>/dev/null; then status=$(systemctl is-active --quiet dcgm-exporter && echo "${GREEN}运行中${NC}" || echo "${YELLOW}已安装但未运行${NC}") echo -e "${GREEN}OK${NC} 状态:${status}" else echo -e "${RED}失败${NC} 未安装" fi } # 主函数 main() { echo -e "${MAGENTA}========================================${NC}" echo -e "${MAGENTA} 系统硬件和软件检查工具 ${NC}" echo -e "${MAGENTA}========================================${NC}" check_gpu_driver check_network_driver check_root_partition check_normal_users # 新增检查普通用户 check_ubuntu_user check_cuda check_system_version check_kernel_lock check_fabricmanager check_node_exporter check_dcgm_exporter echo -e "${MAGENTA}========================================${NC}" echo -e "${MAGENTA} 检查完成! ${NC}" echo -e "${MAGENTA}========================================${NC}" } # 执行主函数 main