diff --git a/scripts/install-checker.sh b/scripts/install-checker.sh new file mode 100644 index 0000000..361d4d1 --- /dev/null +++ b/scripts/install-checker.sh @@ -0,0 +1,234 @@ +#!/bin/bash + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +MAGENTA='\033[0;35m' +CYAN='\033[0;36m' +WHITE='\033[0;37m' +NC='\033[0m' # 恢复默认颜色 + +# 标题函数 +print_title() { + echo -e "${CYAN}========================================${NC}" + echo -e "${CYAN}$1${NC}" + echo -e "${CYAN}========================================${NC}" +} + +# 检查显卡驱动 +check_gpu_driver() { + echo -n "显卡驱动检查:" + if command -v nvidia-smi &>/dev/null; then + driver_version=$(nvidia-smi | grep -i 'driver version' | awk '{print $6}') + if [ -z "$driver_version" ]; then + driver_version=$(nvidia-smi | grep -i 'version' | head -1 | awk '{print $9}') + fi + echo -e "${GREEN}OK${NC} 版本:${YELLOW}${driver_version}${NC}" + else + echo -e "${RED}失败${NC} 未找到nvidia-smi命令" + fi +} + +# 检查网卡驱动 +check_network_driver() { + echo -n "网卡驱动检查:" + + # 检查mlx5_core驱动版本(适用于CX7网卡) + if modinfo mlx5_core &>/dev/null; then + mlx5_version=$(modinfo mlx5_core | grep -i version | awk '{print $2}') + echo -e "${GREEN}OK${NC}" + echo -e " ${YELLOW}Mellanox CX7 驱动版本:${NC} ${mlx5_version}" + else + echo -e "${RED}失败${NC} 未找到mlx5_core驱动" + fi + + # 检查IB驱动是否安装 + if command -v ibdev2netdev &>/dev/null; then + echo -e " ${YELLOW}InfiniBand 状态:${NC} 已安装" + + # 获取IB设备信息 + ib_devices=$(ibdev2netdev 2>/dev/null | awk '{print $1}') + if [ -n "$ib_devices" ]; then + echo -e " ${YELLOW}IB 设备:${NC}" + for dev in $ib_devices; do + dev_info=$(ibdev2netdev | grep $dev) + echo -e " - ${dev_info}" + done + fi + else + echo -e " ${YELLOW}InfiniBand 状态:${NC} 未安装或未启用" + fi + + # 显示所有网络设备的驱动信息(精简版) + all_drivers=$(lshw -C network 2>/dev/null | grep -A5 'description: Ethernet' | grep -e 'product:' -e 'vendor:' -e 'driver:' | tr '\n' '; ' | sed 's/; /\n - /g') + if [ -n "$all_drivers" ]; then + echo -e " ${YELLOW}所有网卡驱动信息:${NC}" + echo -e " - ${all_drivers}" + fi +} + +# 检查根分区大小及LVM状态 +check_root_partition() { + echo -n "根分区检查:" + root_info=$(df -h / | awk 'NR==2 {print "总大小: " $2 ", 已用: " $3 ", 可用: " $4 ", 使用率: " $5}') + if [ -n "$root_info" ]; then + echo -e "${GREEN}OK${NC} 信息:${YELLOW}${root_info}${NC}" + + # 检查是否使用LVM + root_fs=$(df -P / | tail -1 | awk '{print $1}') + if echo "$root_fs" | grep -q '^/dev/mapper/'; then + echo -e " ${YELLOW}根分区使用LVM:${NC} ${GREEN}是${NC}" + + # 获取LV信息 + lv_name=$(echo "$root_fs" | cut -d'/' -f3) + vg_name=$(lvs --noheadings -o vg_name "$lv_name" 2>/dev/null | tr -d ' ') + + if [ -n "$vg_name" ]; then + echo -e " ${YELLOW}卷组名称:${NC} ${vg_name}" + + # 检查VG是否有可用空间(扩容潜力) + vg_free=$(vgs --noheadings -o vg_free "$vg_name" 2>/dev/null | tr -d ' ') + if [ -n "$vg_free" ] && [ "$vg_free" != "0.00m" ]; then + echo -e " ${YELLOW}卷组可用空间:${NC} ${vg_free} (可扩容)" + else + echo -e " ${YELLOW}卷组可用空间:${NC} ${vg_free} (无扩容空间)" + fi + + # 检查LV是否已占满VG + lv_size=$(lvs --noheadings -o lv_size "$lv_name" 2>/dev/null | tr -d ' ') + vg_size=$(vgs --noheadings -o vg_size "$vg_name" 2>/dev/null | tr -d ' ') + if [ -n "$lv_size" ] && [ -n "$vg_size" ]; then + if [ "$lv_size" = "$vg_size" ]; then + echo -e " ${YELLOW}逻辑卷状态:${NC} 已占满卷组全部空间" + else + echo -e " ${YELLOW}逻辑卷状态:${NC} 未占满卷组空间 (可扩容)" + fi + fi + fi + else + echo -e " ${YELLOW}根分区使用LVM:${NC} ${RED}否${NC}" + + # 检查普通分区是否有未分配空间 + partition=$(lsblk -no pkname "$root_fs" 2>/dev/null) + if [ -n "$partition" ]; then + unallocated=$(parted -s /dev/"$partition" unit GB print free | grep "Free Space" | tail -1 | awk '{print $3}') + if [ -n "$unallocated" ] && [ "$unallocated" != "0.00GB" ]; then + echo -e " ${YELLOW}分区未分配空间:${NC} ${unallocated} (可扩容)" + else + echo -e " ${YELLOW}分区未分配空间:${NC} 无 (不可扩容)" + fi + fi + fi + else + echo -e "${RED}失败${NC} 无法获取根分区信息" + fi +} + +# 检查普通用户 +check_normal_users() { + echo -n "普通用户检查:" + + # 获取所有普通用户(UID >= 1000 且 非系统用户) + normal_users=$(getent passwd | awk -F: '$3 >= 1000 && $3 < 65534 && !/nologin|false/ {print $1}') + + if [ -n "$normal_users" ]; then + echo -e "${GREEN}OK${NC}" + echo -e " ${YELLOW}已存在普通用户:${NC} ${normal_users}" + else + echo -e "${RED}失败${NC} 未找到普通用户" + fi +} + +# 检查CUDA +check_cuda() { + echo -n "CUDA检查:" + if command -v nvcc &>/dev/null; then + cuda_version=$(nvcc --version | grep -i release | awk '{print $5}' | tr -d ',') + echo -e "${GREEN}OK${NC} 版本:${YELLOW}${cuda_version}${NC}" + else + echo -e "${RED}失败${NC} nvcc未找到" + # 尝试从其他位置获取CUDA版本 + if [ -f /usr/local/cuda/version.txt ]; then + cuda_version_file=$(cat /usr/local/cuda/version.txt) + echo -e " CUDA版本(从文件): ${YELLOW}${cuda_version_file}${NC}" + fi + fi +} + +# 检查系统版本 +check_system_version() { + echo -n "系统版本检查:" + os_info=$(lsb_release -ds 2>/dev/null || cat /etc/os-release 2>/dev/null | grep PRETTY_NAME | cut -d'"' -f2 || echo "未知") + kernel_info=$(uname -r) + echo -e "${GREEN}OK${NC} 系统:${YELLOW}${os_info}${NC}, 内核:${YELLOW}${kernel_info}${NC}" +} + +# 检查内核是否锁定 +check_kernel_lock() { + echo -n "内核锁定检查:" + if [ -f /etc/apt/preferences.d/kernel.pref ]; then + echo -e "${GREEN}OK${NC} 存在内核锁定配置" + else + echo -e "${RED}失败${NC} 未找到内核锁定配置" + fi +} + +# 检查nvidia-fabricmanager +check_fabricmanager() { + echo -n "nvidia-fabricmanager检查:" + if systemctl is-active --quiet nvidia-fabricmanager; then + echo -e "${GREEN}OK${NC} 服务正在运行" + else + echo -e "${RED}失败${NC} 服务未运行" + fi +} + +# 检查node_exporter +check_node_exporter() { + echo -n "node_exporter检查:" + if command -v node_exporter &>/dev/null; then + status=$(systemctl is-active --quiet node_exporter && echo "${GREEN}运行中${NC}" || echo "${YELLOW}已安装但未运行${NC}") + echo -e "${GREEN}OK${NC} 状态:${status}" + else + echo -e "${RED}失败${NC} 未安装" + fi +} + +# 检查dcgm_exporter +check_dcgm_exporter() { + echo -n "dcgm_exporter检查:" + if command -v dcgm-exporter &>/dev/null; then + status=$(systemctl is-active --quiet dcgm-exporter && echo "${GREEN}运行中${NC}" || echo "${YELLOW}已安装但未运行${NC}") + echo -e "${GREEN}OK${NC} 状态:${status}" + else + echo -e "${RED}失败${NC} 未安装" + fi +} + +# 主函数 +main() { + echo -e "${MAGENTA}========================================${NC}" + echo -e "${MAGENTA} 系统硬件和软件检查工具 ${NC}" + echo -e "${MAGENTA}========================================${NC}" + + check_gpu_driver + check_network_driver + check_root_partition + check_normal_users # 新增检查普通用户 + check_ubuntu_user + check_cuda + check_system_version + check_kernel_lock + check_fabricmanager + check_node_exporter + check_dcgm_exporter + + echo -e "${MAGENTA}========================================${NC}" + echo -e "${MAGENTA} 检查完成! ${NC}" + echo -e "${MAGENTA}========================================${NC}" +} + +# 执行主函数 +main \ No newline at end of file