ansible-devops/scripts/install-checker.sh

234 lines
8.4 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
MAGENTA='\033[0;35m'
CYAN='\033[0;36m'
WHITE='\033[0;37m'
NC='\033[0m' # 恢复默认颜色
# 标题函数
print_title() {
echo -e "${CYAN}========================================${NC}"
echo -e "${CYAN}$1${NC}"
echo -e "${CYAN}========================================${NC}"
}
# 检查显卡驱动
check_gpu_driver() {
echo -n "显卡驱动检查:"
if command -v nvidia-smi &>/dev/null; then
driver_version=$(nvidia-smi | grep -i 'driver version' | awk '{print $6}')
if [ -z "$driver_version" ]; then
driver_version=$(nvidia-smi | grep -i 'version' | head -1 | awk '{print $9}')
fi
echo -e "${GREEN}OK${NC} 版本:${YELLOW}${driver_version}${NC}"
else
echo -e "${RED}失败${NC} 未找到nvidia-smi命令"
fi
}
# 检查网卡驱动
check_network_driver() {
echo -n "网卡驱动检查:"
# 检查mlx5_core驱动版本适用于CX7网卡
if modinfo mlx5_core &>/dev/null; then
mlx5_version=$(modinfo mlx5_core | grep -i version | awk '{print $2}')
echo -e "${GREEN}OK${NC}"
echo -e " ${YELLOW}Mellanox CX7 驱动版本:${NC} ${mlx5_version}"
else
echo -e "${RED}失败${NC} 未找到mlx5_core驱动"
fi
# 检查IB驱动是否安装
if command -v ibdev2netdev &>/dev/null; then
echo -e " ${YELLOW}InfiniBand 状态:${NC} 已安装"
# 获取IB设备信息
ib_devices=$(ibdev2netdev 2>/dev/null | awk '{print $1}')
if [ -n "$ib_devices" ]; then
echo -e " ${YELLOW}IB 设备:${NC}"
for dev in $ib_devices; do
dev_info=$(ibdev2netdev | grep $dev)
echo -e " - ${dev_info}"
done
fi
else
echo -e " ${YELLOW}InfiniBand 状态:${NC} 未安装或未启用"
fi
# 显示所有网络设备的驱动信息(精简版)
all_drivers=$(lshw -C network 2>/dev/null | grep -A5 'description: Ethernet' | grep -e 'product:' -e 'vendor:' -e 'driver:' | tr '\n' '; ' | sed 's/; /\n - /g')
if [ -n "$all_drivers" ]; then
echo -e " ${YELLOW}所有网卡驱动信息:${NC}"
echo -e " - ${all_drivers}"
fi
}
# 检查根分区大小及LVM状态
check_root_partition() {
echo -n "根分区检查:"
root_info=$(df -h / | awk 'NR==2 {print "总大小: " $2 ", 已用: " $3 ", 可用: " $4 ", 使用率: " $5}')
if [ -n "$root_info" ]; then
echo -e "${GREEN}OK${NC} 信息:${YELLOW}${root_info}${NC}"
# 检查是否使用LVM
root_fs=$(df -P / | tail -1 | awk '{print $1}')
if echo "$root_fs" | grep -q '^/dev/mapper/'; then
echo -e " ${YELLOW}根分区使用LVM:${NC} ${GREEN}${NC}"
# 获取LV信息
lv_name=$(echo "$root_fs" | cut -d'/' -f3)
vg_name=$(lvs --noheadings -o vg_name "$lv_name" 2>/dev/null | tr -d ' ')
if [ -n "$vg_name" ]; then
echo -e " ${YELLOW}卷组名称:${NC} ${vg_name}"
# 检查VG是否有可用空间扩容潜力
vg_free=$(vgs --noheadings -o vg_free "$vg_name" 2>/dev/null | tr -d ' ')
if [ -n "$vg_free" ] && [ "$vg_free" != "0.00m" ]; then
echo -e " ${YELLOW}卷组可用空间:${NC} ${vg_free} (可扩容)"
else
echo -e " ${YELLOW}卷组可用空间:${NC} ${vg_free} (无扩容空间)"
fi
# 检查LV是否已占满VG
lv_size=$(lvs --noheadings -o lv_size "$lv_name" 2>/dev/null | tr -d ' ')
vg_size=$(vgs --noheadings -o vg_size "$vg_name" 2>/dev/null | tr -d ' ')
if [ -n "$lv_size" ] && [ -n "$vg_size" ]; then
if [ "$lv_size" = "$vg_size" ]; then
echo -e " ${YELLOW}逻辑卷状态:${NC} 已占满卷组全部空间"
else
echo -e " ${YELLOW}逻辑卷状态:${NC} 未占满卷组空间 (可扩容)"
fi
fi
fi
else
echo -e " ${YELLOW}根分区使用LVM:${NC} ${RED}${NC}"
# 检查普通分区是否有未分配空间
partition=$(lsblk -no pkname "$root_fs" 2>/dev/null)
if [ -n "$partition" ]; then
unallocated=$(parted -s /dev/"$partition" unit GB print free | grep "Free Space" | tail -1 | awk '{print $3}')
if [ -n "$unallocated" ] && [ "$unallocated" != "0.00GB" ]; then
echo -e " ${YELLOW}分区未分配空间:${NC} ${unallocated} (可扩容)"
else
echo -e " ${YELLOW}分区未分配空间:${NC} 无 (不可扩容)"
fi
fi
fi
else
echo -e "${RED}失败${NC} 无法获取根分区信息"
fi
}
# 检查普通用户
check_normal_users() {
echo -n "普通用户检查:"
# 获取所有普通用户UID >= 1000 且 非系统用户)
normal_users=$(getent passwd | awk -F: '$3 >= 1000 && $3 < 65534 && !/nologin|false/ {print $1}')
if [ -n "$normal_users" ]; then
echo -e "${GREEN}OK${NC}"
echo -e " ${YELLOW}已存在普通用户:${NC} ${normal_users}"
else
echo -e "${RED}失败${NC} 未找到普通用户"
fi
}
# 检查CUDA
check_cuda() {
echo -n "CUDA检查"
if command -v nvcc &>/dev/null; then
cuda_version=$(nvcc --version | grep -i release | awk '{print $5}' | tr -d ',')
echo -e "${GREEN}OK${NC} 版本:${YELLOW}${cuda_version}${NC}"
else
echo -e "${RED}失败${NC} nvcc未找到"
# 尝试从其他位置获取CUDA版本
if [ -f /usr/local/cuda/version.txt ]; then
cuda_version_file=$(cat /usr/local/cuda/version.txt)
echo -e " CUDA版本(从文件): ${YELLOW}${cuda_version_file}${NC}"
fi
fi
}
# 检查系统版本
check_system_version() {
echo -n "系统版本检查:"
os_info=$(lsb_release -ds 2>/dev/null || cat /etc/os-release 2>/dev/null | grep PRETTY_NAME | cut -d'"' -f2 || echo "未知")
kernel_info=$(uname -r)
echo -e "${GREEN}OK${NC} 系统:${YELLOW}${os_info}${NC}, 内核:${YELLOW}${kernel_info}${NC}"
}
# 检查内核是否锁定
check_kernel_lock() {
echo -n "内核锁定检查:"
if [ -f /etc/apt/preferences.d/kernel.pref ]; then
echo -e "${GREEN}OK${NC} 存在内核锁定配置"
else
echo -e "${RED}失败${NC} 未找到内核锁定配置"
fi
}
# 检查nvidia-fabricmanager
check_fabricmanager() {
echo -n "nvidia-fabricmanager检查"
if systemctl is-active --quiet nvidia-fabricmanager; then
echo -e "${GREEN}OK${NC} 服务正在运行"
else
echo -e "${RED}失败${NC} 服务未运行"
fi
}
# 检查node_exporter
check_node_exporter() {
echo -n "node_exporter检查"
if command -v node_exporter &>/dev/null; then
status=$(systemctl is-active --quiet node_exporter && echo "${GREEN}运行中${NC}" || echo "${YELLOW}已安装但未运行${NC}")
echo -e "${GREEN}OK${NC} 状态:${status}"
else
echo -e "${RED}失败${NC} 未安装"
fi
}
# 检查dcgm_exporter
check_dcgm_exporter() {
echo -n "dcgm_exporter检查"
if command -v dcgm-exporter &>/dev/null; then
status=$(systemctl is-active --quiet dcgm-exporter && echo "${GREEN}运行中${NC}" || echo "${YELLOW}已安装但未运行${NC}")
echo -e "${GREEN}OK${NC} 状态:${status}"
else
echo -e "${RED}失败${NC} 未安装"
fi
}
# 主函数
main() {
echo -e "${MAGENTA}========================================${NC}"
echo -e "${MAGENTA} 系统硬件和软件检查工具 ${NC}"
echo -e "${MAGENTA}========================================${NC}"
check_gpu_driver
check_network_driver
check_root_partition
check_normal_users # 新增检查普通用户
check_ubuntu_user
check_cuda
check_system_version
check_kernel_lock
check_fabricmanager
check_node_exporter
check_dcgm_exporter
echo -e "${MAGENTA}========================================${NC}"
echo -e "${MAGENTA} 检查完成! ${NC}"
echo -e "${MAGENTA}========================================${NC}"
}
# 执行主函数
main