ansible-devops/scripts/install-checker.sh

234 lines
8.4 KiB
Bash
Raw Normal View History

2025-07-15 18:09:02 +08:00
#!/bin/bash
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
MAGENTA='\033[0;35m'
CYAN='\033[0;36m'
WHITE='\033[0;37m'
NC='\033[0m' # 恢复默认颜色
# 标题函数
print_title() {
echo -e "${CYAN}========================================${NC}"
echo -e "${CYAN}$1${NC}"
echo -e "${CYAN}========================================${NC}"
}
# 检查显卡驱动
check_gpu_driver() {
echo -n "显卡驱动检查:"
if command -v nvidia-smi &>/dev/null; then
driver_version=$(nvidia-smi | grep -i 'driver version' | awk '{print $6}')
if [ -z "$driver_version" ]; then
driver_version=$(nvidia-smi | grep -i 'version' | head -1 | awk '{print $9}')
fi
echo -e "${GREEN}OK${NC} 版本:${YELLOW}${driver_version}${NC}"
else
echo -e "${RED}失败${NC} 未找到nvidia-smi命令"
fi
}
# 检查网卡驱动
check_network_driver() {
echo -n "网卡驱动检查:"
# 检查mlx5_core驱动版本适用于CX7网卡
if modinfo mlx5_core &>/dev/null; then
mlx5_version=$(modinfo mlx5_core | grep -i version | awk '{print $2}')
echo -e "${GREEN}OK${NC}"
echo -e " ${YELLOW}Mellanox CX7 驱动版本:${NC} ${mlx5_version}"
else
echo -e "${RED}失败${NC} 未找到mlx5_core驱动"
fi
# 检查IB驱动是否安装
if command -v ibdev2netdev &>/dev/null; then
echo -e " ${YELLOW}InfiniBand 状态:${NC} 已安装"
# 获取IB设备信息
ib_devices=$(ibdev2netdev 2>/dev/null | awk '{print $1}')
if [ -n "$ib_devices" ]; then
echo -e " ${YELLOW}IB 设备:${NC}"
for dev in $ib_devices; do
dev_info=$(ibdev2netdev | grep $dev)
echo -e " - ${dev_info}"
done
fi
else
echo -e " ${YELLOW}InfiniBand 状态:${NC} 未安装或未启用"
fi
# 显示所有网络设备的驱动信息(精简版)
all_drivers=$(lshw -C network 2>/dev/null | grep -A5 'description: Ethernet' | grep -e 'product:' -e 'vendor:' -e 'driver:' | tr '\n' '; ' | sed 's/; /\n - /g')
if [ -n "$all_drivers" ]; then
echo -e " ${YELLOW}所有网卡驱动信息:${NC}"
echo -e " - ${all_drivers}"
fi
}
# 检查根分区大小及LVM状态
check_root_partition() {
echo -n "根分区检查:"
root_info=$(df -h / | awk 'NR==2 {print "总大小: " $2 ", 已用: " $3 ", 可用: " $4 ", 使用率: " $5}')
if [ -n "$root_info" ]; then
echo -e "${GREEN}OK${NC} 信息:${YELLOW}${root_info}${NC}"
# 检查是否使用LVM
root_fs=$(df -P / | tail -1 | awk '{print $1}')
if echo "$root_fs" | grep -q '^/dev/mapper/'; then
echo -e " ${YELLOW}根分区使用LVM:${NC} ${GREEN}${NC}"
# 获取LV信息
lv_name=$(echo "$root_fs" | cut -d'/' -f3)
vg_name=$(lvs --noheadings -o vg_name "$lv_name" 2>/dev/null | tr -d ' ')
if [ -n "$vg_name" ]; then
echo -e " ${YELLOW}卷组名称:${NC} ${vg_name}"
# 检查VG是否有可用空间扩容潜力
vg_free=$(vgs --noheadings -o vg_free "$vg_name" 2>/dev/null | tr -d ' ')
if [ -n "$vg_free" ] && [ "$vg_free" != "0.00m" ]; then
echo -e " ${YELLOW}卷组可用空间:${NC} ${vg_free} (可扩容)"
else
echo -e " ${YELLOW}卷组可用空间:${NC} ${vg_free} (无扩容空间)"
fi
# 检查LV是否已占满VG
lv_size=$(lvs --noheadings -o lv_size "$lv_name" 2>/dev/null | tr -d ' ')
vg_size=$(vgs --noheadings -o vg_size "$vg_name" 2>/dev/null | tr -d ' ')
if [ -n "$lv_size" ] && [ -n "$vg_size" ]; then
if [ "$lv_size" = "$vg_size" ]; then
echo -e " ${YELLOW}逻辑卷状态:${NC} 已占满卷组全部空间"
else
echo -e " ${YELLOW}逻辑卷状态:${NC} 未占满卷组空间 (可扩容)"
fi
fi
fi
else
echo -e " ${YELLOW}根分区使用LVM:${NC} ${RED}${NC}"
# 检查普通分区是否有未分配空间
partition=$(lsblk -no pkname "$root_fs" 2>/dev/null)
if [ -n "$partition" ]; then
unallocated=$(parted -s /dev/"$partition" unit GB print free | grep "Free Space" | tail -1 | awk '{print $3}')
if [ -n "$unallocated" ] && [ "$unallocated" != "0.00GB" ]; then
echo -e " ${YELLOW}分区未分配空间:${NC} ${unallocated} (可扩容)"
else
echo -e " ${YELLOW}分区未分配空间:${NC} 无 (不可扩容)"
fi
fi
fi
else
echo -e "${RED}失败${NC} 无法获取根分区信息"
fi
}
# 检查普通用户
check_normal_users() {
echo -n "普通用户检查:"
# 获取所有普通用户UID >= 1000 且 非系统用户)
normal_users=$(getent passwd | awk -F: '$3 >= 1000 && $3 < 65534 && !/nologin|false/ {print $1}')
if [ -n "$normal_users" ]; then
echo -e "${GREEN}OK${NC}"
echo -e " ${YELLOW}已存在普通用户:${NC} ${normal_users}"
else
echo -e "${RED}失败${NC} 未找到普通用户"
fi
}
# 检查CUDA
check_cuda() {
echo -n "CUDA检查"
if command -v nvcc &>/dev/null; then
cuda_version=$(nvcc --version | grep -i release | awk '{print $5}' | tr -d ',')
echo -e "${GREEN}OK${NC} 版本:${YELLOW}${cuda_version}${NC}"
else
echo -e "${RED}失败${NC} nvcc未找到"
# 尝试从其他位置获取CUDA版本
if [ -f /usr/local/cuda/version.txt ]; then
cuda_version_file=$(cat /usr/local/cuda/version.txt)
echo -e " CUDA版本(从文件): ${YELLOW}${cuda_version_file}${NC}"
fi
fi
}
# 检查系统版本
check_system_version() {
echo -n "系统版本检查:"
os_info=$(lsb_release -ds 2>/dev/null || cat /etc/os-release 2>/dev/null | grep PRETTY_NAME | cut -d'"' -f2 || echo "未知")
kernel_info=$(uname -r)
echo -e "${GREEN}OK${NC} 系统:${YELLOW}${os_info}${NC}, 内核:${YELLOW}${kernel_info}${NC}"
}
# 检查内核是否锁定
check_kernel_lock() {
echo -n "内核锁定检查:"
if [ -f /etc/apt/preferences.d/kernel.pref ]; then
echo -e "${GREEN}OK${NC} 存在内核锁定配置"
else
echo -e "${RED}失败${NC} 未找到内核锁定配置"
fi
}
# 检查nvidia-fabricmanager
check_fabricmanager() {
echo -n "nvidia-fabricmanager检查"
if systemctl is-active --quiet nvidia-fabricmanager; then
echo -e "${GREEN}OK${NC} 服务正在运行"
else
echo -e "${RED}失败${NC} 服务未运行"
fi
}
# 检查node_exporter
check_node_exporter() {
echo -n "node_exporter检查"
if command -v node_exporter &>/dev/null; then
status=$(systemctl is-active --quiet node_exporter && echo "${GREEN}运行中${NC}" || echo "${YELLOW}已安装但未运行${NC}")
echo -e "${GREEN}OK${NC} 状态:${status}"
else
echo -e "${RED}失败${NC} 未安装"
fi
}
# 检查dcgm_exporter
check_dcgm_exporter() {
echo -n "dcgm_exporter检查"
if command -v dcgm-exporter &>/dev/null; then
status=$(systemctl is-active --quiet dcgm-exporter && echo "${GREEN}运行中${NC}" || echo "${YELLOW}已安装但未运行${NC}")
echo -e "${GREEN}OK${NC} 状态:${status}"
else
echo -e "${RED}失败${NC} 未安装"
fi
}
# 主函数
main() {
echo -e "${MAGENTA}========================================${NC}"
echo -e "${MAGENTA} 系统硬件和软件检查工具 ${NC}"
echo -e "${MAGENTA}========================================${NC}"
check_gpu_driver
check_network_driver
check_root_partition
check_normal_users # 新增检查普通用户
check_ubuntu_user
check_cuda
check_system_version
check_kernel_lock
check_fabricmanager
check_node_exporter
check_dcgm_exporter
echo -e "${MAGENTA}========================================${NC}"
echo -e "${MAGENTA} 检查完成! ${NC}"
echo -e "${MAGENTA}========================================${NC}"
}
# 执行主函数
main