ansible-devops/scripts/node-exporter.sh

350 lines
11 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
#========================================================================
# Node Exporter 安装/卸载脚本 - 最终优化版
# 用途在Linux系统上自动化安装和卸载Prometheus Node Exporter
# 版本1.3.3
#========================================================================
# 严格模式
set -euo pipefail
IFS=$'\n\t'
#================================
# 全局配置
#================================
LOG_FILE="/var/log/node_exporter_install_$(date +%Y%m%d%H%M%S).log"
APP_NAME="node_exporter"
APP_VERSION="1.8.2"
APP_USER="node_exporter"
APP_DIR="/opt/${APP_NAME}"
BINARY_DIR="${APP_DIR}/bin"
SYSTEMD_SERVICE_FILE="/lib/systemd/system/${APP_NAME}.service"
WEB_LISTEN_ADDRESS=":10086"
# 颜色定义
GREEN='\033[1;32m' # 绿色 - 成功
RED='\033[1;31m' # 红色 - 失败/错误
BLUE='\033[1;34m' # 蓝色 - 标题/信息
NC='\033[0m' # 重置颜色
#================================
# 日志函数
#================================
log() {
local level="$1"
local message="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 颜色输出 - 仅SUCCESS和ERROR使用特殊颜色
case "$level" in
"SUCCESS") echo -e "${GREEN}[${level}]${NC} $message" ;;
"ERROR") echo -e "${RED}[${level}]${NC} $message" ;;
*) echo -e "${BLUE}[${level}]${NC} $message" ;;
esac
# 记录到日志文件(不包含颜色)
echo "[$timestamp] [$level] $message" >> "$LOG_FILE"
}
#================================
# 错误处理
#================================
error() {
local message="$1"
log "ERROR" "$message"
log "ERROR" "详细日志请查看: $LOG_FILE"
exit 1
}
#================================
# 执行命令
#================================
run_cmd() {
local command="$1"
local description="${2:-"执行命令"}"
log "INFO" "$description: $command"
# 执行命令并捕获输出
local output
output=$(eval "$command" 2>&1) || {
log "ERROR" "命令执行失败: $command"
log "ERROR" "错误详情: $output"
return 1
}
return 0
}
#================================
# 测试网络连接(极简日志版)
#================================
test_network() {
local url="$1"
log "INFO" "测试网络连接: $url"
# 直接测试URL连通性
if ! curl -fsSLI --connect-timeout 5 "$url" &>> "$LOG_FILE"; then
return 1
fi
return 0
}
#================================
# 下载文件
#================================
download_file() {
local url="$1"
local dest="$2"
log "INFO" "开始下载: $url"
# 使用wget下载显示进度条
if ! wget -q --show-progress -O "$dest" "$url" &>> "$LOG_FILE"; then
error "下载失败: $url"
fi
log "INFO" "下载完成: $dest"
}
#================================
# 安装 Node Exporter
#================================
install_node_exporter() {
log "TITLE" "${BLUE}开始安装 Node Exporter ${APP_VERSION}${NC}"
# 检查基本网络工具
log "INFO" "检查必备工具..."
for tool in wget curl tar; do
if ! command -v "$tool" &>> "$LOG_FILE"; then
log "ERROR" "未找到必备工具: $tool"
log "ERROR" "请先安装: yum install -y $tool 或 apt-get install -y $tool"
exit 1
fi
done
# 创建目录
run_cmd "mkdir -p $BINARY_DIR" "创建目录结构"
# 定义下载源列表 (按优先级排序)
DOWNLOAD_SOURCES=(
"/opt/${APP_NAME}-${APP_VERSION}.linux-amd64.tar.gz" # 本地文件
"http://10.101.0.51:5588/${APP_NAME}/${APP_NAME}-${APP_VERSION}.linux-amd64.tar.gz" # 内部源1
"http://10.102.32.207:5588/${APP_NAME}/${APP_NAME}-${APP_VERSION}.linux-amd64.tar.gz" # 内部源2
"https://github.com/prometheus/${APP_NAME}/releases/download/v${APP_VERSION}/${APP_NAME}-${APP_VERSION}.linux-amd64.tar.gz" # 公共源
"https://mirrors.tuna.tsinghua.edu.cn/github-release/prometheus/node_exporter/Node-${APP_VERSION}/${APP_NAME}-${APP_VERSION}.linux-amd64.tar.gz" # 清华镜像
)
# 查找可用的下载源
DOWNLOAD_URL=""
log "INFO" "开始查找可用的下载源..."
for source in "${DOWNLOAD_SOURCES[@]}"; do
if [[ "$source" == /* ]]; then
# 本地文件检查
log "INFO" "检查本地文件: $source"
if [[ -f "$source" ]]; then
DOWNLOAD_URL="$source"
log "INFO" "${BLUE}找到本地文件: $DOWNLOAD_URL${NC}"
break
else
log "WARN" "本地文件不存在: $source"
fi
else
# 网络URL检查仅保留必要日志
if ! test_network "$source"; then
log "WARN" "网络源不可用: $source"
continue # 直接跳过不可用源
fi
DOWNLOAD_URL="$source"
log "INFO" "${BLUE}找到可用下载源: $DOWNLOAD_URL${NC}"
break
fi
done
# 检查是否找到可用源
if [[ -z "$DOWNLOAD_URL" ]]; then
log "ERROR" "无法找到可用的下载源"
log "ERROR" "请检查网络连接或手动下载安装包到/opt目录"
log "ERROR" "下载地址: https://github.com/prometheus/node_exporter/releases/download/v${APP_VERSION}/${APP_NAME}-${APP_VERSION}.linux-amd64.tar.gz"
exit 1
fi
# 下载文件
TEMP_FILE="/tmp/${APP_NAME}.tar.gz"
log "INFO" "准备获取安装包..."
if [[ "$DOWNLOAD_URL" == /* ]]; then
# 使用本地文件
log "INFO" "使用本地文件: $DOWNLOAD_URL"
run_cmd "cp $DOWNLOAD_URL $TEMP_FILE" "复制本地文件到临时目录"
else
# 从网络下载
log "INFO" "从网络下载: $DOWNLOAD_URL"
download_file "$DOWNLOAD_URL" "$TEMP_FILE"
fi
# 验证文件完整性
log "INFO" "验证下载文件的完整性..."
file_size=$(stat -c%s "$TEMP_FILE")
if [[ $file_size -lt 1048576 ]]; then # 检查文件大小是否小于1MB
log "ERROR" "下载的文件大小异常: $file_size 字节"
log "ERROR" "请检查网络连接或下载源的可用性"
exit 1
fi
log "INFO" "文件大小正常: $file_size 字节"
# 解压文件
run_cmd "tar -xzf $TEMP_FILE -C /tmp" "解压安装包"
# 移动二进制文件
run_cmd "mv /tmp/${APP_NAME}-${APP_VERSION}.linux-amd64/${APP_NAME} $BINARY_DIR/" "移动二进制文件"
# 创建系统用户
if ! id "$APP_USER" &>> "$LOG_FILE"; then
run_cmd "useradd --system --no-create-home --shell /bin/false $APP_USER" "创建系统用户"
fi
# 设置权限
run_cmd "chown -R $APP_USER:$APP_USER $APP_DIR" "设置文件权限"
# 创建systemd服务
log "INFO" "配置systemd服务"
cat > "$SYSTEMD_SERVICE_FILE" <<EOF
[Unit]
Description=Node Exporter
After=network.target
[Service]
User=$APP_USER
ExecStart=$BINARY_DIR/${APP_NAME} --collector.textfile.directory=/var/lib/node_exporter/textfile_collector --web.listen-address=$WEB_LISTEN_ADDRESS
Restart=always
[Install]
WantedBy=multi-user.target
EOF
# 启动服务
run_cmd "mkdir -p /var/lib/node_exporter/textfile_collector" "开启textfile扩展"
run_cmd "chmod -R 755 /var/lib/node_exporter/textfile_collector"
run_cmd "systemctl daemon-reload" "重新加载systemd配置"
run_cmd "systemctl enable ${APP_NAME}.service" "启用服务"
run_cmd "systemctl restart ${APP_NAME}.service" "启动服务"
# 验证安装
log "INFO" "验证安装..."
sleep 2 # 等待服务启动
if ! systemctl is-active --quiet ${APP_NAME}.service; then
log "ERROR" "服务启动失败"
run_cmd "systemctl status ${APP_NAME}.service" "查看服务状态"
exit 1
fi
if ! curl -s -f "http://127.0.0.1:10086/metrics" &>> "$LOG_FILE"; then
log "ERROR" "指标端点不可访问"
log "ERROR" "请检查服务是否正常运行或端口是否被占用"
exit 1
fi
# 获取服务器IP优先使用第二个内部IP
SERVER_IP=$(hostname -I | awk '{print $2}')
if [[ -z "$SERVER_IP" ]]; then
SERVER_IP=$(hostname -I | awk '{print $1}') # 备用获取第一个IP
fi
# 美化结果展示(全蓝色,仅结果用红绿)
log "TITLE" "${BLUE}========================================"
log "INFO" "服务名称Node Exporter"
log "SUCCESS" "安装结果:成功"
log "INFO" "访问地址http://${SERVER_IP}:10086/metrics"
log "INFO" "日志位置:$LOG_FILE"
log "TITLE" "${BLUE}========================================"
}
#================================
# 卸载 Node Exporter
#================================
uninstall_node_exporter() {
log "TITLE" "${BLUE}开始卸载 Node Exporter${NC}"
# 停止服务
if systemctl is-active --quiet ${APP_NAME}.service; then
run_cmd "systemctl stop ${APP_NAME}.service" "停止服务"
fi
# 禁用服务
if systemctl is-enabled --quiet ${APP_NAME}.service; then
run_cmd "systemctl disable ${APP_NAME}.service" "禁用服务"
fi
# 删除服务文件
if [[ -f "$SYSTEMD_SERVICE_FILE" ]]; then
run_cmd "rm -f $SYSTEMD_SERVICE_FILE" "删除服务文件"
fi
# 重新加载systemd
run_cmd "systemctl daemon-reload" "重新加载systemd配置"
# 删除文件
if [[ -d "$APP_DIR" ]]; then
run_cmd "rm -rf $APP_DIR" "删除应用目录"
fi
# 可选:删除用户
log "SUCCESS" "${GREEN}Node Exporter 卸载完成!${NC}"
log "INFO" "注意: 系统用户 '$APP_USER' 未被删除,如需删除请手动执行: userdel $APP_USER"
log "INFO" "日志文件位置: $LOG_FILE"
}
#================================
# 显示帮助
#================================
show_help() {
echo "用法: $0 [选项]"
echo "选项:"
echo " --install 安装 Node Exporter"
echo " --uninstall 卸载 Node Exporter"
echo " --help 显示此帮助信息"
}
#================================
# 主函数
#================================
main() {
log "TITLE" "${BLUE}Node Exporter 管理脚本启动${NC}"
# 检查root权限
if [[ $EUID -ne 0 ]]; then
error "此脚本需要root权限运行"
fi
# 解析参数
if [[ $# -ne 1 ]]; then
show_help
exit 1
fi
case "$1" in
"--install")
install_node_exporter
;;
"--uninstall")
uninstall_node_exporter
;;
"--help")
show_help
exit 0
;;
*)
error "未知参数: $1"
;;
esac
}
# 执行主函数
main "$@"