2025-07-28 16:51:33 +08:00
|
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 脚本部署工具 - 下载并配置GPU监控脚本
|
|
|
|
|
|
set -euo pipefail
|
|
|
|
|
|
IFS=$'\n\t'
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 17:32:12 +08:00
|
|
|
|
# 颜色定义
|
|
|
|
|
|
RED='\033[0;31m'
|
|
|
|
|
|
GREEN='\033[0;32m'
|
|
|
|
|
|
YELLOW='\033[0;33m'
|
|
|
|
|
|
BLUE='\033[0;34m'
|
|
|
|
|
|
NC='\033[0m' # 恢复默认颜色
|
|
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 配置信息
|
|
|
|
|
|
SCRIPT_URL="http://116.205.97.109/scripts/gpu_metrics.sh" # 脚本下载地址
|
|
|
|
|
|
TARGET_DIR="/opt/scripts" # 脚本存放目录
|
|
|
|
|
|
TARGET_SCRIPT="${TARGET_DIR}/gpu_monitor.sh" # 目标脚本路径
|
2025-09-09 09:10:28 +08:00
|
|
|
|
CRON_SCHEDULE="*/3 * * * *" # 定时任务执行频率
|
2025-07-28 16:52:29 +08:00
|
|
|
|
LOG_FILE="/var/log/gpu_monitor.log" # 日志文件路径
|
2025-07-28 17:28:00 +08:00
|
|
|
|
METRICS_DIR="/var/lib/node_exporter/textfile_collector" # 指标文件目录
|
|
|
|
|
|
METRICS_FILE="${METRICS_DIR}/gpu_metrics.prom" # 指标文件
|
|
|
|
|
|
NODE_EXPORTER_SERVICE="node_exporter.service" # node_exporter服务名
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
|
|
|
|
|
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 帮助信息
|
|
|
|
|
|
show_help() {
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}用法: $0 [选项]${NC}"
|
|
|
|
|
|
echo -e "${BLUE}GPU监控脚本部署工具(必须指定--install或--uninstall)${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
echo ""
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}选项:${NC}"
|
|
|
|
|
|
echo -e " ${YELLOW}--install${NC} 安装并配置GPU监控脚本,重启node_exporter"
|
|
|
|
|
|
echo -e " ${YELLOW}--uninstall${NC} 卸载GPU监控脚本及配置,删除指标文件并重启node_exporter"
|
|
|
|
|
|
echo -e " ${YELLOW}--help${NC} 显示此帮助信息"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
echo ""
|
2025-07-28 16:52:29 +08:00
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 检查是否以root权限运行
|
|
|
|
|
|
check_root() {
|
|
|
|
|
|
if [ "$(id -u)" -ne 0 ]; then
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${RED}错误: 此脚本需要root权限运行,请使用sudo${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 检查服务是否存在
|
|
|
|
|
|
service_exists() {
|
|
|
|
|
|
systemctl list-units --all --full -t service --no-legend "$1" | grep -q "$1"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 重启node_exporter并检查状态
|
|
|
|
|
|
restart_node_exporter() {
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}检查node_exporter服务状态...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
if service_exists "${NODE_EXPORTER_SERVICE}"; then
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}正在重启node_exporter服务...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
systemctl restart "${NODE_EXPORTER_SERVICE}" || true # 允许失败,不中断流程
|
|
|
|
|
|
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}检查node_exporter状态...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
if systemctl is-active --quiet "${NODE_EXPORTER_SERVICE}"; then
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}node_exporter状态: 运行中${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
else
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}警告: node_exporter未正常运行,请查看日志:journalctl -u ${NODE_EXPORTER_SERVICE}${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
|
|
|
|
|
else
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}警告: node_exporter服务不存在,跳过重启${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
|
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 安全添加定时任务(避免重复)
|
|
|
|
|
|
add_cron_job() {
|
2025-09-10 14:53:15 +08:00
|
|
|
|
local cron_line="${CRON_SCHEDULE} bash ${TARGET_SCRIPT} > ${LOG_FILE} 2>&1"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
|
|
|
|
|
|
# 检查任务是否已存在(精确匹配)
|
|
|
|
|
|
if crontab -l 2>/dev/null | grep -qxF "${cron_line}"; then
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}定时任务已存在,跳过添加${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
return 0
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 备份现有crontab
|
|
|
|
|
|
local backup_file="/tmp/crontab_backup_$(date +%s)"
|
|
|
|
|
|
crontab -l 2>/dev/null > "${backup_file}" || true
|
|
|
|
|
|
|
|
|
|
|
|
# 添加新任务
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}添加定时任务: ${cron_line}${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
(
|
|
|
|
|
|
crontab -l 2>/dev/null || true # 获取现有crontab(如果有)
|
|
|
|
|
|
echo "${cron_line}" # 添加新任务
|
|
|
|
|
|
) | crontab -
|
|
|
|
|
|
|
|
|
|
|
|
# 验证添加结果
|
|
|
|
|
|
if crontab -l 2>/dev/null | grep -qxF "${cron_line}"; then
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}定时任务添加成功${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
else
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${RED}错误: 定时任务添加失败${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 恢复备份
|
|
|
|
|
|
if [ -f "${backup_file}" ]; then
|
|
|
|
|
|
crontab "${backup_file}"
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}已恢复原crontab${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
|
|
|
|
|
return 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 安装函数
|
|
|
|
|
|
install_script() {
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}===== 开始安装GPU监控脚本 =====${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
|
|
|
|
|
|
# 1. 清理旧指标文件(避免残留数据)
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}清理旧指标文件...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
if [ -f "${METRICS_FILE}" ]; then
|
|
|
|
|
|
rm -f "${METRICS_FILE}"
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}已删除旧指标文件: ${METRICS_FILE}${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 创建目标目录并下载脚本
|
|
|
|
|
|
mkdir -p "${TARGET_DIR}"
|
|
|
|
|
|
cd "${TARGET_DIR}"
|
|
|
|
|
|
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}正在下载GPU监控脚本...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
wget -q -O "${TARGET_SCRIPT}" "${SCRIPT_URL}" || {
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${RED}错误: 无法从 ${SCRIPT_URL} 下载脚本${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
exit 1
|
|
|
|
|
|
}
|
|
|
|
|
|
chmod +x "${TARGET_SCRIPT}"
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 脚本配置(确保GPU总数为8)
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}配置脚本参数...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
sed -i 's/EXPECTED_GPUS=".*"/EXPECTED_GPUS="0 1 2 3 4 5 6 7"/' "${TARGET_SCRIPT}"
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 准备metrics目录
|
|
|
|
|
|
mkdir -p "${METRICS_DIR}"
|
|
|
|
|
|
chmod 755 "${METRICS_DIR}"
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 配置定时任务(安全添加,避免重复)
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}配置定时任务...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
add_cron_job || exit 1
|
|
|
|
|
|
|
|
|
|
|
|
# 6. 首次执行脚本生成指标
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}测试执行脚本...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
"${TARGET_SCRIPT}" || {
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${RED}错误: 脚本执行失败,请检查日志: ${LOG_FILE}${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
exit 1
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 7. 重启node_exporter使配置生效
|
|
|
|
|
|
restart_node_exporter
|
|
|
|
|
|
|
|
|
|
|
|
# 8. 验证结果
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}===== 安装完成 =====${NC}"
|
|
|
|
|
|
echo -e "${BLUE}脚本位置:${NC} ${TARGET_SCRIPT}"
|
|
|
|
|
|
echo -e "${BLUE}定时任务:${NC} $(crontab -l | grep "${TARGET_SCRIPT}")"
|
|
|
|
|
|
echo -e "${BLUE}日志文件:${NC} ${LOG_FILE}"
|
|
|
|
|
|
echo -e "${BLUE}指标文件:${NC} ${METRICS_FILE}${GREEN}(已生成)${NC}"
|
|
|
|
|
|
echo -e "${BLUE}node_exporter状态:${NC} $(systemctl is-active "${NODE_EXPORTER_SERVICE}" 2>/dev/null || echo -e "${YELLOW}未安装${NC}")"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 卸载函数
|
|
|
|
|
|
uninstall_script() {
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}===== 开始卸载GPU监控脚本 =====${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
|
|
|
|
|
|
# 1. 移除定时任务
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}删除定时任务...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
local cron_line="${CRON_SCHEDULE} ${TARGET_SCRIPT} >> ${LOG_FILE} 2>&1"
|
|
|
|
|
|
|
|
|
|
|
|
if crontab -l 2>/dev/null | grep -qxF "${cron_line}"; then
|
|
|
|
|
|
# 备份现有crontab
|
|
|
|
|
|
local backup_file="/tmp/crontab_backup_$(date +%s)"
|
|
|
|
|
|
crontab -l > "${backup_file}"
|
|
|
|
|
|
|
|
|
|
|
|
# 移除任务
|
|
|
|
|
|
crontab -l | grep -vxF "${cron_line}" | crontab -
|
|
|
|
|
|
|
|
|
|
|
|
# 验证移除结果
|
|
|
|
|
|
if ! crontab -l 2>/dev/null | grep -qxF "${cron_line}"; then
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}已成功删除定时任务${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
else
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${RED}错误: 定时任务删除失败${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 恢复备份
|
|
|
|
|
|
crontab "${backup_file}"
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}已恢复原crontab${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
else
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}未找到相关定时任务,跳过${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 删除脚本文件
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}删除监控脚本...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
if [ -f "${TARGET_SCRIPT}" ]; then
|
|
|
|
|
|
rm -f "${TARGET_SCRIPT}"
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}已删除脚本: ${TARGET_SCRIPT}${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
else
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}脚本文件不存在,跳过${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 删除指标文件
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}删除指标文件...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
if [ -f "${METRICS_FILE}" ]; then
|
|
|
|
|
|
rm -f "${METRICS_FILE}"
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}已删除指标文件: ${METRICS_FILE}${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
else
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}指标文件不存在,跳过${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 删除日志文件
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${BLUE}删除日志文件...${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
if [ -f "${LOG_FILE}" ]; then
|
|
|
|
|
|
rm -f "${LOG_FILE}"
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${GREEN}已删除日志文件: ${LOG_FILE}${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
else
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}日志文件不存在,跳过${NC}"
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 重启node_exporter
|
|
|
|
|
|
restart_node_exporter
|
|
|
|
|
|
|
|
|
|
|
|
# 6. 验证结果
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${YELLOW}===== 卸载完成 =====${NC}"
|
|
|
|
|
|
echo -e "${BLUE}node_exporter状态:${NC} $(systemctl is-active "${NODE_EXPORTER_SERVICE}" 2>/dev/null || echo -e "${YELLOW}未安装${NC}")"
|
2025-07-28 16:52:29 +08:00
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 主逻辑
|
|
|
|
|
|
check_root
|
|
|
|
|
|
|
|
|
|
|
|
# 参数检查
|
|
|
|
|
|
if [ $# -ne 1 ]; then
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${RED}错误: 必须指定--install或--uninstall${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
show_help
|
2025-07-28 16:52:29 +08:00
|
|
|
|
exit 1
|
2025-07-28 17:28:00 +08:00
|
|
|
|
fi
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 17:28:00 +08:00
|
|
|
|
# 执行对应操作
|
|
|
|
|
|
case "$1" in
|
|
|
|
|
|
--install)
|
|
|
|
|
|
install_script
|
|
|
|
|
|
;;
|
|
|
|
|
|
--uninstall)
|
|
|
|
|
|
uninstall_script
|
|
|
|
|
|
;;
|
|
|
|
|
|
--help)
|
|
|
|
|
|
show_help
|
|
|
|
|
|
exit 0
|
|
|
|
|
|
;;
|
|
|
|
|
|
*)
|
2025-07-28 17:32:12 +08:00
|
|
|
|
echo -e "${RED}错误: 未知参数 '$1'${NC}" >&2
|
2025-07-28 17:28:00 +08:00
|
|
|
|
show_help
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
;;
|
2025-07-28 17:32:12 +08:00
|
|
|
|
esac
|