2025-07-28 16:51:33 +08:00
|
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 脚本部署工具 - 下载并配置GPU监控脚本
|
|
|
|
|
|
set -euo pipefail
|
|
|
|
|
|
IFS=$'\n\t'
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 配置信息
|
|
|
|
|
|
SCRIPT_URL="http://116.205.97.109/scripts/gpu_metrics.sh" # 脚本下载地址
|
|
|
|
|
|
TARGET_DIR="/opt/scripts" # 脚本存放目录
|
|
|
|
|
|
TARGET_SCRIPT="${TARGET_DIR}/gpu_monitor.sh" # 目标脚本路径
|
|
|
|
|
|
CRON_SCHEDULE="*/2 * * * *" # 定时任务执行频率
|
|
|
|
|
|
LOG_FILE="/var/log/gpu_monitor.log" # 日志文件路径
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 创建目标目录
|
|
|
|
|
|
mkdir -p "${TARGET_DIR}"
|
|
|
|
|
|
cd "${TARGET_DIR}"
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 下载脚本
|
|
|
|
|
|
echo "正在下载GPU监控脚本..."
|
|
|
|
|
|
wget -q -O "${TARGET_SCRIPT}" "${SCRIPT_URL}" || {
|
|
|
|
|
|
echo "错误: 无法从 ${SCRIPT_URL} 下载脚本" >&2
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 赋予执行权限
|
|
|
|
|
|
chmod +x "${TARGET_SCRIPT}"
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 修改脚本内容(示例:将GPU总数设置为8)
|
|
|
|
|
|
echo "正在修改脚本配置..."
|
|
|
|
|
|
sed -i 's/BASELINE_GPUS=".*"/BASELINE_GPUS="0 1 2 3 4 5 6 7"/' "${TARGET_SCRIPT}"
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 确保metrics文件目录存在
|
|
|
|
|
|
METRICS_DIR="/var/lib/node_exporter/textfile_collector"
|
|
|
|
|
|
mkdir -p "${METRICS_DIR}"
|
|
|
|
|
|
chmod 755 "${METRICS_DIR}"
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 添加定时任务到crontab
|
|
|
|
|
|
echo "正在配置定时任务..."
|
|
|
|
|
|
(
|
|
|
|
|
|
crontab -l 2>/dev/null || true # 获取现有crontab(如果有)
|
|
|
|
|
|
echo "${CRON_SCHEDULE} ${TARGET_SCRIPT} >> ${LOG_FILE} 2>&1" # 添加新任务
|
|
|
|
|
|
) | crontab -
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 验证定时任务
|
|
|
|
|
|
echo "验证定时任务配置:"
|
|
|
|
|
|
crontab -l | grep "${TARGET_SCRIPT}" || {
|
|
|
|
|
|
echo "错误: 定时任务添加失败" >&2
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
# 首次执行脚本
|
|
|
|
|
|
echo "执行脚本进行测试..."
|
|
|
|
|
|
"${TARGET_SCRIPT}" || {
|
|
|
|
|
|
echo "错误: 脚本首次执行失败,请检查 ${LOG_FILE}" >&2
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
}
|
2025-07-28 16:51:33 +08:00
|
|
|
|
|
2025-07-28 16:52:29 +08:00
|
|
|
|
echo "====================================================="
|
|
|
|
|
|
echo "GPU监控脚本已成功部署!"
|
|
|
|
|
|
echo "脚本位置: ${TARGET_SCRIPT}"
|
|
|
|
|
|
echo "定时任务: ${CRON_SCHEDULE}"
|
|
|
|
|
|
echo "日志文件: ${LOG_FILE}"
|
|
|
|
|
|
echo "指标文件: ${METRICS_DIR}/gpu_metrics.prom"
|
|
|
|
|
|
echo "====================================================="
|