ansible-devops/scripts/deploy_gpu_monitor.sh

65 lines
1.9 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 脚本部署工具 - 下载并配置GPU监控脚本
set -euo pipefail
IFS=$'\n\t'
# 配置信息
SCRIPT_URL="http://116.205.97.109/scripts/gpu_metrics.sh" # 脚本下载地址
TARGET_DIR="/opt/scripts" # 脚本存放目录
TARGET_SCRIPT="${TARGET_DIR}/gpu_monitor.sh" # 目标脚本路径
CRON_SCHEDULE="*/2 * * * *" # 定时任务执行频率
LOG_FILE="/var/log/gpu_monitor.log" # 日志文件路径
# 创建目标目录
mkdir -p "${TARGET_DIR}"
cd "${TARGET_DIR}"
# 下载脚本
echo "正在下载GPU监控脚本..."
wget -q -O "${TARGET_SCRIPT}" "${SCRIPT_URL}" || {
echo "错误: 无法从 ${SCRIPT_URL} 下载脚本" >&2
exit 1
}
# 赋予执行权限
chmod +x "${TARGET_SCRIPT}"
# 修改脚本内容示例将GPU总数设置为8
echo "正在修改脚本配置..."
sed -i 's/BASELINE_GPUS=".*"/BASELINE_GPUS="0 1 2 3 4 5 6 7"/' "${TARGET_SCRIPT}"
# 确保metrics文件目录存在
METRICS_DIR="/var/lib/node_exporter/textfile_collector"
mkdir -p "${METRICS_DIR}"
chmod 755 "${METRICS_DIR}"
# 添加定时任务到crontab
echo "正在配置定时任务..."
(
crontab -l 2>/dev/null || true # 获取现有crontab如果有
echo "${CRON_SCHEDULE} ${TARGET_SCRIPT} >> ${LOG_FILE} 2>&1" # 添加新任务
) | crontab -
# 验证定时任务
echo "验证定时任务配置:"
crontab -l | grep "${TARGET_SCRIPT}" || {
echo "错误: 定时任务添加失败" >&2
exit 1
}
# 首次执行脚本
echo "执行脚本进行测试..."
"${TARGET_SCRIPT}" || {
echo "错误: 脚本首次执行失败,请检查 ${LOG_FILE}" >&2
exit 1
}
echo "====================================================="
echo "GPU监控脚本已成功部署!"
echo "脚本位置: ${TARGET_SCRIPT}"
echo "定时任务: ${CRON_SCHEDULE}"
echo "日志文件: ${LOG_FILE}"
echo "指标文件: ${METRICS_DIR}/gpu_metrics.prom"
echo "====================================================="