ansible-devops/scripts/dcgm-exporter.sh

289 lines
7.9 KiB
Bash
Raw Permalink Normal View History

2025-07-05 15:49:53 +08:00
#!/bin/bash
2025-09-20 18:29:00 +08:00
set -uo pipefail
2025-07-05 15:49:53 +08:00
# 全局变量
LOG_FILE="/var/log/dcgm-exporter-install_$(date +%Y%m%d%H%M%S).log"
GO_VERSION="1.21.1"
DCGM_EXPORTER_VERSION="4.2.0-4.1.0"
DCGM_EXPORTER_DIR="/opt/dcgm-exporter"
SERVICE_FILE="/etc/systemd/system/dcgm-exporter.service"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m'
# 日志函数(控制台+日志文件)
log() {
local timestamp="[$(date '+%Y-%m-%d %H:%M:%S')]"
echo -e "$timestamp $*" | tee -a "$LOG_FILE"
}
# 成功提示
success() {
log "${GREEN}$*${NC}"
}
# 警告提示
warning() {
log "${YELLOW}$*${NC}"
}
# 错误提示
error() {
log "${RED}✖ 错误: $*${NC}"
exit 1
}
# 执行命令(带详细错误处理)
run() {
local cmd="$1"
local error_msg="${2:-命令执行失败}"
local timeout="${3:-30}" # 默认超时30秒
log "→ 执行: $cmd"
# 使用timeout防止命令卡死
if ! timeout "$timeout" bash -c "$cmd" &>> "$LOG_FILE"; then
local exit_code=$?
if [[ $exit_code -eq 124 ]]; then
error "命令超时 ($timeout秒): $cmd"
else
error "$error_msg (退出码: $exit_code)"
fi
fi
}
# 检测DCGM状态
check_dcgm() {
log "检测DCGM服务状态..."
# 检查systemctl命令是否存在
if ! command -v systemctl &> /dev/null; then
error "未找到systemctl命令请确保系统支持systemd"
fi
# 检查nvidia-dcgm.service文件是否存在
if [[ ! -f "/lib/systemd/system/nvidia-dcgm.service" ]]; then
error "未找到nvidia-dcgm服务文件请确认DCGM已正确安装"
fi
# 获取服务状态
local status=$(systemctl is-active nvidia-dcgm 2>&1)
local exit_code=$?
if [[ $exit_code -ne 0 ]]; then
error "无法获取DCGM服务状态: $status"
fi
if [[ "$status" == "active" ]]; then
DCGM_VERSION=$(dcgmi version 2>&1 | grep -oP 'Version \K[^\s]+' || echo "未知")
success "DCGM服务运行中 (版本: $DCGM_VERSION)"
else
error "DCGM服务状态异常 ($status),请先执行: systemctl start nvidia-dcgm"
fi
}
# 安装Go环境
install_go() {
log "安装Go环境 (版本: $GO_VERSION)..."
GO_PACKAGE="go${GO_VERSION}.linux-amd64.tar.gz"
DL_URL="http://10.101.0.51:5588/yindun-iso/go/$GO_PACKAGE"
2025-07-05 15:49:53 +08:00
TMP_PACKAGE="/tmp/$GO_PACKAGE"
# 下载安装包
if [[ ! -f "$TMP_PACKAGE" ]]; then
run "wget -qO $TMP_PACKAGE $DL_URL" "下载Go安装包失败"
fi
# 解压安装
run "tar -xzf $TMP_PACKAGE -C /usr/local" "解压Go安装包失败"
# 配置环境变量
GO_ENV="/etc/profile.d/go.sh"
cat > "$GO_ENV" <<'EOF'
export GOROOT=/usr/local/go
export GOPATH=/usr/local/gopath
export PATH=$PATH:$GOROOT/bin
export GO111MODULE=on
export GOPROXY=https://goproxy.cn,direct
EOF
log "→ 配置Go环境变量"
if ! chmod +x "$GO_ENV" &>> "$LOG_FILE"; then
error "设置Go环境变量文件权限失败"
fi
# 在当前shell中加载环境变量
log "→ 加载Go环境变量"
if ! source "$GO_ENV"; then
error "加载Go环境变量失败"
fi
# 验证安装
local go_version=$(go version 2>&1)
if [[ "$go_version" == *"go$GO_VERSION"* ]]; then
success "Go环境安装完成: $go_version"
else
error "Go环境验证失败: $go_version"
fi
}
# 卸载现有DCGM Exporter
uninstall_existing() {
log "检查是否存在旧版本DCGM Exporter..."
if [[ -d "$DCGM_EXPORTER_DIR" || -f "$SERVICE_FILE" ]]; then
log "发现旧版本,开始卸载..."
# 停止服务
if systemctl is-active --quiet dcgm-exporter; then
run "systemctl stop dcgm-exporter" "停止现有服务失败"
fi
# 禁用服务
if systemctl is-enabled --quiet dcgm-exporter; then
run "systemctl disable dcgm-exporter" "禁用现有服务失败"
fi
# 删除文件
run "rm -rf $DCGM_EXPORTER_DIR" "删除旧安装目录失败"
run "rm -f $SERVICE_FILE" "删除服务文件失败"
# 重新加载systemd
run "systemctl daemon-reload" "重新加载systemd失败"
success "旧版本卸载完成"
else
success "未发现旧版本,继续安装..."
fi
}
# 安装DCGM Exporter
install_exporter() {
log "安装DCGM Exporter (版本: $DCGM_EXPORTER_VERSION)..."
# 加载Go环境变量
log "→ 加载Go环境变量"
if [[ -f "/etc/profile.d/go.sh" ]]; then
if ! source "/etc/profile.d/go.sh"; then
error "加载Go环境变量失败"
fi
else
error "未找到Go环境变量配置文件"
fi
run "mkdir -p $DCGM_EXPORTER_DIR" "创建工作目录失败"
# 下载源码使用固定URL
DCGM_PACKAGE="4.2.0-4.1.0.tar.gz"
DL_URL="https://github.com/NVIDIA/dcgm-exporter/archive/refs/tags/$DCGM_PACKAGE"
TMP_PACKAGE="/tmp/$DCGM_PACKAGE"
if [[ ! -f "$TMP_PACKAGE" ]]; then
run "wget -qO $TMP_PACKAGE $DL_URL" "下载DCGM Exporter源码失败"
fi
# 解压
run "tar xf $TMP_PACKAGE -C /tmp" "解压DCGM Exporter源码失败"
SOURCE_DIR="/tmp/dcgm-exporter-$DCGM_EXPORTER_VERSION"
# 编译安装
log "→ 编译DCGM Exporter"
if ! cd "$SOURCE_DIR" &>> "$LOG_FILE"; then
error "进入源码目录失败"
fi
# 分步骤执行make便于调试
if ! make binary &>> "$LOG_FILE"; then
error "编译DCGM Exporter失败"
fi
if ! make install &>> "$LOG_FILE"; then
error "安装DCGM Exporter失败"
fi
# 复制文件
run "cp -a /usr/bin/dcgm-exporter $DCGM_EXPORTER_DIR/" "复制二进制文件失败"
run "cp -a /etc/dcgm-exporter/default-counters.csv $DCGM_EXPORTER_DIR/" "复制配置文件失败"
# 生成服务文件
cat > "$SERVICE_FILE" <<EOF
[Unit]
Description=DCGM Exporter
After=network.target nvidia-dcgm.service
[Service]
Type=simple
User=root
ExecStart=/opt/dcgm-exporter/dcgm-exporter -f /opt/dcgm-exporter/default-counters.csv -a 0.0.0.0:9411
Restart=always
StandardOutput=file:/var/log/dcgm-exporter.log
StandardError=file:/var/log/dcgm-exporter-error.log
[Install]
WantedBy=multi-user.target
EOF
run "chmod 644 $SERVICE_FILE" "设置服务文件权限失败"
# 启动服务
run "systemctl daemon-reload && systemctl enable --now dcgm-exporter.service" "启动DCGM Exporter服务失败"
# 验证服务
log "等待服务启动..."
for i in {1..30}; do
if curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:9411/metrics | grep -q "200"; then
success "DCGM Exporter服务启动成功 (http://127.0.0.1:9411/metrics)"
break
fi
sleep 1
done
[[ $i -eq 31 ]] && error "服务启动超时,请检查日志"
}
# 清理Go环境
clean_go() {
log "清理Go环境..."
run "rm -rf /usr/local/go" "删除Go安装目录失败"
run "rm -f /etc/profile.d/go.sh" "删除Go环境变量配置失败"
run "rm -rf /tmp/go*.tar.gz /tmp/dcgm-exporter*.tar.gz" "删除临时安装包失败"
success "Go环境清理完成"
}
# 主流程
log "================= DCGM Exporter安装 =================="
log "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
# 解析命令行参数
case "$1" in
"--install")
ACTION="install"
;;
"--uninstall")
ACTION="uninstall"
;;
*)
error "未知参数: $1\n用法: $0 [--install|--uninstall]"
;;
esac
# 执行对应操作
case "$ACTION" in
"install")
check_dcgm
install_go
uninstall_existing
install_exporter
clean_go
;;
"uninstall")
uninstall_existing
success "卸载完成"
;;
esac
# 完成
log "================= 操作完成 =================="
log "日志文件: $LOG_FILE"