ansible-devops/scripts/dcgm-exporter.sh

289 lines
7.9 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -uo pipefail
# 全局变量
LOG_FILE="/var/log/dcgm-exporter-install_$(date +%Y%m%d%H%M%S).log"
GO_VERSION="1.21.1"
DCGM_EXPORTER_VERSION="4.2.0-4.1.0"
DCGM_EXPORTER_DIR="/opt/dcgm-exporter"
SERVICE_FILE="/etc/systemd/system/dcgm-exporter.service"
# 颜色定义
GREEN='\033[1;32m'
RED='\033[1;31m'
YELLOW='\033[1;33m'
NC='\033[0m'
# 日志函数(控制台+日志文件)
log() {
local timestamp="[$(date '+%Y-%m-%d %H:%M:%S')]"
echo -e "$timestamp $*" | tee -a "$LOG_FILE"
}
# 成功提示
success() {
log "${GREEN}$*${NC}"
}
# 警告提示
warning() {
log "${YELLOW}$*${NC}"
}
# 错误提示
error() {
log "${RED}✖ 错误: $*${NC}"
exit 1
}
# 执行命令(带详细错误处理)
run() {
local cmd="$1"
local error_msg="${2:-命令执行失败}"
local timeout="${3:-60}" # 默认超时30秒
log "→ 执行: $cmd"
# 使用timeout防止命令卡死
if ! timeout "$timeout" bash -c "$cmd" &>> "$LOG_FILE"; then
local exit_code=$?
if [[ $exit_code -eq 124 ]]; then
error "命令超时 ($timeout秒): $cmd"
else
error "$error_msg (退出码: $exit_code)"
fi
fi
}
# 检测DCGM状态
check_dcgm() {
log "检测DCGM服务状态..."
# 检查systemctl命令是否存在
if ! command -v systemctl &> /dev/null; then
error "未找到systemctl命令请确保系统支持systemd"
fi
# 检查nvidia-dcgm.service文件是否存在
if [[ ! -f "/lib/systemd/system/nvidia-dcgm.service" ]]; then
error "未找到nvidia-dcgm服务文件请确认DCGM已正确安装"
fi
# 获取服务状态
local status=$(systemctl is-active nvidia-dcgm 2>&1)
local exit_code=$?
if [[ $exit_code -ne 0 ]]; then
error "无法获取DCGM服务状态: $status"
fi
if [[ "$status" == "active" ]]; then
DCGM_VERSION=$(dcgmi version 2>&1 | grep -oP 'Version \K[^\s]+' || echo "未知")
success "DCGM服务运行中 (版本: $DCGM_VERSION)"
else
error "DCGM服务状态异常 ($status),请先执行: systemctl start nvidia-dcgm"
fi
}
# 安装Go环境
install_go() {
log "安装Go环境 (版本: $GO_VERSION)..."
GO_PACKAGE="go${GO_VERSION}.linux-amd64.tar.gz"
DL_URL="https://mirrors.aliyun.com/golang/$GO_PACKAGE"
TMP_PACKAGE="/tmp/$GO_PACKAGE"
# 下载安装包
if [[ ! -f "$TMP_PACKAGE" ]]; then
run "wget -qO $TMP_PACKAGE $DL_URL" "下载Go安装包失败"
fi
# 解压安装
run "tar -xzf $TMP_PACKAGE -C /usr/local" "解压Go安装包失败"
# 配置环境变量
GO_ENV="/etc/profile.d/go.sh"
cat > "$GO_ENV" <<'EOF'
export GOROOT=/usr/local/go
export GOPATH=/usr/local/gopath
export PATH=$PATH:$GOROOT/bin
export GO111MODULE=on
export GOPROXY=https://goproxy.cn,direct
EOF
log "→ 配置Go环境变量"
if ! chmod +x "$GO_ENV" &>> "$LOG_FILE"; then
error "设置Go环境变量文件权限失败"
fi
# 在当前shell中加载环境变量
log "→ 加载Go环境变量"
if ! source "$GO_ENV"; then
error "加载Go环境变量失败"
fi
# 验证安装
local go_version=$(go version 2>&1)
if [[ "$go_version" == *"go$GO_VERSION"* ]]; then
success "Go环境安装完成: $go_version"
else
error "Go环境验证失败: $go_version"
fi
}
# 卸载现有DCGM Exporter
uninstall_existing() {
log "检查是否存在旧版本DCGM Exporter..."
if [[ -d "$DCGM_EXPORTER_DIR" || -f "$SERVICE_FILE" ]]; then
log "发现旧版本,开始卸载..."
# 停止服务
if systemctl is-active --quiet dcgm-exporter; then
run "systemctl stop dcgm-exporter" "停止现有服务失败"
fi
# 禁用服务
if systemctl is-enabled --quiet dcgm-exporter; then
run "systemctl disable dcgm-exporter" "禁用现有服务失败"
fi
# 删除文件
run "rm -rf $DCGM_EXPORTER_DIR" "删除旧安装目录失败"
run "rm -f $SERVICE_FILE" "删除服务文件失败"
# 重新加载systemd
run "systemctl daemon-reload" "重新加载systemd失败"
success "旧版本卸载完成"
else
success "未发现旧版本,继续安装..."
fi
}
# 安装DCGM Exporter
install_exporter() {
log "安装DCGM Exporter (版本: $DCGM_EXPORTER_VERSION)..."
# 加载Go环境变量
log "→ 加载Go环境变量"
if [[ -f "/etc/profile.d/go.sh" ]]; then
if ! source "/etc/profile.d/go.sh"; then
error "加载Go环境变量失败"
fi
else
error "未找到Go环境变量配置文件"
fi
run "mkdir -p $DCGM_EXPORTER_DIR" "创建工作目录失败"
# 下载源码使用固定URL
DCGM_PACKAGE="4.2.0-4.1.0.tar.gz"
DL_URL="https://github.com/NVIDIA/dcgm-exporter/archive/refs/tags/$DCGM_PACKAGE"
TMP_PACKAGE="/tmp/$DCGM_PACKAGE"
if [[ ! -f "$TMP_PACKAGE" ]]; then
run "wget -qO $TMP_PACKAGE $DL_URL" "下载DCGM Exporter源码失败"
fi
# 解压
run "tar xf $TMP_PACKAGE -C /tmp" "解压DCGM Exporter源码失败"
SOURCE_DIR="/tmp/dcgm-exporter-$DCGM_EXPORTER_VERSION"
# 编译安装
log "→ 编译DCGM Exporter"
if ! cd "$SOURCE_DIR" &>> "$LOG_FILE"; then
error "进入源码目录失败"
fi
# 分步骤执行make便于调试
if ! make binary &>> "$LOG_FILE"; then
error "编译DCGM Exporter失败"
fi
if ! make install &>> "$LOG_FILE"; then
error "安装DCGM Exporter失败"
fi
# 复制文件
run "cp -a /usr/bin/dcgm-exporter $DCGM_EXPORTER_DIR/" "复制二进制文件失败"
run "cp -a /etc/dcgm-exporter/default-counters.csv $DCGM_EXPORTER_DIR/" "复制配置文件失败"
# 生成服务文件
cat > "$SERVICE_FILE" <<EOF
[Unit]
Description=DCGM Exporter
After=network.target nvidia-dcgm.service
[Service]
Type=simple
User=root
ExecStart=/opt/dcgm-exporter/dcgm-exporter -f /opt/dcgm-exporter/default-counters.csv -a 0.0.0.0:9411
Restart=always
StandardOutput=file:/var/log/dcgm-exporter.log
StandardError=file:/var/log/dcgm-exporter-error.log
[Install]
WantedBy=multi-user.target
EOF
run "chmod 644 $SERVICE_FILE" "设置服务文件权限失败"
# 启动服务
run "systemctl daemon-reload && systemctl enable --now dcgm-exporter.service" "启动DCGM Exporter服务失败"
# 验证服务
log "等待服务启动..."
for i in {1..30}; do
if curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:9411/metrics | grep -q "200"; then
success "DCGM Exporter服务启动成功 (http://127.0.0.1:9411/metrics)"
break
fi
sleep 1
done
[[ $i -eq 31 ]] && error "服务启动超时,请检查日志"
}
# 清理Go环境
clean_go() {
log "清理Go环境..."
run "rm -rf /usr/local/go" "删除Go安装目录失败"
run "rm -f /etc/profile.d/go.sh" "删除Go环境变量配置失败"
run "rm -rf /tmp/go*.tar.gz /tmp/dcgm-exporter*.tar.gz" "删除临时安装包失败"
success "Go环境清理完成"
}
# 主流程
log "================= DCGM Exporter安装 =================="
log "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
# 解析命令行参数
case "$1" in
"--install")
ACTION="install"
;;
"--uninstall")
ACTION="uninstall"
;;
*)
error "未知参数: $1\n用法: $0 [--install|--uninstall]"
;;
esac
# 执行对应操作
case "$ACTION" in
"install")
check_dcgm
install_go
uninstall_existing
install_exporter
clean_go
;;
"uninstall")
uninstall_existing
success "卸载完成"
;;
esac
# 完成
log "================= 操作完成 =================="
log "日志文件: $LOG_FILE"