2025-07-05 15:49:53 +08:00
|
|
|
|
#!/bin/bash
|
2025-09-20 18:29:00 +08:00
|
|
|
|
set -uo pipefail
|
2025-07-05 15:49:53 +08:00
|
|
|
|
|
|
|
|
|
|
# 全局变量
|
|
|
|
|
|
LOG_FILE="/var/log/dcgm-exporter-install_$(date +%Y%m%d%H%M%S).log"
|
|
|
|
|
|
GO_VERSION="1.21.1"
|
|
|
|
|
|
DCGM_EXPORTER_VERSION="4.2.0-4.1.0"
|
|
|
|
|
|
DCGM_EXPORTER_DIR="/opt/dcgm-exporter"
|
|
|
|
|
|
SERVICE_FILE="/etc/systemd/system/dcgm-exporter.service"
|
|
|
|
|
|
|
|
|
|
|
|
# 颜色定义
|
|
|
|
|
|
GREEN='\033[1;32m'
|
|
|
|
|
|
RED='\033[1;31m'
|
|
|
|
|
|
YELLOW='\033[1;33m'
|
|
|
|
|
|
NC='\033[0m'
|
|
|
|
|
|
|
|
|
|
|
|
# 日志函数(控制台+日志文件)
|
|
|
|
|
|
log() {
|
|
|
|
|
|
local timestamp="[$(date '+%Y-%m-%d %H:%M:%S')]"
|
|
|
|
|
|
echo -e "$timestamp $*" | tee -a "$LOG_FILE"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 成功提示
|
|
|
|
|
|
success() {
|
|
|
|
|
|
log "${GREEN}✔ $*${NC}"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 警告提示
|
|
|
|
|
|
warning() {
|
|
|
|
|
|
log "${YELLOW}⚠ $*${NC}"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 错误提示
|
|
|
|
|
|
error() {
|
|
|
|
|
|
log "${RED}✖ 错误: $*${NC}"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 执行命令(带详细错误处理)
|
|
|
|
|
|
run() {
|
|
|
|
|
|
local cmd="$1"
|
|
|
|
|
|
local error_msg="${2:-命令执行失败}"
|
2025-10-24 10:16:05 +08:00
|
|
|
|
local timeout="${3:-120}" # 默认超时120秒
|
2025-07-05 15:49:53 +08:00
|
|
|
|
|
|
|
|
|
|
log "→ 执行: $cmd"
|
|
|
|
|
|
|
|
|
|
|
|
# 使用timeout防止命令卡死
|
|
|
|
|
|
if ! timeout "$timeout" bash -c "$cmd" &>> "$LOG_FILE"; then
|
|
|
|
|
|
local exit_code=$?
|
|
|
|
|
|
if [[ $exit_code -eq 124 ]]; then
|
|
|
|
|
|
error "命令超时 ($timeout秒): $cmd"
|
|
|
|
|
|
else
|
|
|
|
|
|
error "$error_msg (退出码: $exit_code)"
|
|
|
|
|
|
fi
|
|
|
|
|
|
fi
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 检测DCGM状态
|
|
|
|
|
|
check_dcgm() {
|
|
|
|
|
|
log "检测DCGM服务状态..."
|
|
|
|
|
|
|
|
|
|
|
|
# 检查systemctl命令是否存在
|
|
|
|
|
|
if ! command -v systemctl &> /dev/null; then
|
|
|
|
|
|
error "未找到systemctl命令,请确保系统支持systemd"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 检查nvidia-dcgm.service文件是否存在
|
|
|
|
|
|
if [[ ! -f "/lib/systemd/system/nvidia-dcgm.service" ]]; then
|
|
|
|
|
|
error "未找到nvidia-dcgm服务文件,请确认DCGM已正确安装"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 获取服务状态
|
|
|
|
|
|
local status=$(systemctl is-active nvidia-dcgm 2>&1)
|
|
|
|
|
|
local exit_code=$?
|
|
|
|
|
|
|
|
|
|
|
|
if [[ $exit_code -ne 0 ]]; then
|
|
|
|
|
|
error "无法获取DCGM服务状态: $status"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
if [[ "$status" == "active" ]]; then
|
|
|
|
|
|
DCGM_VERSION=$(dcgmi version 2>&1 | grep -oP 'Version \K[^\s]+' || echo "未知")
|
|
|
|
|
|
success "DCGM服务运行中 (版本: $DCGM_VERSION)"
|
|
|
|
|
|
else
|
|
|
|
|
|
error "DCGM服务状态异常 ($status),请先执行: systemctl start nvidia-dcgm"
|
|
|
|
|
|
fi
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 安装Go环境
|
|
|
|
|
|
install_go() {
|
|
|
|
|
|
log "安装Go环境 (版本: $GO_VERSION)..."
|
|
|
|
|
|
GO_PACKAGE="go${GO_VERSION}.linux-amd64.tar.gz"
|
2025-10-24 10:06:30 +08:00
|
|
|
|
DL_URL="https://mirrors.aliyun.com/golang/$GO_PACKAGE"
|
2025-07-05 15:49:53 +08:00
|
|
|
|
TMP_PACKAGE="/tmp/$GO_PACKAGE"
|
|
|
|
|
|
|
|
|
|
|
|
# 下载安装包
|
|
|
|
|
|
if [[ ! -f "$TMP_PACKAGE" ]]; then
|
|
|
|
|
|
run "wget -qO $TMP_PACKAGE $DL_URL" "下载Go安装包失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 解压安装
|
|
|
|
|
|
run "tar -xzf $TMP_PACKAGE -C /usr/local" "解压Go安装包失败"
|
|
|
|
|
|
|
|
|
|
|
|
# 配置环境变量
|
|
|
|
|
|
GO_ENV="/etc/profile.d/go.sh"
|
|
|
|
|
|
cat > "$GO_ENV" <<'EOF'
|
|
|
|
|
|
export GOROOT=/usr/local/go
|
|
|
|
|
|
export GOPATH=/usr/local/gopath
|
|
|
|
|
|
export PATH=$PATH:$GOROOT/bin
|
|
|
|
|
|
export GO111MODULE=on
|
|
|
|
|
|
export GOPROXY=https://goproxy.cn,direct
|
|
|
|
|
|
EOF
|
|
|
|
|
|
log "→ 配置Go环境变量"
|
|
|
|
|
|
if ! chmod +x "$GO_ENV" &>> "$LOG_FILE"; then
|
|
|
|
|
|
error "设置Go环境变量文件权限失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 在当前shell中加载环境变量
|
|
|
|
|
|
log "→ 加载Go环境变量"
|
|
|
|
|
|
if ! source "$GO_ENV"; then
|
|
|
|
|
|
error "加载Go环境变量失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 验证安装
|
|
|
|
|
|
local go_version=$(go version 2>&1)
|
|
|
|
|
|
if [[ "$go_version" == *"go$GO_VERSION"* ]]; then
|
|
|
|
|
|
success "Go环境安装完成: $go_version"
|
|
|
|
|
|
else
|
|
|
|
|
|
error "Go环境验证失败: $go_version"
|
|
|
|
|
|
fi
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 卸载现有DCGM Exporter
|
|
|
|
|
|
uninstall_existing() {
|
|
|
|
|
|
log "检查是否存在旧版本DCGM Exporter..."
|
|
|
|
|
|
|
|
|
|
|
|
if [[ -d "$DCGM_EXPORTER_DIR" || -f "$SERVICE_FILE" ]]; then
|
|
|
|
|
|
log "发现旧版本,开始卸载..."
|
|
|
|
|
|
|
|
|
|
|
|
# 停止服务
|
|
|
|
|
|
if systemctl is-active --quiet dcgm-exporter; then
|
|
|
|
|
|
run "systemctl stop dcgm-exporter" "停止现有服务失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 禁用服务
|
|
|
|
|
|
if systemctl is-enabled --quiet dcgm-exporter; then
|
|
|
|
|
|
run "systemctl disable dcgm-exporter" "禁用现有服务失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 删除文件
|
|
|
|
|
|
run "rm -rf $DCGM_EXPORTER_DIR" "删除旧安装目录失败"
|
|
|
|
|
|
run "rm -f $SERVICE_FILE" "删除服务文件失败"
|
|
|
|
|
|
|
|
|
|
|
|
# 重新加载systemd
|
|
|
|
|
|
run "systemctl daemon-reload" "重新加载systemd失败"
|
|
|
|
|
|
|
|
|
|
|
|
success "旧版本卸载完成"
|
|
|
|
|
|
else
|
|
|
|
|
|
success "未发现旧版本,继续安装..."
|
|
|
|
|
|
fi
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 安装DCGM Exporter
|
|
|
|
|
|
install_exporter() {
|
|
|
|
|
|
log "安装DCGM Exporter (版本: $DCGM_EXPORTER_VERSION)..."
|
|
|
|
|
|
|
|
|
|
|
|
# 加载Go环境变量
|
|
|
|
|
|
log "→ 加载Go环境变量"
|
|
|
|
|
|
if [[ -f "/etc/profile.d/go.sh" ]]; then
|
|
|
|
|
|
if ! source "/etc/profile.d/go.sh"; then
|
|
|
|
|
|
error "加载Go环境变量失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
else
|
|
|
|
|
|
error "未找到Go环境变量配置文件"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
run "mkdir -p $DCGM_EXPORTER_DIR" "创建工作目录失败"
|
|
|
|
|
|
|
|
|
|
|
|
# 下载源码(使用固定URL)
|
|
|
|
|
|
DCGM_PACKAGE="4.2.0-4.1.0.tar.gz"
|
|
|
|
|
|
DL_URL="https://github.com/NVIDIA/dcgm-exporter/archive/refs/tags/$DCGM_PACKAGE"
|
|
|
|
|
|
TMP_PACKAGE="/tmp/$DCGM_PACKAGE"
|
|
|
|
|
|
|
|
|
|
|
|
if [[ ! -f "$TMP_PACKAGE" ]]; then
|
|
|
|
|
|
run "wget -qO $TMP_PACKAGE $DL_URL" "下载DCGM Exporter源码失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 解压
|
|
|
|
|
|
run "tar xf $TMP_PACKAGE -C /tmp" "解压DCGM Exporter源码失败"
|
|
|
|
|
|
SOURCE_DIR="/tmp/dcgm-exporter-$DCGM_EXPORTER_VERSION"
|
|
|
|
|
|
|
|
|
|
|
|
# 编译安装
|
|
|
|
|
|
log "→ 编译DCGM Exporter"
|
|
|
|
|
|
if ! cd "$SOURCE_DIR" &>> "$LOG_FILE"; then
|
|
|
|
|
|
error "进入源码目录失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 分步骤执行make,便于调试
|
|
|
|
|
|
if ! make binary &>> "$LOG_FILE"; then
|
|
|
|
|
|
error "编译DCGM Exporter失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
if ! make install &>> "$LOG_FILE"; then
|
|
|
|
|
|
error "安装DCGM Exporter失败"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
# 复制文件
|
|
|
|
|
|
run "cp -a /usr/bin/dcgm-exporter $DCGM_EXPORTER_DIR/" "复制二进制文件失败"
|
|
|
|
|
|
run "cp -a /etc/dcgm-exporter/default-counters.csv $DCGM_EXPORTER_DIR/" "复制配置文件失败"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 生成服务文件
|
|
|
|
|
|
cat > "$SERVICE_FILE" <<EOF
|
|
|
|
|
|
[Unit]
|
|
|
|
|
|
Description=DCGM Exporter
|
|
|
|
|
|
After=network.target nvidia-dcgm.service
|
|
|
|
|
|
|
|
|
|
|
|
[Service]
|
|
|
|
|
|
Type=simple
|
|
|
|
|
|
User=root
|
|
|
|
|
|
ExecStart=/opt/dcgm-exporter/dcgm-exporter -f /opt/dcgm-exporter/default-counters.csv -a 0.0.0.0:9411
|
|
|
|
|
|
Restart=always
|
|
|
|
|
|
StandardOutput=file:/var/log/dcgm-exporter.log
|
|
|
|
|
|
StandardError=file:/var/log/dcgm-exporter-error.log
|
|
|
|
|
|
|
|
|
|
|
|
[Install]
|
|
|
|
|
|
WantedBy=multi-user.target
|
|
|
|
|
|
EOF
|
|
|
|
|
|
run "chmod 644 $SERVICE_FILE" "设置服务文件权限失败"
|
|
|
|
|
|
|
|
|
|
|
|
# 启动服务
|
|
|
|
|
|
run "systemctl daemon-reload && systemctl enable --now dcgm-exporter.service" "启动DCGM Exporter服务失败"
|
|
|
|
|
|
|
|
|
|
|
|
# 验证服务
|
|
|
|
|
|
log "等待服务启动..."
|
|
|
|
|
|
for i in {1..30}; do
|
|
|
|
|
|
if curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:9411/metrics | grep -q "200"; then
|
|
|
|
|
|
success "DCGM Exporter服务启动成功 (http://127.0.0.1:9411/metrics)"
|
|
|
|
|
|
break
|
|
|
|
|
|
fi
|
|
|
|
|
|
sleep 1
|
|
|
|
|
|
done
|
|
|
|
|
|
[[ $i -eq 31 ]] && error "服务启动超时,请检查日志"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 清理Go环境
|
|
|
|
|
|
clean_go() {
|
|
|
|
|
|
log "清理Go环境..."
|
|
|
|
|
|
run "rm -rf /usr/local/go" "删除Go安装目录失败"
|
|
|
|
|
|
run "rm -f /etc/profile.d/go.sh" "删除Go环境变量配置失败"
|
|
|
|
|
|
run "rm -rf /tmp/go*.tar.gz /tmp/dcgm-exporter*.tar.gz" "删除临时安装包失败"
|
|
|
|
|
|
success "Go环境清理完成"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 主流程
|
|
|
|
|
|
log "================= DCGM Exporter安装 =================="
|
|
|
|
|
|
log "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
|
|
|
|
|
|
|
|
|
|
# 解析命令行参数
|
|
|
|
|
|
case "$1" in
|
|
|
|
|
|
"--install")
|
|
|
|
|
|
ACTION="install"
|
|
|
|
|
|
;;
|
|
|
|
|
|
"--uninstall")
|
|
|
|
|
|
ACTION="uninstall"
|
|
|
|
|
|
;;
|
|
|
|
|
|
*)
|
|
|
|
|
|
error "未知参数: $1\n用法: $0 [--install|--uninstall]"
|
|
|
|
|
|
;;
|
|
|
|
|
|
esac
|
|
|
|
|
|
|
|
|
|
|
|
# 执行对应操作
|
|
|
|
|
|
case "$ACTION" in
|
|
|
|
|
|
"install")
|
|
|
|
|
|
check_dcgm
|
|
|
|
|
|
install_go
|
|
|
|
|
|
uninstall_existing
|
|
|
|
|
|
install_exporter
|
|
|
|
|
|
clean_go
|
|
|
|
|
|
;;
|
|
|
|
|
|
"uninstall")
|
|
|
|
|
|
uninstall_existing
|
|
|
|
|
|
success "卸载完成"
|
|
|
|
|
|
;;
|
|
|
|
|
|
esac
|
|
|
|
|
|
|
|
|
|
|
|
# 完成
|
|
|
|
|
|
log "================= 操作完成 =================="
|
|
|
|
|
|
log "日志文件: $LOG_FILE"
|