#!/bin/bash set -uo pipefail # 全局变量 LOG_FILE="/var/log/dcgm-exporter-install_$(date +%Y%m%d%H%M%S).log" GO_VERSION="1.21.1" DCGM_EXPORTER_VERSION="4.2.0-4.1.0" DCGM_EXPORTER_DIR="/opt/dcgm-exporter" SERVICE_FILE="/etc/systemd/system/dcgm-exporter.service" # 颜色定义 GREEN='\033[1;32m' RED='\033[1;31m' YELLOW='\033[1;33m' NC='\033[0m' # 日志函数(控制台+日志文件) log() { local timestamp="[$(date '+%Y-%m-%d %H:%M:%S')]" echo -e "$timestamp $*" | tee -a "$LOG_FILE" } # 成功提示 success() { log "${GREEN}✔ $*${NC}" } # 警告提示 warning() { log "${YELLOW}⚠ $*${NC}" } # 错误提示 error() { log "${RED}✖ 错误: $*${NC}" exit 1 } # 执行命令(带详细错误处理) run() { local cmd="$1" local error_msg="${2:-命令执行失败}" local timeout="${3:-120}" # 默认超时120秒 log "→ 执行: $cmd" # 使用timeout防止命令卡死 if ! timeout "$timeout" bash -c "$cmd" &>> "$LOG_FILE"; then local exit_code=$? if [[ $exit_code -eq 124 ]]; then error "命令超时 ($timeout秒): $cmd" else error "$error_msg (退出码: $exit_code)" fi fi } # 检测DCGM状态 check_dcgm() { log "检测DCGM服务状态..." # 检查systemctl命令是否存在 if ! command -v systemctl &> /dev/null; then error "未找到systemctl命令,请确保系统支持systemd" fi # 检查nvidia-dcgm.service文件是否存在 if [[ ! -f "/lib/systemd/system/nvidia-dcgm.service" ]]; then error "未找到nvidia-dcgm服务文件,请确认DCGM已正确安装" fi # 获取服务状态 local status=$(systemctl is-active nvidia-dcgm 2>&1) local exit_code=$? if [[ $exit_code -ne 0 ]]; then error "无法获取DCGM服务状态: $status" fi if [[ "$status" == "active" ]]; then DCGM_VERSION=$(dcgmi version 2>&1 | grep -oP 'Version \K[^\s]+' || echo "未知") success "DCGM服务运行中 (版本: $DCGM_VERSION)" else error "DCGM服务状态异常 ($status),请先执行: systemctl start nvidia-dcgm" fi } # 安装Go环境 install_go() { log "安装Go环境 (版本: $GO_VERSION)..." GO_PACKAGE="go${GO_VERSION}.linux-amd64.tar.gz" DL_URL="https://mirrors.aliyun.com/golang/$GO_PACKAGE" TMP_PACKAGE="/tmp/$GO_PACKAGE" # 下载安装包 if [[ ! -f "$TMP_PACKAGE" ]]; then run "wget -qO $TMP_PACKAGE $DL_URL" "下载Go安装包失败" fi # 解压安装 run "tar -xzf $TMP_PACKAGE -C /usr/local" "解压Go安装包失败" # 配置环境变量 GO_ENV="/etc/profile.d/go.sh" cat > "$GO_ENV" <<'EOF' export GOROOT=/usr/local/go export GOPATH=/usr/local/gopath export PATH=$PATH:$GOROOT/bin export GO111MODULE=on export GOPROXY=https://goproxy.cn,direct EOF log "→ 配置Go环境变量" if ! chmod +x "$GO_ENV" &>> "$LOG_FILE"; then error "设置Go环境变量文件权限失败" fi # 在当前shell中加载环境变量 log "→ 加载Go环境变量" if ! source "$GO_ENV"; then error "加载Go环境变量失败" fi # 验证安装 local go_version=$(go version 2>&1) if [[ "$go_version" == *"go$GO_VERSION"* ]]; then success "Go环境安装完成: $go_version" else error "Go环境验证失败: $go_version" fi } # 卸载现有DCGM Exporter uninstall_existing() { log "检查是否存在旧版本DCGM Exporter..." if [[ -d "$DCGM_EXPORTER_DIR" || -f "$SERVICE_FILE" ]]; then log "发现旧版本,开始卸载..." # 停止服务 if systemctl is-active --quiet dcgm-exporter; then run "systemctl stop dcgm-exporter" "停止现有服务失败" fi # 禁用服务 if systemctl is-enabled --quiet dcgm-exporter; then run "systemctl disable dcgm-exporter" "禁用现有服务失败" fi # 删除文件 run "rm -rf $DCGM_EXPORTER_DIR" "删除旧安装目录失败" run "rm -f $SERVICE_FILE" "删除服务文件失败" # 重新加载systemd run "systemctl daemon-reload" "重新加载systemd失败" success "旧版本卸载完成" else success "未发现旧版本,继续安装..." fi } # 安装DCGM Exporter install_exporter() { log "安装DCGM Exporter (版本: $DCGM_EXPORTER_VERSION)..." # 加载Go环境变量 log "→ 加载Go环境变量" if [[ -f "/etc/profile.d/go.sh" ]]; then if ! source "/etc/profile.d/go.sh"; then error "加载Go环境变量失败" fi else error "未找到Go环境变量配置文件" fi run "mkdir -p $DCGM_EXPORTER_DIR" "创建工作目录失败" # 下载源码(使用固定URL) DCGM_PACKAGE="4.2.0-4.1.0.tar.gz" DL_URL="https://github.com/NVIDIA/dcgm-exporter/archive/refs/tags/$DCGM_PACKAGE" TMP_PACKAGE="/tmp/$DCGM_PACKAGE" if [[ ! -f "$TMP_PACKAGE" ]]; then run "wget -qO $TMP_PACKAGE $DL_URL" "下载DCGM Exporter源码失败" fi # 解压 run "tar xf $TMP_PACKAGE -C /tmp" "解压DCGM Exporter源码失败" SOURCE_DIR="/tmp/dcgm-exporter-$DCGM_EXPORTER_VERSION" # 编译安装 log "→ 编译DCGM Exporter" if ! cd "$SOURCE_DIR" &>> "$LOG_FILE"; then error "进入源码目录失败" fi # 分步骤执行make,便于调试 if ! make binary &>> "$LOG_FILE"; then error "编译DCGM Exporter失败" fi if ! make install &>> "$LOG_FILE"; then error "安装DCGM Exporter失败" fi # 复制文件 run "cp -a /usr/bin/dcgm-exporter $DCGM_EXPORTER_DIR/" "复制二进制文件失败" run "cp -a /etc/dcgm-exporter/default-counters.csv $DCGM_EXPORTER_DIR/" "复制配置文件失败" # 生成服务文件 cat > "$SERVICE_FILE" <