From edb9ee6c288739470866755110c09209c6a2bded Mon Sep 17 00:00:00 2001 From: joy Date: Fri, 31 Oct 2025 09:41:35 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20scripts/k8s-cluster-deploy?= =?UTF-8?q?.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 12 --- scripts/k8s-cluster-deploy.sh | 237 +++++++++++++++++++++------------- 1 file changed, 150 insertions(+), 87 deletions(-) diff --git a/scripts/k8s-cluster-deploy.sh b/scripts/k8s-cluster-deploy.sh index 39be23e..46219cf 100644 --- a/scripts/k8s-cluster-deploy.sh +++ b/scripts/k8s-cluster-deploy.sh @@ -9,34 +9,60 @@ YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' +# 命令记录存储(用于最后汇总展示) +declare -a EXECUTED_COMMANDS=() +LOG_FILE="/var/log/k8s-deploy-$(date +%Y%m%d-%H%M%S).log" + # 打印函数 -info() { echo -e "${YELLOW}[INFO]${NC} $1"; } -success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } -error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; } -step() { echo -e "\n${BLUE}===== $1 =====${NC}"; } +info() { + echo -e "${YELLOW}[INFO]${NC} $1" | tee -a "$LOG_FILE" +} + +success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE" +} + +error() { + echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE" + exit 1 +} + +step() { + echo -e "\n${BLUE}===== $1 =====${NC}" | tee -a "$LOG_FILE" +} + +# 远程执行命令并记录 +remote_exec() { + local ip=$1 + local cmd=$2 + local desc=${3:-"执行命令"} + + info "在 $ip 上 $desc: ${YELLOW}$cmd${NC}" + EXECUTED_COMMANDS+=("在 $ip 上: $cmd") + + # 执行远程命令并记录输出 + if ! ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "$REMOTE_USER@$ip" "$cmd" >> "$LOG_FILE" 2>&1; then + error "在 $ip 上执行命令失败: $cmd,请查看日志: $LOG_FILE" + fi +} # 固定路径(可根据实际调整) -KUBEADM_CONF="/opt/k8s-install-conf/kubeadm-conf.yaml" # Master1的kubeadm配置文件 +KUBEADM_CONF="/opt/k8s-install-conf/kubeadm-conf.yaml" # Master节点的kubeadm配置文件 INIT_RESULT_FILE="/opt/k8s-install-conf/kubeadm-init-result.txt" # 初始化结果保存文件 -# 关键修改1:Calico v3.25官方YAML地址(替换原v3.26) CALICO_YAML_OFFICIAL="https://docs.projectcalico.org/v3.25/manifests/calico.yaml" -# 本地临时Calico YAML路径(用于镜像替换) LOCAL_CALICO_YAML="/tmp/calico-v3.25-modified.yaml" -# 关键修改2:华为云SWR镜像前缀(替换原docker.io/calico) SWR_CALICO_PREFIX="swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/calico" REMOTE_USER="root" # 远程节点登录用户 -# ========================== 参数解析(核心:指定Master/Node IP) ========================== -# 初始化参数 +# ========================== 参数解析 ========================== MASTER_IPS=() # 3个Master节点IP(顺序:Master1, Master2, Master3) -NODE_IPS=() # Node节点IP列表(支持多个,用逗号分隔) +NODE_IPS=() # Node节点IP列表 parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --master-ips) - # 解析3个Master IP(格式:"192.168.61.10,192.168.61.11,192.168.61.12") IFS=',' read -ra IPS <<< "$2" if [[ ${#IPS[@]} -ne 3 ]]; then error "--master-ips必须包含3个IP(逗号分隔),当前提供${#IPS[@]}个" @@ -45,14 +71,13 @@ parse_args() { shift 2 ;; --node-ips) - # 解析Node IP列表(格式:"192.168.61.20,192.168.61.21") IFS=',' read -ra IPS <<< "$2" NODE_IPS=("${IPS[@]}") shift 2 ;; --help) echo "用法:$0 --master-ips [--node-ips ]" - echo "功能:Master1初始化→远程Master/Node加入→Calico(v3.25+华为云SWR)部署" + echo "功能:部署K8s集群,包含3个Master节点和可选的Node节点" echo "参数说明:" echo " --master-ips 必选,3个Master节点IP(顺序:Master1(初始化节点), Master2, Master3)" echo " --node-ips 可选,Node节点IP列表(逗号分隔,如不指定则仅部署Master集群)" @@ -71,7 +96,7 @@ parse_args() { error "缺少必选参数--master-ips(需指定3个Master节点IP)" fi - # 提取Master1 IP(初始化节点) + # 提取Master IP export MASTER1_IP="${MASTER_IPS[0]}" export MASTER2_IP="${MASTER_IPS[1]}" export MASTER3_IP="${MASTER_IPS[2]}" @@ -83,7 +108,31 @@ parse_args() { } -# ========================== 步骤1:Master1初始化(核心) ========================== +# ========================== 免密登录检测 ========================== +check_ssh_access() { + step "前置检查:SSH免密登录验证" + local all_ips=("${MASTER_IPS[@]}" "${NODE_IPS[@]}") + local failed_ips=() + + for ip in "${all_ips[@]}"; do + info "验证SSH免密登录:$REMOTE_USER@$ip" + if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes "$REMOTE_USER@$ip" "echo 'SSH连通正常'" >/dev/null 2>&1; then + info "$ip SSH免密登录验证通过" + else + error "$ip SSH免密登录验证失败,请先执行: ssh-copy-id $REMOTE_USER@$ip 配置免密" + failed_ips+=("$ip") + fi + done + + if [[ ${#failed_ips[@]} -gt 0 ]]; then + error "以下节点SSH免密登录配置失败: ${failed_ips[*]}" + fi + + success "所有节点SSH免密登录验证通过" +} + + +# ========================== 步骤1:Master1初始化 ========================== init_master1() { step "Step 1:Master1节点初始化($MASTER1_IP)" @@ -92,16 +141,18 @@ init_master1() { error "Master1的kubeadm配置文件不存在:$MASTER1_IP:$KUBEADM_CONF(请先执行配置分发脚本)" fi - # 执行kubeadm init(带证书上传,支持其他Master加入) - info "执行初始化命令:kubeadm init --upload-certs --config $KUBEADM_CONF" + # 执行kubeadm init + local init_cmd="kubeadm init --upload-certs --config $KUBEADM_CONF 2>&1" + info "执行初始化命令:$init_cmd" info "初始化结果将保存到:$INIT_RESULT_FILE" # 远程执行初始化,同时保存输出到本地文件 - ssh "$REMOTE_USER@$MASTER1_IP" "kubeadm init --upload-certs --config $KUBEADM_CONF 2>&1" | tee "$INIT_RESULT_FILE" - local init_exit_code=${PIPESTATUS[0]} # 获取远程命令退出码 + ssh "$REMOTE_USER@$MASTER1_IP" "$init_cmd" | tee "$INIT_RESULT_FILE" + local init_exit_code=${PIPESTATUS[0]} if [[ $init_exit_code -ne 0 ]]; then error "Master1初始化失败!查看详情:cat $INIT_RESULT_FILE" fi + EXECUTED_COMMANDS+=("在 $MASTER1_IP 上初始化: $init_cmd") success "Master1初始化完成,结果已保存到:$INIT_RESULT_FILE" info "初始化输出预览(前20行):" @@ -123,51 +174,51 @@ config_master1_kubeconfig() { ) for cmd in "${kube_cmds[@]}"; do - info "在Master1执行:$cmd" - ssh "$REMOTE_USER@$MASTER1_IP" "$cmd" || error "执行命令失败:$cmd" + remote_exec "$MASTER1_IP" "$cmd" "配置kubeconfig" done # 验证kubectl是否可用 - info "验证kubectl:在Master1执行 kubectl get nodes" - ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get nodes" || error "kubectl配置失败,无法获取节点信息" + remote_exec "$MASTER1_IP" "kubectl get nodes" "验证kubectl" success "Master1 kubeconfig配置完成" } -# ========================== 步骤3:提取join命令(从初始化结果中) ========================== +# ========================== 步骤3:提取join命令 ========================== extract_join_commands() { step "Step 3:从初始化结果中提取join命令" # 检查结果文件是否存在 [[ -f "$INIT_RESULT_FILE" ]] || error "初始化结果文件不存在:$INIT_RESULT_FILE" - # 1. 提取Master节点join命令(含--control-plane和--certificate-key) + # 提取Master节点join命令 info "提取Master节点join命令" local master_join_cmd master_join_cmd=$(grep -A 3 "You can now join any number of control-plane nodes by running the following command on each as root" "$INIT_RESULT_FILE" | grep -v "You can now" | tr -d '\n' | sed 's/ */ /g') [[ -z "$master_join_cmd" ]] && error "未从初始化结果中提取到Master join命令" - # 补充--config参数(其他Master也需用自己的配置文件) + # 补充--config参数 master_join_cmd="$master_join_cmd --config $KUBEADM_CONF" export MASTER_JOIN_CMD="$master_join_cmd" + EXECUTED_COMMANDS+=("Master节点加入命令: $MASTER_JOIN_CMD") success "Master join命令提取完成:" echo -e "${YELLOW}${MASTER_JOIN_CMD}${NC}" - # 2. 提取Node节点join命令 + # 提取Node节点join命令 info "提取Node节点join命令" local node_join_cmd node_join_cmd=$(grep -A 2 "Then you can join any number of worker nodes by running the following on each as root" "$INIT_RESULT_FILE" | grep -v "Then you can" | tr -d '\n' | sed 's/ */ /g') [[ -z "$node_join_cmd" ]] && error "未从初始化结果中提取到Node join命令" export NODE_JOIN_CMD="$node_join_cmd" + EXECUTED_COMMANDS+=("Node节点加入命令: $NODE_JOIN_CMD") success "Node join命令提取完成:" echo -e "${YELLOW}${NODE_JOIN_CMD}${NC}" } -# ========================== 步骤4:远程其他Master节点加入 ========================== +# ========================== 步骤4:其他Master节点加入 ========================== join_other_masters() { step "Step 4:远程Master2($MASTER2_IP)和Master3($MASTER3_IP)加入集群" - # 定义要加入的Master节点列表(排除Master1) + # 定义要加入的Master节点列表 local other_masters=("$MASTER2_IP" "$MASTER3_IP") for master_ip in "${other_masters[@]}"; do @@ -175,28 +226,32 @@ join_other_masters() { # 检查该Master的kubeadm配置文件是否存在 if ! ssh "$REMOTE_USER@$master_ip" "test -f $KUBEADM_CONF"; then - error "Master $master_ip的kubeconfig配置文件不存在:$KUBEADM_CONF" + error "Master $master_ip的配置文件不存在:$KUBEADM_CONF" fi # 远程执行join命令 - info "在$master_ip执行join命令:$MASTER_JOIN_CMD" - ssh "$REMOTE_USER@$master_ip" "$MASTER_JOIN_CMD" || error "Master $master_ip加入失败" + remote_exec "$master_ip" "$MASTER_JOIN_CMD" "执行加入集群命令" - # 配置该Master的kubeconfig(否则无法本地执行kubectl) - info "配置$master_ip的kubeconfig" - ssh "$REMOTE_USER@$master_ip" "mkdir -p \$HOME/.kube && cp -i /etc/kubernetes/admin.conf \$HOME/.kube/config && chown \$(id -u):\$(id -g) \$HOME/.kube/config" + # 配置该Master的kubeconfig + local kube_config_cmds=( + "mkdir -p \$HOME/.kube" + "cp -i /etc/kubernetes/admin.conf \$HOME/.kube/config" + "chown \$(id -u):\$(id -g) \$HOME/.kube/config" + ) + for cmd in "${kube_config_cmds[@]}"; do + remote_exec "$master_ip" "$cmd" "配置kubeconfig" + done success "Master $master_ip加入集群并配置完成" done # 验证所有Master节点状态 - info "验证所有Master节点状态(从Master1执行)" - ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get nodes | grep master" || error "Master节点状态验证失败" + remote_exec "$MASTER1_IP" "kubectl get nodes | grep master" "验证Master节点状态" success "所有Master节点加入完成" } -# ========================== 步骤5:远程Node节点加入(可选) ========================== +# ========================== 步骤5:Node节点加入 ========================== join_nodes() { if [[ ${#NODE_IPS[@]} -eq 0 ]]; then info "未指定Node节点,跳过Node加入步骤" @@ -208,30 +263,29 @@ join_nodes() { for node_ip in "${NODE_IPS[@]}"; do info "处理Node节点:$node_ip" - # 检查Node节点是否已安装基础组件(kubeadm、kubelet、containerd) - if ! ssh "$REMOTE_USER@$node_ip" "command -v kubeadm &>/dev/null && command -v kubelet &>/dev/null && systemctl is-active --quiet containerd"; then + # 检查Node节点是否已安装基础组件 + local check_cmd="command -v kubeadm &>/dev/null && command -v kubelet &>/dev/null && systemctl is-active --quiet containerd" + if ! ssh "$REMOTE_USER@$node_ip" "$check_cmd"; then error "Node $node_ip未安装基础组件(需先安装kubeadm、kubelet、containerd)" fi # 远程执行Node join命令 - info "在$node_ip执行join命令:$NODE_JOIN_CMD" - ssh "$REMOTE_USER@$node_ip" "$NODE_JOIN_CMD" || error "Node $node_ip加入失败" + remote_exec "$node_ip" "$NODE_JOIN_CMD" "执行加入集群命令" success "Node $node_ip加入集群完成" done # 验证所有节点状态 - info "验证所有节点状态(从Master1执行)" - ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get nodes" || error "节点状态验证失败" + remote_exec "$MASTER1_IP" "kubectl get nodes" "验证所有节点状态" success "所有Node节点加入完成" } -# ========================== 步骤6:部署Calico网络插件(v3.25+华为云SWR) ========================== +# ========================== 步骤6:部署Calico网络插件 ========================== deploy_calico() { step "Step 6:部署Calico v3.25(华为云SWR镜像)" - # 子步骤1:下载Calico v3.25官方YAML到本地 + # 下载Calico YAML到本地 info "下载Calico v3.25官方YAML:$CALICO_YAML_OFFICIAL" if command -v wget &>/dev/null; then wget -q -O "$LOCAL_CALICO_YAML" "$CALICO_YAML_OFFICIAL" || error "wget下载Calico YAML失败" @@ -243,9 +297,8 @@ deploy_calico() { [[ -f "$LOCAL_CALICO_YAML" ]] || error "Calico YAML下载失败,文件不存在" success "Calico YAML下载完成(本地路径:$LOCAL_CALICO_YAML)" - # 子步骤2:替换YAML中的所有镜像地址为华为云SWR(核心修改) + # 替换YAML中的所有镜像地址为华为云SWR info "替换镜像地址为华为云SWR:$SWR_CALICO_PREFIX" - # 替换3个核心镜像(cni、node、kube-controllers,均为v3.25.0版本) sed -i "s#docker.io/calico/cni:v3.25.0#${SWR_CALICO_PREFIX}/cni:v3.25.0#g" "$LOCAL_CALICO_YAML" sed -i "s#docker.io/calico/node:v3.25.0#${SWR_CALICO_PREFIX}/node:v3.25.0#g" "$LOCAL_CALICO_YAML" sed -i "s#docker.io/calico/kube-controllers:v3.25.0#${SWR_CALICO_PREFIX}/kube-controllers:v3.25.0#g" "$LOCAL_CALICO_YAML" @@ -255,27 +308,27 @@ deploy_calico() { grep -n "image:" "$LOCAL_CALICO_YAML" | head -n 10 || error "Calico YAML中未找到镜像配置,替换失败" success "Calico镜像地址替换完成" - # 子步骤3:将修改后的YAML上传到Master1节点 + # 将修改后的YAML上传到Master1节点 local remote_calico_yaml="/opt/k8s-install-conf/calico-v3.25-swr.yaml" info "上传修改后的YAML到Master1:$remote_calico_yaml" scp -o StrictHostKeyChecking=no "$LOCAL_CALICO_YAML" "$REMOTE_USER@$MASTER1_IP:$remote_calico_yaml" || error "YAML上传失败" + EXECUTED_COMMANDS+=("上传Calico配置到 $MASTER1_IP: scp $LOCAL_CALICO_YAML $REMOTE_USER@$MASTER1_IP:$remote_calico_yaml") success "YAML上传完成" - # 子步骤4:在Master1执行Calico部署 - info "执行部署命令:kubectl apply -f $remote_calico_yaml" - ssh "$REMOTE_USER@$MASTER1_IP" "kubectl apply -f $remote_calico_yaml" || error "Calico部署失败" + # 在Master1执行Calico部署 + local deploy_cmd="kubectl apply -f $remote_calico_yaml" + remote_exec "$MASTER1_IP" "$deploy_cmd" "执行Calico部署" - # 子步骤5:等待Calico Pod启动(最多等待5分钟) + # 等待Calico Pod启动 info "等待Calico Pod启动(最多5分钟)..." local wait_time=0 while true; do - # 检查calico-system命名空间下所有Pod是否Running local pod_status pod_status=$(ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get pods -n calico-system -o jsonpath='{.items[*].status.phase}' 2>/dev/null") if [[ "$pod_status" =~ ^(Running\ )*Running$ && -n "$pod_status" ]]; then break fi - if [[ $wait_time -ge 300 ]]; then # 5分钟=300秒 + if [[ $wait_time -ge 300 ]]; then error "Calico Pod启动超时(5分钟),请手动检查:kubectl get pods -n calico-system" fi sleep 10 @@ -283,58 +336,68 @@ deploy_calico() { info "已等待${wait_time}秒,Calico Pod状态:$pod_status" done - # 子步骤6:验证Calico状态 - info "验证Calico状态(从Master1执行)" - ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get pods -n calico-system" || error "Calico状态验证失败" - # 额外验证网络插件是否就绪 - ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get nodes -o wide | grep -E 'STATUS|Ready'" || error "节点网络未就绪" + # 验证Calico状态 + remote_exec "$MASTER1_IP" "kubectl get pods -n calico-system" "验证Calico状态" + remote_exec "$MASTER1_IP" "kubectl get nodes -o wide | grep -E 'STATUS|Ready'" "验证节点网络状态" success "Calico v3.25(华为云SWR)部署完成,集群网络已就绪" - # 子步骤7:清理本地临时YAML文件 + # 清理本地临时YAML文件 info "清理本地临时文件:$LOCAL_CALICO_YAML" rm -f "$LOCAL_CALICO_YAML" || info "本地文件清理失败,可手动删除" } -# ========================== 主流程:串联所有步骤 ========================== +# ========================== 步骤7:展示执行的所有命令 ========================== +show_executed_commands() { + step "Step 7:执行命令汇总" + echo -e "${YELLOW}以下是部署过程中执行的关键命令:${NC}" + echo -e "----------------------------------------------------------------------" + local count=1 + for cmd in "${EXECUTED_COMMANDS[@]}"; do + echo -e "${count}. ${cmd}" + ((count++)) + done + echo -e "----------------------------------------------------------------------" +} + + +# ========================== 主流程 ========================== main() { + # 记录开始时间 + local start_time=$(date +%s) + # 1. 解析参数 parse_args "$@" - - # 2. 前置检查:确保SSH免密登录正常 - step "前置检查:SSH免密登录验证" - local all_ips=("${MASTER_IPS[@]}" "${NODE_IPS[@]}") - for ip in "${all_ips[@]}"; do - info "验证SSH免密登录:$REMOTE_USER@$ip" - if ! ssh -o StrictHostKeyChecking=no "$REMOTE_USER@$ip" "echo 'SSH连通正常' >/dev/null"; then - error "SSH免密登录失败:$ip(请先执行 ssh-copy-id $REMOTE_USER@$ip 配置免密)" - fi - done - success "所有节点SSH免密登录验证通过" - + + # 2. 检查SSH免密登录 + check_ssh_access + # 3. 执行核心步骤 - init_master1 # Master1初始化 - config_master1_kubeconfig # Master1配置kubeconfig - extract_join_commands # 提取join命令 - join_other_masters # 其他Master加入 - join_nodes # Node加入(可选) - deploy_calico # 部署Calico(v3.25+SWR) - - # 4. 集群部署完成总结 + init_master1 + config_master1_kubeconfig + extract_join_commands + join_other_masters + join_nodes + deploy_calico + + # 4. 展示执行的命令 + show_executed_commands + + # 5. 集群部署完成总结 step "集群部署完成总结" + local end_time=$(date +%s) + local duration=$((end_time - start_time)) echo -e "${GREEN}========================================" echo -e "K8s集群部署全流程完成!" + echo -e "部署耗时:$((duration/60))分$((duration%60))秒" echo -e "关键信息:" echo -e " 1. Calico版本:v3.25(镜像源:华为云SWR)" echo -e " 2. 集群节点状态:ssh $REMOTE_USER@$MASTER1_IP 'kubectl get nodes'" echo -e " 3. Calico状态:ssh $REMOTE_USER@$MASTER1_IP 'kubectl get pods -n calico-system'" echo -e " 4. 初始化结果:$INIT_RESULT_FILE" + echo -e " 5. 部署日志:$LOG_FILE" echo -e "========================================" } # 启动主流程 main "$@" -chmod +x k8s-cluster-deploy.sh - -# 部署3个Master + 2个Node(替换为你的IP) -#bash k8s-cluster-deploy.sh --master-ips 192.168.61.10,192.168.61.11,192.168.61.12 --node-ips 192.168.61.20,192.168.61.21