diff --git a/scripts/k8s-cluster-deploy.sh b/scripts/k8s-cluster-deploy.sh index 46219cf..ab73fb8 100644 --- a/scripts/k8s-cluster-deploy.sh +++ b/scripts/k8s-cluster-deploy.sh @@ -1,403 +1,189 @@ #!/bin/bash set -euo pipefail -# ========================== 全局配置与工具函数 ========================== -# 颜色定义 -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -# 命令记录存储(用于最后汇总展示) -declare -a EXECUTED_COMMANDS=() -LOG_FILE="/var/log/k8s-deploy-$(date +%Y%m%d-%H%M%S).log" - -# 打印函数 -info() { - echo -e "${YELLOW}[INFO]${NC} $1" | tee -a "$LOG_FILE" -} - -success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE" -} - -error() { - echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE" +# 校验是否为root用户执行 +if [ "$(id -u)" -ne 0 ]; then + echo "错误:脚本需以root权限执行" exit 1 -} +fi -step() { - echo -e "\n${BLUE}===== $1 =====${NC}" | tee -a "$LOG_FILE" -} +# 初始化参数变量 +master_ips="" +node_ips="" -# 远程执行命令并记录 -remote_exec() { - local ip=$1 - local cmd=$2 - local desc=${3:-"执行命令"} - - info "在 $ip 上 $desc: ${YELLOW}$cmd${NC}" - EXECUTED_COMMANDS+=("在 $ip 上: $cmd") - - # 执行远程命令并记录输出 - if ! ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "$REMOTE_USER@$ip" "$cmd" >> "$LOG_FILE" 2>&1; then - error "在 $ip 上执行命令失败: $cmd,请查看日志: $LOG_FILE" +# 解析命令行参数 +while [[ $# -gt 0 ]]; do + case "$1" in + --master-ips) + master_ips="$2" + shift 2 + ;; + --node-ips) + node_ips="$2" + shift 2 + ;; + *) + echo "错误:未知参数 $1" + echo "使用方式:$0 --master-ips 192.168.61.131,192.168.61.132 --node-ips 192.168.61.134" + exit 1 + ;; + esac +done + +# 校验参数是否完整 +if [ -z "$master_ips" ] && [ -z "$node_ips" ]; then + echo "错误:必须指定 --master-ips 或 --node-ips" + echo "使用方式:$0 --master-ips 192.168.61.131,192.168.61.132 --node-ips 192.168.61.134" + exit 1 +fi + +# 定义目标目录 +target_dir="/opt/k8s-install-conf" + +# 校验kubeadm-conf.yaml是否存在 +if [ ! -f "/opt/k8s-install-conf/kubeadm-conf.yaml" ]; then + echo "错误:当前目录未找到 kubeadm-conf.yaml 配置文件" + exit 1 +fi + + +# -------------------------- 第一步:初始化第一台Master节点 -------------------------- +echo "=== 开始初始化第一台Master节点 ===" + +# 执行kubeadm初始化(带证书上传) +echo "执行 kubeadm init 初始化集群..." +kubeadm init --config ${target_dir}/kubeadm-conf.yaml --upload-certs + +# 配置当前用户kubeconfig +echo "配置当前用户kubeconfig..." +mkdir -p "$HOME/.kube" +cp -i /etc/kubernetes/admin.conf "$HOME/.kube/config" +chown "$(id -u):$(id -g)" "$HOME/.kube/config" +export KUBECONFIG=/etc/kubernetes/admin.conf +# 将环境变量写入profile,永久生效 +echo 'export KUBECONFIG=/etc/kubernetes/admin.conf' >> "$HOME/.profile" + +echo "=== 第一台Master节点初始化完成 ===" + +# -------------------------- 第二步:生成添加节点的脚本 -------------------------- +echo "=== 开始生成添加节点脚本 ===" + +# 创建目标目录 +mkdir -p "$target_dir" + +# 生成certificate-key和基础join命令 +echo "生成集群加入所需参数..." +CERT_KEY=$(kubeadm init phase upload-certs --upload-certs 2>/dev/null | grep -E '^[0-9a-f]{64,}$') +JOIN_CMD_BASE=$(kubeadm token create --print-join-command 2>/dev/null) + +# 校验参数生成结果 +if [ -z "$CERT_KEY" ] || [ -z "$JOIN_CMD_BASE" ]; then + echo "错误:生成集群加入参数失败" + exit 1 +fi + +# 生成添加Master节点的脚本(包含kubeconfig配置) +echo "生成 $target_dir/add_master.sh ..." +cat > "$target_dir/add_master.sh" <> \$HOME/.profile + +echo "Master节点加入集群并配置完成" +EOF + +# 生成添加Node节点的脚本 +echo "生成 $target_dir/add_node.sh ..." +cat > "$target_dir/add_node.sh" < [--node-ips ]" - echo "功能:部署K8s集群,包含3个Master节点和可选的Node节点" - echo "参数说明:" - echo " --master-ips 必选,3个Master节点IP(顺序:Master1(初始化节点), Master2, Master3)" - echo " --node-ips 可选,Node节点IP列表(逗号分隔,如不指定则仅部署Master集群)" - echo "示例:" - echo " $0 --master-ips 192.168.61.10,192.168.61.11,192.168.61.12 --node-ips 192.168.61.20,192.168.61.21" - exit 0 - ;; - *) - error "未知参数:$1(执行$0 --help查看用法)" - ;; - esac +# -------------------------- 第四步:处理Master节点加入 -------------------------- +if [ -n "$master_ips" ]; then + echo -e "\n=== 开始处理Master节点加入 ===" + # 分割IP列表(处理逗号分隔,去除空格) + IFS=',' read -ra master_ip_arr <<< "$(echo "$master_ips" | tr -d ' ')" + + for ip in "${master_ip_arr[@]}"; do + echo -e "\n--- 处理Master节点:$ip ---" + # 验证免密 + check_ssh_auth "$ip" + # 拷贝脚本 + copy_script "$ip" "$target_dir/add_master.sh" + # 远程执行 + run_remote_script "$ip" "add_master.sh" + echo "--- Master节点 $ip 处理完成 ---" done +fi - # 校验必选参数 - if [[ ${#MASTER_IPS[@]} -eq 0 ]]; then - error "缺少必选参数--master-ips(需指定3个Master节点IP)" - fi - - # 提取Master IP - export MASTER1_IP="${MASTER_IPS[0]}" - export MASTER2_IP="${MASTER_IPS[1]}" - export MASTER3_IP="${MASTER_IPS[2]}" - info "已识别节点:" - info "Master1(初始化):$MASTER1_IP" - info "Master2:$MASTER2_IP" - info "Master3:$MASTER3_IP" - [[ ${#NODE_IPS[@]} -gt 0 ]] && info "Node节点:${NODE_IPS[*]}" || info "未指定Node节点,仅部署Master集群" -} - - -# ========================== 免密登录检测 ========================== -check_ssh_access() { - step "前置检查:SSH免密登录验证" - local all_ips=("${MASTER_IPS[@]}" "${NODE_IPS[@]}") - local failed_ips=() - - for ip in "${all_ips[@]}"; do - info "验证SSH免密登录:$REMOTE_USER@$ip" - if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes "$REMOTE_USER@$ip" "echo 'SSH连通正常'" >/dev/null 2>&1; then - info "$ip SSH免密登录验证通过" - else - error "$ip SSH免密登录验证失败,请先执行: ssh-copy-id $REMOTE_USER@$ip 配置免密" - failed_ips+=("$ip") - fi +# -------------------------- 第五步:处理Node节点加入 -------------------------- +if [ -n "$node_ips" ]; then + echo -e "\n=== 开始处理Node节点加入 ===" + # 分割IP列表(处理逗号分隔,去除空格) + IFS=',' read -ra node_ip_arr <<< "$(echo "$node_ips" | tr -d ' ')" + + for ip in "${node_ip_arr[@]}"; do + echo -e "\n--- 处理Node节点:$ip ---" + # 验证免密 + check_ssh_auth "$ip" + # 拷贝脚本 + copy_script "$ip" "$target_dir/add_node.sh" + # 远程执行 + run_remote_script "$ip" "add_node.sh" + echo "--- Node节点 $ip 处理完成 ---" done +fi - if [[ ${#failed_ips[@]} -gt 0 ]]; then - error "以下节点SSH免密登录配置失败: ${failed_ips[*]}" - fi - - success "所有节点SSH免密登录验证通过" -} - - -# ========================== 步骤1:Master1初始化 ========================== -init_master1() { - step "Step 1:Master1节点初始化($MASTER1_IP)" - - # 检查Master1的kubeadm配置文件是否存在 - if ! ssh "$REMOTE_USER@$MASTER1_IP" "test -f $KUBEADM_CONF"; then - error "Master1的kubeadm配置文件不存在:$MASTER1_IP:$KUBEADM_CONF(请先执行配置分发脚本)" - fi - - # 执行kubeadm init - local init_cmd="kubeadm init --upload-certs --config $KUBEADM_CONF 2>&1" - info "执行初始化命令:$init_cmd" - info "初始化结果将保存到:$INIT_RESULT_FILE" - - # 远程执行初始化,同时保存输出到本地文件 - ssh "$REMOTE_USER@$MASTER1_IP" "$init_cmd" | tee "$INIT_RESULT_FILE" - local init_exit_code=${PIPESTATUS[0]} - if [[ $init_exit_code -ne 0 ]]; then - error "Master1初始化失败!查看详情:cat $INIT_RESULT_FILE" - fi - EXECUTED_COMMANDS+=("在 $MASTER1_IP 上初始化: $init_cmd") - - success "Master1初始化完成,结果已保存到:$INIT_RESULT_FILE" - info "初始化输出预览(前20行):" - head -n 20 "$INIT_RESULT_FILE" -} - - -# ========================== 步骤2:Master1配置kubeconfig ========================== -config_master1_kubeconfig() { - step "Step 2:配置Master1的kubeconfig" - - # 远程执行kubeconfig配置命令 - local kube_cmds=( - "mkdir -p \$HOME/.kube" - "cp -i /etc/kubernetes/admin.conf \$HOME/.kube/config" - "chown \$(id -u):\$(id -g) \$HOME/.kube/config" - "echo 'export KUBECONFIG=/etc/kubernetes/admin.conf' >> ~/.bash_profile" - "source ~/.bash_profile" - ) - - for cmd in "${kube_cmds[@]}"; do - remote_exec "$MASTER1_IP" "$cmd" "配置kubeconfig" - done - - # 验证kubectl是否可用 - remote_exec "$MASTER1_IP" "kubectl get nodes" "验证kubectl" - success "Master1 kubeconfig配置完成" -} - - -# ========================== 步骤3:提取join命令 ========================== -extract_join_commands() { - step "Step 3:从初始化结果中提取join命令" - - # 检查结果文件是否存在 - [[ -f "$INIT_RESULT_FILE" ]] || error "初始化结果文件不存在:$INIT_RESULT_FILE" - - # 提取Master节点join命令 - info "提取Master节点join命令" - local master_join_cmd - master_join_cmd=$(grep -A 3 "You can now join any number of control-plane nodes by running the following command on each as root" "$INIT_RESULT_FILE" | grep -v "You can now" | tr -d '\n' | sed 's/ */ /g') - [[ -z "$master_join_cmd" ]] && error "未从初始化结果中提取到Master join命令" - # 补充--config参数 - master_join_cmd="$master_join_cmd --config $KUBEADM_CONF" - export MASTER_JOIN_CMD="$master_join_cmd" - EXECUTED_COMMANDS+=("Master节点加入命令: $MASTER_JOIN_CMD") - success "Master join命令提取完成:" - echo -e "${YELLOW}${MASTER_JOIN_CMD}${NC}" - - # 提取Node节点join命令 - info "提取Node节点join命令" - local node_join_cmd - node_join_cmd=$(grep -A 2 "Then you can join any number of worker nodes by running the following on each as root" "$INIT_RESULT_FILE" | grep -v "Then you can" | tr -d '\n' | sed 's/ */ /g') - [[ -z "$node_join_cmd" ]] && error "未从初始化结果中提取到Node join命令" - export NODE_JOIN_CMD="$node_join_cmd" - EXECUTED_COMMANDS+=("Node节点加入命令: $NODE_JOIN_CMD") - success "Node join命令提取完成:" - echo -e "${YELLOW}${NODE_JOIN_CMD}${NC}" -} - - -# ========================== 步骤4:其他Master节点加入 ========================== -join_other_masters() { - step "Step 4:远程Master2($MASTER2_IP)和Master3($MASTER3_IP)加入集群" - - # 定义要加入的Master节点列表 - local other_masters=("$MASTER2_IP" "$MASTER3_IP") - - for master_ip in "${other_masters[@]}"; do - info "处理Master节点:$master_ip" - - # 检查该Master的kubeadm配置文件是否存在 - if ! ssh "$REMOTE_USER@$master_ip" "test -f $KUBEADM_CONF"; then - error "Master $master_ip的配置文件不存在:$KUBEADM_CONF" - fi - - # 远程执行join命令 - remote_exec "$master_ip" "$MASTER_JOIN_CMD" "执行加入集群命令" - - # 配置该Master的kubeconfig - local kube_config_cmds=( - "mkdir -p \$HOME/.kube" - "cp -i /etc/kubernetes/admin.conf \$HOME/.kube/config" - "chown \$(id -u):\$(id -g) \$HOME/.kube/config" - ) - for cmd in "${kube_config_cmds[@]}"; do - remote_exec "$master_ip" "$cmd" "配置kubeconfig" - done - - success "Master $master_ip加入集群并配置完成" - done - - # 验证所有Master节点状态 - remote_exec "$MASTER1_IP" "kubectl get nodes | grep master" "验证Master节点状态" - success "所有Master节点加入完成" -} - - -# ========================== 步骤5:Node节点加入 ========================== -join_nodes() { - if [[ ${#NODE_IPS[@]} -eq 0 ]]; then - info "未指定Node节点,跳过Node加入步骤" - return - fi - - step "Step 5:远程Node节点加入集群(共${#NODE_IPS[@]}个)" - - for node_ip in "${NODE_IPS[@]}"; do - info "处理Node节点:$node_ip" - - # 检查Node节点是否已安装基础组件 - local check_cmd="command -v kubeadm &>/dev/null && command -v kubelet &>/dev/null && systemctl is-active --quiet containerd" - if ! ssh "$REMOTE_USER@$node_ip" "$check_cmd"; then - error "Node $node_ip未安装基础组件(需先安装kubeadm、kubelet、containerd)" - fi - - # 远程执行Node join命令 - remote_exec "$node_ip" "$NODE_JOIN_CMD" "执行加入集群命令" - - success "Node $node_ip加入集群完成" - done - - # 验证所有节点状态 - remote_exec "$MASTER1_IP" "kubectl get nodes" "验证所有节点状态" - success "所有Node节点加入完成" -} - - -# ========================== 步骤6:部署Calico网络插件 ========================== -deploy_calico() { - step "Step 6:部署Calico v3.25(华为云SWR镜像)" - - # 下载Calico YAML到本地 - info "下载Calico v3.25官方YAML:$CALICO_YAML_OFFICIAL" - if command -v wget &>/dev/null; then - wget -q -O "$LOCAL_CALICO_YAML" "$CALICO_YAML_OFFICIAL" || error "wget下载Calico YAML失败" - elif command -v curl &>/dev/null; then - curl -s -o "$LOCAL_CALICO_YAML" "$CALICO_YAML_OFFICIAL" || error "curl下载Calico YAML失败" - else - error "请先安装wget或curl" - fi - [[ -f "$LOCAL_CALICO_YAML" ]] || error "Calico YAML下载失败,文件不存在" - success "Calico YAML下载完成(本地路径:$LOCAL_CALICO_YAML)" - - # 替换YAML中的所有镜像地址为华为云SWR - info "替换镜像地址为华为云SWR:$SWR_CALICO_PREFIX" - sed -i "s#docker.io/calico/cni:v3.25.0#${SWR_CALICO_PREFIX}/cni:v3.25.0#g" "$LOCAL_CALICO_YAML" - sed -i "s#docker.io/calico/node:v3.25.0#${SWR_CALICO_PREFIX}/node:v3.25.0#g" "$LOCAL_CALICO_YAML" - sed -i "s#docker.io/calico/kube-controllers:v3.25.0#${SWR_CALICO_PREFIX}/kube-controllers:v3.25.0#g" "$LOCAL_CALICO_YAML" - - # 验证替换结果 - info "验证镜像替换结果(查看前10行含镜像的内容)" - grep -n "image:" "$LOCAL_CALICO_YAML" | head -n 10 || error "Calico YAML中未找到镜像配置,替换失败" - success "Calico镜像地址替换完成" - - # 将修改后的YAML上传到Master1节点 - local remote_calico_yaml="/opt/k8s-install-conf/calico-v3.25-swr.yaml" - info "上传修改后的YAML到Master1:$remote_calico_yaml" - scp -o StrictHostKeyChecking=no "$LOCAL_CALICO_YAML" "$REMOTE_USER@$MASTER1_IP:$remote_calico_yaml" || error "YAML上传失败" - EXECUTED_COMMANDS+=("上传Calico配置到 $MASTER1_IP: scp $LOCAL_CALICO_YAML $REMOTE_USER@$MASTER1_IP:$remote_calico_yaml") - success "YAML上传完成" - - # 在Master1执行Calico部署 - local deploy_cmd="kubectl apply -f $remote_calico_yaml" - remote_exec "$MASTER1_IP" "$deploy_cmd" "执行Calico部署" - - # 等待Calico Pod启动 - info "等待Calico Pod启动(最多5分钟)..." - local wait_time=0 - while true; do - local pod_status - pod_status=$(ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get pods -n calico-system -o jsonpath='{.items[*].status.phase}' 2>/dev/null") - if [[ "$pod_status" =~ ^(Running\ )*Running$ && -n "$pod_status" ]]; then - break - fi - if [[ $wait_time -ge 300 ]]; then - error "Calico Pod启动超时(5分钟),请手动检查:kubectl get pods -n calico-system" - fi - sleep 10 - wait_time=$((wait_time + 10)) - info "已等待${wait_time}秒,Calico Pod状态:$pod_status" - done - - # 验证Calico状态 - remote_exec "$MASTER1_IP" "kubectl get pods -n calico-system" "验证Calico状态" - remote_exec "$MASTER1_IP" "kubectl get nodes -o wide | grep -E 'STATUS|Ready'" "验证节点网络状态" - success "Calico v3.25(华为云SWR)部署完成,集群网络已就绪" - - # 清理本地临时YAML文件 - info "清理本地临时文件:$LOCAL_CALICO_YAML" - rm -f "$LOCAL_CALICO_YAML" || info "本地文件清理失败,可手动删除" -} - - -# ========================== 步骤7:展示执行的所有命令 ========================== -show_executed_commands() { - step "Step 7:执行命令汇总" - echo -e "${YELLOW}以下是部署过程中执行的关键命令:${NC}" - echo -e "----------------------------------------------------------------------" - local count=1 - for cmd in "${EXECUTED_COMMANDS[@]}"; do - echo -e "${count}. ${cmd}" - ((count++)) - done - echo -e "----------------------------------------------------------------------" -} - - -# ========================== 主流程 ========================== -main() { - # 记录开始时间 - local start_time=$(date +%s) - - # 1. 解析参数 - parse_args "$@" - - # 2. 检查SSH免密登录 - check_ssh_access - - # 3. 执行核心步骤 - init_master1 - config_master1_kubeconfig - extract_join_commands - join_other_masters - join_nodes - deploy_calico - - # 4. 展示执行的命令 - show_executed_commands - - # 5. 集群部署完成总结 - step "集群部署完成总结" - local end_time=$(date +%s) - local duration=$((end_time - start_time)) - echo -e "${GREEN}========================================" - echo -e "K8s集群部署全流程完成!" - echo -e "部署耗时:$((duration/60))分$((duration%60))秒" - echo -e "关键信息:" - echo -e " 1. Calico版本:v3.25(镜像源:华为云SWR)" - echo -e " 2. 集群节点状态:ssh $REMOTE_USER@$MASTER1_IP 'kubectl get nodes'" - echo -e " 3. Calico状态:ssh $REMOTE_USER@$MASTER1_IP 'kubectl get pods -n calico-system'" - echo -e " 4. 初始化结果:$INIT_RESULT_FILE" - echo -e " 5. 部署日志:$LOG_FILE" - echo -e "========================================" -} - -# 启动主流程 -main "$@" +# -------------------------- 第六步:执行结果提示 -------------------------- +echo -e "\n=== 所有节点处理完成 ===" +echo "验证集群节点状态命令:kubectl get nodes" +echo "验证控制平面组件状态命令:kubectl get pods -n kube-system -l component=kube-apiserver,kube-controller-manager,kube-scheduler"