ansible-devops/scripts/k8s-cluster-deploy.sh

404 lines
15 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -euo pipefail
# ========================== 全局配置与工具函数 ==========================
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 命令记录存储(用于最后汇总展示)
declare -a EXECUTED_COMMANDS=()
LOG_FILE="/var/log/k8s-deploy-$(date +%Y%m%d-%H%M%S).log"
# 打印函数
info() {
echo -e "${YELLOW}[INFO]${NC} $1" | tee -a "$LOG_FILE"
}
success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
}
error() {
echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
exit 1
}
step() {
echo -e "\n${BLUE}===== $1 =====${NC}" | tee -a "$LOG_FILE"
}
# 远程执行命令并记录
remote_exec() {
local ip=$1
local cmd=$2
local desc=${3:-"执行命令"}
info "$ip$desc: ${YELLOW}$cmd${NC}"
EXECUTED_COMMANDS+=("$ip 上: $cmd")
# 执行远程命令并记录输出
if ! ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 "$REMOTE_USER@$ip" "$cmd" >> "$LOG_FILE" 2>&1; then
error "$ip 上执行命令失败: $cmd,请查看日志: $LOG_FILE"
fi
}
# 固定路径(可根据实际调整)
KUBEADM_CONF="/opt/k8s-install-conf/kubeadm-conf.yaml" # Master节点的kubeadm配置文件
INIT_RESULT_FILE="/opt/k8s-install-conf/kubeadm-init-result.txt" # 初始化结果保存文件
CALICO_YAML_OFFICIAL="https://docs.projectcalico.org/v3.25/manifests/calico.yaml"
LOCAL_CALICO_YAML="/tmp/calico-v3.25-modified.yaml"
SWR_CALICO_PREFIX="swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/calico"
REMOTE_USER="root" # 远程节点登录用户
# ========================== 参数解析 ==========================
MASTER_IPS=() # 3个Master节点IP顺序Master1, Master2, Master3
NODE_IPS=() # Node节点IP列表
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--master-ips)
IFS=',' read -ra IPS <<< "$2"
if [[ ${#IPS[@]} -ne 3 ]]; then
error "--master-ips必须包含3个IP逗号分隔当前提供${#IPS[@]}"
fi
MASTER_IPS=("${IPS[@]}")
shift 2
;;
--node-ips)
IFS=',' read -ra IPS <<< "$2"
NODE_IPS=("${IPS[@]}")
shift 2
;;
--help)
echo "用法:$0 --master-ips <IP1,IP2,IP3> [--node-ips <IP1,IP2,...>]"
echo "功能部署K8s集群包含3个Master节点和可选的Node节点"
echo "参数说明:"
echo " --master-ips 必选3个Master节点IP顺序Master1(初始化节点), Master2, Master3"
echo " --node-ips 可选Node节点IP列表逗号分隔如不指定则仅部署Master集群"
echo "示例:"
echo " $0 --master-ips 192.168.61.10,192.168.61.11,192.168.61.12 --node-ips 192.168.61.20,192.168.61.21"
exit 0
;;
*)
error "未知参数:$1(执行$0 --help查看用法"
;;
esac
done
# 校验必选参数
if [[ ${#MASTER_IPS[@]} -eq 0 ]]; then
error "缺少必选参数--master-ips需指定3个Master节点IP"
fi
# 提取Master IP
export MASTER1_IP="${MASTER_IPS[0]}"
export MASTER2_IP="${MASTER_IPS[1]}"
export MASTER3_IP="${MASTER_IPS[2]}"
info "已识别节点:"
info "Master1初始化$MASTER1_IP"
info "Master2$MASTER2_IP"
info "Master3$MASTER3_IP"
[[ ${#NODE_IPS[@]} -gt 0 ]] && info "Node节点${NODE_IPS[*]}" || info "未指定Node节点仅部署Master集群"
}
# ========================== 免密登录检测 ==========================
check_ssh_access() {
step "前置检查SSH免密登录验证"
local all_ips=("${MASTER_IPS[@]}" "${NODE_IPS[@]}")
local failed_ips=()
for ip in "${all_ips[@]}"; do
info "验证SSH免密登录$REMOTE_USER@$ip"
if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes "$REMOTE_USER@$ip" "echo 'SSH连通正常'" >/dev/null 2>&1; then
info "$ip SSH免密登录验证通过"
else
error "$ip SSH免密登录验证失败请先执行: ssh-copy-id $REMOTE_USER@$ip 配置免密"
failed_ips+=("$ip")
fi
done
if [[ ${#failed_ips[@]} -gt 0 ]]; then
error "以下节点SSH免密登录配置失败: ${failed_ips[*]}"
fi
success "所有节点SSH免密登录验证通过"
}
# ========================== 步骤1Master1初始化 ==========================
init_master1() {
step "Step 1Master1节点初始化$MASTER1_IP"
# 检查Master1的kubeadm配置文件是否存在
if ! ssh "$REMOTE_USER@$MASTER1_IP" "test -f $KUBEADM_CONF"; then
error "Master1的kubeadm配置文件不存在$MASTER1_IP:$KUBEADM_CONF(请先执行配置分发脚本)"
fi
# 执行kubeadm init
local init_cmd="kubeadm init --upload-certs --config $KUBEADM_CONF 2>&1"
info "执行初始化命令:$init_cmd"
info "初始化结果将保存到:$INIT_RESULT_FILE"
# 远程执行初始化,同时保存输出到本地文件
ssh "$REMOTE_USER@$MASTER1_IP" "$init_cmd" | tee "$INIT_RESULT_FILE"
local init_exit_code=${PIPESTATUS[0]}
if [[ $init_exit_code -ne 0 ]]; then
error "Master1初始化失败查看详情cat $INIT_RESULT_FILE"
fi
EXECUTED_COMMANDS+=("$MASTER1_IP 上初始化: $init_cmd")
success "Master1初始化完成结果已保存到$INIT_RESULT_FILE"
info "初始化输出预览前20行"
head -n 20 "$INIT_RESULT_FILE"
}
# ========================== 步骤2Master1配置kubeconfig ==========================
config_master1_kubeconfig() {
step "Step 2配置Master1的kubeconfig"
# 远程执行kubeconfig配置命令
local kube_cmds=(
"mkdir -p \$HOME/.kube"
"cp -i /etc/kubernetes/admin.conf \$HOME/.kube/config"
"chown \$(id -u):\$(id -g) \$HOME/.kube/config"
"echo 'export KUBECONFIG=/etc/kubernetes/admin.conf' >> ~/.bash_profile"
"source ~/.bash_profile"
)
for cmd in "${kube_cmds[@]}"; do
remote_exec "$MASTER1_IP" "$cmd" "配置kubeconfig"
done
# 验证kubectl是否可用
remote_exec "$MASTER1_IP" "kubectl get nodes" "验证kubectl"
success "Master1 kubeconfig配置完成"
}
# ========================== 步骤3提取join命令 ==========================
extract_join_commands() {
step "Step 3从初始化结果中提取join命令"
# 检查结果文件是否存在
[[ -f "$INIT_RESULT_FILE" ]] || error "初始化结果文件不存在:$INIT_RESULT_FILE"
# 提取Master节点join命令
info "提取Master节点join命令"
local master_join_cmd
master_join_cmd=$(grep -A 3 "You can now join any number of control-plane nodes by running the following command on each as root" "$INIT_RESULT_FILE" | grep -v "You can now" | tr -d '\n' | sed 's/ */ /g')
[[ -z "$master_join_cmd" ]] && error "未从初始化结果中提取到Master join命令"
# 补充--config参数
master_join_cmd="$master_join_cmd --config $KUBEADM_CONF"
export MASTER_JOIN_CMD="$master_join_cmd"
EXECUTED_COMMANDS+=("Master节点加入命令: $MASTER_JOIN_CMD")
success "Master join命令提取完成"
echo -e "${YELLOW}${MASTER_JOIN_CMD}${NC}"
# 提取Node节点join命令
info "提取Node节点join命令"
local node_join_cmd
node_join_cmd=$(grep -A 2 "Then you can join any number of worker nodes by running the following on each as root" "$INIT_RESULT_FILE" | grep -v "Then you can" | tr -d '\n' | sed 's/ */ /g')
[[ -z "$node_join_cmd" ]] && error "未从初始化结果中提取到Node join命令"
export NODE_JOIN_CMD="$node_join_cmd"
EXECUTED_COMMANDS+=("Node节点加入命令: $NODE_JOIN_CMD")
success "Node join命令提取完成"
echo -e "${YELLOW}${NODE_JOIN_CMD}${NC}"
}
# ========================== 步骤4其他Master节点加入 ==========================
join_other_masters() {
step "Step 4远程Master2$MASTER2_IP和Master3$MASTER3_IP)加入集群"
# 定义要加入的Master节点列表
local other_masters=("$MASTER2_IP" "$MASTER3_IP")
for master_ip in "${other_masters[@]}"; do
info "处理Master节点$master_ip"
# 检查该Master的kubeadm配置文件是否存在
if ! ssh "$REMOTE_USER@$master_ip" "test -f $KUBEADM_CONF"; then
error "Master $master_ip的配置文件不存在$KUBEADM_CONF"
fi
# 远程执行join命令
remote_exec "$master_ip" "$MASTER_JOIN_CMD" "执行加入集群命令"
# 配置该Master的kubeconfig
local kube_config_cmds=(
"mkdir -p \$HOME/.kube"
"cp -i /etc/kubernetes/admin.conf \$HOME/.kube/config"
"chown \$(id -u):\$(id -g) \$HOME/.kube/config"
)
for cmd in "${kube_config_cmds[@]}"; do
remote_exec "$master_ip" "$cmd" "配置kubeconfig"
done
success "Master $master_ip加入集群并配置完成"
done
# 验证所有Master节点状态
remote_exec "$MASTER1_IP" "kubectl get nodes | grep master" "验证Master节点状态"
success "所有Master节点加入完成"
}
# ========================== 步骤5Node节点加入 ==========================
join_nodes() {
if [[ ${#NODE_IPS[@]} -eq 0 ]]; then
info "未指定Node节点跳过Node加入步骤"
return
fi
step "Step 5远程Node节点加入集群${#NODE_IPS[@]}个)"
for node_ip in "${NODE_IPS[@]}"; do
info "处理Node节点$node_ip"
# 检查Node节点是否已安装基础组件
local check_cmd="command -v kubeadm &>/dev/null && command -v kubelet &>/dev/null && systemctl is-active --quiet containerd"
if ! ssh "$REMOTE_USER@$node_ip" "$check_cmd"; then
error "Node $node_ip未安装基础组件需先安装kubeadm、kubelet、containerd"
fi
# 远程执行Node join命令
remote_exec "$node_ip" "$NODE_JOIN_CMD" "执行加入集群命令"
success "Node $node_ip加入集群完成"
done
# 验证所有节点状态
remote_exec "$MASTER1_IP" "kubectl get nodes" "验证所有节点状态"
success "所有Node节点加入完成"
}
# ========================== 步骤6部署Calico网络插件 ==========================
deploy_calico() {
step "Step 6部署Calico v3.25华为云SWR镜像"
# 下载Calico YAML到本地
info "下载Calico v3.25官方YAML$CALICO_YAML_OFFICIAL"
if command -v wget &>/dev/null; then
wget -q -O "$LOCAL_CALICO_YAML" "$CALICO_YAML_OFFICIAL" || error "wget下载Calico YAML失败"
elif command -v curl &>/dev/null; then
curl -s -o "$LOCAL_CALICO_YAML" "$CALICO_YAML_OFFICIAL" || error "curl下载Calico YAML失败"
else
error "请先安装wget或curl"
fi
[[ -f "$LOCAL_CALICO_YAML" ]] || error "Calico YAML下载失败文件不存在"
success "Calico YAML下载完成本地路径$LOCAL_CALICO_YAML"
# 替换YAML中的所有镜像地址为华为云SWR
info "替换镜像地址为华为云SWR$SWR_CALICO_PREFIX"
sed -i "s#docker.io/calico/cni:v3.25.0#${SWR_CALICO_PREFIX}/cni:v3.25.0#g" "$LOCAL_CALICO_YAML"
sed -i "s#docker.io/calico/node:v3.25.0#${SWR_CALICO_PREFIX}/node:v3.25.0#g" "$LOCAL_CALICO_YAML"
sed -i "s#docker.io/calico/kube-controllers:v3.25.0#${SWR_CALICO_PREFIX}/kube-controllers:v3.25.0#g" "$LOCAL_CALICO_YAML"
# 验证替换结果
info "验证镜像替换结果查看前10行含镜像的内容"
grep -n "image:" "$LOCAL_CALICO_YAML" | head -n 10 || error "Calico YAML中未找到镜像配置替换失败"
success "Calico镜像地址替换完成"
# 将修改后的YAML上传到Master1节点
local remote_calico_yaml="/opt/k8s-install-conf/calico-v3.25-swr.yaml"
info "上传修改后的YAML到Master1$remote_calico_yaml"
scp -o StrictHostKeyChecking=no "$LOCAL_CALICO_YAML" "$REMOTE_USER@$MASTER1_IP:$remote_calico_yaml" || error "YAML上传失败"
EXECUTED_COMMANDS+=("上传Calico配置到 $MASTER1_IP: scp $LOCAL_CALICO_YAML $REMOTE_USER@$MASTER1_IP:$remote_calico_yaml")
success "YAML上传完成"
# 在Master1执行Calico部署
local deploy_cmd="kubectl apply -f $remote_calico_yaml"
remote_exec "$MASTER1_IP" "$deploy_cmd" "执行Calico部署"
# 等待Calico Pod启动
info "等待Calico Pod启动最多5分钟..."
local wait_time=0
while true; do
local pod_status
pod_status=$(ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get pods -n calico-system -o jsonpath='{.items[*].status.phase}' 2>/dev/null")
if [[ "$pod_status" =~ ^(Running\ )*Running$ && -n "$pod_status" ]]; then
break
fi
if [[ $wait_time -ge 300 ]]; then
error "Calico Pod启动超时5分钟请手动检查kubectl get pods -n calico-system"
fi
sleep 10
wait_time=$((wait_time + 10))
info "已等待${wait_time}Calico Pod状态$pod_status"
done
# 验证Calico状态
remote_exec "$MASTER1_IP" "kubectl get pods -n calico-system" "验证Calico状态"
remote_exec "$MASTER1_IP" "kubectl get nodes -o wide | grep -E 'STATUS|Ready'" "验证节点网络状态"
success "Calico v3.25华为云SWR部署完成集群网络已就绪"
# 清理本地临时YAML文件
info "清理本地临时文件:$LOCAL_CALICO_YAML"
rm -f "$LOCAL_CALICO_YAML" || info "本地文件清理失败,可手动删除"
}
# ========================== 步骤7展示执行的所有命令 ==========================
show_executed_commands() {
step "Step 7执行命令汇总"
echo -e "${YELLOW}以下是部署过程中执行的关键命令:${NC}"
echo -e "----------------------------------------------------------------------"
local count=1
for cmd in "${EXECUTED_COMMANDS[@]}"; do
echo -e "${count}. ${cmd}"
((count++))
done
echo -e "----------------------------------------------------------------------"
}
# ========================== 主流程 ==========================
main() {
# 记录开始时间
local start_time=$(date +%s)
# 1. 解析参数
parse_args "$@"
# 2. 检查SSH免密登录
check_ssh_access
# 3. 执行核心步骤
init_master1
config_master1_kubeconfig
extract_join_commands
join_other_masters
join_nodes
deploy_calico
# 4. 展示执行的命令
show_executed_commands
# 5. 集群部署完成总结
step "集群部署完成总结"
local end_time=$(date +%s)
local duration=$((end_time - start_time))
echo -e "${GREEN}========================================"
echo -e "K8s集群部署全流程完成"
echo -e "部署耗时:$((duration/60))$((duration%60))"
echo -e "关键信息:"
echo -e " 1. Calico版本v3.25镜像源华为云SWR"
echo -e " 2. 集群节点状态ssh $REMOTE_USER@$MASTER1_IP 'kubectl get nodes'"
echo -e " 3. Calico状态ssh $REMOTE_USER@$MASTER1_IP 'kubectl get pods -n calico-system'"
echo -e " 4. 初始化结果:$INIT_RESULT_FILE"
echo -e " 5. 部署日志:$LOG_FILE"
echo -e "========================================"
}
# 启动主流程
main "$@"