ansible-devops/scripts/k8s-cluster-deploy.sh

341 lines
15 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
set -euo pipefail
# ========================== 全局配置与工具函数 ==========================
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 打印函数
info() { echo -e "${YELLOW}[INFO]${NC} $1"; }
success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
step() { echo -e "\n${BLUE}===== $1 =====${NC}"; }
# 固定路径(可根据实际调整)
KUBEADM_CONF="/opt/k8s-install-conf/kubeadm-conf.yaml" # Master1的kubeadm配置文件
INIT_RESULT_FILE="/opt/k8s-install-conf/kubeadm-init-result.txt" # 初始化结果保存文件
# 关键修改1Calico v3.25官方YAML地址替换原v3.26
CALICO_YAML_OFFICIAL="https://docs.projectcalico.org/v3.25/manifests/calico.yaml"
# 本地临时Calico YAML路径用于镜像替换
LOCAL_CALICO_YAML="/tmp/calico-v3.25-modified.yaml"
# 关键修改2华为云SWR镜像前缀替换原docker.io/calico
SWR_CALICO_PREFIX="swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/calico"
REMOTE_USER="root" # 远程节点登录用户
# ========================== 参数解析核心指定Master/Node IP ==========================
# 初始化参数
MASTER_IPS=() # 3个Master节点IP顺序Master1, Master2, Master3
NODE_IPS=() # Node节点IP列表支持多个用逗号分隔
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--master-ips)
# 解析3个Master IP格式"192.168.61.10,192.168.61.11,192.168.61.12"
IFS=',' read -ra IPS <<< "$2"
if [[ ${#IPS[@]} -ne 3 ]]; then
error "--master-ips必须包含3个IP逗号分隔当前提供${#IPS[@]}"
fi
MASTER_IPS=("${IPS[@]}")
shift 2
;;
--node-ips)
# 解析Node IP列表格式"192.168.61.20,192.168.61.21"
IFS=',' read -ra IPS <<< "$2"
NODE_IPS=("${IPS[@]}")
shift 2
;;
--help)
echo "用法:$0 --master-ips <IP1,IP2,IP3> [--node-ips <IP1,IP2,...>]"
echo "功能Master1初始化→远程Master/Node加入→Calico(v3.25+华为云SWR)部署"
echo "参数说明:"
echo " --master-ips 必选3个Master节点IP顺序Master1(初始化节点), Master2, Master3"
echo " --node-ips 可选Node节点IP列表逗号分隔如不指定则仅部署Master集群"
echo "示例:"
echo " $0 --master-ips 192.168.61.10,192.168.61.11,192.168.61.12 --node-ips 192.168.61.20,192.168.61.21"
exit 0
;;
*)
error "未知参数:$1(执行$0 --help查看用法"
;;
esac
done
# 校验必选参数
if [[ ${#MASTER_IPS[@]} -eq 0 ]]; then
error "缺少必选参数--master-ips需指定3个Master节点IP"
fi
# 提取Master1 IP初始化节点
export MASTER1_IP="${MASTER_IPS[0]}"
export MASTER2_IP="${MASTER_IPS[1]}"
export MASTER3_IP="${MASTER_IPS[2]}"
info "已识别节点:"
info "Master1初始化$MASTER1_IP"
info "Master2$MASTER2_IP"
info "Master3$MASTER3_IP"
[[ ${#NODE_IPS[@]} -gt 0 ]] && info "Node节点${NODE_IPS[*]}" || info "未指定Node节点仅部署Master集群"
}
# ========================== 步骤1Master1初始化核心 ==========================
init_master1() {
step "Step 1Master1节点初始化$MASTER1_IP"
# 检查Master1的kubeadm配置文件是否存在
if ! ssh "$REMOTE_USER@$MASTER1_IP" "test -f $KUBEADM_CONF"; then
error "Master1的kubeadm配置文件不存在$MASTER1_IP:$KUBEADM_CONF(请先执行配置分发脚本)"
fi
# 执行kubeadm init带证书上传支持其他Master加入
info "执行初始化命令kubeadm init --upload-certs --config $KUBEADM_CONF"
info "初始化结果将保存到:$INIT_RESULT_FILE"
# 远程执行初始化,同时保存输出到本地文件
ssh "$REMOTE_USER@$MASTER1_IP" "kubeadm init --upload-certs --config $KUBEADM_CONF 2>&1" | tee "$INIT_RESULT_FILE"
local init_exit_code=${PIPESTATUS[0]} # 获取远程命令退出码
if [[ $init_exit_code -ne 0 ]]; then
error "Master1初始化失败查看详情cat $INIT_RESULT_FILE"
fi
success "Master1初始化完成结果已保存到$INIT_RESULT_FILE"
info "初始化输出预览前20行"
head -n 20 "$INIT_RESULT_FILE"
}
# ========================== 步骤2Master1配置kubeconfig ==========================
config_master1_kubeconfig() {
step "Step 2配置Master1的kubeconfig"
# 远程执行kubeconfig配置命令
local kube_cmds=(
"mkdir -p \$HOME/.kube"
"cp -i /etc/kubernetes/admin.conf \$HOME/.kube/config"
"chown \$(id -u):\$(id -g) \$HOME/.kube/config"
"echo 'export KUBECONFIG=/etc/kubernetes/admin.conf' >> ~/.bash_profile"
"source ~/.bash_profile"
)
for cmd in "${kube_cmds[@]}"; do
info "在Master1执行$cmd"
ssh "$REMOTE_USER@$MASTER1_IP" "$cmd" || error "执行命令失败:$cmd"
done
# 验证kubectl是否可用
info "验证kubectl在Master1执行 kubectl get nodes"
ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get nodes" || error "kubectl配置失败无法获取节点信息"
success "Master1 kubeconfig配置完成"
}
# ========================== 步骤3提取join命令从初始化结果中 ==========================
extract_join_commands() {
step "Step 3从初始化结果中提取join命令"
# 检查结果文件是否存在
[[ -f "$INIT_RESULT_FILE" ]] || error "初始化结果文件不存在:$INIT_RESULT_FILE"
# 1. 提取Master节点join命令含--control-plane和--certificate-key
info "提取Master节点join命令"
local master_join_cmd
master_join_cmd=$(grep -A 3 "You can now join any number of control-plane nodes by running the following command on each as root" "$INIT_RESULT_FILE" | grep -v "You can now" | tr -d '\n' | sed 's/ */ /g')
[[ -z "$master_join_cmd" ]] && error "未从初始化结果中提取到Master join命令"
# 补充--config参数其他Master也需用自己的配置文件
master_join_cmd="$master_join_cmd --config $KUBEADM_CONF"
export MASTER_JOIN_CMD="$master_join_cmd"
success "Master join命令提取完成"
echo -e "${YELLOW}${MASTER_JOIN_CMD}${NC}"
# 2. 提取Node节点join命令
info "提取Node节点join命令"
local node_join_cmd
node_join_cmd=$(grep -A 2 "Then you can join any number of worker nodes by running the following on each as root" "$INIT_RESULT_FILE" | grep -v "Then you can" | tr -d '\n' | sed 's/ */ /g')
[[ -z "$node_join_cmd" ]] && error "未从初始化结果中提取到Node join命令"
export NODE_JOIN_CMD="$node_join_cmd"
success "Node join命令提取完成"
echo -e "${YELLOW}${NODE_JOIN_CMD}${NC}"
}
# ========================== 步骤4远程其他Master节点加入 ==========================
join_other_masters() {
step "Step 4远程Master2$MASTER2_IP和Master3$MASTER3_IP)加入集群"
# 定义要加入的Master节点列表排除Master1
local other_masters=("$MASTER2_IP" "$MASTER3_IP")
for master_ip in "${other_masters[@]}"; do
info "处理Master节点$master_ip"
# 检查该Master的kubeadm配置文件是否存在
if ! ssh "$REMOTE_USER@$master_ip" "test -f $KUBEADM_CONF"; then
error "Master $master_ip的kubeconfig配置文件不存在$KUBEADM_CONF"
fi
# 远程执行join命令
info "$master_ip执行join命令$MASTER_JOIN_CMD"
ssh "$REMOTE_USER@$master_ip" "$MASTER_JOIN_CMD" || error "Master $master_ip加入失败"
# 配置该Master的kubeconfig否则无法本地执行kubectl
info "配置$master_ip的kubeconfig"
ssh "$REMOTE_USER@$master_ip" "mkdir -p \$HOME/.kube && cp -i /etc/kubernetes/admin.conf \$HOME/.kube/config && chown \$(id -u):\$(id -g) \$HOME/.kube/config"
success "Master $master_ip加入集群并配置完成"
done
# 验证所有Master节点状态
info "验证所有Master节点状态从Master1执行"
ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get nodes | grep master" || error "Master节点状态验证失败"
success "所有Master节点加入完成"
}
# ========================== 步骤5远程Node节点加入可选 ==========================
join_nodes() {
if [[ ${#NODE_IPS[@]} -eq 0 ]]; then
info "未指定Node节点跳过Node加入步骤"
return
fi
step "Step 5远程Node节点加入集群${#NODE_IPS[@]}个)"
for node_ip in "${NODE_IPS[@]}"; do
info "处理Node节点$node_ip"
# 检查Node节点是否已安装基础组件kubeadm、kubelet、containerd
if ! ssh "$REMOTE_USER@$node_ip" "command -v kubeadm &>/dev/null && command -v kubelet &>/dev/null && systemctl is-active --quiet containerd"; then
error "Node $node_ip未安装基础组件需先安装kubeadm、kubelet、containerd"
fi
# 远程执行Node join命令
info "$node_ip执行join命令$NODE_JOIN_CMD"
ssh "$REMOTE_USER@$node_ip" "$NODE_JOIN_CMD" || error "Node $node_ip加入失败"
success "Node $node_ip加入集群完成"
done
# 验证所有节点状态
info "验证所有节点状态从Master1执行"
ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get nodes" || error "节点状态验证失败"
success "所有Node节点加入完成"
}
# ========================== 步骤6部署Calico网络插件v3.25+华为云SWR ==========================
deploy_calico() {
step "Step 6部署Calico v3.25华为云SWR镜像"
# 子步骤1下载Calico v3.25官方YAML到本地
info "下载Calico v3.25官方YAML$CALICO_YAML_OFFICIAL"
if command -v wget &>/dev/null; then
wget -q -O "$LOCAL_CALICO_YAML" "$CALICO_YAML_OFFICIAL" || error "wget下载Calico YAML失败"
elif command -v curl &>/dev/null; then
curl -s -o "$LOCAL_CALICO_YAML" "$CALICO_YAML_OFFICIAL" || error "curl下载Calico YAML失败"
else
error "请先安装wget或curl"
fi
[[ -f "$LOCAL_CALICO_YAML" ]] || error "Calico YAML下载失败文件不存在"
success "Calico YAML下载完成本地路径$LOCAL_CALICO_YAML"
# 子步骤2替换YAML中的所有镜像地址为华为云SWR核心修改
info "替换镜像地址为华为云SWR$SWR_CALICO_PREFIX"
# 替换3个核心镜像cni、node、kube-controllers均为v3.25.0版本)
sed -i "s#docker.io/calico/cni:v3.25.0#${SWR_CALICO_PREFIX}/cni:v3.25.0#g" "$LOCAL_CALICO_YAML"
sed -i "s#docker.io/calico/node:v3.25.0#${SWR_CALICO_PREFIX}/node:v3.25.0#g" "$LOCAL_CALICO_YAML"
sed -i "s#docker.io/calico/kube-controllers:v3.25.0#${SWR_CALICO_PREFIX}/kube-controllers:v3.25.0#g" "$LOCAL_CALICO_YAML"
# 验证替换结果
info "验证镜像替换结果查看前10行含镜像的内容"
grep -n "image:" "$LOCAL_CALICO_YAML" | head -n 10 || error "Calico YAML中未找到镜像配置替换失败"
success "Calico镜像地址替换完成"
# 子步骤3将修改后的YAML上传到Master1节点
local remote_calico_yaml="/opt/k8s-install-conf/calico-v3.25-swr.yaml"
info "上传修改后的YAML到Master1$remote_calico_yaml"
scp -o StrictHostKeyChecking=no "$LOCAL_CALICO_YAML" "$REMOTE_USER@$MASTER1_IP:$remote_calico_yaml" || error "YAML上传失败"
success "YAML上传完成"
# 子步骤4在Master1执行Calico部署
info "执行部署命令kubectl apply -f $remote_calico_yaml"
ssh "$REMOTE_USER@$MASTER1_IP" "kubectl apply -f $remote_calico_yaml" || error "Calico部署失败"
# 子步骤5等待Calico Pod启动最多等待5分钟
info "等待Calico Pod启动最多5分钟..."
local wait_time=0
while true; do
# 检查calico-system命名空间下所有Pod是否Running
local pod_status
pod_status=$(ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get pods -n calico-system -o jsonpath='{.items[*].status.phase}' 2>/dev/null")
if [[ "$pod_status" =~ ^(Running\ )*Running$ && -n "$pod_status" ]]; then
break
fi
if [[ $wait_time -ge 300 ]]; then # 5分钟=300秒
error "Calico Pod启动超时5分钟请手动检查kubectl get pods -n calico-system"
fi
sleep 10
wait_time=$((wait_time + 10))
info "已等待${wait_time}Calico Pod状态$pod_status"
done
# 子步骤6验证Calico状态
info "验证Calico状态从Master1执行"
ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get pods -n calico-system" || error "Calico状态验证失败"
# 额外验证网络插件是否就绪
ssh "$REMOTE_USER@$MASTER1_IP" "kubectl get nodes -o wide | grep -E 'STATUS|Ready'" || error "节点网络未就绪"
success "Calico v3.25华为云SWR部署完成集群网络已就绪"
# 子步骤7清理本地临时YAML文件
info "清理本地临时文件:$LOCAL_CALICO_YAML"
rm -f "$LOCAL_CALICO_YAML" || info "本地文件清理失败,可手动删除"
}
# ========================== 主流程:串联所有步骤 ==========================
main() {
# 1. 解析参数
parse_args "$@"
# 2. 前置检查确保SSH免密登录正常
step "前置检查SSH免密登录验证"
local all_ips=("${MASTER_IPS[@]}" "${NODE_IPS[@]}")
for ip in "${all_ips[@]}"; do
info "验证SSH免密登录$REMOTE_USER@$ip"
if ! ssh -o StrictHostKeyChecking=no "$REMOTE_USER@$ip" "echo 'SSH连通正常' >/dev/null"; then
error "SSH免密登录失败$ip(请先执行 ssh-copy-id $REMOTE_USER@$ip 配置免密)"
fi
done
success "所有节点SSH免密登录验证通过"
# 3. 执行核心步骤
init_master1 # Master1初始化
config_master1_kubeconfig # Master1配置kubeconfig
extract_join_commands # 提取join命令
join_other_masters # 其他Master加入
join_nodes # Node加入可选
deploy_calico # 部署Calicov3.25+SWR
# 4. 集群部署完成总结
step "集群部署完成总结"
echo -e "${GREEN}========================================"
echo -e "K8s集群部署全流程完成"
echo -e "关键信息:"
echo -e " 1. Calico版本v3.25镜像源华为云SWR"
echo -e " 2. 集群节点状态ssh $REMOTE_USER@$MASTER1_IP 'kubectl get nodes'"
echo -e " 3. Calico状态ssh $REMOTE_USER@$MASTER1_IP 'kubectl get pods -n calico-system'"
echo -e " 4. 初始化结果:$INIT_RESULT_FILE"
echo -e "========================================"
}
# 启动主流程
main "$@"
chmod +x k8s-cluster-deploy.sh
# 部署3个Master + 2个Node替换为你的IP
#bash k8s-cluster-deploy.sh --master-ips 192.168.61.10,192.168.61.11,192.168.61.12 --node-ips 192.168.61.20,192.168.61.21