kubenetes1.23.17离线部署(shell)
1.部署k8s准备
1.1 服务器信息
| IP地址 | 系统 | 内核 | 系统配置 | 部署方案 | 数据基础目录 | 备注 | 
| 172.16.4.85 | CentOS7.8 | 5.4.278-1.el7.elrepo.x86_64 | 8核16G/100G | master1 | /data/ | 目前先只做一主两从,85主,86、87从 | 
| 172.16.4.89 | CentOS7.8 | 5.4.278-1.el7.elrepo.x86_64 | 8核16G/100G | master2 | /data/ | 缺少 | 
| 172.16.4.86 | CentOS7.8 | 5.4.278-1.el7.elrepo.x86_64 | 8核16G/100G | node1 | /data/ | |
| 172.16.4.87 | CentOS7.8 | 5.4.278-1.el7.elrepo.x86_64 | 8核16G/100G | node2 | /data/ | |
| 172.16.4.88 | CentOS7.8 | 5.4.278-1.el7.elrepo.x86_64 | 8核16G/100G | node3 | /data/ | 缺少 | 
1.2 所有离线包压缩文件(所有k8s节点都要上传)
- 由于环境因数,此次部署均为离线部署,下边压缩包就包含了所有离线安装包(工具包、docker服务、内核rpm、k8s镜像、kubeadm工具等)
k8s12317_centos78_20250115_all.tar.gz1.3 解压后的目录路径
/data/k8s12317_centos78_20250115_all- 目录 k8s12317_centos78_20250115_all 下的文件
[root@harbor k8s12317_centos78_20250115_all]# ll -htr
总用量 783M
-rw-r--r-- 1 root root 168M 1月   1 01:37 k8s_tools_package_centos7.8.tar.gz   #k8s依赖工具包
-rw-r--r-- 1 root root  61M 1月   1 01:37 docker-20.10.9.tgz                   #docker服务
-rw-r--r-- 1 root root 420M 1月   3 00:54 k8s_1.23.17_images.tar.gz            #k8s1.23.17版本的镜像
-rwxr-xr-x 1 root root  812 1月   3 00:54 save_images.sh      
-rw-r--r-- 1 root root  67M 1月   3 00:54 kubeadmin_1.23.17.tar.gz             #kubeadmin工具包
-rwxr-xr-x 1 root root 6.4K 1月   3 01:06 init_server.sh                       #初始化脚本
-rw-r--r-- 1 root root  69M 1月  15 11:55 kernel-5.4.278-1.tar.gz              #内核5.4.278包
-rwxr-xr-x 1 root root 1.8K 1月  15 14:56 kernel_upgrade.sh                    #内核升级脚本
-rwxr-xr-x 1 root root 3.1K 1月  15 15:49 k8s_tools.sh                         #k8s依赖工具包安装脚本
2.系统初始化(所有k8s节点都要执行)
2.1 初始化系统(init_server.sh,包含安装k8s所需工具脚本 k8s_tools.sh )
- 用来初始化系统,安装docker、安装依赖的tools等
- 脚本有一处需要修改,定义主机名和IP的对应关系,需要手动修改IP地址和主机名,所有节点都添加
 declare -A nodes
 nodes=(
 ["172.16.4.85"]="master1"
 ["172.16.4.86"]="node1"
 ["172.16.4.87"]="node2"
 )
init_server.sh
 [root@localhost k8s12317_centos78_20250115_all]# cat init_server.sh 
#!/bin/bash
# 日志函数:输出日志信息并添加时间戳
log_message() {
    local log_level=$1
    local log_msg=$2
    local timestamp
    timestamp=$(date "+%Y-%m-%d %H:%M:%S")
    echo "$timestamp [$log_level] $log_msg"
}
# 定义主机名和IP的对应关系
declare -A nodes
nodes=(
    ["172.16.4.85"]="master1"
    ["172.16.4.86"]="node1"
    ["172.16.4.87"]="node2"
)
# NTP 服务器,可以随时更改
NTP_SERVER="ntp.ntsc.ac.cn"
# 获取当前机器的IP地址
get_current_ip() {
    local ip
    ip=$(hostname -I | awk '{print $1}')  # 获取当前机器的IP地址
    echo "$ip"
}
# 配置主机名
configure_hostname() {
    local current_ip=$(get_current_ip)  # 获取当前机器的IP
    local hostname="${nodes[$current_ip]}"
    if [ -n "$hostname" ]; then
        hostnamectl set-hostname "$hostname" && log_message "INFO" "Hostname set to $hostname" || { log_message "ERROR" "Failed to set hostname"; exit 1; }
    else
        log_message "ERROR" "Unknown IP $current_ip"
        exit 1
    fi
}
# 配置 /etc/hosts
configure_hosts() {
    for ip in "${!nodes[@]}"; do
        echo "$ip   ${nodes[$ip]}" >> /etc/hosts
        log_message "INFO" "Added ${nodes[$ip]} to /etc/hosts"
    done
}
# 关闭 SELinux
disable_selinux() {
    setenforce 0
    sed -i 's/^SELINUX=enforcing$/SELINUX=disabled/' /etc/selinux/config
    sed -i 's/^SELINUX=permissive$/SELINUX=disabled/' /etc/selinux/config
    log_message "INFO" "SELinux disabled."
}
# 关闭防火墙
disable_firewall() {
    systemctl stop firewalld
    systemctl disable firewalld
    log_message "INFO" "Firewall disabled."
}
# 关闭 swap 交换分区
disable_swap() {
    swapoff -a
    sed -i 's/.*swap.*/#&/' /etc/fstab
    log_message "INFO" "Swap disabled."
}
# 设置内核参数
configure_kernel() {
    modprobe br_netfilter
    echo "modprobe br_netfilter" >> /etc/profile
    cat > /etc/sysctl.d/kubernetes.conf << EOF
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
net.ipv4.ip_forward = 1
EOF
    sysctl --system
    log_message "INFO" "Kernel parameters configured."
}
# 离线安装工具
install_offline_tools() {
    if [[ ! -f "k8s_tools.sh" ]]; then
        log_message "ERROR" "k8s_tools.sh file not found!"
        exit 1
    fi
    sh k8s_tools.sh i && log_message "INFO" "Offline tools installed." || { log_message "ERROR" "Failed to install offline tools"; exit 1; }
}
# 开启 IPVS
enable_ipvs() {
    # 加载 IPVS 模块
    modprobe ip_vs
    # 如果你需要支持 FTP
    modprobe ip_vs_ftp
    # 加载轮询调度算法
    modprobe ip_vs_rr
    # 加载加权轮询调度算法
    modprobe ip_vs_wrr
    # 加载一致性哈希调度算法
    modprobe ip_vs_sh
    # 将 IPVS 模块添加到 /etc/modules-load.d/ipvs.conf 文件中,以便开机自动加载
    echo -e "ip_vs\nip_vs_ftp\nip_vs_rr\nip_vs_wrr\nip_vs_sh" > /etc/modules-load.d/ipvs.conf
    log_message "INFO" "IPVS modules loaded."
}
# 时间同步
configure_time_sync() {
    # 定义定时任务内容
    CRON_JOB="*/10 * * * * /usr/sbin/ntpdate $NTP_SERVER"
    # 检查该定时任务是否已经存在
    (crontab -l | grep -F "$CRON_JOB") || (crontab -l 2>/dev/null; echo "$CRON_JOB") | crontab -
    # 输出操作结果
    log_message "INFO" "Time synchronization cron job added."
    # 立即执行 ntpdate
    /usr/sbin/ntpdate $NTP_SERVER && log_message "INFO" "Time synchronized to $NTP_SERVER." || { log_message "ERROR" "Failed to synchronize time"; exit 1; }
}
# 安装 Docker
install_docker() {
    if [[ ! -f "docker-20.10.9.tgz" ]]; then
        log_message "ERROR" "Docker installation package not found!"
        exit 1
    fi
    # 解压 docker 安装包
    tar zxf docker-20.10.9.tgz
    mv ./docker/* /usr/bin/
    # 删除解压的临时目录
    #rm -rf /data/k8s1231_centos78_20241231_all/docker
    log_message "INFO" "Docker files extracted and moved."
    # 配置 Docker 守护进程
    mkdir /etc/docker
    cat > /etc/docker/daemon.json <<EOF
{
    "exec-opts": ["native.cgroupdriver=systemd"],
    "graph": "/data/docker_storage",
    "log-driver": "json-file",
    "log-opts": {
        "max-size": "100m"
    },
    "storage-driver": "overlay2",
    "storage-opts": [
        "overlay2.override_kernel_check=true"
    ],
    "insecure-registries" : ["172.16.4.17:8090", "152.136.254.160:8090"],
    "registry-mirrors": [
        "https://dockerpull.com",
        "https://dockerproxy.cn",
        "https://docker.anyhub.us.kg",
        "https://dockerhub.jobcher.com",
        "https://dockerhub.icu",
        "https://docker.hpcloud.cloud",
        "https://docker.1panel.live"
    ],
    "live-restore": true
}
EOF
    # 配置 Docker 开机启动
    cat <<EOF > /lib/systemd/system/docker.service
[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target firewalld.service
Wants=network-online.target
[Service]
Type=notify
EnvironmentFile=-/etc/sysconfig/docker
EnvironmentFile=-/etc/sysconfig/docker-storage
EnvironmentFile=-/etc/sysconfig/docker-network
Environment=GOTRACEBACK=crash
ExecStart=/usr/bin/dockerd -H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock
ExecReload=/bin/kill -s HUP \$MAINPID
LimitNOFILE=1048576
LimitNPROC=1048576
LimitCORE=infinity
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
[Install]
WantedBy=multi-user.target
EOF
    systemctl daemon-reload
    systemctl restart docker
    systemctl enable docker
    log_message "INFO" "Docker installed and configured."
    # 检查 Docker 是否安装成功
    if command -v docker &>/dev/null; then
        log_message "INFO" "Docker installed successfully."
    else
        log_message "ERROR" "Docker installation failed."
        exit 1
    fi
}
# 主函数
main() {
    log_message "INFO" "Starting the setup process..."
    # 配置主机名
    configure_hostname
    # 配置 /etc/hosts
    configure_hosts
    # 关闭 SELinux
    disable_selinux
    # 关闭防火墙
    disable_firewall
    # 关闭交换分区
    disable_swap
    # 配置内核参数
    configure_kernel
    # 离线安装工具
    install_offline_tools
    # 开启 IPVS
    enable_ipvs
    # 配置时间同步
    configure_time_sync
    # 安装 Docker
    install_docker
    log_message "INFO" "Setup completed successfully."
}
# 执行主函数
main
2.2 工具安装脚本
- init_server.sh已经包含执行,此处只做展示,此脚本用来安装k8s所需工具
k8s_tools.sh
 [root@node2 k8s12317_centos78_20250115_all]# cat k8s_tools.sh 
#!/bin/bash
# 日志输出函数
log_message() {
    local LOG_LEVEL="$1"
    local MESSAGE="$2"
    echo "$(date +'%Y-%m-%d %H:%M:%S') [$LOG_LEVEL] $MESSAGE" >> "$LOG_FILE"
}
# 目标目录
DOWNLOAD_DIR="/data/k8s12317_centos78_20250115_all/package"
LOG_FILE="/data/k8s12317_centos78_20250115_all/package/download_log.txt"
# 确保目录存在
mkdir -p "$DOWNLOAD_DIR"
# 清空日志文件
> "$LOG_FILE"
# 软件包列表
PACKAGES=(
    "yum-utils"
    "device-mapper-persistent-data"
    "lvm2"
    "wget"
    "net-tools"
    "nfs-utils"
    "lrzsz"
    "gcc"
    "gcc-c++"
    "make"
    "cmake"
    "libxml2-devel"
    "openssl-devel"
    "curl"
    "curl-devel"
    "unzip"
    "ntp"
    "libaio-devel"
    "vim"
    "ncurses-devel"
    "autoconf"
    "automake"
    "zlib-devel"
    "python-devel"
    "epel-release"
    "openssh-server"
    "socat"
    "ipvsadm"
    "conntrack"
    "ntpdate"
    "nc"
    "telnet"
    "tcpdump"
    "pciutils"
    "iptraf"
)
# 输出使用说明函数
usage() {
    echo "Usage: $0 {d|i}"
    echo "  d: Download packages only"
    echo "  i: Install packages only"
    exit 1
}
# 参数检查,必须是 d 或 i
if [[ "$1" != "d" && "$1" != "i" ]]; then
    log_message "ERROR" "Invalid argument: $1. Only 'd' or 'i' are allowed."
    usage
fi
# 下载函数
download_packages() {
    log_message "INFO" "Download Alibaba Cloud YUM repository"
    curl -o /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-7.repo
    log_message "INFO" "Starting package download at $(date)"
    for PACKAGE in "${PACKAGES[@]}"; do
        PACKAGE_DIR="$DOWNLOAD_DIR/$PACKAGE"
        mkdir -p "$PACKAGE_DIR"
        log_message "INFO" "Downloading $PACKAGE to $PACKAGE_DIR..."
        yum install --downloadonly --downloaddir="$PACKAGE_DIR" "$PACKAGE" &>> "$LOG_FILE"
        # 检查下载是否成功
        if [ $? -ne 0 ]; then
            log_message "ERROR" "Failed to download: $PACKAGE"
        else
            log_message "INFO" "Successfully downloaded: $PACKAGE"
        fi
    done
    log_message "INFO" "Download process completed at $(date)"
}
# 安装函数
install_packages() {
    log_message "INFO" "Starting package installation at $(date)"
    
    #解压工具包
    tar zxf k8s_tools_package_centos7.8.tar.gz 
    # 检查下载目录是否存在
    if [ ! -d "$DOWNLOAD_DIR" ]; then
        log_message "ERROR" "Download directory $DOWNLOAD_DIR does not exist. Skipping installation."
        return 1
    fi
    
    # 使用 yum 安装所有 .rpm 文件
    RPM_FILES=$(find "$DOWNLOAD_DIR" -type f -name "*.rpm")
    
    # 使用 yum 安装 RPM 包
    log_message "INFO" "Installing RPM packages using yum localinstall"
    yum localinstall -y $RPM_FILES &>> "$LOG_FILE"
    
    if [ $? -ne 0 ]; then
        log_message "ERROR" "Failed to install RPM packages."
    else
        log_message "INFO" "Successfully installed RPM packages."
    fi
    
    log_message "INFO" "Installation process completed at $(date)"
}
# 根据传入参数选择下载或安装
if [[ "$1" == "d" ]]; then
    download_packages
elif [[ "$1" == "i" ]]; then
    install_packages
fi
2.3 内核升级
- 内核升级脚本,升级到5.4.278(长期支持版本)
kernel_upgrade.sh
 [root@harbor k8s12317_centos78_20250115_all]# cat kernel_upgrade.sh 
#!/bin/bash
# 日志函数
log() {
    local LEVEL=$1
    local MESSAGE=$2
    local TIMESTAMP=$(date "+%Y-%m-%d %H:%M:%S")
    echo "$TIMESTAMP [$LEVEL] - $MESSAGE"
}
# 执行命令并判断结果,失败时输出错误日志并退出
run_command() {
    local COMMAND=$1
    local SUCCESS_MSG=$2
    local ERROR_MSG=$3
    
    if $COMMAND; then
        log "INFO" "$SUCCESS_MSG"
    else
        log "ERROR" "$ERROR_MSG"
        exit 1
    fi
}
# 确保是以 root 用户运行
if [ "$(id -u)" -ne 0 ]; then
    log "ERROR" "请以 root 用户运行此脚本。"
    exit 1
fi
# 移除现有的内核工具包
run_command "yum remove -y kernel-tools kernel-headers kernel-tools-libs" "移除现有内核工具包..." "移除内核工具包失败。"
# 解压内核源代码包
KERNEL_TAR="kernel-5.4.278-1.tar.gz"
if [ ! -f "$KERNEL_TAR" ]; then
    log "ERROR" "内核包 $KERNEL_TAR 不存在,请确保内核包在当前目录。"
    exit 1
fi
run_command "tar -zxf $KERNEL_TAR" "解压 $KERNEL_TAR ..." "解压内核包失败。"
# 进入解压后的内核源代码目录
cd kernel || { log "ERROR" "无法进入内核源代码目录"; exit 1; }
# 安装内核 RPM 包
run_command "yum localinstall -y kernel-lt-*.rpm" "安装内核 RPM 包..." "安装内核 RPM 包失败。"
# 更新 GRUB 配置(适用于 BIOS 系统)
run_command "grub2-set-default 0" "设置默认启动内核..." "设置默认启动内核失败。"
run_command "grub2-mkconfig -o /boot/grub2/grub.cfg" "更新 GRUB 配置..." "更新 GRUB 配置失败。"
# 提示用户重启
read -p "内核安装完成,是否重启系统?(y/n): " REBOOT
if [[ "$REBOOT" =~ ^[Yy]$ ]]; then
    log "INFO" "正在重启系统..."
    reboot
else
    log "INFO" "重启操作已取消。"
fi
3.安装k8s组件(所有k8s节点都要执行)
- kubeadmin_1.23.17.tar.gz #k8s组件包
- kubeadm:是一个工具,它可以初始化集群、引导新的节点加入集群等。
- kubelet:是运行在集群中所有节点上的代理。它确保容器都在运行状态。
- kubectl:是 Kubernetes 的命令行工具。可以使用它来管理 Kubernetes 集群。
- kubeadm 和 kubelet 每个节点上都安装,而 kubectl 通常只安装在你打算执行管理命令的机器上。
#到此目录
cd /data/k8s1231_centos78_20241231_all
#解压
tar zxf kubeadm_1.23.17.tar.gz
#进行安装
cd kubeadmin_1.23.17
#目录下rpm包
[root@master1 kubeadm_1.23.17]# ll -htr
total 67M
-rw-r--r-- 1 root root  24K Aug 11  2017 libnetfilter_queue-1.0.2-2.el7_2.x86_64.rpm
-rw-r--r-- 1 root root 290K Aug 11  2017 socat-1.7.3.2-2.el7.x86_64.rpm
-rw-r--r-- 1 root root 187K Apr  4  2020 conntrack-tools-1.4.4-7.el7.x86_64.rpm
-rw-r--r-- 1 root root  18K Apr  4  2020 libnetfilter_cthelper-1.0.0-11.el7.x86_64.rpm
-rw-r--r-- 1 root root  18K Apr  4  2020 libnetfilter_cttimeout-1.0.0-7.el7.x86_64.rpm
-rw-r--r-- 1 root root  17M Jan 19  2023 0f2a2afd740d476ad77c508847bad1f559afc2425816c1f2ce4432a62dfe0b9d-kubernetes-cni-1.2.0-0.x86_64.rpm
-rw-r--r-- 1 root root 9.4M Mar  1  2023 52c389a4598f61bdf251c5ebcdf2475c32254e3ea85027e204ccc0356e7a1be1-kubeadm-1.23.17-0.x86_64.rpm
-rw-r--r-- 1 root root 9.9M Mar  1  2023 cb2ed23fb25cc5b2f73ffc665c3b71e87bce012ec4cf7750e2246f1b48afd34e-kubectl-1.23.17-0.x86_64.rpm
-rw-r--r-- 1 root root  22M Mar  1  2023 552c4d4494c1de798baf4b52c0a27a3e9a63740900a76f30997210edbfcb7d99-kubelet-1.23.17-0.x86_64.rpm
-rw-r--r-- 1 root root 8.6M Sep 18  2023 3f5ba2b53701ac9102ea7c7ab2ca6616a8cd5966591a77577585fde1c434ef74-cri-tools-1.26.0-0.x86_64.rpm
#批量安装rpm,并自动处理依赖关系
yum localinstall ./*.rpm
#设置kubelet 为自启动
systemctl enable kubelet4. 初始化k8s
- 方法一需要将离线镜像根据master和node需求导入所有节点,方法二则只需要导入master节点,在初始化的时候会自动从harbor私有仓库拉取
4.1 方法一
4.1.1 将镜像离线包导入(所有k8s节点,可以根据master或者node节点导入对应的镜像,如果分不清,就全部导入所有节点)
#解压
tar zxf k8s_1.23.17_images.tar.gz
#到目录中
cd k8s_1.23.17_images
#查看镜像包
[root@harbor k8s_1.23.17_images]# ls etcd-3.5.6-0.tar pause-3.6.tar registry.aliyuncs.com_google_containers_* 
etcd-3.5.6-0.tar                                                     registry.aliyuncs.com_google_containers_kube-controller-manager:v1.23.17.tar
pause-3.6.tar                                                        registry.aliyuncs.com_google_containers_kube-proxy:v1.23.17.tar
registry.aliyuncs.com_google_containers_coredns:v1.8.6.tar           registry.aliyuncs.com_google_containers_kube-scheduler:v1.23.17.tar             
registry.aliyuncs.com_google_containers_kube-apiserver:v1.23.17.tar
#批量load tar包到docker中,成为镜像
ls *.tar | xargs -I {} docker load -i {}4.1.2 初始化(master节点)
kubeadm init \
  --apiserver-advertise-address=172.16.4.85 \
  --image-repository registry.aliyuncs.com/google_containers \
  --kubernetes-version v1.23.17 \
  --service-cidr=10.96.0.0/12 \
  --pod-network-cidr=10.244.0.0/164.2 方法二(只在master节点上执行,自动会从harbor私有仓库拉取对应镜像)
4.2.1 将下载镜像后,推送到私有仓库如:harbor,kubeadm初始化时会自动拉取镜像(我目前使用这种方法)
- 离线下载镜像并推送到私有仓库harbor,参考这篇:
https://www.cnblogs.com/Leonardo-li/p/186446994.2.2 初始化命令参数解释
1. --apiserver-advertise-address=172.16.4.85
作用:指定 Kubernetes 控制平面的 API Server 在网络中的可达地址。
用途:此地址是其他节点加入集群时所使用的地址,通常填写主节点的 IP。
建议:填写主节点的实际内网 IP,确保其他节点可以通过该地址访问。
2. --image-repository=172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers
作用:指定 Kubernetes 组件所需镜像的仓库地址。
用途:如果默认的官方仓库(k8s.gcr.io)不可用,使用此参数从指定的私有镜像仓库拉取镜像。
示例:
172.16.4.177:8090 是你的私有镜像仓库地址。
k8s12317/registry.aliyuncs.com/google_containers 是镜像存储的路径前缀。
3. --kubernetes-version=v1.23.17
作用:指定要安装的 Kubernetes 控制平面组件的版本。
用途:确保集群的版本符合预期,避免安装错误版本。
注意:
如果未指定,kubeadm 会尝试安装当前支持的最新版本。
确保此版本的镜像已经存在于私有镜像仓库中。
4. --service-cidr=10.96.0.0/12
作用:为 Kubernetes 服务分配一个虚拟 IP 地址段。
用途:该地址段用于分配 ClusterIP 类型的服务,服务内部的通信将通过这些虚拟 IP 进行。
默认值:10.96.0.0/12,可根据需要修改。
注意:
确保此地址段不会与集群中其他子网冲突。
地址段一旦确定,后续无法轻易修改。
5. --pod-network-cidr=10.244.0.0/16
作用:指定 Pod 网络的 CIDR(Classless Inter-Domain Routing)。
用途:配置 Pod 的 IP 地址分配范围,通常与所使用的网络插件相关。
示例:
如果使用 Flannel 插件,推荐使用 10.244.0.0/16。
如果使用其他插件(如 Calico),需查看插件的文档选择合适的 CIDR。
注意:确保该 CIDR 不与服务 CIDR 或主机网络 IP 段冲突。
6. 其他可能参数(未出现在命令中)
--control-plane-endpoint:
如果有负载均衡器或多主节点集群,指定访问控制平面的入口地址。
示例:--control-plane-endpoint=my-load-balancer.example.com:6443
--upload-certs:
将证书加密后分发到其他控制平面节点,适用于高可用集群。
--token:
手动设置一个用于加入集群的令牌。
命令执行后的输出
成功执行后,会生成以下信息:
集群初始化成功的确认消息。
kubeadm join 命令:供其他节点加入集群时使用,包括 token 和 ca-cert 信息。4.2.3 初始化
kubeadm init \
  --apiserver-advertise-address=172.16.4.85 \
  --image-repository 172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers \
  --kubernetes-version v1.23.17 \
  --service-cidr=10.96.0.0/12 \
  --pod-network-cidr=10.244.0.0/164.2.4 初始化完成打印
kubeadm init
 [root@master1 k8s1231_centos78_20241231_all]# kubeadm init   --apiserver-advertise-address=172.16.4.85   --image-repository 172.16.4.17:8090/k8s12317/registry.aliyuncs.com/google_containers   --kubernetes-version v1.23.17   --service-cidr=10.96.0.0/12   --pod-network-cidr=10.244.0.0/16
[init] Using Kubernetes version: v1.23.17
[preflight] Running pre-flight checks
[preflight] Pulling images required for setting up a Kubernetes cluster
[preflight] This might take a minute or two, depending on the speed of your internet connection
[preflight] You can also perform this action in beforehand using 'kubeadm config images pull'
[certs] Using certificateDir folder "/etc/kubernetes/pki"
[certs] Generating "ca" certificate and key
[certs] Generating "apiserver" certificate and key
[certs] apiserver serving cert is signed for DNS names [kubernetes kubernetes.default kubernetes.default.svc kubernetes.default.svc.cluster.local master1] and IPs [10.96.0.1 172.16.4.85]
[certs] Generating "apiserver-kubelet-client" certificate and key
[certs] Generating "front-proxy-ca" certificate and key
[certs] Generating "front-proxy-client" certificate and key
[certs] Generating "etcd/ca" certificate and key
[certs] Generating "etcd/server" certificate and key
[certs] etcd/server serving cert is signed for DNS names [localhost master1] and IPs [172.16.4.85 127.0.0.1 ::1]
[certs] Generating "etcd/peer" certificate and key
[certs] etcd/peer serving cert is signed for DNS names [localhost master1] and IPs [172.16.4.85 127.0.0.1 ::1]
[certs] Generating "etcd/healthcheck-client" certificate and key
[certs] Generating "apiserver-etcd-client" certificate and key
[certs] Generating "sa" key and public key
[kubeconfig] Using kubeconfig folder "/etc/kubernetes"
[kubeconfig] Writing "admin.conf" kubeconfig file
[kubeconfig] Writing "kubelet.conf" kubeconfig file
[kubeconfig] Writing "controller-manager.conf" kubeconfig file
[kubeconfig] Writing "scheduler.conf" kubeconfig file
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Starting the kubelet
[control-plane] Using manifest folder "/etc/kubernetes/manifests"
[control-plane] Creating static Pod manifest for "kube-apiserver"
[control-plane] Creating static Pod manifest for "kube-controller-manager"
[control-plane] Creating static Pod manifest for "kube-scheduler"
[etcd] Creating static Pod manifest for local etcd in "/etc/kubernetes/manifests"
[wait-control-plane] Waiting for the kubelet to boot up the control plane as static Pods from directory "/etc/kubernetes/manifests". This can take up to 4m0s
[apiclient] All control plane components are healthy after 6.502433 seconds
[upload-config] Storing the configuration used in ConfigMap "kubeadm-config" in the "kube-system" Namespace
[kubelet] Creating a ConfigMap "kubelet-config-1.23" in namespace kube-system with the configuration for the kubelets in the cluster
NOTE: The "kubelet-config-1.23" naming of the kubelet ConfigMap is deprecated. Once the UnversionedKubeletConfigMap feature gate graduates to Beta the default name will become just "kubelet-config". Kubeadm upgrade will handle this transition transparently.
[upload-certs] Skipping phase. Please see --upload-certs
[mark-control-plane] Marking the node master1 as control-plane by adding the labels: [node-role.kubernetes.io/master(deprecated) node-role.kubernetes.io/control-plane node.kubernetes.io/exclude-from-external-load-balancers]
[mark-control-plane] Marking the node master1 as control-plane by adding the taints [node-role.kubernetes.io/master:NoSchedule]
[bootstrap-token] Using token: 3x666d.rw9pi9y8hzrouwyl
[bootstrap-token] Configuring bootstrap tokens, cluster-info ConfigMap, RBAC Roles
[bootstrap-token] configured RBAC rules to allow Node Bootstrap tokens to get nodes
[bootstrap-token] configured RBAC rules to allow Node Bootstrap tokens to post CSRs in order for nodes to get long term certificate credentials
[bootstrap-token] configured RBAC rules to allow the csrapprover controller automatically approve CSRs from a Node Bootstrap Token
[bootstrap-token] configured RBAC rules to allow certificate rotation for all node client certificates in the cluster
[bootstrap-token] Creating the "cluster-info" ConfigMap in the "kube-public" namespace
[kubelet-finalize] Updating "/etc/kubernetes/kubelet.conf" to point to a rotatable kubelet client certificate and key
[addons] Applied essential addon: CoreDNS
[addons] Applied essential addon: kube-proxy
Your Kubernetes control-plane has initialized successfully!
To start using your cluster, you need to run the following as a regular user:
  mkdir -p $HOME/.kube
  sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
  sudo chown $(id -u):$(id -g) $HOME/.kube/config
Alternatively, if you are the root user, you can run:
  export KUBECONFIG=/etc/kubernetes/admin.conf
You should now deploy a pod network to the cluster.
Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at:
  https://kubernetes.io/docs/concepts/cluster-administration/addons/
Then you can join any number of worker nodes by running the following on each as root:
kubeadm join 172.16.4.85:6443 --token 3x666d.rw9pi9y8hzrouwyl \
	--discovery-token-ca-cert-hash sha256:4fd5a431609cbe13041d9b80a845dcb40150c8427266fdd17602d16ed11cdd614.2.5 可以看到已经将私有仓库的镜像pull下来了
[root@master1 ~]# docker images 
REPOSITORY                                                                                  TAG        IMAGE ID       CREATED         SIZE
172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers/kube-apiserver            v1.23.17   62bc5d8258d6   23 months ago   130MB
172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers/kube-controller-manager   v1.23.17   1dab4fc7b6e0   23 months ago   120MB
172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers/kube-scheduler            v1.23.17   bc6794cb54ac   23 months ago   51.9MB
172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers/kube-proxy                v1.23.17   f21c8d21558c   23 months ago   111MB
172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers/etcd                      3.5.6-0    fce326961ae2   2 years ago     299MB
172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers/coredns                   v1.8.6     a4ca41631cc7   3 years ago     46.8MB
172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers/pause                     3.6        6270bb605e12   3 years ago     683kB4.2.6 etcd和pause版本导致的kubeadm init初始化错误
- 最开始下载的etcd:3.5.1-0、pause:3.7版本的镜像,在初始化的时候报错,如下:
[root@master1 kubeadmin_1.23.17]# kubeadm init   --apiserver-advertise-address=172.16.4.85   --image-repository 172.16.4.177:8090/k8s12317/registry.aliyuncs.com/google_containers   --kubernetes-version v1.23.17   --service-cidr=10.96.0.0/12   --pod-network-cidr=10.244.0.0/16
[init] Using Kubernetes version: v1.23.17
[preflight] Running pre-flight checks
[preflight] Pulling images required for setting up a Kubernetes cluster
[preflight] This might take a minute or two, depending on the speed of your internet connection
[preflight] You can also perform this action in beforehand using 'kubeadm config images pull'
error execution phase preflight: [preflight] Some fatal errors occurred:
	[ERROR ImagePull]: failed to pull image 172.16.4.17:8090/k8s12317/registry.aliyuncs.com/google_containers/pause:3.6: output: Error response from daemon: unknown: artifact k8s12317/registry.aliyuncs.com/google_containers/pause:3.6 not found
, error: exit status 1
	[ERROR ImagePull]: failed to pull image 172.16.4.17:8090/k8s12317/registry.aliyuncs.com/google_containers/etcd:3.5.6-0: output: Error response from daemon: unknown: artifact k8s12317/registry.aliyuncs.com/google_containers/etcd:3.5.6-0 not found
, error: exit status 1
[preflight] If you know what you are doing, you can make a check non-fatal with --ignore-preflight-errors=...
To see the stack trace of this error execute with --v=5 or higher- 解决方法:从错误信息可以看出,kubeadm init 时在私有仓库中没有找到 pause:3.6 和 etcd:3.5.6-0 版本的镜像,而我的harbor中有pause:3.7 和 etcd:3.5.1-0镜像,显然这两个版本与1.23.17版本的其他k8s镜像不匹配,需要下载pause:3.6 和 etcd:3.5.6-0 版本的镜像,所以,又重新下载了离线镜像,参考:https://www.cnblogs.com/Leonardo-li/p/18644699
4.2.7 拷贝k8s认证文件
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config4.2.8 查看k8s节点(验证)
- 由于网络插件还没有部署,还没有准备就绪 NotReady
[root@master1 ~]# kubectl get node
NAME      STATUS     ROLES                  AGE   VERSION
master1   NotReady   control-plane,master   86m   v1.23.17
5.往集群里添加node节点(所有node节点执行)
5.1 查看token有效期(有效期24小时),以及重新创建token
[root@master1 ~]# kubeadm token list
TOKEN                     TTL         EXPIRES                USAGES                   DESCRIPTION                                                EXTRA GROUPS
3x666d.rw9pi9y8hzrouwyl   22h         2025-01-16T10:09:32Z   authentication,signing   The default bootstrap token generated by 'kubeadm init'.   system:bootstrappers:kubeadm:default-node-token[root@master1 ~]# kubeadm token create --print-join-command
kubeadm join 172.16.4.85:6443 --token h9g5rn.y07uajj3d9r3v5hh     --discovery-token-ca-cert-hash sha256:cfb734386ee0d27d4864900648c3eaf0e2f84b1e9f98d04b483ad9e702653c9e5.2 向集群添加新节点
5.2.1 执行在 kubeadm 初始化时输出的"kubeadm join ..."命令
- node1节点
[root@node1 data]# kubeadm join 172.16.4.85:6443 --token 3x666d.rw9pi9y8hzrouwyl \
> --discovery-token-ca-cert-hash sha256:4fd5a431609cbe13041d9b80a845dcb40150c8427266fdd17602d16ed11cdd61
[preflight] Running pre-flight checks
[preflight] Reading configuration from the cluster...
[preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -o yaml'
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Starting the kubelet
[kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap...
This node has joined the cluster:
* Certificate signing request was sent to apiserver and a response was received.
* The Kubelet was informed of the new secure connection details.
Run 'kubectl get nodes' on the control-plane to see this node join the cluster.- node2节点
[root@node2 data]# kubeadm join 172.16.4.85:6443 --token 3x666d.rw9pi9y8hzrouwyl \
> --discovery-token-ca-cert-hash sha256:4fd5a431609cbe13041d9b80a845dcb40150c8427266fdd17602d16ed11cdd61
[preflight] Running pre-flight checks
[preflight] Reading configuration from the cluster...
[preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -o yaml'
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Starting the kubelet
[kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap...
This node has joined the cluster:
* Certificate signing request was sent to apiserver and a response was received.
* The Kubelet was informed of the new secure connection details.
Run 'kubectl get nodes' on the control-plane to see this node join the cluster.- node3节点。。。
5.2.2 查看节点信息
- 由于网络插件还没有部署,还没有准备就绪 NotReady
[root@master1 ~]# kubectl get node
NAME      STATUS     ROLES                  AGE     VERSION
master1   NotReady   control-plane,master   122m    v1.23.17
node1     NotReady   <none>                 4m57s   v1.23.17
node2     NotReady   <none>                 84s     v1.23.17
6.网络插件Cailco
- 
根据 Calico 的发布记录, pod2daemon-flexvol组件在 Calico 3.24.0 版本中被移除。 该版本的发布说明提到,随着 Kubernetes 对 FlexVolume 插件的逐步弃用,Calico 相应地移除了对pod2daemon-flexvol的支持。 因此,从 3.24.0 版本开始,Calico 不再包含或使用pod2daemon-flexvol组件。如果正在使用 Calico 3.24.0 或更高版本,且Kubernetes 集群版本较新,默认情况下无需再配置或使用 pod2daemon-flexvol。 如果集群仍依赖于 FlexVolume 插件,建议在升级前仔细评估和测试,以确保网络功能的正常运行。
- 部署Calico 3.25.0版本时,calico.yaml文件中并没有pod2damon-flexvol的相关信息以及镜像下载要求,我不太确定是否完全弃用,所以还是下载了。
- calico.yaml 文件对比: 3.20.6 VS 3.25.0
#3.20.6
grep image: calico.yaml 
          image: docker.io/calico/cni:v3.20.6
          image: docker.io/calico/cni:v3.20.6
          image: docker.io/calico/pod2daemon-flexvol:v3.20.6
          image: docker.io/calico/node:v3.20.6
          image: docker.io/calico/kube-controllers:v3.20.6#3.25.0
grep "image:" calico.yaml 
          image: docker.io/calico/cni:v3.25.0
          image: docker.io/calico/cni:v3.25.0
          image: docker.io/calico/node:v3.25.0
          image: docker.io/calico/node:v3.25.0
          image: docker.io/calico/kube-controllers:v3.25.0
6.1 下载Calico.yaml文件(在有网的机器上下载)
wget --no-check-certificate https://docs.projectcalico.org/manifests/calico.yaml6.2 查看Calico需要哪些版本的镜像
grep "image:" calico.yaml 
          image: docker.io/calico/cni:v3.25.0
          image: docker.io/calico/cni:v3.25.0
          image: docker.io/calico/node:v3.25.0
          image: docker.io/calico/node:v3.25.0
          image: docker.io/calico/kube-controllers:v3.25.06.3 下载Calico相关镜像(在有网的机器上,需要下载对应版本的镜像)
docker pull docker.io/calico/node:v3.25.0
docker pull docker.io/calico/cni:v3.25.0
docker pull docker.io/calico/kube-controllers:v3.25.0
docker pull docker.io/calico/pod2daemon-flexvol:v3.25.0方法一:将下载的镜像打成tar包,传到所有节点,然后导入到docker中(docker load -i *),除了导入镜像方式外,其他操作都一样
[root@harbor k8s_1.23.17_images]# ls calico*
calico_cni:v3.25.0.tar  calico_kube-controllers:v3.25.0.tar  calico_node:v3.25.0.tar  calico_pod2daemon-flexvol:v3.25.0.tar方法二:将下载的镜像修改tag,推送到私有仓库harbor,参考:https://www.cnblogs.com/Leonardo-li/p/18644699 (我目前使用的这种方法,主要是方便后边部署其他k8s)
- 私有仓库的镜像地址(172.16.4.177:8090/k8s12317,我的私有仓库)
172.16.4.177:8090/k8s12317/docker.io/calico/node:v3.25.0
172.16.4.177:8090/k8s12317/docker.io/calico/cni:v3.25.0
172.16.4.177:8090/k8s12317/docker.io/calico/kube-controllers:v3.25.0
172.16.4.177:8090/k8s12317/docker.io/calico/pod2daemon-flexvol:v3.25.06.4 编辑calico.yaml文件(master节点)
- 上传calico.yaml文件到master节点,将文件中pod所在网段更新为kubeadm init时选项--pod-network-cidr所指定的网段
#查看pod网段
[root@master1 ~]# cat /etc/kubernetes/manifests/kube-controller-manager.yaml | grep "cluster-cidr="
    - --cluster-cidr=10.244.0.0/16- 指定网卡(必须要做,否则会报错)
[root@master1 ~]# vim calico.yaml
# 找到下面的内容进行修改
            # no effect. This should fall within `--cluster-cidr`.
            - name: CALICO_IPV4POOL_CIDR	# 去掉注释
              value: "10.244.0.0/16"		# 去掉注释,更新地址
            # Disable file logging so `kubectl logs` works.
            - name: CALICO_DISABLE_FILE_LOGGING
              value: "true"
# 指定网卡,不然创建pod时会有报错 
			
            # 找到这里
            - name: CLUSTER_TYPE
              value: "k8s,bgp"
            # 在下面添加
            - name: IP_AUTODETECTION_METHOD
              value: "interface=ens192"	# ens192为本地网卡名6.5 安装Calico(master节点)
- 修改Calico中镜像地址为私有仓库镜像,
grep "image:" calico.yaml 
          image: docker.io/calico/cni:v3.25.0
          image: docker.io/calico/cni:v3.25.0
          image: docker.io/calico/node:v3.25.0
          image: docker.io/calico/node:v3.25.0
          image: docker.io/calico/kube-controllers:v3.25.0
#修改后: 
grep image: calico.yaml 
          image: 172.16.4.17:8090/k8s12317/docker.io/calico/cni:v3.25.0
          image: 172.16.4.17:8090/k8s12317/docker.io/calico/cni:v3.25.0
          image: 172.16.4.17:8090/k8s12317/docker.io/calico/node:v3.25.0
          image: 172.16.4.17:8090/k8s12317/docker.io/calico/node:v3.25.0
          image: 172.16.4.17:8090/k8s12317/docker.io/calico/kube-controllers:v3.25.0- 部署Calico.yaml
[root@master1 file]# kubectl apply -f calico.yaml- 查看节点
[root@master1 file]# kubectl get node 
NAME      STATUS   ROLES                  AGE   VERSION
master1   Ready    control-plane,master   17h   v1.23.17
node1     Ready    <none>                 15h   v1.23.17
node2     Ready    <none>                 15h   v1.23.17- 验证Calico是否正常,Running表示都正常运行
[root@master1 file]# kubectl get pods -n kube-system -o wide | grep calico
calico-kube-controllers-54f6c69876-8zhgr   1/1     Running   0          30m   10.244.166.131   node1     <none>           <none>
calico-node-nv4sb                          1/1     Running   0          30m   172.16.4.87      node2     <none>           <none>
calico-node-qjr6n                          1/1     Running   0          30m   172.16.4.85      master1   <none>           <none>
calico-node-v452x                          1/1     Running   0          30m   172.16.4.86      node1     <none>           <none>
7.验证k8s集群是否正常
验证k8s集群状态
 1. 检查节点状态
使用以下命令检查集群中所有节点的状态,确保所有节点都处于 Ready 状态:
kubectl get nodes
输出示例:
NAME      STATUS   ROLES                  AGE   VERSION
master1   Ready    control-plane,master   18h   v1.23.17
node1     Ready    <none>                 16h   v1.23.17
node2     Ready    <none>                 16h   v1.23.17
Ready:表示节点正常运行。
如果状态是 NotReady 或 SchedulingDisabled,需要检查相关节点日志以找出原因。
2. 检查 Pod 状态
确保所有的关键组件(如 kube-apiserver、kube-controller-manager、kube-scheduler)都在 kube-system 命名空间内运行,并且状态是 Running:
kubectl get pods -n kube-system
输出示例:
[root@master1 file]# kubectl get pods -n kube-system
NAME                                       READY   STATUS    RESTARTS   AGE
calico-kube-controllers-54f6c69876-8zhgr   1/1     Running   0          46m
calico-node-nv4sb                          1/1     Running   0          46m
calico-node-qjr6n                          1/1     Running   0          46m
calico-node-v452x                          1/1     Running   0          46m
coredns-6765c788b6-6fvtd                   1/1     Running   0          18h
coredns-6765c788b6-7fqk9                   1/1     Running   0          18h
etcd-master1                               1/1     Running   0          18h
kube-apiserver-master1                     1/1     Running   0          18h
kube-controller-manager-master1            1/1     Running   0          18h
kube-proxy-m5kkx                           1/1     Running   0          16h
kube-proxy-t5wh5                           1/1     Running   0          16h
kube-proxy-xrv87                           1/1     Running   0          18h
kube-scheduler-master1                     1/1     Running   0          18h
确保关键组件的 Pod 如 kube-apiserver, kube-controller-manager, kube-scheduler 等都处于 Running 状态。
3. 检查核心服务
查看 Kubernetes 集群的核心服务是否正常,例如 API Server:
kubectl cluster-info
输出示例:
[root@master1 file]# kubectl cluster-info
Kubernetes control plane is running at https://172.16.4.85:6443
CoreDNS is running at https://172.16.4.85:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy
如果返回相关服务的地址,表示 Kubernetes 集群的 API Server 和 DNS 服务是正常的。
4. 检查 kube-system 中的 Pod 状态
验证关键服务如 kube-dns, calico-node, kube-proxy 等是否在 kube-system 命名空间正常运行:
kubectl get pods -n kube-system -o wide
输出示例
[root@master1 file]# kubectl get pods -n kube-system -o wide
NAME                                       READY   STATUS    RESTARTS   AGE   IP               NODE      NOMINATED NODE   READINESS GATES
calico-kube-controllers-54f6c69876-8zhgr   1/1     Running   0          48m   10.244.166.131   node1     <none>           <none>
calico-node-nv4sb                          1/1     Running   0          48m   172.16.4.87      node2     <none>           <none>
calico-node-qjr6n                          1/1     Running   0          48m   172.16.4.85      master1   <none>           <none>
calico-node-v452x                          1/1     Running   0          48m   172.16.4.86      node1     <none>           <none>
coredns-6765c788b6-6fvtd                   1/1     Running   0          18h   10.244.166.129   node1     <none>           <none>
coredns-6765c788b6-7fqk9                   1/1     Running   0          18h   10.244.166.130   node1     <none>           <none>
etcd-master1                               1/1     Running   0          18h   172.16.4.85      master1   <none>           <none>
kube-apiserver-master1                     1/1     Running   0          18h   172.16.4.85      master1   <none>           <none>
kube-controller-manager-master1            1/1     Running   0          18h   172.16.4.85      master1   <none>           <none>
kube-proxy-m5kkx                           1/1     Running   0          16h   172.16.4.86      node1     <none>           <none>
kube-proxy-t5wh5                           1/1     Running   0          16h   172.16.4.87      node2     <none>           <none>
kube-proxy-xrv87                           1/1     Running   0          18h   172.16.4.85      master1   <none>           <none>
kube-scheduler-master1                     1/1     Running   0          18h   172.16.4.85      master1   <none>           <none>
如果其中有任何 Pod 不是 Running 状态,说明可能存在问题。
5. 测试 kubectl 配置
尝试通过 kubectl 执行一个简单的命令,确认集群是否响应:
kubectl get pods
输出示例:
[root@master1 file]# kubectl get pods -n kube-system
NAME                                       READY   STATUS    RESTARTS   AGE
calico-kube-controllers-54f6c69876-8zhgr   1/1     Running   0          49m
calico-node-nv4sb                          1/1     Running   0          49m
calico-node-qjr6n                          1/1     Running   0          49m
calico-node-v452x                          1/1     Running   0          49m
coredns-6765c788b6-6fvtd                   1/1     Running   0          18h
coredns-6765c788b6-7fqk9                   1/1     Running   0          18h
etcd-master1                               1/1     Running   0          18h
kube-apiserver-master1                     1/1     Running   0          18h
kube-controller-manager-master1            1/1     Running   0          18h
kube-proxy-m5kkx                           1/1     Running   0          16h
kube-proxy-t5wh5                           1/1     Running   0          16h
kube-proxy-xrv87                           1/1     Running   0          18h
kube-scheduler-master1                     1/1     Running   0          18h
如果能成功列出 Pod,则说明集群控制面板和 kubectl 配置正常。
如果出现权限错误或连接失败,检查 kubeconfig 配置是否正确。
6. 检查 Pod 间通信
创建一个简单的测试 Pod,检查不同节点间的网络通信是否正常:
# nginx-test.yaml
apiVersion: v1
kind: Pod
metadata:
  name: nginx-test
  labels:
    app: nginx
spec:
  containers:
  - name: nginx
    image: nginx:1.21.6
    ports:
    - containerPort: 80
部署测试 Pod:
kubectl apply -f nginx-test.yaml
kubectl get pod nginx-test
然后在其他节点上运行:
kubectl exec -it nginx-test -- ping <another-pod-ip>
如果 Pod 间能够相互通信,表示网络正常。
7. 检查节点上的网络插件(如 Calico)
确保网络插件(如 Calico)正常运行,执行以下命令检查节点上的 Calico 状态:
calicoctl node status
如果节点的状态是 Established,说明网络插件已正常工作。
8. 验证集群 DNS 服务
测试集群 DNS 服务是否正常工作:
kubectl run -it --rm dns-test --image=busybox --restart=Never -- nslookup kubernetes.default
成功:表示 DNS 服务工作正常。
失败:可能是 DNS 配置或网络插件存在问题。
9. 检查集群的资源使用情况
确保集群资源(如 CPU、内存等)正常,可以使用以下命令查看资源使用情况:
kubectl top nodes
kubectl top pods
如果出现资源瓶颈,可能需要优化集群资源配置或扩展节点。
10. 验证 kubectl 的访问权限
kubectl config view
确保您能够连接到正确的集群,并且集群权限配置正常。
8.参考文档
https://blog.csdn.net/qq_41210783/article/details/134311364
 
                    
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号