NFS+Keepalived 高可用部署

1、部署前提准备工作

# 所有节点执行
yum install -y nfs-utils keepalived rsync rpcbind inotify-tools
# 节点间相互配置免密

2、NFS 服务配置

 所有节点一致

# 1. 配置NFS共享规则
cat > /etc/exports << EOF
/data/nfs 192.168.189.0/24(rw,sync,no_root_squash,no_all_squash,fsid=0,anonuid=0,anongid=0)
EOF
# 2. 启动NFS服务并设置开机自启
systemctl enable --now rpcbind nfs-server
# 重载配置
exportfs -rv  

3、Keepalived 配置

1. 主节点(192.168.189.155)配置

vi /etc/keepalived/keepalived.conf
global_defs {
    router_id NFS_MASTER_155
    script_user root
    enable_script_security
}

# NFS健康检查脚本
vrrp_script chk_nfs {
    script "/usr/local/bin/chk_nfs.sh"
    interval 2
    weight -50
    fall 2
    rise 2
}

vrrp_instance VI_NFS {
    state BACKUP
    interface ens3  # 替换为实际网卡名
    virtual_router_id 51
    priority 150
    advert_int 1
    nopreempt
    preempt_delay 300
    garp_master_delay 1
    # 单播,防止脑裂
    unicast_src_ip 192.168.189.155
    unicast_peer {
        192.168.189.163
        192.168.189.164
    }

    authentication {
        auth_type PASS
        auth_pass NFS_HA_2026_Prod
    }

    virtual_ipaddress {
        192.168.189.156/24 dev ens3 noprefixroute  # vip地址
    }

    track_script {
        chk_nfs
    }

    # 状态切换通知脚本
    notify_master "/usr/local/bin/vip_master.sh"
    notify_backup "/usr/local/bin/vip_backup.sh"
    notify_fault "/usr/local/bin/vip_fault.sh"
}

2、备节点(192.168.189.163)配置

vi /etc/keepalived/keepalived.conf
global_defs {
    router_id NFS_BACKUP_163
    script_user root
    enable_script_security
}

vrrp_script chk_nfs {
    script "/usr/local/bin/chk_nfs.sh"
    interval 2
    weight -50
    fall 2
    rise 2
}

vrrp_instance VI_NFS {
    state BACKUP
    interface ens3
    virtual_router_id 51
    priority 120
    advert_int 1
    nopreempt
    preempt_delay 300
    garp_master_delay 1
    # 单播,防止脑裂
    unicast_src_ip 192.168.189.163
    unicast_peer {
        192.168.189.155
        192.168.189.164
    }

    authentication {
        auth_type PASS
        auth_pass NFS_HA_2026_Prod
    }

    virtual_ipaddress {
        192.168.189.156/24 dev ens3 noprefixroute  # vip地址
    }

    track_script {
        chk_nfs
    }

    notify_master "/usr/local/bin/vip_master.sh"
    notify_backup "/usr/local/bin/vip_backup.sh"
    notify_fault "/usr/local/bin/vip_fault.sh"
}

3、备节点 2(192.168.189.164)配置

vi /etc/keepalived/keepalived.conf
global_defs {
    router_id NFS_BACKUP_164
    script_user root
    enable_script_security
}

vrrp_script chk_nfs {
    script "/usr/local/bin/chk_nfs.sh"
    interval 2
    weight -50
    fall 2
    rise 2
}

vrrp_instance VI_NFS {
    state BACKUP
    interface ens3
    virtual_router_id 51
    priority 100
    advert_int 1
    nopreempt
    preempt_delay 300
    garp_master_delay 1
    # 单播,防止脑裂
    unicast_src_ip 192.168.189.164
    unicast_peer {
        192.168.189.155
        192.168.189.163
    }

    authentication {
        auth_type PASS
        auth_pass NFS_HA_2026_Prod
    }

    virtual_ipaddress {
        192.168.189.156/24 dev ens3 noprefixroute  # vip地址
    }

    track_script {
        chk_nfs
    }

    notify_master "/usr/local/bin/vip_master.sh"
    notify_backup "/usr/local/bin/vip_backup.sh"
    notify_fault "/usr/local/bin/vip_fault.sh"
}

4、核心脚本配置

所有节点一致

1. NFS 健康检查脚本(/usr/local/bin/chk_nfs.sh)

#!/bin/bash
# 检查NFS服务状态
systemctl is-active --quiet nfs-server || exit 1
# 检查共享目录可写性
touch /data/nfs/.nfs_health_check 2>/dev/null
if [ $? -ne 0 ]; then exit 1; fi
rm -f /data/nfs/.nfs_health_check
# 主节点检查VIP通信
if /usr/sbin/ip addr | grep -q 192.168.189.156; then
    ping -c 1 -W 1 192.168.189.156 >/dev/null 2>&1 || exit 1
fi
exit 0

2、VIP 主节点通知脚本(/usr/local/bin/vip_master.sh)

#!/bin/bash
LOG_FILE="/var/log/nfs_ha.log"
VIP="192.168.189.156"
DEV="ens3"
NFS_DIR="/data/nfs"

log() {
    echo "$(date +'%Y-%m-%d %H:%M:%S') [MASTER] - $1" >> $LOG_FILE
}

log "============ 切换为主节点 ============"

# 先停止所有同步,避免冲突
pkill -f "nfs_inotify_sync.sh" >/dev/null 2>&1
sleep 2
log "第一步:从当前VIP拉取最新数据,防止旧数据覆盖"
rsync -avz --delete --exclude ".nfs_*" root@$VIP:$NFS_DIR/ $NFS_DIR/ >> $LOG_FILE 2>&1

# 刷新ARP
arping -c 3 -I $DEV $VIP >/dev/null 2>&1
log "ARP 刷新完成"

# 启动NFS
systemctl restart nfs-server
systemctl is-active --quiet nfs-server && log "NFS 正常" || log "NFS 异常!"

# 启动实时同步
nohup /usr/local/bin/nfs_inotify_sync.sh >> /var/log/nfs_inotify.log 2>&1 &
log "Inotify 同步已启动"

log "主节点切换完成!"

3、VIP 备节点通知脚本(/usr/local/bin/vip_backup.sh)

#!/bin/bash
# 备节点状态切换脚本(核心:只拉不推,杜绝数据覆盖)
LOG_FILE="/var/log/nfs_ha.log"
VIP="192.168.189.156"
NFS_DIR="/data/nfs"

# 统一日志函数
log() {
    echo "$(date +'%Y-%m-%d %H:%M:%S') [BACKUP] - $1" >> $LOG_FILE
}

log "============ 切换为备节点 ============"

# 1. 强制终止所有同步进程(关键:杜绝备节点残留inotify推旧数据)
pkill -9 -f "nfs_inotify_sync.sh" >/dev/null 2>&1
pkill -9 -f "inotifywait" >/dev/null 2>&1
log "已强制停止所有inotify同步进程"

# 2. 等待2秒,确保进程完全退出
sleep 2

# 3. 严格从VIP拉取最新数据(排除NFS临时文件,避免同步异常)
log "开始从VIP($VIP)拉取最新数据,覆盖本地旧数据"
rsync -avz --delete \
  --exclude ".nfs_*" \
  --exclude ".nfs_health_check" \
  --timeout=30 \
  root@$VIP:$NFS_DIR/ $NFS_DIR/ >> $LOG_FILE 2>&1

# 4. 同步结果校验
if [ $? -eq 0 ]; then
    log "✅ 备节点从VIP同步数据成功,本地数据已更新为最新"
else
    log "❌ 备节点从VIP同步数据失败!请检查网络/免密登录/VIP状态"
    # 可选:添加告警(钉钉/邮件)
    # curl -s -X POST 你的钉钉机器人URL -d '{"msgtype":"text","text":{"content":"NFS备节点同步失败,VIP:192.168.185.156"}}'
fi

log "备节点切换流程完成"
exit 0

4、VIP 故障通知脚本(/usr/local/bin/vip_fault.sh)

#!/bin/bash
LOG_FILE="/var/log/nfs_ha.log"
VIP="192.168.189.156"

log() {
    echo "$(date +'%Y-%m-%d %H:%M:%S') [FAULT] - $1" >> $LOG_FILE
}

log "本机故障,VIP $VIP 已漂移!!!"

5、Inotify 实时同步脚本(/usr/local/bin/nfs_inotify_sync.sh)

#!/bin/bash
# 核心配置
VIP="192.168.189.156"
NFS_DIR="/data/nfs"
NODE_LIST=("192.168.189.155" "192.168.189.163" "192.168.189.164")  # 所有节点IP
LOG_FILE="/var/log/nfs_inotify.log"
RETRY_COUNT=3  # 同步失败重试次数
RETRY_INTERVAL=2  # 重试间隔(秒)
SYNC_TIMEOUT=10  # rsync超时时间(秒)

# 日志函数
log() {
    local LEVEL=$1
    local MSG=$2
    echo "$(date +'%Y-%m-%d %H:%M:%S') [$LEVEL] - $MSG" >> $LOG_FILE
}

# 校验是否持有VIP(仅持有VIP的节点执行同步)
check_vip() {
    if ! ip addr | grep -q "$VIP"; then
        log "ERROR" "本机未持有VIP($VIP),退出同步进程"
        exit 0
    fi
}

# 单次同步函数
single_sync() {
    local TARGET_NODE=$1
    local FILE=$2
    local RETRY=0

    while [ $RETRY -lt $RETRY_COUNT ]; do
        rsync -avz --delete \
          --exclude ".nfs_*" \
          --times \
          --timeout=$SYNC_TIMEOUT \
          $NFS_DIR/ root@$TARGET_NODE:$NFS_DIR/ >> $LOG_FILE 2>&1
        
        if [ $? -eq 0 ]; then
            log "INFO" "文件 $FILE 同步到 $TARGET_NODE 成功"
            return 0
        else
            RETRY=$((RETRY + 1))
            log "WARN" "文件 $FILE 同步到 $TARGET_NODE 失败,重试第$RETRY次"
            sleep $RETRY_INTERVAL
        fi
    done

    log "ERROR" "文件 $FILE 同步到 $TARGET_NODE 失败(已重试$RETRY_COUNT次)"
    return 1
}

# 核心同步函数(推数据到其他节点)
sync_data() {
    local CURRENT_IP=$(hostname -I | awk '{print $1}')
    # 二次校验VIP(防止脚本启动后VIP漂移)
    check_vip
    log "INFO" "本机($CURRENT_IP)持有VIP,开始监听文件变化并同步到其他节点"
    
    # 监听文件变化(创建/删除/修改/移动)
    inotifywait -mrq --format '%w%f' -e create,delete,modify,move $NFS_DIR | while read FILE; do
        # 跳过临时文件
        if [[ $FILE =~ ".nfs_" ]]; then
            continue
        fi
        
        log "INFO" "检测到文件变化:$FILE"
        # 遍历所有节点,排除自身
        for node in "${NODE_LIST[@]}"; do
            if [ "$node" != "$CURRENT_IP" ]; then
                single_sync $node $FILE
            fi
        done

        # 每次同步后,再次校验VIP(关键:VIP漂移后立刻退出)
        if ! ip addr | grep -q "$VIP"; then
            log "ERROR" "VIP已漂移,本机不再是主节点,同步进程退出"
            exit 0
        fi
    done
}

# 启动逻辑(增加前置检查)
main() {
    # 检查目录是否存在
    if [ ! -d $NFS_DIR ]; then
        log "ERROR" "NFS目录($NFS_DIR)不存在,退出同步进程"
        exit 1
    fi

    # 检查inotifywait命令
    if ! which inotifywait >/dev/null 2>&1; then
        log "ERROR" "未找到inotifywait命令,请安装inotify-tools"
        exit 1
    fi

    # 启动同步
    check_vip
    sync_data
}

# 执行主流程
main

5、赋予脚本执行权限

chmod +x /usr/local/bin/chk_nfs.sh /usr/local/bin/vip_*.sh /usr/local/bin/nfs_inotify_sync.sh

6、服务启动与验证(所有节点执行)

# 1. 清理残留VIP和进程
pkill -9 keepalived
ip addr del 192.168.189.156/32 dev ens3 2>/dev/null

# 2. 启动Keepalived并设置开机自启
systemctl enable --now keepalived

# 3. 验证VIP状态(主节点应显示VIP)
ip addr show ens3 | grep 192.168.189.156

# 4. 验证Inotify同步(主节点执行)
touch /data/nfs/test_prod.txt
# 备节点验证
ls /data/nfs/test_prod.txt  # 应能看到该文件

# 5. 模拟主节点故障测试,主节点执行
systemctl stop keepalived  
# 备节点163验证VIP漂移163应持有VIP
ip addr show ens3 | grep 192.168.189.156

 7、K8S设置NFS存储

创建命名空间:kubectl create ns shared

创建存储类:kubectl apply -f  storageClass.yaml

kind: StorageClass

apiVersion: storage.k8s.io/v1

metadata:

  name: nfs-storage

  namespace: shared

  annotations:

    storageclass.beta.kubernetes.io/is-default-class: 'true'

    storageclass.kubernetes.io/is-default-class: 'true'

  labels:

    environment: test

provisioner: fuseim.pri/ifs                                           

reclaimPolicy: Retain                                               

volumeBindingMode: Immediate                 

创建RBAC权限:kubectl apply -f rbac.yaml

apiVersion: v1
kind: ServiceAccount
metadata:
  name: nfs-client-provisioner
  namespace: shared                                             
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: nfs-client-provisioner-runner
rules:
  - apiGroups: [""]
    resources: ["persistentvolumes"]
    verbs: ["get", "list", "watch", "create", "delete"]
  - apiGroups: [""]
    resources: ["persistentvolumeclaims"]
    verbs: ["get", "list", "watch", "update"]
  - apiGroups: ["storage.k8s.io"]
    resources: ["storageclasses"]
    verbs: ["get", "list", "watch"]
  - apiGroups: [""]
    resources: ["events"]
    verbs: ["create", "update", "patch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: run-nfs-client-provisioner
subjects:
  - kind: ServiceAccount
    name: nfs-client-provisioner
    namespace: shared                               
roleRef:
  kind: ClusterRole
  name: nfs-client-provisioner-runner
  apiGroup: rbac.authorization.k8s.io
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: leader-locking-nfs-client-provisioner
  namespace: shared                                  
rules:
  - apiGroups: [""]
    resources: ["endpoints"]
    verbs: ["get", "list", "watch", "create", "update", "patch"]
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: leader-locking-nfs-client-provisioner
  namespace: shared
subjects:
  - kind: ServiceAccount
    name: nfs-client-provisioner
    namespace: shared
roleRef:
  kind: Role
  name: leader-locking-nfs-client-provisioner
  apiGroup: rbac.authorization.k8s.io

创建Provisioner:kubectl apply -f provisioner.yaml

kind: Deployment
apiVersion: apps/v1
metadata:
  name: nfs-client-provisioner
  labels:
    app: nfs-client-provisioner
  
  namespace: shared
spec:
  replicas: 3
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: nfs-client-provisioner
  template:
    metadata:
      labels:
        app: nfs-client-provisioner
    spec:
      serviceAccountName: nfs-client-provisioner
      containers:
        - name: nfs-client-provisioner
          image: docker.1ms.run/eipwork/nfs-subdir-external-provisioner:v4.0.2
          imagePullPolicy: IfNotPresent
          volumeMounts:
            - name: nfs-client-root
              mountPath: /persistentvolumes
          env:
            - name: PROVISIONER_NAME
              value: fuseim.pri/ifs   # 必须要与storageclass中的PROVISIONER信息一致
            - name: NFS_SERVER
              value: 192.168.189.156   # 指定NFS服务器的VIP地址
            - name: NFS_PATH
              value: /data/nfs        # 指定NFS服务器中的共享挂载目录
      volumes:
        - name: nfs-client-root   # 定义持久化卷的名称,必须要上面volumeMounts挂载的名称一致
          nfs:
            server: 192.168.189.156   # 指定NFS所在的VIP地址
            path: /data/nfs          # 指定NFS服务器中的共享挂载目录

kubectl get po -A  | grep shared

image

 


 

posted @ 2026-03-13 16:30  sxFu9528  阅读(2)  评论(0)    收藏  举报