Es 选主流程

实例化node后调用各个模块的 start方法,discovery模块调用startInitialJoin()->startNewThreadIfNotRunning()->innerJoinCluster()

  

 

 

 

    /**
     * the main function of a join thread. This function is guaranteed to join the cluster
     * or spawn a new join thread upon failure to do so.
     */
    private void innerJoinCluster() {
        DiscoveryNode masterNode = null;
        final Thread currentThread = Thread.currentThread();
        nodeJoinController.startElectionContext();
        while (masterNode == null && joinThreadControl.joinThreadActive(currentThread)) {// 一个while循环选举临时主节点 
            masterNode = findMaster();
        }

        if (!joinThreadControl.joinThreadActive(currentThread)) {
            logger.trace("thread is no longer in currentJoinThread. Stopping.");
            return;
        }

        if (transportService.getLocalNode().equals(masterNode)) {//如果选取的主节点就是当前节点  
            // we count as one  最小主节点数 当没有配置的时候是-1
            final int requiredJoins = Math.max(0, electMaster.minimumMasterNodes() - 1); 
        
            logger.debug("elected as master, waiting for incoming joins ([{}] needed)", requiredJoins);
            //等待其他节点加入本节点 超时后还没满足最小主节点数 重新加入  默认是30秒  成功后发布新的clusterState
            nodeJoinController.waitToBeElectedAsMaster(requiredJoins, masterElectionWaitForJoinsTimeout,
                    new NodeJoinController.ElectionCallback() {
                        @Override
                        public void onElectedAsMaster(ClusterState state) {
                            synchronized (stateMutex) {
                                joinThreadControl.markThreadAsDone(currentThread);
                            }
                        }

                        @Override
                        public void onFailure(Throwable t) {//失败的话 重新选举 
                            logger.trace("failed while waiting for nodes to join, rejoining", t);
                            synchronized (stateMutex) {
                                joinThreadControl.markThreadAsDoneAndStartNew(currentThread);
                            }
                        }
                    }

            );
        } else {
            // process any incoming joins (they will fail because we are not the master)
            nodeJoinController.stopElectionContext(masterNode + " elected");  //停止选举过程 

            //向已经选出的master发送join请求 向此次选举出的master节点发起join请求,即承认其master身份
            final boolean success = joinElectedMaster(masterNode);

            synchronized (stateMutex) {
                if (success) {//如果加入 master节点成功  
                    DiscoveryNode currentMasterNode = this.clusterState().getNodes().getMasterNode();//重新获取master信息
                    if (currentMasterNode == null) {//如果没有选举出来  重新开始选举流程    
                        logger.debug("no master node is set, despite of join request completing. retrying pings.");
                        joinThreadControl.markThreadAsDoneAndStartNew(currentThread);
                    } else if (currentMasterNode.equals(masterNode) == false) {
//如果选出的节点和临时节点不一样 重新选举 可能集群较大 每个节点启动时间不一样 会导致选出来的主不一样 如果一次性全部启动 且网络不会中断 不会出现这个问题
                        joinThreadControl.stopRunningThreadAndRejoin("master_switched_while_finalizing_join");
                    }

                    joinThreadControl.markThreadAsDone(currentThread);//加入成功
                } else {//重新加入集群
                    // failed to join. Try again...
                    joinThreadControl.markThreadAsDoneAndStartNew(currentThread);
                }
            }
        }
    }




    //选举临时节点
    private DiscoveryNode findMaster() {
        logger.trace("starting to ping");
        List<ZenPing.PingResponse> fullPingResponses = pingAndWait(pingTimeout).toList();//找到除本节点外的其他节点  
        if (fullPingResponses == null) {
            logger.trace("No full ping responses");
            return null;
        }
        if (logger.isTraceEnabled()) {
            StringBuilder sb = new StringBuilder();
            if (fullPingResponses.size() == 0) {
                sb.append(" {none}");
            } else {
                for (ZenPing.PingResponse pingResponse : fullPingResponses) {
                    sb.append("\n\t--> ").append(pingResponse);
                }
            }
            logger.trace("full ping responses:{}", sb);
        }

        final DiscoveryNode localNode = transportService.getLocalNode();

        // add our selves
        assert fullPingResponses.stream().map(ZenPing.PingResponse::node)
            .filter(n -> n.equals(localNode)).findAny().isPresent() == false;

        fullPingResponses.add(new ZenPing.PingResponse(localNode, null, this.clusterState()));

        // filter responses  除去不做主节点选举的节点的ping请求
        final List<ZenPing.PingResponse> pingResponses = filterPingResponses(fullPingResponses, masterElectionIgnoreNonMasters, logger);

        List<DiscoveryNode> activeMasters = new ArrayList<>();// 出本节点外的其他活跃master节点列表
        for (ZenPing.PingResponse pingResponse : pingResponses) {
            // We can't include the local node in pingMasters list, otherwise we may up electing ourselves without
            // any check / verifications from other nodes in ZenDiscover#innerJoinCluster()
        //添加除本节点外的其他活跃master节点
            if (pingResponse.master() != null && !localNode.equals(pingResponse.master())) {
                activeMasters.add(pingResponse.master());
            }
        }

        // nodes discovered during pinging
        List<ElectMasterService.MasterCandidate> masterCandidates = new ArrayList<>();//候选列表 
        for (ZenPing.PingResponse pingResponse : pingResponses) {
            if (pingResponse.node().isMasterNode()) {//添加可以作为master的节点
                masterCandidates.add(new ElectMasterService.MasterCandidate(pingResponse.node(), pingResponse.getClusterStateVersion()));
            }
        }

        if (activeMasters.isEmpty()) {//如果没有活跃的master节点  这里活跃master节点并不包括本地节点   候选的选举节点中包含  默认自己不选自己
            if (electMaster.hasEnoughCandidates(masterCandidates)) { //discovery.zen.minimum_master_nodes 判断是否已经超过最小主节点数
               //选举出获胜节点  流程是先判断版本号  版本号 一致判断 是否本身可以是master节点(文件中配置) 最后是对比id ID 最大的获胜 返回列表返回的第一个为master
            final ElectMasterService.MasterCandidate winner = electMaster.electMaster(masterCandidates);
                logger.trace("candidate {} won election", winner);
                return winner.getNode();
            } else { //没有足够的最小组节点数返回null
                // if we don't have enough master nodes, we bail, because there are not enough master to elect from
                logger.warn("not enough master nodes discovered during pinging (found [{}], but needed [{}]), pinging again",
                            masterCandidates, electMaster.minimumMasterNodes());
                return null;
            }
        } else {
            assert !activeMasters.contains(localNode) : "local node should never be elected as master when other nodes indicate an active master";
            // lets tie break between discovered nodes
            return electMaster.tieBreakActiveMasters(activeMasters);//否则的话 从活跃的主节点中选出
        }
    }



    选举Master算法
        public static int compare(MasterCandidate c1, MasterCandidate c2) {
            // we explicitly swap c1 and c2 here. the code expects "better" is lower in a sorted
            // list, so if c2 has a higher cluster state version, it needs to come first.
            int ret = Long.compare(c2.clusterStateVersion, c1.clusterStateVersion);//版本高的在前
            if (ret == 0) {
                ret = compareNodes(c1.getNode(), c2.getNode());
            }
            return ret;
        }

    /** master nodes go before other nodes, with a secondary sort by id **/
     private static int compareNodes(DiscoveryNode o1, DiscoveryNode o2) {
        if (o1.isMasterNode() && !o2.isMasterNode()) {//可以成为master排前 配置文件控制
            return -1;
        }
        if (!o1.isMasterNode() && o2.isMasterNode()) {
            return 1;
        }
        return o1.getId().compareTo(o2.getId());
    }




    public ZenDiscovery(...) {//构造函数中 添加节点检测的listener
        .....
        this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this::clusterState, masterService, clusterName);
        this.masterFD.addListener(new MasterNodeFailureListener());//主节点
        this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, clusterName);
        this.nodesFD.addListener(new NodeFaultDetectionListener());//普通节点
        .....

   }


 普通节点失败的话
    private void handleNodeFailure(final DiscoveryNode node, final String reason) {
        //如果node 失败 移除node 移除是会判断是否够候选人数 够的话迁移移除的这个节点的分片到其他节点上  否则的话放弃master身份
        if (lifecycleState() != Lifecycle.State.STARTED) {
            // not started, ignore a node failure
            return;
        }
        if (!localNodeMaster()) {
            // nothing to do here...
            return;
        }
        removeNode(node, "zen-disco-node-failed", reason);
    }


    private void removeNode(final DiscoveryNode node, final String source, final String reason) {//
        masterService.submitStateUpdateTask(
                source + "(" + node + "), reason(" + reason + ")",
                new NodeRemovalClusterStateTaskExecutor.Task(node, reason),//定义失败的task
                ClusterStateTaskConfig.build(Priority.IMMEDIATE),
                nodeRemovalExecutor,
                nodeRemovalExecutor);
    }


  public ClusterTasksResult<Task> execute(final ClusterState currentState, final List<Task> tasks) throws Exception {
            final DiscoveryNodes.Builder remainingNodesBuilder = DiscoveryNodes.builder(currentState.nodes());
            boolean removed = false;
            for (final Task task : tasks) {
                if (currentState.nodes().nodeExists(task.node())) {
                    remainingNodesBuilder.remove(task.node());
                    removed = true;
                } else {
                    logger.debug("node [{}] does not exist in cluster state, ignoring", task);
                }
            }

            if (!removed) {
                // no nodes to remove, keep the current cluster state
                return ClusterTasksResult.<Task>builder().successes(tasks).build(currentState);
            }

            final ClusterState remainingNodesClusterState = remainingNodesClusterState(currentState, remainingNodesBuilder);

            final ClusterTasksResult.Builder<Task> resultBuilder = ClusterTasksResult.<Task>builder().successes(tasks);
            if (electMasterService.hasEnoughMasterNodes(remainingNodesClusterState.nodes()) == false) {//判断是否够法定人数
                final int masterNodes = electMasterService.countMasterNodes(remainingNodesClusterState.nodes());
                rejoin.accept(LoggerMessageFormat.format("not enough master nodes (has [{}], but needed [{}])",
                                                         masterNodes, electMasterService.minimumMasterNodes()));
                return resultBuilder.build(currentState);//放弃master身份  避免产生双主 
            } else {

            //足够的话 
                return resultBuilder.build(allocationService.deassociateDeadNodes(remainingNodesClusterState, true, describeTasks(tasks)));

        //如果一个节点失败 刚好也够法定人数迁移移除的这个节点的分片到其他节点上             }         } 主节点失败     private void handleMasterGone(final DiscoveryNode masterNode, final Throwable cause, final String reason) {//master离开重新选举         if (lifecycleState() != Lifecycle.State.STARTED) {             // not started, ignore a master failure             return;         }         if (localNodeMaster()) {             // we might get this on both a master telling us shutting down, and then the disconnect failure             return;         }         logger.info((Supplier<?>) () -> new ParameterizedMessage("master_left [{}], reason [{}]", masterNode, reason), cause);         synchronized (stateMutex) {//重新选举一个master             if (localNodeMaster() == false && masterNode.equals(committedState.get().nodes().getMasterNode())) {                 // flush any pending cluster states from old master, so it will not be set as master again                 pendingStatesQueue.failAllStatesAndClear(new ElasticsearchException("master left [{}]", reason));                 rejoin("master left (reason = " + reason + ")");//重新加入节点 重新选举            }         }     }

 

posted @ 2020-07-07 14:46  王南辉  阅读(658)  评论(0)    收藏  举报