Es 选主流程
实例化node后调用各个模块的 start方法,discovery模块调用startInitialJoin()->startNewThreadIfNotRunning()->innerJoinCluster()
/** * the main function of a join thread. This function is guaranteed to join the cluster * or spawn a new join thread upon failure to do so. */ private void innerJoinCluster() { DiscoveryNode masterNode = null; final Thread currentThread = Thread.currentThread(); nodeJoinController.startElectionContext(); while (masterNode == null && joinThreadControl.joinThreadActive(currentThread)) {// 一个while循环选举临时主节点 masterNode = findMaster(); } if (!joinThreadControl.joinThreadActive(currentThread)) { logger.trace("thread is no longer in currentJoinThread. Stopping."); return; } if (transportService.getLocalNode().equals(masterNode)) {//如果选取的主节点就是当前节点 // we count as one 最小主节点数 当没有配置的时候是-1 final int requiredJoins = Math.max(0, electMaster.minimumMasterNodes() - 1); logger.debug("elected as master, waiting for incoming joins ([{}] needed)", requiredJoins); //等待其他节点加入本节点 超时后还没满足最小主节点数 重新加入 默认是30秒 成功后发布新的clusterState nodeJoinController.waitToBeElectedAsMaster(requiredJoins, masterElectionWaitForJoinsTimeout, new NodeJoinController.ElectionCallback() { @Override public void onElectedAsMaster(ClusterState state) { synchronized (stateMutex) { joinThreadControl.markThreadAsDone(currentThread); } } @Override public void onFailure(Throwable t) {//失败的话 重新选举 logger.trace("failed while waiting for nodes to join, rejoining", t); synchronized (stateMutex) { joinThreadControl.markThreadAsDoneAndStartNew(currentThread); } } } ); } else { // process any incoming joins (they will fail because we are not the master) nodeJoinController.stopElectionContext(masterNode + " elected"); //停止选举过程 //向已经选出的master发送join请求 向此次选举出的master节点发起join请求,即承认其master身份 final boolean success = joinElectedMaster(masterNode); synchronized (stateMutex) { if (success) {//如果加入 master节点成功 DiscoveryNode currentMasterNode = this.clusterState().getNodes().getMasterNode();//重新获取master信息 if (currentMasterNode == null) {//如果没有选举出来 重新开始选举流程 logger.debug("no master node is set, despite of join request completing. retrying pings."); joinThreadControl.markThreadAsDoneAndStartNew(currentThread); } else if (currentMasterNode.equals(masterNode) == false) { //如果选出的节点和临时节点不一样 重新选举 可能集群较大 每个节点启动时间不一样 会导致选出来的主不一样 如果一次性全部启动 且网络不会中断 不会出现这个问题 joinThreadControl.stopRunningThreadAndRejoin("master_switched_while_finalizing_join"); } joinThreadControl.markThreadAsDone(currentThread);//加入成功 } else {//重新加入集群 // failed to join. Try again... joinThreadControl.markThreadAsDoneAndStartNew(currentThread); } } } } //选举临时节点 private DiscoveryNode findMaster() { logger.trace("starting to ping"); List<ZenPing.PingResponse> fullPingResponses = pingAndWait(pingTimeout).toList();//找到除本节点外的其他节点 if (fullPingResponses == null) { logger.trace("No full ping responses"); return null; } if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); if (fullPingResponses.size() == 0) { sb.append(" {none}"); } else { for (ZenPing.PingResponse pingResponse : fullPingResponses) { sb.append("\n\t--> ").append(pingResponse); } } logger.trace("full ping responses:{}", sb); } final DiscoveryNode localNode = transportService.getLocalNode(); // add our selves assert fullPingResponses.stream().map(ZenPing.PingResponse::node) .filter(n -> n.equals(localNode)).findAny().isPresent() == false; fullPingResponses.add(new ZenPing.PingResponse(localNode, null, this.clusterState())); // filter responses 除去不做主节点选举的节点的ping请求 final List<ZenPing.PingResponse> pingResponses = filterPingResponses(fullPingResponses, masterElectionIgnoreNonMasters, logger); List<DiscoveryNode> activeMasters = new ArrayList<>();// 出本节点外的其他活跃master节点列表 for (ZenPing.PingResponse pingResponse : pingResponses) { // We can't include the local node in pingMasters list, otherwise we may up electing ourselves without // any check / verifications from other nodes in ZenDiscover#innerJoinCluster() //添加除本节点外的其他活跃master节点 if (pingResponse.master() != null && !localNode.equals(pingResponse.master())) { activeMasters.add(pingResponse.master()); } } // nodes discovered during pinging List<ElectMasterService.MasterCandidate> masterCandidates = new ArrayList<>();//候选列表 for (ZenPing.PingResponse pingResponse : pingResponses) { if (pingResponse.node().isMasterNode()) {//添加可以作为master的节点 masterCandidates.add(new ElectMasterService.MasterCandidate(pingResponse.node(), pingResponse.getClusterStateVersion())); } } if (activeMasters.isEmpty()) {//如果没有活跃的master节点 这里活跃master节点并不包括本地节点 候选的选举节点中包含 默认自己不选自己 if (electMaster.hasEnoughCandidates(masterCandidates)) { //discovery.zen.minimum_master_nodes 判断是否已经超过最小主节点数 //选举出获胜节点 流程是先判断版本号 版本号 一致判断 是否本身可以是master节点(文件中配置) 最后是对比id ID 最大的获胜 返回列表返回的第一个为master final ElectMasterService.MasterCandidate winner = electMaster.electMaster(masterCandidates); logger.trace("candidate {} won election", winner); return winner.getNode(); } else { //没有足够的最小组节点数返回null // if we don't have enough master nodes, we bail, because there are not enough master to elect from logger.warn("not enough master nodes discovered during pinging (found [{}], but needed [{}]), pinging again", masterCandidates, electMaster.minimumMasterNodes()); return null; } } else { assert !activeMasters.contains(localNode) : "local node should never be elected as master when other nodes indicate an active master"; // lets tie break between discovered nodes return electMaster.tieBreakActiveMasters(activeMasters);//否则的话 从活跃的主节点中选出 } } 选举Master算法 public static int compare(MasterCandidate c1, MasterCandidate c2) { // we explicitly swap c1 and c2 here. the code expects "better" is lower in a sorted // list, so if c2 has a higher cluster state version, it needs to come first. int ret = Long.compare(c2.clusterStateVersion, c1.clusterStateVersion);//版本高的在前 if (ret == 0) { ret = compareNodes(c1.getNode(), c2.getNode()); } return ret; } /** master nodes go before other nodes, with a secondary sort by id **/ private static int compareNodes(DiscoveryNode o1, DiscoveryNode o2) { if (o1.isMasterNode() && !o2.isMasterNode()) {//可以成为master排前 配置文件控制 return -1; } if (!o1.isMasterNode() && o2.isMasterNode()) { return 1; } return o1.getId().compareTo(o2.getId()); } public ZenDiscovery(...) {//构造函数中 添加节点检测的listener ..... this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this::clusterState, masterService, clusterName); this.masterFD.addListener(new MasterNodeFailureListener());//主节点 this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, clusterName); this.nodesFD.addListener(new NodeFaultDetectionListener());//普通节点 ..... } 普通节点失败的话 private void handleNodeFailure(final DiscoveryNode node, final String reason) { //如果node 失败 移除node 移除是会判断是否够候选人数 够的话迁移移除的这个节点的分片到其他节点上 否则的话放弃master身份 if (lifecycleState() != Lifecycle.State.STARTED) { // not started, ignore a node failure return; } if (!localNodeMaster()) { // nothing to do here... return; } removeNode(node, "zen-disco-node-failed", reason); } private void removeNode(final DiscoveryNode node, final String source, final String reason) {// masterService.submitStateUpdateTask( source + "(" + node + "), reason(" + reason + ")", new NodeRemovalClusterStateTaskExecutor.Task(node, reason),//定义失败的task ClusterStateTaskConfig.build(Priority.IMMEDIATE), nodeRemovalExecutor, nodeRemovalExecutor); } public ClusterTasksResult<Task> execute(final ClusterState currentState, final List<Task> tasks) throws Exception { final DiscoveryNodes.Builder remainingNodesBuilder = DiscoveryNodes.builder(currentState.nodes()); boolean removed = false; for (final Task task : tasks) { if (currentState.nodes().nodeExists(task.node())) { remainingNodesBuilder.remove(task.node()); removed = true; } else { logger.debug("node [{}] does not exist in cluster state, ignoring", task); } } if (!removed) { // no nodes to remove, keep the current cluster state return ClusterTasksResult.<Task>builder().successes(tasks).build(currentState); } final ClusterState remainingNodesClusterState = remainingNodesClusterState(currentState, remainingNodesBuilder); final ClusterTasksResult.Builder<Task> resultBuilder = ClusterTasksResult.<Task>builder().successes(tasks); if (electMasterService.hasEnoughMasterNodes(remainingNodesClusterState.nodes()) == false) {//判断是否够法定人数 final int masterNodes = electMasterService.countMasterNodes(remainingNodesClusterState.nodes()); rejoin.accept(LoggerMessageFormat.format("not enough master nodes (has [{}], but needed [{}])", masterNodes, electMasterService.minimumMasterNodes())); return resultBuilder.build(currentState);//放弃master身份 避免产生双主 } else { //足够的话 return resultBuilder.build(allocationService.deassociateDeadNodes(remainingNodesClusterState, true, describeTasks(tasks)));
//如果一个节点失败 刚好也够法定人数迁移移除的这个节点的分片到其他节点上 } } 主节点失败 private void handleMasterGone(final DiscoveryNode masterNode, final Throwable cause, final String reason) {//master离开重新选举 if (lifecycleState() != Lifecycle.State.STARTED) { // not started, ignore a master failure return; } if (localNodeMaster()) { // we might get this on both a master telling us shutting down, and then the disconnect failure return; } logger.info((Supplier<?>) () -> new ParameterizedMessage("master_left [{}], reason [{}]", masterNode, reason), cause); synchronized (stateMutex) {//重新选举一个master if (localNodeMaster() == false && masterNode.equals(committedState.get().nodes().getMasterNode())) { // flush any pending cluster states from old master, so it will not be set as master again pendingStatesQueue.failAllStatesAndClear(new ElasticsearchException("master left [{}]", reason)); rejoin("master left (reason = " + reason + ")");//重新加入节点 重新选举 } } }