redis源码之节点状态检查(七)

简介

redis节点的下线状态分为可疑下线和下线:

  • 可疑下线(pfail):当节点A和节点B通信超时,节点A就会认为节点B为可疑下线
  • 下线(fail):当节点A收到超过集群一半的节点认为节点B下线的消息,就认为B是下线了

节点状态维护

clusterCron定时检查节点状态

集群模式下,函数clusterCron会每100ms调用一次,每次调用都要检查myself节点与其他节点是否存在通信超时,如果存在就设置对应节点状态为CLUSTER_NODE_PFAIL

void clusterCron(void) {
    ...
	   di = dictGetSafeIterator(server.cluster->nodes);
    // 遍历server.cluster->node, 检查myself与nodes的连接状态
    while((de = dictNext(di)) != NULL) {
        // 从发送ping到现在的时间
        mstime_t ping_delay = now - node->ping_sent;
        // 上次接收到数据的时间
        mstime_t data_delay = now - node->data_received;
		     // 取ping_delay和data_delay中较小的
        mstime_t node_delay = (ping_delay < data_delay) ? ping_delay :
                                                          data_delay;
        // 如果node_delay超过了server.cluster_node_timeout, 据设置节点状态为可疑fail
        if (node_delay > server.cluster_node_timeout) {
            /* Timeout reached. Set the node as possibly failing if it is
             * not already in this state. */
            // 如果是新成为pfail的节点,就设置节点的pfail标志
            if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
                serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
                    node->name);
                node->flags |= CLUSTER_NODE_PFAIL;
                update_state = 1;
            }
        }
				...
}

设置节点状态为FAIL

如果节点A收到收到节点B发送的消息中说节点C处于pfail状态,节点A会统计收到了多少个节点报告节点C处于pfail状态的消息,如果超过cluster->size的一半,节点A就标记节点C处于FAIL状态。
cluster->size是集群中存在slots的主节点的数量,只有主节点才有投票权。

// 消息处理函数
void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
    clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip;
    clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender);
	   node = clusterLookupNode(g->nodename);
	   ...
        if (node) {
            /* We already know this node.
               Handle failure reports, only when the sender is a master. */
            // sender是发送消息的节点,node是消息描述的节点
								// 如果收到主节点sender发送的node是fail或者pfail状态的消息
            if (sender && nodeIsMaster(sender) && node != myself) {
                if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
														// 将sender放入node->fail_reports链表,表示收到sender发送的node的pfail消息
                    if (clusterNodeAddFailureReport(node,sender)) {
                        serverLog(LL_VERBOSE,
                            "Node %.40s reported node %.40s as not reachable.",
                            sender->name, node->name);
                    }
                    markNodeAsFailingIfNeeded(node);
                } else {
                    if (clusterNodeDelFailureReport(node,sender)) {
                        serverLog(LL_VERBOSE,
                            "Node %.40s reported node %.40s is back online.",
                            sender->name, node->name);
                    }
                }
            }


}
// 如果超过cluster->size的一半,就设置FAIL标记
void markNodeAsFailingIfNeeded(clusterNode *node) {
    int failures;
    int needed_quorum = (server.cluster->size / 2) + 1;

    if (!nodeTimedOut(node)) return; /* We can reach it. */
    if (nodeFailed(node)) return; /* Already FAILing. */

    failures = clusterNodeFailureReportsCount(node);
    /* Also count myself as a voter if I'm a master. */
    if (nodeIsMaster(myself)) failures++;
    if (failures < needed_quorum) return; /* No weak agreement from masters. */

    serverLog(LL_NOTICE,
        "Marking node %.40s as failing (quorum reached).", node->name);

    /* Mark the node as failing. */
    node->flags &= ~CLUSTER_NODE_PFAIL;
    node->flags |= CLUSTER_NODE_FAIL;
    node->fail_time = mstime();

    /* Broadcast the failing node name to everybody, forcing all the other
     * reachable nodes to flag the node as FAIL.
     * We do that even if this node is a replica and not a master: anyway
     * the failing state is triggered collecting failure reports from masters,
     * so here the replica is only helping propagating this status. */
    clusterSendFail(node->name);
    clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
}

节点间消息通信

节点A每秒都会随机抽取一个节点发送ping消息,或者给长时间没有通信的节点发送ping消息,消息里面包含随机抽取的几个节点和pfail的节点信息。

void clusterCron(void) {
    ...
    if (!(iteration % 10)) {
        int j;

        /* Check a few random nodes and ping the one with the oldest
         * pong_received time. */
        // 从cluster->nodes随机取5次节点,给pong_received最小的节点发送ping
        for (j = 0; j < 5; j++) {
            de = dictGetRandomKey(server.cluster->nodes);
            clusterNode *this = dictGetVal(de);

            /* Don't ping nodes disconnected or with a ping currently active. */
            // 与this节点的连接断了或者已经给this发送了ping
            if (this->link == NULL || this->ping_sent != 0) continue;
            //  如果this节点是当前节点自身,也不会给自己发送ping
            if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
                continue;
            if (min_pong_node == NULL || min_pong > this->pong_received) {
                min_pong_node = this;
                min_pong = this->pong_received;
            }
        }
        if (min_pong_node) {
            serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
            // 给min_pong_node发送ping,ping消息包含了随机几个节点的信息和所有pfail状态节点的信息
            clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
        }
    }
    ...	
	
	
	   di = dictGetSafeIterator(server.cluster->nodes);
    // 遍历server.cluster->node, 检查myself与nodes的连接状态
    while((de = dictNext(di)) != NULL) {
        ...
        // 如果没有发给node的活动的ping, 而且太长时间没有收到node新的pong了, 就给node发送ping
        if (node->link &&
            node->ping_sent == 0 &&
            (now - node->pong_received) > server.cluster_node_timeout/2)
        {
            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
            continue;
        }
        ...
		 }
}

// 给link对应的节点发送ping消息,消息里面包含了随机抽取的几个节点和所有的pfail的节点信息
void clusterSendPing(clusterLink *link, int type) {
    ...
    /* Populate the gossip fields */
    int maxiterations = wanted*3;
    // 随机抽取节点, 数量不能大于wanted, 最多抽取wanted*3次
    // 将抽取的节点的信息加入到hdr
    while(freshnodes > 0 && gossipcount < wanted && maxiterations--) {
        dictEntry *de = dictGetRandomKey(server.cluster->nodes);
        clusterNode *this = dictGetVal(de);

        /* Don't include this node: the whole packet header is about us
         * already, so we just gossip about other nodes. */
        if (this == myself) continue;

        /* PFAIL nodes will be added later. */
        if (this->flags & CLUSTER_NODE_PFAIL) continue;

        /* In the gossip section don't include:
         * 1) Nodes in HANDSHAKE state.
         * 3) Nodes with the NOADDR flag set.
         * 4) Disconnected nodes if they don't have configured slots.
         */
        // 跳过handshake节点和没有ip的节点
        if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) ||
            (this->link == NULL && this->numslots == 0))
        {
            freshnodes--; /* Technically not correct, but saves CPU. */
            continue;
        }

        /* Do not add a node we already have. */
        if (clusterNodeIsInGossipSection(hdr,gossipcount,this)) continue;

        /* Add it */
        clusterSetGossipEntry(hdr,gossipcount,this);
        freshnodes--;
        gossipcount++;
    }
    ...
    /* If there are PFAIL nodes, add them at the end. */
    if (pfail_wanted) {
        dictIterator *di;
        dictEntry *de;
        // 遍历cluster->nodes,将pfail节点信息放入hdr
        di = dictGetSafeIterator(server.cluster->nodes);
        while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
            clusterNode *node = dictGetVal(de);
            if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
            if (node->flags & CLUSTER_NODE_NOADDR) continue;
            if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
            clusterSetGossipEntry(hdr,gossipcount,node);
            freshnodes--;
            gossipcount++;
            /* We take the count of the slots we allocated, since the
             * PFAIL stats may not match perfectly with the current number
             * of PFAIL nodes. */
            pfail_wanted--;
        }
        dictReleaseIterator(di);
    }

}
posted @ 2024-09-27 17:07  董少奇  阅读(83)  评论(0)    收藏  举报