redis源码之节点状态检查(七)
简介
redis节点的下线状态分为可疑下线和下线:
- 可疑下线(pfail):当节点A和节点B通信超时,节点A就会认为节点B为可疑下线
- 下线(fail):当节点A收到超过集群一半的节点认为节点B下线的消息,就认为B是下线了
节点状态维护
clusterCron定时检查节点状态
集群模式下,函数clusterCron会每100ms调用一次,每次调用都要检查myself节点与其他节点是否存在通信超时,如果存在就设置对应节点状态为CLUSTER_NODE_PFAIL
void clusterCron(void) {
...
di = dictGetSafeIterator(server.cluster->nodes);
// 遍历server.cluster->node, 检查myself与nodes的连接状态
while((de = dictNext(di)) != NULL) {
// 从发送ping到现在的时间
mstime_t ping_delay = now - node->ping_sent;
// 上次接收到数据的时间
mstime_t data_delay = now - node->data_received;
// 取ping_delay和data_delay中较小的
mstime_t node_delay = (ping_delay < data_delay) ? ping_delay :
data_delay;
// 如果node_delay超过了server.cluster_node_timeout, 据设置节点状态为可疑fail
if (node_delay > server.cluster_node_timeout) {
/* Timeout reached. Set the node as possibly failing if it is
* not already in this state. */
// 如果是新成为pfail的节点,就设置节点的pfail标志
if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
node->name);
node->flags |= CLUSTER_NODE_PFAIL;
update_state = 1;
}
}
...
}
设置节点状态为FAIL
如果节点A收到收到节点B发送的消息中说节点C处于pfail状态,节点A会统计收到了多少个节点报告节点C处于pfail状态的消息,如果超过cluster->size的一半,节点A就标记节点C处于FAIL状态。
cluster->size是集群中存在slots的主节点的数量,只有主节点才有投票权。
// 消息处理函数
void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
clusterMsgDataGossip *g = (clusterMsgDataGossip*) hdr->data.ping.gossip;
clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender);
node = clusterLookupNode(g->nodename);
...
if (node) {
/* We already know this node.
Handle failure reports, only when the sender is a master. */
// sender是发送消息的节点,node是消息描述的节点
// 如果收到主节点sender发送的node是fail或者pfail状态的消息
if (sender && nodeIsMaster(sender) && node != myself) {
if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
// 将sender放入node->fail_reports链表,表示收到sender发送的node的pfail消息
if (clusterNodeAddFailureReport(node,sender)) {
serverLog(LL_VERBOSE,
"Node %.40s reported node %.40s as not reachable.",
sender->name, node->name);
}
markNodeAsFailingIfNeeded(node);
} else {
if (clusterNodeDelFailureReport(node,sender)) {
serverLog(LL_VERBOSE,
"Node %.40s reported node %.40s is back online.",
sender->name, node->name);
}
}
}
}
// 如果超过cluster->size的一半,就设置FAIL标记
void markNodeAsFailingIfNeeded(clusterNode *node) {
int failures;
int needed_quorum = (server.cluster->size / 2) + 1;
if (!nodeTimedOut(node)) return; /* We can reach it. */
if (nodeFailed(node)) return; /* Already FAILing. */
failures = clusterNodeFailureReportsCount(node);
/* Also count myself as a voter if I'm a master. */
if (nodeIsMaster(myself)) failures++;
if (failures < needed_quorum) return; /* No weak agreement from masters. */
serverLog(LL_NOTICE,
"Marking node %.40s as failing (quorum reached).", node->name);
/* Mark the node as failing. */
node->flags &= ~CLUSTER_NODE_PFAIL;
node->flags |= CLUSTER_NODE_FAIL;
node->fail_time = mstime();
/* Broadcast the failing node name to everybody, forcing all the other
* reachable nodes to flag the node as FAIL.
* We do that even if this node is a replica and not a master: anyway
* the failing state is triggered collecting failure reports from masters,
* so here the replica is only helping propagating this status. */
clusterSendFail(node->name);
clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG);
}
节点间消息通信
节点A每秒都会随机抽取一个节点发送ping消息,或者给长时间没有通信的节点发送ping消息,消息里面包含随机抽取的几个节点和pfail的节点信息。
void clusterCron(void) {
...
if (!(iteration % 10)) {
int j;
/* Check a few random nodes and ping the one with the oldest
* pong_received time. */
// 从cluster->nodes随机取5次节点,给pong_received最小的节点发送ping
for (j = 0; j < 5; j++) {
de = dictGetRandomKey(server.cluster->nodes);
clusterNode *this = dictGetVal(de);
/* Don't ping nodes disconnected or with a ping currently active. */
// 与this节点的连接断了或者已经给this发送了ping
if (this->link == NULL || this->ping_sent != 0) continue;
// 如果this节点是当前节点自身,也不会给自己发送ping
if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
continue;
if (min_pong_node == NULL || min_pong > this->pong_received) {
min_pong_node = this;
min_pong = this->pong_received;
}
}
if (min_pong_node) {
serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
// 给min_pong_node发送ping,ping消息包含了随机几个节点的信息和所有pfail状态节点的信息
clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
}
}
...
di = dictGetSafeIterator(server.cluster->nodes);
// 遍历server.cluster->node, 检查myself与nodes的连接状态
while((de = dictNext(di)) != NULL) {
...
// 如果没有发给node的活动的ping, 而且太长时间没有收到node新的pong了, 就给node发送ping
if (node->link &&
node->ping_sent == 0 &&
(now - node->pong_received) > server.cluster_node_timeout/2)
{
clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
continue;
}
...
}
}
// 给link对应的节点发送ping消息,消息里面包含了随机抽取的几个节点和所有的pfail的节点信息
void clusterSendPing(clusterLink *link, int type) {
...
/* Populate the gossip fields */
int maxiterations = wanted*3;
// 随机抽取节点, 数量不能大于wanted, 最多抽取wanted*3次
// 将抽取的节点的信息加入到hdr
while(freshnodes > 0 && gossipcount < wanted && maxiterations--) {
dictEntry *de = dictGetRandomKey(server.cluster->nodes);
clusterNode *this = dictGetVal(de);
/* Don't include this node: the whole packet header is about us
* already, so we just gossip about other nodes. */
if (this == myself) continue;
/* PFAIL nodes will be added later. */
if (this->flags & CLUSTER_NODE_PFAIL) continue;
/* In the gossip section don't include:
* 1) Nodes in HANDSHAKE state.
* 3) Nodes with the NOADDR flag set.
* 4) Disconnected nodes if they don't have configured slots.
*/
// 跳过handshake节点和没有ip的节点
if (this->flags & (CLUSTER_NODE_HANDSHAKE|CLUSTER_NODE_NOADDR) ||
(this->link == NULL && this->numslots == 0))
{
freshnodes--; /* Technically not correct, but saves CPU. */
continue;
}
/* Do not add a node we already have. */
if (clusterNodeIsInGossipSection(hdr,gossipcount,this)) continue;
/* Add it */
clusterSetGossipEntry(hdr,gossipcount,this);
freshnodes--;
gossipcount++;
}
...
/* If there are PFAIL nodes, add them at the end. */
if (pfail_wanted) {
dictIterator *di;
dictEntry *de;
// 遍历cluster->nodes,将pfail节点信息放入hdr
di = dictGetSafeIterator(server.cluster->nodes);
while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
clusterNode *node = dictGetVal(de);
if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
if (node->flags & CLUSTER_NODE_NOADDR) continue;
if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
clusterSetGossipEntry(hdr,gossipcount,node);
freshnodes--;
gossipcount++;
/* We take the count of the slots we allocated, since the
* PFAIL stats may not match perfectly with the current number
* of PFAIL nodes. */
pfail_wanted--;
}
dictReleaseIterator(di);
}
}

浙公网安备 33010602011771号