redis源码之集群(六)
集群
集群初始化
// cluster.c 初始化集群配置
void clusterInit(void) {
int saveconf = 0;
server.cluster = zmalloc(sizeof(clusterState));
...
for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
server.cluster->stats_bus_messages_sent[i] = 0;
server.cluster->stats_bus_messages_received[i] = 0;
}
server.cluster->stats_pfail_nodes = 0;
memset(server.cluster->slots,0, sizeof(server.cluster->slots));
clusterCloseAllSlots();
/* Lock the cluster config file to make sure every node uses
* its own nodes.conf. */
server.cluster_config_file_lock_fd = -1;
if (clusterLockConfig(server.cluster_configfile) == C_ERR)
exit(1);
/* Load or create a new nodes configuration. */
// 加载集群配置文件node.conf, 建立name->node的映射,slots->node的映射
if (clusterLoadConfig(server.cluster_configfile) == C_ERR) {
/* No configuration found. We will just use the random name provided
* by the createClusterNode() function. */
myself = server.cluster->myself =
createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER);
serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s",
myself->name);
clusterAddNode(myself);
saveconf = 1;
}
if (saveconf) clusterSaveConfigOrDie(1);
/* We need a listening TCP port for our cluster messaging needs. */
server.cfd.count = 0;
/* Port sanity check II
* The other handshake port check is triggered too late to stop
* us from trying to use a too-high cluster port number. */
// 建立端口的监听
int port = server.tls_cluster ? server.tls_port : server.port;
if (port > (65535-CLUSTER_PORT_INCR)) {
serverLog(LL_WARNING, "Redis port number too high. "
"Cluster communication port is 10,000 port "
"numbers higher than your Redis port. "
"Your Redis port number must be 55535 or less.");
exit(1);
}
// 启动集群端口监听
if (listenToPort(port+CLUSTER_PORT_INCR, &server.cfd) == C_ERR) {
/* Note: the following log text is matched by the test suite. */
serverLog(LL_WARNING, "Failed listening on port %u (cluster), aborting.", port);
exit(1);
}
//
if (createSocketAcceptHandler(&server.cfd, clusterAcceptHandler) != C_OK) {
serverPanic("Unrecoverable error creating Redis Cluster socket accept handler.");
}
/* The slots -> keys map is a radix tree. Initialize it here. */
server.cluster->slots_to_keys = raxNew();
memset(server.cluster->slots_keys_count,0,
sizeof(server.cluster->slots_keys_count));
/* Set myself->port/cport/pport to my listening ports, we'll just need to
* discover the IP address via MEET messages. */
deriveAnnouncedPorts(&myself->port, &myself->pport, &myself->cport);
server.cluster->mf_end = 0;
server.cluster->mf_slave = NULL;
resetManualFailover();
clusterUpdateMyselfFlags();
}
定时任务
void clusterCron(void) {
dictIterator *di;
dictEntry *de;
int update_state = 0;
int orphaned_masters; /* How many masters there are without ok slaves. */
int max_slaves; /* Max number of ok slaves for a single master. */
int this_slaves; /* Number of ok slaves for our master (if we are slave). */
mstime_t min_pong = 0, now = mstime();
clusterNode *min_pong_node = NULL;
static unsigned long long iteration = 0;
mstime_t handshake_timeout;
iteration++; /* Number of times this function was called so far. */
/* We want to take myself->ip in sync with the cluster-announce-ip option.
* The option can be set at runtime via CONFIG SET, so we periodically check
* if the option changed to reflect this into myself->ip. */
// config set可能动态调整配置,需要检查
// 检查ip是否发生变化
{
// 静态变量,记录上次执行的时候的server.cluster_announce_ip
static char *prev_ip = NULL;
// 获取当前cluster_announce_ip
// config set命令可以动态修改cluster_announce_ip
char *curr_ip = server.cluster_announce_ip;
int changed = 0;
if (prev_ip == NULL && curr_ip != NULL) changed = 1;
else if (prev_ip != NULL && curr_ip == NULL) changed = 1;
else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = 1;
// 如果ip发生了变化
if (changed) {
if (prev_ip) zfree(prev_ip);
prev_ip = curr_ip;
// 如果当前的server.cluster_announce_ip不为空
// 拷贝server.cluster_announce_ip到myself->ip
// 所以myself->ip的更新比server.cluster_announce_ip慢
if (curr_ip) {
/* We always take a copy of the previous IP address, by
* duplicating the string. This way later we can check if
* the address really changed. */
prev_ip = zstrdup(prev_ip);
strncpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN);
myself->ip[NET_IP_STR_LEN-1] = '\0';
} else {
myself->ip[0] = '\0'; /* Force autodetection. */
}
}
}
/* The handshake timeout is the time after which a handshake node that was
* not turned into a normal node is removed from the nodes. Usually it is
* just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use
* the value of 1 second. */
// 握手时间最小为1秒
handshake_timeout = server.cluster_node_timeout;
if (handshake_timeout < 1000) handshake_timeout = 1000;
/* Update myself flags. */
// 检查是否调整了nofailover
// 如果调整了就要检查更新集群状态和保存配置到配置文件
clusterUpdateMyselfFlags();
/* Check if we have disconnected nodes and re-establish the connection.
* Also update a few stats while we are here, that can be used to make
* better decisions in other part of the code. */
di = dictGetSafeIterator(server.cluster->nodes);
server.cluster->stats_pfail_nodes = 0;
// 遍历cluster->nodes, 创建本地节点到其他节点的连接,记录在node->link中
while((de = dictNext(di)) != NULL) {
clusterNode *node = dictGetVal(de);
/* Not interested in reconnecting the link with myself or nodes
* for which we have no address. */
// 跳过当前节点和没有设置ip的节点
if (node->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR)) continue;
// 如果节点的状态是可疑下线
if (node->flags & CLUSTER_NODE_PFAIL)
server.cluster->stats_pfail_nodes++;
/* A Node in HANDSHAKE state has a limited lifespan equal to the
* configured node timeout. */
// 如果节点是handshake状态,且当前时间和node->ctime的时间超过了handshake_timeout
if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
// 从集群删除节点node
clusterDelNode(node);
continue;
}
if (node->link == NULL) {
clusterLink *link = createClusterLink(node);
// 创建一个空的连接对象
link->conn = server.tls_cluster ? connCreateTLS() : connCreateSocket();
connSetPrivateData(link->conn, link);
// 创建到node的连接
if (connConnect(link->conn, node->ip, node->cport, NET_FIRST_BIND_ADDR,
clusterLinkConnectHandler) == -1) {
/* We got a synchronous error from connect before
* clusterSendPing() had a chance to be called.
* If node->ping_sent is zero, failure detection can't work,
* so we claim we actually sent a ping now (that will
* be really sent as soon as the link is obtained). */
// 如果创建连接失败
if (node->ping_sent == 0) node->ping_sent = mstime();
serverLog(LL_DEBUG, "Unable to connect to "
"Cluster Node [%s]:%d -> %s", node->ip,
node->cport, server.neterr);
freeClusterLink(link);
continue;
}
// 记录了本地节点到node的连接
node->link = link;
}
}
dictReleaseIterator(di);
/* Ping some random node 1 time every 10 iterations, so that we usually ping
* one random node every second. */
// 每十次执行一次下面的块, 也就是每秒执行一次,从随机抽取的5个节点里面选择最长没有通信的节点发送ping
if (!(iteration % 10)) {
int j;
/* Check a few random nodes and ping the one with the oldest
* pong_received time. */
for (j = 0; j < 5; j++) {
de = dictGetRandomKey(server.cluster->nodes);
clusterNode *this = dictGetVal(de);
/* Don't ping nodes disconnected or with a ping currently active. */
// 与this节点的连接断了或者已经给this发送了ping
if (this->link == NULL || this->ping_sent != 0) continue;
// 如果this节点是当前节点自身,也不会给自己发送ping
if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
continue;
if (min_pong_node == NULL || min_pong > this->pong_received) {
min_pong_node = this;
min_pong = this->pong_received;
}
}
if (min_pong_node) {
serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
// 给min_pong_node发送ping
clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
}
}
/* Iterate nodes to check if we need to flag something as failing.
* This loop is also responsible to:
* 1) Check if there are orphaned masters (masters without non failing
* slaves).
* 2) Count the max number of non failing slaves for a single master.
* 3) Count the number of slaves for our master, if we are a slave. */
orphaned_masters = 0;
max_slaves = 0;
this_slaves = 0;
di = dictGetSafeIterator(server.cluster->nodes);
// 遍历server.cluster->node, 检查myself与nodes的连接状态
while((de = dictNext(di)) != NULL) {
clusterNode *node = dictGetVal(de);
now = mstime(); /* Use an updated time at every iteration. */
// 跳过节点自身,和没有ip的节点,和正在进行handshake的节点
if (node->flags &
(CLUSTER_NODE_MYSELF|CLUSTER_NODE_NOADDR|CLUSTER_NODE_HANDSHAKE))
continue;
/* Orphaned master check, useful only if the current instance
* is a slave that may migrate to another master. */
// 如果当前节点是从节点, node是master节点,node的状态不是FAIL
if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) {
// node的从节点中不是FAIL状态的从节点数量 遍历node->slaves
int okslaves = clusterCountNonFailingSlaves(node);
/* A master is orphaned if it is serving a non-zero number of
* slots, have no working slaves, but used to have at least one
* slave, or failed over a master that used to have slaves. */
// 如果node服务部分slots, 而且没有正常的从节点, 而且node的标记是表示存在从节点
// 应该是发生了node的从节点转为故障状态了
if (okslaves == 0 && node->numslots > 0 &&
node->flags & CLUSTER_NODE_MIGRATE_TO)
{
orphaned_masters++;
}
if (okslaves > max_slaves) max_slaves = okslaves;
// 如果myself是node的从节点
// this_slaves记录myself的主节点的从节点数量
if (nodeIsSlave(myself) && myself->slaveof == node)
this_slaves = okslaves;
}
/* If we are not receiving any data for more than half the cluster
* timeout, reconnect the link: maybe there is a connection
* issue even if the node is alive. */
// 从发送ping到现在的时间
mstime_t ping_delay = now - node->ping_sent;
// 上次接收到数据的时间
mstime_t data_delay = now - node->data_received;
// 如果ping超时了,那就断开与node的连接
// 如果连接的创建时间大于cluster_node_timeout, 发送了ping
if (node->link && /* is connected */
now - node->link->ctime >
server.cluster_node_timeout && /* was not already reconnected */
node->ping_sent && /* we already sent a ping */
/* and we are waiting for the pong more than timeout/2 */
ping_delay > server.cluster_node_timeout/2 &&
/* and in such interval we are not seeing any traffic at all. */
data_delay > server.cluster_node_timeout/2)
{
/* Disconnect the link, it will be reconnected automatically. */
freeClusterLink(node->link);
}
/* If we have currently no active ping in this instance, and the
* received PONG is older than half the cluster timeout, send
* a new ping now, to ensure all the nodes are pinged without
* a too big delay. */
// 如果没有收到到node的ping, 而且太长时间没有收到node新的pong了, 就给node发送ping
if (node->link &&node->ping_sent == 0 &&(now - node->pong_received) > server.cluster_node_timeout/2)
{
clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
continue;
}
/* If we are a master and one of the slaves requested a manual
* failover, ping it continuously. */
// 如果myself节点是主节点,然后node是myself的从节点并且启动了手动切换, 持续ping node
if (server.cluster->mf_end &&nodeIsMaster(myself) &&server.cluster->mf_slave == node &&node->link)
{
clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
continue;
}
/* Check only if we have an active ping for this instance. */
// 如果node没有活动的ping,就进入下个节点的检查
if (node->ping_sent == 0) continue;
// 如果node有活动的ping , 检查node是否是可疑下线
/* Check if this node looks unreachable.
* Note that if we already received the PONG, then node->ping_sent
* is zero, so can't reach this code at all, so we don't risk of
* checking for a PONG delay if we didn't sent the PING.
*
* We also consider every incoming data as proof of liveness, since
* our cluster bus link is also used for data: under heavy data
* load pong delays are possible. */
// 取ping_delay和data_delay中较小的
mstime_t node_delay = (ping_delay < data_delay) ? ping_delay :
data_delay;
// 如果node_delay超过了server.cluster_node_timeout, 据设置节点状态为可疑fail
if (node_delay > server.cluster_node_timeout) {
/* Timeout reached. Set the node as possibly failing if it is
* not already in this state. */
// 如果没有设置PFAIL或FAIL, 那就设置PFAIL
if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
node->name);
node->flags |= CLUSTER_NODE_PFAIL;
update_state = 1;
}
}
}
dictReleaseIterator(di);
/* If we are a slave node but the replication is still turned off,
* enable it if we know the address of our master and it appears to
* be up. */
// 如果myself设置了主节点,但是没有到主节点的连接,就建立连接
if (nodeIsSlave(myself) &&
server.masterhost == NULL &&
myself->slaveof &&
nodeHasAddr(myself->slaveof))
{
replicationSetMaster(myself->slaveof->ip, myself->slaveof->port);
}
/* Abort a manual failover if the timeout is reached. */
// 如果手动切换超时,就取消切换
manualFailoverCheckTimeout();
if (nodeIsSlave(myself)) {
// 如果有进行中的手动切换,就检查数据同步状态,并设置todo_before_sleep
clusterHandleManualFailover();
if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
clusterHandleSlaveFailover();
/* If there are orphaned slaves, and we are a slave among the masters
* with the max number of non-failing slaves, consider migrating to
* the orphaned masters. Note that it does not make sense to try
* a migration if there is no master with at least *two* working
* slaves. */
// 如果存在孤立的主节点,而且存在大于2个从节点的主节点,而且myself节点的主节点的从节点数量为max_slaves
if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves &&
server.cluster_allow_replica_migration)
clusterHandleSlaveMigration(max_slaves);
}
if (update_state || server.cluster->state == CLUSTER_FAIL)
clusterUpdateState();
}
clusterCron的主要功能就是检查myself与其他node的连接,节点是否需要进行主备切换,备节点迁移。
- 因为可以通过config set动态调整服务器参数,有些参数需要随时检查:ip, nofailover
- 检查myself到其他node的连接是否超时,没有连接的就创建连接,连接失败就删除节点
- 每秒从集群节点中随机抽取5个节点发送ping,从中选取最长时间没有收到pong的节点发送ping
- 遍历节点:检查是否存在孤立主节点;检查是否有ping超时的节点,超时就设置节点状态为pfail;是否有太长时间没有发送ping的节点,是就发送ping;
- 如果存在孤立的主节点,而且myself是从节点,满足切换条件,就切换当前节点为孤立主节点的从节点
- 如果有节点进入pfail状态,就更新集群状态
集群通信
节点间通信消息类型
/* Message types.
*
* Note that the PING, PONG and MEET messages are actually the same exact
* kind of packet. PONG is the reply to ping, in the exact format as a PING,
* while MEET is a special PING that forces the receiver to add the sender
* as a node (if it is not already in the list). */
#define CLUSTERMSG_TYPE_PING 0 /* Ping */
#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */
#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */
#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */
#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */
#define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */
#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */
#define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */
#define CLUSTERMSG_TYPE_COUNT 10 /* Total number of message types. */
查看不通类型消息的计数
ip:port> cluster info
cluster_stats_messages_ping_sent:615811
cluster_stats_messages_pong_sent:580837
cluster_stats_messages_auth-req_sent:5
cluster_stats_messages_mfstart_sent:1
cluster_stats_messages_sent:1196654
cluster_stats_messages_ping_received:580832
cluster_stats_messages_pong_received:615811
cluster_stats_messages_auth-ack_received:3
cluster_stats_messages_received:1196646
集群状态
typedef struct clusterState {
// 当前节点
clusterNode *myself; /* This node */
uint64_t currentEpoch;
int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */
int size; /* Num of master nodes with at least one slot */
// name->clusterNode的映射
dict *nodes; /* Hash table of name -> clusterNode structures */
dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
// 迁移槽位,记录在其他节点的槽位
clusterNode *migrating_slots_to[CLUSTER_SLOTS];
clusterNode *importing_slots_from[CLUSTER_SLOTS];
// 槽位n所在的节点
clusterNode *slots[CLUSTER_SLOTS];
uint64_t slots_keys_count[CLUSTER_SLOTS];
rax *slots_to_keys;
/* The following fields are used to take the slave state on elections. */
mstime_t failover_auth_time; /* Time of previous or next election. */
int failover_auth_count; /* Number of votes received so far. */
int failover_auth_sent; /* True if we already asked for votes. */
int failover_auth_rank; /* This slave rank for current auth request. */
uint64_t failover_auth_epoch; /* Epoch of the current election. */
int cant_failover_reason; /* Why a slave is currently not able to
failover. See the CANT_FAILOVER_* macros. */
/* Manual failover state in common. */
mstime_t mf_end; /* Manual failover time limit (ms unixtime).
It is zero if there is no MF in progress. */
// 手动切换状态
/* Manual failover state of master. */
// 执行手动切换的备节点
clusterNode *mf_slave; /* Slave performing the manual failover. */
/* Manual failover state of slave. */
long long mf_master_offset; /* Master offset the slave needs to start MF
or zero if still not received. */
// 表示是否可以手动启动failover
int mf_can_start; /* If non-zero signal that the manual failover
can start requesting masters vote. */
/* The following fields are used by masters to take state on elections. */
uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
/* Messages received and sent by type. */
long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
long long stats_pfail_nodes; /* Number of nodes in PFAIL status,
excluding nodes without address. */
} clusterState;
节点
typedef struct clusterNode {
mstime_t ctime; /* Node object creation time. */
// 节点名称,随机生成的一个长度为40的字符串
char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */
// 节点的角色
int flags; /* CLUSTER_NODE_... */
uint64_t configEpoch; /* Last configEpoch observed for this node */
unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */
sds slots_info; /* Slots info represented by string. */
int numslots; /* Number of slots handled by this node */
int numslaves; /* Number of slave nodes, if this is a master */
struct clusterNode **slaves; /* pointers to slave nodes */
struct clusterNode *slaveof; /* pointer to the master node. Note that it
may be NULL even if the node is a slave
if we don't have the master node in our
tables. */
mstime_t ping_sent; /* Unix time we sent latest ping */
mstime_t pong_received; /* Unix time we received the pong */
mstime_t data_received; /* Unix time we received any data */
mstime_t fail_time; /* Unix time when FAIL flag was set */
mstime_t voted_time; /* Last time we voted for a slave of this master */
mstime_t repl_offset_time; /* Unix time we received offset for this node */
mstime_t orphaned_time; /* Starting time of orphaned master condition */
long long repl_offset; /* Last known repl offset for this node. */
char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */
// 节点的服务端口,客户端通过该端口连接
int port; /* Latest known clients port (TLS or plain). */
int pport; /* Latest known clients plaintext port. Only used
if the main clients port is for TLS. */
// 节点的集群通信端口,默认是port+10000, 不能大于65536
int cport; /* Latest known cluster port of this node. */
clusterLink *link; /* TCP/IP link with this node */
list *fail_reports; /* List of nodes signaling this as failing */
} clusterNode;
cluster.h定义了节点flags的值和表示的意思:
#define CLUSTER_NODE_MASTER 1 /* The node is a master */
#define CLUSTER_NODE_SLAVE 2 /* The node is a slave */
#define CLUSTER_NODE_PFAIL 4 /* Failure? Need acknowledge */
#define CLUSTER_NODE_FAIL 8 /* The node is believed to be malfunctioning */
#define CLUSTER_NODE_MYSELF 16 /* This node is myself */
#define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */
#define CLUSTER_NODE_NOADDR 64 /* We don't know the address of this node */
#define CLUSTER_NODE_MEET 128 /* Send a MEET message to this node */
#define CLUSTER_NODE_MIGRATE_TO 256 /* Master eligible for replica migration. */
#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */
集群配置文件
nodes.conf定义了集群节点的信息, 该文件的信息会实时更新
name ip:port@cport master,nofailover masternodename ping_set pong_rec epoch connected 5461-10922
- 0 name: 表示当前行是哪个节点的信息
- 1 ip:port@cport:ip表示节点的ip, port表示对外服务端口,cport集群间通信端口,集群间的通信端口配置项是cluster-announce-port xxx,不配置就会默认设置为port+10000。
- 2 节点角色和配置相关,master表示是主节点,slave表示是从节点,myself表示是当前节点。fail表示客观下线,fail?表示主观下线。
- 3 masternodename:对于从节点,该节点记录了主节点的名称;对于主节点,该值为-
- 4,5 ping_set: 如果不为0,就设置节点的clusterNode.ping_sent=mstime(); pong_rec如果不为0,就设置节点的clusterNode.ping_received=mstime(); 也就是记录上次发送ping和接收到pong的时间
- 6 epoch: clusterNode.configEpoch
- 7 节点是否正常连接
- 8-... 记录了slots的信息,有几种格式
a 表示slot a属于节点
a-b 表示从a到b的slots都属于节点
[a-<-nodename] 表示要将slots a 从nodename迁移到本节点
[a->-nodename] 表示要将slots a 从本节点迁移到nodename
查看集群信息
cluster help
cluster info
cluster nodes
cluster slots

浙公网安备 33010602011771号