redis源码之集群主备切换(八)
主从切换简介
redis的主从切换分为自动切换和手动切换:
- 自动切换的触发机制是集群会定时检查集群中节点状态,当存在主节点是FAIL的时候触发自动切换
- 手动切换是用户连接到从节点,执行切换命令,触发主从切换。
切换主要的两个处理函数是:
- clusterHandleManualFailover:手动切换的时候需要主从节点同步到一致的状态,该函数就是检查是否达到一致,如果一致就设置server.cluster->mf_can_start=1
- clusterHandleSlaveFailover:手动切换和自动切换都需要调用该函数,检查是否达到切换条件,达到就执行切换。
集群主备切换源码
主从切换状态状态
typedef struct clusterState {
clusterNode *myself; /* This node */
uint64_t currentEpoch;
int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */
int size; /* Num of master nodes with at least one slot */
dict *nodes; /* Hash table of name -> clusterNode structures */
dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */
clusterNode *migrating_slots_to[CLUSTER_SLOTS];
clusterNode *importing_slots_from[CLUSTER_SLOTS];
clusterNode *slots[CLUSTER_SLOTS];
uint64_t slots_keys_count[CLUSTER_SLOTS];
rax *slots_to_keys;
/* The following fields are used to take the slave state on elections. */
// 上次发起选举的时间
mstime_t failover_auth_time; /* Time of previous or next election. */
// 获取的投票数量, 超过一半就可以代替主节点
int failover_auth_count; /* Number of votes received so far. */
// 1表示发起了选举投票
int failover_auth_sent; /* True if we already asked for votes. */
// rank越小,表示和主节点偏移越小
int failover_auth_rank; /* This slave rank for current auth request. */
// 第几次选举
uint64_t failover_auth_epoch; /* Epoch of the current election. */
// 无法切换的原因
int cant_failover_reason; /* Why a slave is currently not able to
failover. See the CANT_FAILOVER_* macros. */
/* Manual failover state in common. */
// 手动切换超时时间,如果没有进行中的手动切换,值就为0
mstime_t mf_end; /* Manual failover time limit (ms unixtime).
It is zero if there is no MF in progress. */
/* Manual failover state of master. */
// 主节点中,记录进行手动切换的节点信息
clusterNode *mf_slave; /* Slave performing the manual failover. */
/* Manual failover state of slave. */
// 与主节点的偏移
long long mf_master_offset; /* Master offset the slave needs to start MF
or -1 if still not received. */
// 如果从节点与主节点完全同步,值就为1
int mf_can_start; /* If non-zero signal that the manual failover
can start requesting masters vote. */
/* The following fields are used by masters to take state on elections. */
uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */
/* Messages received and sent by type. */
long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT];
long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT];
long long stats_pfail_nodes; /* Number of nodes in PFAIL status,
excluding nodes without address. */
} clusterState;
主从切换触发条件
自动切换就是检查发现主节点是fail状态,就尝试切换
手动切换就是连接到想要进行主从切换的从节点,执行cluster failover命令
void clusterCommand(client *c) {
...
else if (!strcasecmp(c->argv[1]->ptr,"failover") &&
(c->argc == 2 || c->argc == 3))
{
/* CLUSTER FAILOVER [FORCE|TAKEOVER] */
int force = 0, takeover = 0;
if (c->argc == 3) {
if (!strcasecmp(c->argv[2]->ptr,"force")) {
force = 1;
} else if (!strcasecmp(c->argv[2]->ptr,"takeover")) {
takeover = 1;
force = 1; /* Takeover also implies force. */
} else {
addReplyErrorObject(c,shared.syntaxerr);
return;
}
}
/* Check preconditions. */
if (nodeIsMaster(myself)) {
addReplyError(c,"You should send CLUSTER FAILOVER to a replica");
return;
} else if (myself->slaveof == NULL) {
addReplyError(c,"I'm a replica but my master is unknown to me");
return;
// 如果主节点是fail状态,就需要使用force
} else if (!force &&
(nodeFailed(myself->slaveof) ||
myself->slaveof->link == NULL))
{
addReplyError(c,"Master is down or failed, "
"please use CLUSTER FAILOVER FORCE");
return;
}
resetManualFailover();
server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT;
if (takeover) {
/* A takeover does not perform any initial check. It just
* generates a new configuration epoch for this node without
* consensus, claims the master's slots, and broadcast the new
* configuration. */
serverLog(LL_WARNING,"Taking over the master (user request).");
clusterBumpConfigEpochWithoutConsensus();
clusterFailoverReplaceYourMaster();
} else if (force) {
/* If this is a forced failover, we don't need to talk with our
* master to agree about the offset. We just failover taking over
* it without coordination. */
// 不管主从同步状态,直接开始切换操作
serverLog(LL_WARNING,"Forced failover user request accepted.");
server.cluster->mf_can_start = 1;
} else {
serverLog(LL_WARNING,"Manual failover user request accepted.");
// 给主节点发送CLUSTERMSG_TYPE_MFSTART消息
clusterSendMFStart(myself->slaveof);
}
addReply(c,shared.ok);
...
}
手动切换也分为3种情况:
- cluster failover: 给主节点发送CLUSTERMSG_TYPE_MFSTART消息,等待主从一致才执行切换。
- cluster failover force: 如果主节点是fail状态,就要使用force强制执行切换,不用等待主从一致。
- cluster failover takeover:
执行主从切换
/* This function is called from the cluster cron function in order to go
* forward with a manual failover state machine. */
// 如果有进行中的手动切换,就检查主备同步状态
// 如果完全一致,就设置CLUSTER_TODO_HANDLE_FAILOVER
// 如果不一致,就设置CLUSTER_TODO_HANDLE_MANUALFAILOVER
// 如果没有进行中的手动切换,就直接返回
void clusterHandleManualFailover(void) {
/* Return ASAP if no manual failover is in progress. */
if (server.cluster->mf_end == 0) return;
/* If mf_can_start is non-zero, the failover was already triggered so the
* next steps are performed by clusterHandleSlaveFailover(). */
// 如果已经设置了手动同步完全一致的标记
if (server.cluster->mf_can_start) return;
if (server.cluster->mf_master_offset == -1) return; /* Wait for offset... */
// 如果完成了主备同步
if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) {
/* Our replication offset matches the master replication offset
* announced after clients were paused. We can start the failover. */
// 设置完全同步的标记
server.cluster->mf_can_start = 1;
serverLog(LL_WARNING,
"All master replication stream processed, "
"manual failover can start.");
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
return;
}
// 如果没有完成同步
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
}
clusterHandleSlaveFailover
void clusterHandleSlaveFailover(void) {
mstime_t data_age;
// failover_auth_time是发送切换的广播消息时间
mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
int needed_quorum = (server.cluster->size / 2) + 1;
// 如果设置了mf_end和mf_can_start就是手动切换
int manual_failover = server.cluster->mf_end != 0 &&
server.cluster->mf_can_start;
mstime_t auth_timeout, auth_retry_time;
server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER;
/* Compute the failover timeout (the max time we have to send votes
* and wait for replies), and the failover retry time (the time to wait
* before trying to get voted again).
*
* Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds.
* Retry is two times the Timeout.
*/
// failover超时时间,cluster_node_timeout*2
auth_timeout = server.cluster_node_timeout*2;
if (auth_timeout < 2000) auth_timeout = 2000;
auth_retry_time = auth_timeout*2;
/* Pre conditions to run the function, that must be met both in case
* of an automatic or manual failover:
* 1) We are a slave.
* 2) Our master is flagged as FAIL, or this is a manual failover.
* 3) We don't have the no failover configuration set, and this is
* not a manual failover.
* 4) It is serving slots. */
// 当前节点必需是从节点, 而且存在主节点,而且主节点的slots不为0
// 自动切换的时候,主节点状态必需是正常的,没有设置cluster_slave_no_failover
if (nodeIsMaster(myself) ||
myself->slaveof == NULL ||
(!nodeFailed(myself->slaveof) && !manual_failover) ||
(server.cluster_slave_no_failover && !manual_failover) ||
myself->slaveof->numslots == 0)
{
/* There are no reasons to failover, so we set the reason why we
* are returning without failing over to NONE. */
server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
return;
}
// myself是从节点,而且存在主节点,而且(nodeFail||manual_failover), 而且(!no_failover||manual), 而且主节点的slots不为0
// 所以下面的代码,是在手动切换或者自动切换的时候才会执行
// 手动切换可以不管节点状态,但是自动切换需要主节点是fail状态,且有服务的slots
/* Set data_age to the number of milliseconds we are disconnected from
* the master. */
// 检查与主节点丢失通信时长, 如果断开时间太长就无法进行自动切换
if (server.repl_state == REPL_STATE_CONNECTED) {
data_age = (mstime_t)(server.unixtime - server.master->lastinteraction)
* 1000;
} else {
data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000;
}
/* Remove the node timeout from the data age as it is fine that we are
* disconnected from our master at least for the time it was down to be
* flagged as FAIL, that's the baseline. */
if (data_age > server.cluster_node_timeout)
data_age -= server.cluster_node_timeout;
/* Check if our data is recent enough according to the slave validity
* factor configured by the user.
*
* Check bypassed for manual failovers. */
if (server.cluster_slave_validity_factor &&
data_age >
(((mstime_t)server.repl_ping_slave_period * 1000) +
(server.cluster_node_timeout * server.cluster_slave_validity_factor)))
{
if (!manual_failover) {
clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE);
return;
}
}
/* If the previous failover attempt timeout and the retry time has
* elapsed, we can setup a new one. */
// 如果超时了, 就重置时间, 重新发送切换的广播消息
if (auth_age > auth_retry_time) {
server.cluster->failover_auth_time = mstime() +
500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
random() % 500; /* Random delay between 0 and 500 milliseconds. */
server.cluster->failover_auth_count = 0;
server.cluster->failover_auth_sent = 0;
// 计算偏移排名,值越小,与主节点的偏移越小
server.cluster->failover_auth_rank = clusterGetSlaveRank();
/* We add another delay that is proportional to the slave rank.
* Specifically 1 second * rank. This way slaves that have a probably
* less updated replication offset, are penalized. */
server.cluster->failover_auth_time +=
server.cluster->failover_auth_rank * 1000;
/* However if this is a manual failover, no delay is needed. */
if (server.cluster->mf_end) {
server.cluster->failover_auth_time = mstime();
server.cluster->failover_auth_rank = 0;
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
}
serverLog(LL_WARNING,
"Start of election delayed for %lld milliseconds "
"(rank #%d, offset %lld).",
server.cluster->failover_auth_time - mstime(),
server.cluster->failover_auth_rank,
replicationGetSlaveOffset());
/* Now that we have a scheduled election, broadcast our offset
* to all the other slaves so that they'll updated their offsets
* if our offset is better. */
clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES);
return;
}
/* It is possible that we received more updated offsets from other
* slaves for the same master since we computed our election delay.
* Update the delay if our rank changed.
*
* Not performed if this is a manual failover. */
// 自动切换时,获取最新的rank
if (server.cluster->failover_auth_sent == 0 &&
server.cluster->mf_end == 0)
{
int newrank = clusterGetSlaveRank();
if (newrank > server.cluster->failover_auth_rank) {
long long added_delay =
(newrank - server.cluster->failover_auth_rank) * 1000;
server.cluster->failover_auth_time += added_delay;
server.cluster->failover_auth_rank = newrank;
serverLog(LL_WARNING,
"Replica rank updated to #%d, added %lld milliseconds of delay.",
newrank, added_delay);
}
}
/* Return ASAP if we can't still start the election. */
// 仍然在等待选举延迟
if (mstime() < server.cluster->failover_auth_time) {
clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY);
return;
}
/* Return ASAP if the election is too old to be valid. */
// 选举超时了
if (auth_age > auth_timeout) {
clusterLogCantFailover(CLUSTER_CANT_FAILOVER_EXPIRED);
return;
}
/* Ask for votes if needed. */
// currentEpoch应该是记录第几次进行选举
// 如果没有发送选举消息,启动一个切换选举,等待选举结果
if (server.cluster->failover_auth_sent == 0) {
server.cluster->currentEpoch++;
server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
serverLog(LL_WARNING,"Starting a failover election for epoch %llu.",
(unsigned long long) server.cluster->currentEpoch);
clusterRequestFailoverAuth();
server.cluster->failover_auth_sent = 1;
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
CLUSTER_TODO_UPDATE_STATE|
CLUSTER_TODO_FSYNC_CONFIG);
return; /* Wait for replies. */
}
/* Check if we reached the quorum. */
// 获取了超过半数的选举, 执行切换
if (server.cluster->failover_auth_count >= needed_quorum) {
/* We have the quorum, we can finally failover the master. */
serverLog(LL_WARNING,
"Failover election won: I'm the new master.");
/* Update my configEpoch to the epoch of the election. */
if (myself->configEpoch < server.cluster->failover_auth_epoch) {
myself->configEpoch = server.cluster->failover_auth_epoch;
serverLog(LL_WARNING,
"configEpoch set to %llu after successful failover",
(unsigned long long) myself->configEpoch);
}
/* Take responsibility for the cluster slots. */
clusterFailoverReplaceYourMaster();
} else {
// 没有获取足够的投票
clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_VOTES);
}
}
- 自动切换需要主节点是fail状态,而且没有设置cluster-replica-no-failover no
// cluster.c中定义,server.c调用
/* This function is called before the event handler returns to sleep for
* events. It is useful to perform operations that must be done ASAP in
* reaction to events fired but that are not safe to perform inside event
* handlers, or to perform potentially expansive tasks that we need to do
* a single time before replying to clients. */
void clusterBeforeSleep(void) {
int flags = server.cluster->todo_before_sleep;
/* Reset our flags (not strictly needed since every single function
* called for flags set should be able to clear its flag). */
server.cluster->todo_before_sleep = 0;
if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) {
/* Handle manual failover as soon as possible so that won't have a 100ms
* as it was handled only in clusterCron */
if(nodeIsSlave(myself)) {
clusterHandleManualFailover();
if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
clusterHandleSlaveFailover();
}
} else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) {
/* Handle failover, this is needed when it is likely that there is already
* the quorum from masters in order to react fast. */
clusterHandleSlaveFailover();
}
/* Update the cluster state. */
// 检测并更新集群状态
if (flags & CLUSTER_TODO_UPDATE_STATE)
clusterUpdateState();
/* Save the config, possibly using fsync. */
if (flags & CLUSTER_TODO_SAVE_CONFIG) {
int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG;
clusterSaveConfigOrDie(fsync);
}
}
// 这个函数只是设置标记,实际的执行在clusterBeforeSleep
void clusterDoBeforeSleep(int flags) {
server.cluster->todo_before_sleep |= flags;
}
// cluster.c 集群定时任务,每0.1秒执行一次
void clusterCron(void) {
...
// 当前节点为从节点,就检查
if (nodeIsSlave(myself)) {
// 如果有进行中的手动切换,就检查并记录主备数据是否完全一致
clusterHandleManualFailover();
if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
// 检查主从切换状态,可以就进行主从切换
// 对于手动切换,需要满足主从偏移一致
// 对于自动切换,需要满足
clusterHandleSlaveFailover();
/* If there are orphaned slaves, and we are a slave among the masters
* with the max number of non-failing slaves, consider migrating to
* the orphaned masters. Note that it does not make sense to try
* a migration if there is no master with at least *two* working
* slaves. */
// 如果存在孤立的主节点,而且存在大于2个从节点的主节点,而且当前节点的主节点的从节点数量为max_slaves
// 迁移当前节点为孤立主节点的从节点
if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves &&
server.cluster_allow_replica_migration)
clusterHandleSlaveMigration(max_slaves);
}
...
}

浙公网安备 33010602011771号