redis源码之主从同步(十)
主从同步原理
主从第一次同步的时候,主节点先创建rdb文件,然后传输给从节点,从节点加载rdb文件。在这个期间主节点的数据变更命令会被写入到内存缓存区replication buff,从节点加载完rdb文件后,就读取主节点内存缓冲区replication buff中的命令的方式进行增量同步。
如果主节点和从节点断开连接后重新连上,这时候就会通过数据同步的偏移和主节点replication buff中的数据判断是否可以只同步replication buff的数据给从节点,这种称为增量同步。如果不行就得重新生成rdb文件并传输给从节点,并且同步replicaiton buff中的内容给从节点,这种称为全量同步。
源码分析
主从连接和同步状态
从节点同步数据时的几个状态:
/* Slave replication state. Used in server.repl_state for slaves to remember
* what to do next. */
typedef enum {
REPL_STATE_NONE = 0, /* No active replication */
// 设置server.masterhost等信息后,设置该状态
REPL_STATE_CONNECT, /* Must connect to master */
// 根据server.masterhost等信息,建立到主节点的tcp连接
REPL_STATE_CONNECTING, /* Connecting to master */
/* --- Handshake states, must be ordered --- */
// 发送ping消息后,等待ping应答的状态
REPL_STATE_RECEIVE_PING_REPLY, /* Wait for PING reply */
REPL_STATE_SEND_HANDSHAKE, /* Send handshake sequance to master */
REPL_STATE_RECEIVE_AUTH_REPLY, /* Wait for AUTH reply */
REPL_STATE_RECEIVE_PORT_REPLY, /* Wait for REPLCONF reply */
REPL_STATE_RECEIVE_IP_REPLY, /* Wait for REPLCONF reply */
REPL_STATE_RECEIVE_CAPA_REPLY, /* Wait for REPLCONF reply */
// 发送psync(尝试增量同步)给主节点后的状态
REPL_STATE_SEND_PSYNC, /* Send PSYNC */
REPL_STATE_RECEIVE_PSYNC_REPLY, /* Wait for PSYNC reply */
/* --- End of handshake states --- */
// 开始rdb文件的传输,从节点的tcp数据处理函数为readSyncBulkPayload
REPL_STATE_TRANSFER, /* Receiving .rdb from master */
// 完成rdb文件的传输,并且加载数据到内存了的状态
REPL_STATE_CONNECTED, /* Connected to master */
} repl_state;
主节点上同步数据时记录的从节点状态:
// 需要全量同步的时候,设置该标志,表示从节点等待主节点创建子进程保存rdb文件
#define SLAVE_STATE_WAIT_BGSAVE_START 6 /* We need to produce a new RDB file. */
// 主节点启动创建rdb子进程后,给从节点发送fullresync的消息,然后设置从节点进入该状态
#define SLAVE_STATE_WAIT_BGSAVE_END 7 /* Waiting RDB file creation to finish. */
// 主节点完成了rdb文件的生成,开始给从节点发送rdb文件
#define SLAVE_STATE_SEND_BULK 8 /* Sending RDB file to slave. */
// 完成了rdb文件的传输,只需要发送replication buffer的内容给从节点
#define SLAVE_STATE_ONLINE 9 /* RDB file transmitted, sending just updates. */
主节点上记录与主从复制相关信息的结构体成员:
struct redisServer {
...
char replid[CONFIG_RUN_ID_SIZE+1]; /* My current replication ID. */
char replid2[CONFIG_RUN_ID_SIZE+1]; /* replid inherited from master*/
long long master_repl_offset; /* My current replication offset */
long long second_replid_offset; /* Accept offsets up to this for replid2. */
int slaveseldb; /* Last SELECTed DB in replication output */
int repl_ping_slave_period; /* Master pings the slave every N seconds */
// replication buffer 一个用于记录修改命令的循环缓冲区
char *repl_backlog; /* Replication backlog for partial syncs */
// 循环缓冲区大小
long long repl_backlog_size; /* Backlog circular buffer size */
// 缓冲区数据量大小
long long repl_backlog_histlen; /* Backlog actual data length */
// 缓冲区当前数据的末尾,写数据从这里开始
long long repl_backlog_idx; /* Backlog circular buffer current offset,
that is the next byte will'll write to.*/
long long repl_backlog_off; /* Replication "master offset" of first
byte in the replication backlog buffer.*/
time_t repl_backlog_time_limit; /* Time without slaves after the backlog
gets released. */
time_t repl_no_slaves_since; /* We have no slaves since that time.
Only valid if server.slaves len is 0. */
int repl_min_slaves_to_write; /* Min number of slaves to write. */
int repl_min_slaves_max_lag; /* Max lag of <count> slaves to write. */
int repl_good_slaves_count; /* Number of slaves with lag <= max_lag. */
int repl_diskless_sync; /* Master send RDB to slaves sockets directly. */
// 如果为1,slave在接收rdb文件的时候不用先保留rdb文件到磁盘
int repl_diskless_load; /* Slave parse RDB directly from the socket.
* see REPL_DISKLESS_LOAD_* enum */
int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */
...
}
主从建立连接的主要函数调用过程
// 从节点连接到主节点,并尝试和主节点进行数据同步
void replicaofCommand(client *c) replication.c
// 设置server.masterhost等信息,复制状态进入REPL_STATE_CONNECT
void replicationSetMaster(char *ip, int port) replication.c
// 建立到主节点的tcp连接,设置tcp连接的读函数为syncWithMaster
int connectWithMaster(void) replication.c
// 发送认证信息给主节点,尝试进行增量同步,不行就全量同步
void syncWithMaster(connection *conn) replication.c
// 发送当前节点的偏移量,尝试增量同步通信
// 注意如果可以增量同步,这里会修改conn的读函数为readQueryFromClient
int slaveTryPartialResynchronization(connection *conn, int read_reply) replication.c
// 全量同步,从conn读取数据并放入数据库
void readSyncBulkPayload(connection *conn) replication.c
// 从io流读取rdb数据并加载到db
int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) rdb.c
// 从文件读取rdb数据并加载到db
int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags) rdb.c
// 主节点收到从节点发来的同步指令后,启动子进程创建rdb文件
void syncCommand(client *c) replication.c
// 如果从节点发送的是psync replid offset, 如果offset在replication buff缓冲区内,就可以进行增量同步
int masterTryPartialResynchronization(client *c, long long psync_offset)
// 给从节点发送replication buff中的内容
long long addReplyReplicationBacklog(client *c, long long offset)
// 只能进行全量同步,主节点就启动一个进程执行rdb文件的创建
int startBgsaveForReplication(int mincapa)
int rdbSaveToSlavesSockets(rdbSaveInfo *rsi)
int rdbSaveBackground(char *filename, rdbSaveInfo *rsi)
int rdbSave(char *filename, rdbSaveInfo *rsi) rdb.c
// 主节点定时检查发现存在rdb子进程创建好rdb文件后退出,就给从节点发送rdb文件
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) server.c
// 检查是否有子进程退出了
void checkChildrenDone(void) server.c
// 处理rdb子进程退出
void backgroundSaveDoneHandler(int exitcode, int bysignal)
static void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal)
// 设置所有处于SLAVE_STATE_WAIT_BGSAVE_END状态的从节点的连接的写处理函数为sendBulkToSlave
void updateSlavesWaitingBgsave(int bgsaveerr, int type)
// 读取rdb文件的内容,发送给从节点
void sendBulkToSlave(connection *conn)
从节点会设置函数syncWithMaster为与主节点连接的消息处理函数,该函数主要是给主节点发送认证消息,认证通过后设置与主节点的连接的读取函数为readSyncBulkPayload,该函数负责读取主节点发送的数据,并保存到rdb文件,然后从节点从该文件加载数据到数据库。
// 网络包读取和处理函数
// 在进行主备同步的时候,设置从节点到主节点的连接的读取函数为syncWithMaster
void syncWithMaster(connection *conn) {
char tmpfile[256], *err = NULL;
int dfd = -1, maxtries = 5;
int psync_result;
/* If this event fired after the user turned the instance into a master
* with SLAVEOF NO ONE we must just return ASAP. */
if (server.repl_state == REPL_STATE_NONE) {
connClose(conn);
return;
}
/* Check for errors in the socket: after a non blocking connect() we
* may find that the socket is in error state. */
if (connGetState(conn) != CONN_STATE_CONNECTED) {
serverLog(LL_WARNING,"Error condition on socket for SYNC: %s",
connGetLastError(conn));
goto error;
}
/* Send a PING to check the master is able to reply without errors. */
// 如果是正在连接状态,就给主节点发送ping检查是否能正常通信
if (server.repl_state == REPL_STATE_CONNECTING) {
serverLog(LL_NOTICE,"Non blocking connect for SYNC fired the event.");
/* Delete the writable event so that the readable event remains
* registered and we can wait for the PONG reply. */
connSetReadHandler(conn, syncWithMaster);
connSetWriteHandler(conn, NULL);
// 切换到REPL_STATE_RECEIVE_PING_REPLY状态, 表示等待ping消息的应答
server.repl_state = REPL_STATE_RECEIVE_PING_REPLY;
/* Send the PING, don't check for errors at all, we have the timeout
* that will take care about this. */
// 发送ping命令
err = sendCommand(conn,"PING",NULL);
if (err) goto write_error;
return;
}
/* Receive the PONG command. */
// 从con读取数据,并校验数据是否是指定的格式
if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) {
// 从conn读取数据, 返回读取的数据
err = receiveSynchronousResponse(conn);
/* We accept only two replies as valid, a positive +PONG reply
* (we just check for "+") or an authentication error.
* Note that older versions of Redis replied with "operation not
* permitted" instead of using a proper error code, so we test
* both. */
if (err[0] != '+' &&
strncmp(err,"-NOAUTH",7) != 0 &&
strncmp(err,"-NOPERM",7) != 0 &&
strncmp(err,"-ERR operation not permitted",28) != 0)
{
serverLog(LL_WARNING,"Error reply to PING from master: '%s'",err);
sdsfree(err);
goto error;
} else {
serverLog(LL_NOTICE,
"Master replied to PING, replication can continue...");
}
sdsfree(err);
err = NULL;
// 读取的数据格式正常,切换状态为REPL_STATE_SEND_HANDSHAKE
server.repl_state = REPL_STATE_SEND_HANDSHAKE;
}
// 如果是握手阶段,就给主节点发送认证信息,端口和ip
if (server.repl_state == REPL_STATE_SEND_HANDSHAKE) {
/* AUTH with the master if required. */
// 如果配置了主机连接的用户和密码, 就发送用户和密码
if (server.masterauth) {
char *args[3] = {"AUTH",NULL,NULL};
size_t lens[3] = {4,0,0};
int argc = 1;
// 如果设置了用户名,就再args中添加用户名
if (server.masteruser) {
args[argc] = server.masteruser;
lens[argc] = strlen(server.masteruser);
argc++;
}
// 追加密码
args[argc] = server.masterauth;
lens[argc] = sdslen(server.masterauth);
argc++;
// AUTH name password 或者 AUTH password
err = sendCommandArgv(conn, argc, args, lens);
if (err) goto write_error;
}
/* Set the slave port, so that Master's INFO command can list the
* slave listening port correctly. */
// 发送端口
{
int port;
if (server.slave_announce_port)
port = server.slave_announce_port;
else if (server.tls_replication && server.tls_port)
port = server.tls_port;
else
port = server.port;
sds portstr = sdsfromlonglong(port);
// 发送slave的监听端口给主机
err = sendCommand(conn,"REPLCONF",
"listening-port",portstr, NULL);
sdsfree(portstr);
if (err) goto write_error;
}
/* Set the slave ip, so that Master's INFO command can list the
* slave IP address port correctly in case of port forwarding or NAT.
* Skip REPLCONF ip-address if there is no slave-announce-ip option set. */
// 发送slave的ip给主机
if (server.slave_announce_ip) {
err = sendCommand(conn,"REPLCONF",
"ip-address",server.slave_announce_ip, NULL);
if (err) goto write_error;
}
/* Inform the master of our (slave) capabilities.
*
* EOF: supports EOF-style RDB transfer for diskless replication.
* PSYNC2: supports PSYNC v2, so understands +CONTINUE <new repl ID>.
*
* The master will ignore capabilities it does not understand. */
// 发送slave的能力
err = sendCommand(conn,"REPLCONF",
"capa","eof","capa","psync2",NULL);
if (err) goto write_error;
// 进入已经发送验证信息,等待应答状态
server.repl_state = REPL_STATE_RECEIVE_AUTH_REPLY;
return;
}
// 如果没有配置认证消息,就直接进入REPL_STATE_RECEIVE_PORT_REPLY
if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.masterauth)
server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY;
/* Receive AUTH reply. */
// 如果收到认证消息的应答,检查应答结果
if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY) {
err = receiveSynchronousResponse(conn);
if (err[0] == '-') {
serverLog(LL_WARNING,"Unable to AUTH to MASTER: %s",err);
sdsfree(err);
goto error;
}
sdsfree(err);
err = NULL;
// 认证检测通过
server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY;
return;
}
/* Receive REPLCONF listening-port reply. */
if (server.repl_state == REPL_STATE_RECEIVE_PORT_REPLY) {
err = receiveSynchronousResponse(conn);
/* Ignore the error if any, not all the Redis versions support
* REPLCONF listening-port. */
if (err[0] == '-') {
serverLog(LL_NOTICE,"(Non critical) Master does not understand "
"REPLCONF listening-port: %s", err);
}
sdsfree(err);
// 端口应答检测通过
server.repl_state = REPL_STATE_RECEIVE_IP_REPLY;
return;
}
if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY && !server.slave_announce_ip)
server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
/* Receive REPLCONF ip-address reply. */
if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY) {
err = receiveSynchronousResponse(conn);
/* Ignore the error if any, not all the Redis versions support
* REPLCONF listening-port. */
if (err[0] == '-') {
serverLog(LL_NOTICE,"(Non critical) Master does not understand "
"REPLCONF ip-address: %s", err);
}
sdsfree(err);
// ip应答检测通过
server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
return;
}
/* Receive CAPA reply. */
if (server.repl_state == REPL_STATE_RECEIVE_CAPA_REPLY) {
err = receiveSynchronousResponse(conn);
/* Ignore the error if any, not all the Redis versions support
* REPLCONF capa. */
if (err[0] == '-') {
serverLog(LL_NOTICE,"(Non critical) Master does not understand "
"REPLCONF capa: %s", err);
}
sdsfree(err);
err = NULL;
// capa应答检测通过
server.repl_state = REPL_STATE_SEND_PSYNC;
}
/* Try a partial resynchonization. If we don't have a cached master
* slaveTryPartialResynchronization() will at least try to use PSYNC
* to start a full resynchronization so that we get the master replid
* and the global offset, to try a partial resync at the next
* reconnection attempt. */
if (server.repl_state == REPL_STATE_SEND_PSYNC) {
// 发送PSYNC replid offset命令给主节点, 尝试增量同步
// 这里不等待主节点的应答,直接返回
if (slaveTryPartialResynchronization(conn,0) == PSYNC_WRITE_ERROR) {
err = sdsnew("Write error sending the PSYNC command.");
abortFailover("Write error to failover target");
goto write_error;
}
server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY;
return;
}
/* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC. */
if (server.repl_state != REPL_STATE_RECEIVE_PSYNC_REPLY) {
serverLog(LL_WARNING,"syncWithMaster(): state machine error, "
"state should be RECEIVE_PSYNC but is %d",
server.repl_state);
goto error;
}
// 读取主节点对psync的应答
psync_result = slaveTryPartialResynchronization(conn,1);
if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
/* Check the status of the planned failover. We expect PSYNC_CONTINUE,
* but there is nothing technically wrong with a full resync which
* could happen in edge cases. */
if (server.failover_state == FAILOVER_IN_PROGRESS) {
if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) {
clearFailoverState();
} else {
abortFailover("Failover target rejected psync request");
return;
}
}
/* If the master is in an transient error, we should try to PSYNC
* from scratch later, so go to the error path. This happens when
* the server is loading the dataset or is not connected with its
* master and so forth. */
if (psync_result == PSYNC_TRY_LATER) goto error;
/* Note: if PSYNC does not return WAIT_REPLY, it will take care of
* uninstalling the read handler from the file descriptor. */
// 可以进行增量同步
if (psync_result == PSYNC_CONTINUE) {
serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization.");
if (server.supervised_mode == SUPERVISED_SYSTEMD) {
redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections in read-write mode.\n");
}
return;
}
/* PSYNC failed or is not supported: we want our slaves to resync with us
* as well, if we have any sub-slaves. The master may transfer us an
* entirely different data set and we have no way to incrementally feed
* our slaves after that. */
disconnectSlaves(); /* Force our slaves to resync with us as well. */
freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
/* Fall back to SYNC if needed. Otherwise psync_result == PSYNC_FULLRESYNC
* and the server.master_replid and master_initial_offset are
* already populated. */
// 如果不支持PSYNC,就尝试SYNC
if (psync_result == PSYNC_NOT_SUPPORTED) {
serverLog(LL_NOTICE,"Retrying with SYNC...");
if (connSyncWrite(conn,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) {
serverLog(LL_WARNING,"I/O error writing to MASTER: %s",
strerror(errno));
goto error;
}
}
/* Prepare a suitable temp file for bulk transfer */
// 不是使用disk less模式,就创建rdb文件
if (!useDisklessLoad()) {
while(maxtries--) {
snprintf(tmpfile,256,
"temp-%d.%ld.rdb",(int)server.unixtime,(long int)getpid());
dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
if (dfd != -1) break;
sleep(1);
}
if (dfd == -1) {
serverLog(LL_WARNING,"Opening the temp file needed for MASTER <-> REPLICA synchronization: %s",strerror(errno));
goto error;
}
// 记录文件名和文件句柄
server.repl_transfer_tmpfile = zstrdup(tmpfile);
server.repl_transfer_fd = dfd;
}
/* Setup the non blocking download of the bulk file. */
if (connSetReadHandler(conn, readSyncBulkPayload)
== C_ERR)
{
char conninfo[CONN_INFO_LEN];
serverLog(LL_WARNING,
"Can't create readable event for SYNC: %s (%s)",
strerror(errno), connGetInfo(conn, conninfo, sizeof(conninfo)));
goto error;
}
server.repl_state = REPL_STATE_TRANSFER;
server.repl_transfer_size = -1;
server.repl_transfer_read = 0;
server.repl_transfer_last_fsync_off = 0;
server.repl_transfer_lastio = server.unixtime;
return;
error:
if (dfd != -1) close(dfd);
connClose(conn);
server.repl_transfer_s = NULL;
if (server.repl_transfer_fd != -1)
close(server.repl_transfer_fd);
if (server.repl_transfer_tmpfile)
zfree(server.repl_transfer_tmpfile);
server.repl_transfer_tmpfile = NULL;
server.repl_transfer_fd = -1;
server.repl_state = REPL_STATE_CONNECT;
return;
write_error: /* Handle sendCommand() errors. */
serverLog(LL_WARNING,"Sending command to master in replication handshake: %s", err);
sdsfree(err);
goto error;
}
该函数主要功能:
- 给主节点发送认证信息,ip,端口等信息;
- 尝试进行增量同步,如果可以就进行增量同步;如果不行就进行全量同步,使用readSyncBulkPayload读取主节点发送的rdb文件。
增量同步
// 处理客户端网络包
void readQueryFromClient(connection *conn)
static int connSocketRead(connection *conn, void *buf, size_t buf_len)
void processInputBuffer(client *c)
// 从c->querybuf取一行数据,放入c->argv
int processInlineBuffer(client *c)
// 处理客户端命令
int processCommandAndResetClient(client *c)
// 进行一些检查后,执行命令
int processCommand(client *c)
// 调用命令处理函数c->cmd->proc
void call(client *c, int flags)
// 将命令写入到replication buff或者aof缓冲区
void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
int flags)
// 将命令写入server.aof_buf
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc)
// 将命令写入server.repl_backlog
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc)
- 对于写命令,propagate函数会根据配置将命令写入replication buff和发送给从节点,或者写入aof缓冲区。

浙公网安备 33010602011771号