redis源码之主从同步(十)

主从同步原理

主从第一次同步的时候,主节点先创建rdb文件,然后传输给从节点,从节点加载rdb文件。在这个期间主节点的数据变更命令会被写入到内存缓存区replication buff,从节点加载完rdb文件后,就读取主节点内存缓冲区replication buff中的命令的方式进行增量同步。
如果主节点和从节点断开连接后重新连上,这时候就会通过数据同步的偏移和主节点replication buff中的数据判断是否可以只同步replication buff的数据给从节点,这种称为增量同步。如果不行就得重新生成rdb文件并传输给从节点,并且同步replicaiton buff中的内容给从节点,这种称为全量同步。

源码分析

主从连接和同步状态

从节点同步数据时的几个状态:

/* Slave replication state. Used in server.repl_state for slaves to remember
 * what to do next. */
typedef enum {
    REPL_STATE_NONE = 0,            /* No active replication */
	  // 设置server.masterhost等信息后,设置该状态
    REPL_STATE_CONNECT,             /* Must connect to master */
	  // 根据server.masterhost等信息,建立到主节点的tcp连接
    REPL_STATE_CONNECTING,          /* Connecting to master */
    /* --- Handshake states, must be ordered --- */
	  // 发送ping消息后,等待ping应答的状态
    REPL_STATE_RECEIVE_PING_REPLY,  /* Wait for PING reply */
    REPL_STATE_SEND_HANDSHAKE,      /* Send handshake sequance to master */
    REPL_STATE_RECEIVE_AUTH_REPLY,  /* Wait for AUTH reply */
    REPL_STATE_RECEIVE_PORT_REPLY,  /* Wait for REPLCONF reply */
    REPL_STATE_RECEIVE_IP_REPLY,    /* Wait for REPLCONF reply */
    REPL_STATE_RECEIVE_CAPA_REPLY,  /* Wait for REPLCONF reply */
	  // 发送psync(尝试增量同步)给主节点后的状态
    REPL_STATE_SEND_PSYNC,          /* Send PSYNC */
    REPL_STATE_RECEIVE_PSYNC_REPLY, /* Wait for PSYNC reply */
    /* --- End of handshake states --- */
	  // 开始rdb文件的传输,从节点的tcp数据处理函数为readSyncBulkPayload
    REPL_STATE_TRANSFER,        /* Receiving .rdb from master */
	  // 完成rdb文件的传输,并且加载数据到内存了的状态
    REPL_STATE_CONNECTED,       /* Connected to master */
} repl_state;

主节点上同步数据时记录的从节点状态:

// 需要全量同步的时候,设置该标志,表示从节点等待主节点创建子进程保存rdb文件
#define SLAVE_STATE_WAIT_BGSAVE_START 6 /* We need to produce a new RDB file. */
// 主节点启动创建rdb子进程后,给从节点发送fullresync的消息,然后设置从节点进入该状态
#define SLAVE_STATE_WAIT_BGSAVE_END 7 /* Waiting RDB file creation to finish. */
// 主节点完成了rdb文件的生成,开始给从节点发送rdb文件
#define SLAVE_STATE_SEND_BULK 8 /* Sending RDB file to slave. */
// 完成了rdb文件的传输,只需要发送replication buffer的内容给从节点
#define SLAVE_STATE_ONLINE 9 /* RDB file transmitted, sending just updates. */

主节点上记录与主从复制相关信息的结构体成员:

struct redisServer {
  ...
    char replid[CONFIG_RUN_ID_SIZE+1];  /* My current replication ID. */
    char replid2[CONFIG_RUN_ID_SIZE+1]; /* replid inherited from master*/
    long long master_repl_offset;   /* My current replication offset */
    long long second_replid_offset; /* Accept offsets up to this for replid2. */
    int slaveseldb;                 /* Last SELECTed DB in replication output */
    int repl_ping_slave_period;     /* Master pings the slave every N seconds */
	   // replication buffer 一个用于记录修改命令的循环缓冲区
    char *repl_backlog;             /* Replication backlog for partial syncs */
    // 循环缓冲区大小
	   long long repl_backlog_size;    /* Backlog circular buffer size */
    // 缓冲区数据量大小
	   long long repl_backlog_histlen; /* Backlog actual data length */
    // 缓冲区当前数据的末尾,写数据从这里开始
	   long long repl_backlog_idx;     /* Backlog circular buffer current offset,
                                       that is the next byte will'll write to.*/
    long long repl_backlog_off;     /* Replication "master offset" of first
                                       byte in the replication backlog buffer.*/
    time_t repl_backlog_time_limit; /* Time without slaves after the backlog
                                       gets released. */
    time_t repl_no_slaves_since;    /* We have no slaves since that time.
                                       Only valid if server.slaves len is 0. */
    int repl_min_slaves_to_write;   /* Min number of slaves to write. */
    int repl_min_slaves_max_lag;    /* Max lag of <count> slaves to write. */
    int repl_good_slaves_count;     /* Number of slaves with lag <= max_lag. */
    int repl_diskless_sync;         /* Master send RDB to slaves sockets directly. */
	   // 如果为1,slave在接收rdb文件的时候不用先保留rdb文件到磁盘
    int repl_diskless_load;         /* Slave parse RDB directly from the socket.
                                     * see REPL_DISKLESS_LOAD_* enum */
    int repl_diskless_sync_delay;   /* Delay to start a diskless repl BGSAVE. */
  ...
}

主从建立连接的主要函数调用过程

// 从节点连接到主节点,并尝试和主节点进行数据同步
void replicaofCommand(client *c)   replication.c
  // 设置server.masterhost等信息,复制状态进入REPL_STATE_CONNECT
  void replicationSetMaster(char *ip, int port)  replication.c
  		// 建立到主节点的tcp连接,设置tcp连接的读函数为syncWithMaster
    int connectWithMaster(void)  replication.c
	     // 发送认证信息给主节点,尝试进行增量同步,不行就全量同步
	     void syncWithMaster(connection *conn) replication.c
		     // 发送当前节点的偏移量,尝试增量同步通信
        // 注意如果可以增量同步,这里会修改conn的读函数为readQueryFromClient 
		     int slaveTryPartialResynchronization(connection *conn, int read_reply) replication.c
			   // 全量同步,从conn读取数据并放入数据库
        void readSyncBulkPayload(connection *conn)  replication.c
	          // 从io流读取rdb数据并加载到db
		        int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) rdb.c
				     //  从文件读取rdb数据并加载到db
					    int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags) rdb.c
			 
// 主节点收到从节点发来的同步指令后,启动子进程创建rdb文件
void syncCommand(client *c) replication.c
  // 如果从节点发送的是psync replid offset, 如果offset在replication buff缓冲区内,就可以进行增量同步
  int masterTryPartialResynchronization(client *c, long long psync_offset) 
    // 给从节点发送replication buff中的内容
    long long addReplyReplicationBacklog(client *c, long long offset) 
	 // 只能进行全量同步,主节点就启动一个进程执行rdb文件的创建
  int startBgsaveForReplication(int mincapa)
    int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) 
	   int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) 
		    int rdbSave(char *filename, rdbSaveInfo *rsi)  rdb.c

// 主节点定时检查发现存在rdb子进程创建好rdb文件后退出,就给从节点发送rdb文件
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData)  server.c
  // 检查是否有子进程退出了
  void checkChildrenDone(void) server.c
    // 处理rdb子进程退出
    void backgroundSaveDoneHandler(int exitcode, int bysignal)
	     static void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal)
		 		  // 设置所有处于SLAVE_STATE_WAIT_BGSAVE_END状态的从节点的连接的写处理函数为sendBulkToSlave
		     void updateSlavesWaitingBgsave(int bgsaveerr, int type) 
			 			// 读取rdb文件的内容,发送给从节点
			 		  void sendBulkToSlave(connection *conn) 
  

从节点会设置函数syncWithMaster为与主节点连接的消息处理函数,该函数主要是给主节点发送认证消息,认证通过后设置与主节点的连接的读取函数为readSyncBulkPayload,该函数负责读取主节点发送的数据,并保存到rdb文件,然后从节点从该文件加载数据到数据库。

// 网络包读取和处理函数
// 在进行主备同步的时候,设置从节点到主节点的连接的读取函数为syncWithMaster
void syncWithMaster(connection *conn) {
    char tmpfile[256], *err = NULL;
    int dfd = -1, maxtries = 5;
    int psync_result;

    /* If this event fired after the user turned the instance into a master
     * with SLAVEOF NO ONE we must just return ASAP. */
    if (server.repl_state == REPL_STATE_NONE) {
        connClose(conn);
        return;
    }

    /* Check for errors in the socket: after a non blocking connect() we
     * may find that the socket is in error state. */
    if (connGetState(conn) != CONN_STATE_CONNECTED) {
        serverLog(LL_WARNING,"Error condition on socket for SYNC: %s",
                connGetLastError(conn));
        goto error;
    }

    /* Send a PING to check the master is able to reply without errors. */
    // 如果是正在连接状态,就给主节点发送ping检查是否能正常通信
    if (server.repl_state == REPL_STATE_CONNECTING) {
        serverLog(LL_NOTICE,"Non blocking connect for SYNC fired the event.");
        /* Delete the writable event so that the readable event remains
         * registered and we can wait for the PONG reply. */
        connSetReadHandler(conn, syncWithMaster);
        connSetWriteHandler(conn, NULL);
        // 切换到REPL_STATE_RECEIVE_PING_REPLY状态, 表示等待ping消息的应答
        server.repl_state = REPL_STATE_RECEIVE_PING_REPLY;
        /* Send the PING, don't check for errors at all, we have the timeout
         * that will take care about this. */
        // 发送ping命令
        err = sendCommand(conn,"PING",NULL);
        if (err) goto write_error;
        return;
    }

    /* Receive the PONG command. */
    // 从con读取数据,并校验数据是否是指定的格式
    if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) {
        // 从conn读取数据, 返回读取的数据
        err = receiveSynchronousResponse(conn);

        /* We accept only two replies as valid, a positive +PONG reply
         * (we just check for "+") or an authentication error.
         * Note that older versions of Redis replied with "operation not
         * permitted" instead of using a proper error code, so we test
         * both. */
        if (err[0] != '+' &&
            strncmp(err,"-NOAUTH",7) != 0 &&
            strncmp(err,"-NOPERM",7) != 0 &&
            strncmp(err,"-ERR operation not permitted",28) != 0)
        {
            serverLog(LL_WARNING,"Error reply to PING from master: '%s'",err);
            sdsfree(err);
            goto error;
        } else {
            serverLog(LL_NOTICE,
                "Master replied to PING, replication can continue...");
        }
        sdsfree(err);
        err = NULL;
        // 读取的数据格式正常,切换状态为REPL_STATE_SEND_HANDSHAKE
        server.repl_state = REPL_STATE_SEND_HANDSHAKE;
    }
    // 如果是握手阶段,就给主节点发送认证信息,端口和ip
    if (server.repl_state == REPL_STATE_SEND_HANDSHAKE) {
        /* AUTH with the master if required. */
        // 如果配置了主机连接的用户和密码, 就发送用户和密码
        if (server.masterauth) {
            char *args[3] = {"AUTH",NULL,NULL};
            size_t lens[3] = {4,0,0};
            int argc = 1;
            // 如果设置了用户名,就再args中添加用户名
            if (server.masteruser) {
                args[argc] = server.masteruser;
                lens[argc] = strlen(server.masteruser);
                argc++;
            }
            // 追加密码
            args[argc] = server.masterauth;
            lens[argc] = sdslen(server.masterauth);
            argc++;
            // AUTH name password 或者  AUTH password
            err = sendCommandArgv(conn, argc, args, lens);
            if (err) goto write_error;
        }

        /* Set the slave port, so that Master's INFO command can list the
         * slave listening port correctly. */
        // 发送端口
        {
            int port;
            if (server.slave_announce_port)
                port = server.slave_announce_port;
            else if (server.tls_replication && server.tls_port)
                port = server.tls_port;
            else
                port = server.port;
            sds portstr = sdsfromlonglong(port);
            // 发送slave的监听端口给主机
            err = sendCommand(conn,"REPLCONF",
                    "listening-port",portstr, NULL);
            sdsfree(portstr);
            if (err) goto write_error;
        }

        /* Set the slave ip, so that Master's INFO command can list the
         * slave IP address port correctly in case of port forwarding or NAT.
         * Skip REPLCONF ip-address if there is no slave-announce-ip option set. */
        // 发送slave的ip给主机
        if (server.slave_announce_ip) {
            err = sendCommand(conn,"REPLCONF",
                    "ip-address",server.slave_announce_ip, NULL);
            if (err) goto write_error;
        }

        /* Inform the master of our (slave) capabilities.
         *
         * EOF: supports EOF-style RDB transfer for diskless replication.
         * PSYNC2: supports PSYNC v2, so understands +CONTINUE <new repl ID>.
         *
         * The master will ignore capabilities it does not understand. */
        // 发送slave的能力
        err = sendCommand(conn,"REPLCONF",
                "capa","eof","capa","psync2",NULL);
        if (err) goto write_error;
        // 进入已经发送验证信息,等待应答状态
        server.repl_state = REPL_STATE_RECEIVE_AUTH_REPLY;
        return;
    }
    // 如果没有配置认证消息,就直接进入REPL_STATE_RECEIVE_PORT_REPLY
    if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.masterauth)
        server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY;

    /* Receive AUTH reply. */
    // 如果收到认证消息的应答,检查应答结果
    if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY) {
        err = receiveSynchronousResponse(conn);
        if (err[0] == '-') {
            serverLog(LL_WARNING,"Unable to AUTH to MASTER: %s",err);
            sdsfree(err);
            goto error;
        }
        sdsfree(err);
        err = NULL;
        // 认证检测通过
        server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY;
        return;
    }

    /* Receive REPLCONF listening-port reply. */
    if (server.repl_state == REPL_STATE_RECEIVE_PORT_REPLY) {
        err = receiveSynchronousResponse(conn);
        /* Ignore the error if any, not all the Redis versions support
         * REPLCONF listening-port. */
        if (err[0] == '-') {
            serverLog(LL_NOTICE,"(Non critical) Master does not understand "
                                "REPLCONF listening-port: %s", err);
        }
        sdsfree(err);
        // 端口应答检测通过
        server.repl_state = REPL_STATE_RECEIVE_IP_REPLY;
        return;
    }

    if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY && !server.slave_announce_ip)
        server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;

    /* Receive REPLCONF ip-address reply. */
    if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY) {
        err = receiveSynchronousResponse(conn);
        /* Ignore the error if any, not all the Redis versions support
         * REPLCONF listening-port. */
        if (err[0] == '-') {
            serverLog(LL_NOTICE,"(Non critical) Master does not understand "
                                "REPLCONF ip-address: %s", err);
        }
        sdsfree(err);
        // ip应答检测通过
        server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
        return;
    }

    /* Receive CAPA reply. */
    if (server.repl_state == REPL_STATE_RECEIVE_CAPA_REPLY) {
        err = receiveSynchronousResponse(conn);
        /* Ignore the error if any, not all the Redis versions support
         * REPLCONF capa. */
        if (err[0] == '-') {
            serverLog(LL_NOTICE,"(Non critical) Master does not understand "
                                  "REPLCONF capa: %s", err);
        }
        sdsfree(err);
        err = NULL;
        // capa应答检测通过
        server.repl_state = REPL_STATE_SEND_PSYNC;
    }

    /* Try a partial resynchonization. If we don't have a cached master
     * slaveTryPartialResynchronization() will at least try to use PSYNC
     * to start a full resynchronization so that we get the master replid
     * and the global offset, to try a partial resync at the next
     * reconnection attempt. */
    if (server.repl_state == REPL_STATE_SEND_PSYNC) {
        // 发送PSYNC replid offset命令给主节点, 尝试增量同步
			    // 这里不等待主节点的应答,直接返回
        if (slaveTryPartialResynchronization(conn,0) == PSYNC_WRITE_ERROR) {
            err = sdsnew("Write error sending the PSYNC command.");
            abortFailover("Write error to failover target");
            goto write_error;
        }
        server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY;
        return;
    }

    /* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC. */
    if (server.repl_state != REPL_STATE_RECEIVE_PSYNC_REPLY) {
        serverLog(LL_WARNING,"syncWithMaster(): state machine error, "
                             "state should be RECEIVE_PSYNC but is %d",
                             server.repl_state);
        goto error;
    }
		 // 读取主节点对psync的应答
    psync_result = slaveTryPartialResynchronization(conn,1);
    if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */

    /* Check the status of the planned failover. We expect PSYNC_CONTINUE,
     * but there is nothing technically wrong with a full resync which
     * could happen in edge cases. */
    if (server.failover_state == FAILOVER_IN_PROGRESS) {
        if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) {
            clearFailoverState();
        } else {
            abortFailover("Failover target rejected psync request");
            return;
        }
    }

    /* If the master is in an transient error, we should try to PSYNC
     * from scratch later, so go to the error path. This happens when
     * the server is loading the dataset or is not connected with its
     * master and so forth. */
    if (psync_result == PSYNC_TRY_LATER) goto error;

    /* Note: if PSYNC does not return WAIT_REPLY, it will take care of
     * uninstalling the read handler from the file descriptor. */
    // 可以进行增量同步
    if (psync_result == PSYNC_CONTINUE) {
        serverLog(LL_NOTICE, "MASTER <-> REPLICA sync: Master accepted a Partial Resynchronization.");
        if (server.supervised_mode == SUPERVISED_SYSTEMD) {
            redisCommunicateSystemd("STATUS=MASTER <-> REPLICA sync: Partial Resynchronization accepted. Ready to accept connections in read-write mode.\n");
        }
        return;
    }

    /* PSYNC failed or is not supported: we want our slaves to resync with us
     * as well, if we have any sub-slaves. The master may transfer us an
     * entirely different data set and we have no way to incrementally feed
     * our slaves after that. */
    disconnectSlaves(); /* Force our slaves to resync with us as well. */
    freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */

    /* Fall back to SYNC if needed. Otherwise psync_result == PSYNC_FULLRESYNC
     * and the server.master_replid and master_initial_offset are
     * already populated. */
    // 如果不支持PSYNC,就尝试SYNC
    if (psync_result == PSYNC_NOT_SUPPORTED) {
        serverLog(LL_NOTICE,"Retrying with SYNC...");
        if (connSyncWrite(conn,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) {
            serverLog(LL_WARNING,"I/O error writing to MASTER: %s",
                strerror(errno));
            goto error;
        }
    }

    /* Prepare a suitable temp file for bulk transfer */
    // 不是使用disk less模式,就创建rdb文件
    if (!useDisklessLoad()) {
        while(maxtries--) {
            snprintf(tmpfile,256,
                "temp-%d.%ld.rdb",(int)server.unixtime,(long int)getpid());
            dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
            if (dfd != -1) break;
            sleep(1);
        }
        if (dfd == -1) {
            serverLog(LL_WARNING,"Opening the temp file needed for MASTER <-> REPLICA synchronization: %s",strerror(errno));
            goto error;
        }
        // 记录文件名和文件句柄
        server.repl_transfer_tmpfile = zstrdup(tmpfile);
        server.repl_transfer_fd = dfd;
    }

    /* Setup the non blocking download of the bulk file. */
    if (connSetReadHandler(conn, readSyncBulkPayload)
            == C_ERR)
    {
        char conninfo[CONN_INFO_LEN];
        serverLog(LL_WARNING,
            "Can't create readable event for SYNC: %s (%s)",
            strerror(errno), connGetInfo(conn, conninfo, sizeof(conninfo)));
        goto error;
    }

    server.repl_state = REPL_STATE_TRANSFER;
    server.repl_transfer_size = -1;
    server.repl_transfer_read = 0;
    server.repl_transfer_last_fsync_off = 0;
    server.repl_transfer_lastio = server.unixtime;
    return;

error:
    if (dfd != -1) close(dfd);
    connClose(conn);
    server.repl_transfer_s = NULL;
    if (server.repl_transfer_fd != -1)
        close(server.repl_transfer_fd);
    if (server.repl_transfer_tmpfile)
        zfree(server.repl_transfer_tmpfile);
    server.repl_transfer_tmpfile = NULL;
    server.repl_transfer_fd = -1;
    server.repl_state = REPL_STATE_CONNECT;
    return;

write_error: /* Handle sendCommand() errors. */
    serverLog(LL_WARNING,"Sending command to master in replication handshake: %s", err);
    sdsfree(err);
    goto error;
}

该函数主要功能:

  • 给主节点发送认证信息,ip,端口等信息;
  • 尝试进行增量同步,如果可以就进行增量同步;如果不行就进行全量同步,使用readSyncBulkPayload读取主节点发送的rdb文件。

增量同步

// 处理客户端网络包
void readQueryFromClient(connection *conn)
  static int connSocketRead(connection *conn, void *buf, size_t buf_len) 
  void processInputBuffer(client *c) 
    // 从c->querybuf取一行数据,放入c->argv
    int processInlineBuffer(client *c)
    // 处理客户端命令
	   int processCommandAndResetClient(client *c)
	     // 进行一些检查后,执行命令
	     int processCommand(client *c)
		      // 调用命令处理函数c->cmd->proc
		      void call(client *c, int flags)
			      // 将命令写入到replication buff或者aof缓冲区
				     void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
               int flags)
			          // 将命令写入server.aof_buf 
              void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc)   
			          // 将命令写入server.repl_backlog
			          void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) 
  • 对于写命令,propagate函数会根据配置将命令写入replication buff和发送给从节点,或者写入aof缓冲区。
posted @ 2024-09-27 17:14  董少奇  阅读(62)  评论(0)    收藏  举报