memcached set命令的大致处理逻辑笔记

这次记录状态机的主要逻辑,跟踪set命令的执行流程,暂不涉及到内存申请这一块,下面内容基本都是代码注释

首先还是补充了解下客户连接在发送数据到数据被处理并返回过程中conn的各种状态的表示

enum conn_states {
conn_listening, /** 只有监听连接的socket才会是这种状态*/
conn_new_cmd, /** 等待下一条命令,新建的客户端连接初始化也是这种状态 */
conn_waiting, /** 等待读数据 */
conn_read, /** 正在读取命令数据,最开始的读取 */
conn_parse_cmd, /** 尝试从读取到的buffer中解析出一个命令 */
conn_write, /** 等待输出返回的结果 */
conn_nread, /** 正在/准备读取n byte 的数据,也就是说已经计算好要读取多少,set命令时读取value会用到 */
conn_swallow, /** 暂时没遇到这种情况 swallowing unnecessary bytes w/o storing */
conn_closing, /** 正在关闭连接 */
conn_mwrite, /** 按顺序返回多个item */
conn_closed, /** 标识连接已经关闭 */
conn_max_state /**< Max state value (used for assertion) */
};

下面从状态机开始,具体是从libevent通知开始接收数据,这时候会跳转到conn_read状态;

static void drive_machine(conn *c) {
bool stop = false;
int sfd;
socklen_t addrlen;
struct sockaddr_storage addr;
int nreqs = settings.reqs_per_event;
int res;
const char *str;

while (!stop) {

switch(c->state) {
case conn_listening:
... /** 处理连接部分逻辑*/
break;

case conn_waiting:
/** 这种状态下只能是向libevent注册可读事件了*/
if (!update_event(c, EV_READ | EV_PERSIST)) {
if (settings.verbose > 0)
fprintf(stderr, "Couldn't update event\n");
conn_set_state(c, conn_closing);
break;
}
/**注册可读之后就是将下一个状态设置为读取数据*/
conn_set_state(c, conn_read);
stop = true; /** 得跳出状态机了,等事件发生再回来,回来入口就是conn_read,现在去吧*/
break;

/** 通知收到数据了,这里开始读取数据*/
case conn_read:
/**这里关注tcp传输方式,跳到try_read_network那里去看看,滚动鼠标到下面找到这个函数的注释*/
res = IS_UDP(c->transport) ? try_read_udp(c) : try_read_network(c);

switch (res) {
case READ_NO_DATA_RECEIVED: /** 没有获取到数据等下一次可读事件触发*/ conn_set_state(c, conn_waiting); break; case READ_DATA_RECEIVED: /** 获取到数据*/ conn_set_state(c, conn_parse_cmd); /** 跳转到命令解析状态, 跳转吧*/ break; case READ_ERROR: /** 读取数据异常,关闭连接*/ conn_set_state(c, conn_closing); break; case READ_MEMORY_ERROR: /* rbuf扩容失败 Failed to allocate more memory */ /* State already set by try_read_network */ break; } /** 注意这里不会跳出状态机,继续根据上面设置的状态(其实大可能在上面就跳了)跳转*/ break; /** 上面读取数据正常就会跳到这里了*/ case conn_parse_cmd : /** 跳到命令解析函数*/ if (try_read_command(c) == 0) { /* 尝试读取命令失败了,系统认为需要更多的数据 wee need more data! */ conn_set_state(c, conn_waiting); /** 注册读事件去*/ } /** 同样,这里不需要跳出状态机*/ /** 解析命令成功则跳转到conn_nread准备获取nbytes的数据*/ break; case conn_new_cmd: /* Only process nreqs at a time to avoid starving other connections */ /** 每次I/O复用返回之后处理的事件不超过nreqs个,这样可以防止其他客户连接一直不能被处理到*/ --nreqs; /** 每处理一个新的*/ if (nreqs >= 0) { /** 计数nreqs*/ reset_cmd_handler(c); /* 重新进入while循环进入状态机,刚开始连接时其实没有数据所以下一个状态是conn_waiting, 跳到该状态逻辑*/ } else { pthread_mutex_lock(&c->thread->stats.mutex); c->thread->stats.conn_yields++; pthread_mutex_unlock(&c->thread->stats.mutex); if (c->rbytes > 0) { /* We have already read in data into the input buffer, so libevent will most likely not signal read events on the socket (unless more data is available. As a hack we should just put in a request to write data, because that should be possible ;-) */ /** *由于已经读取到了数据,但是又因为每次不会处理那么多事件,而libevent在下一次事件 *获取中并不会再次获取到之前已经获取到但是又未被处理的事件(linux epoll ET),所以这里使用 *一个技巧,即向libevent注册了可写事件,这样当下一次事件通知时memcached优先处理可读, *将已读取到的数据进行处理,之后再考虑写事件 */ if (!update_event(c, EV_WRITE | EV_PERSIST)) { if (settings.verbose > 0) fprintf(stderr, "Couldn't update event\n"); conn_set_state(c, conn_closing); break; } } stop = true; /** 之后nreqs 会被重置为 ettings.reqs_per_event 默认为20*/ } break; /** 解析命令成功之后就会跳转到这里*/ case conn_nread: if (c->rlbytes == 0) { complete_nread(c); /** 不需要再读取数据了, 直接跳到complete_nread, 之后out_string() 会改变c的状态可能去到write*/ break; /** out_string() 在下面有注释*/ } /* 异常 Check if rbytes < 0, to prevent crash */ if (c->rlbytes < 0) { if (settings.verbose) { fprintf(stderr, "Invalid rlbytes to read: len %d\n", c->rlbytes); } conn_set_state(c, conn_closing); break; } /* first check if we have leftovers in the conn_read buffer */ /** 经过上面判断还需要读取数据,则看看是否还有尚未解析的数据*/ if (c->rbytes > 0) { int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes; /**确认需要的长度*/ if (c->ritem != c->rcurr) { /**将尚未解析的tocopy长度数据复制到c->ritem, * 由于c->item指向了向内存申请的item的data部分,因此可以省去一次内存复制!? */ memmove(c->ritem, c->rcurr, tocopy); } c->ritem += tocopy; c->rlbytes -= tocopy; c->rcurr += tocopy; c->rbytes -= tocopy; if (c->rlbytes == 0) { break; /** 不需要读取数据了则跳出switch重新来到conn_nread,在上面complete_nread()之后跳出while*/ } } /** 上面读取的数据不够,继续从socket中读取*/ /* now try reading from the socket */ /** 直接读取到c->ritem!*/ res = read(c->sfd, c->ritem, c->rlbytes); if (res > 0) { pthread_mutex_lock(&c->thread->stats.mutex); c->thread->stats.bytes_read += res; pthread_mutex_unlock(&c->thread->stats.mutex); if (c->rcurr == c->ritem) { c->rcurr += res; } c->ritem += res; c->rlbytes -= res; /** 循环读取直到rlbytes == 0 */ break; } if (res == 0) { /* end of stream */ conn_set_state(c, conn_closing); break; } if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { if (!update_event(c, EV_READ | EV_PERSIST)) { if (settings.verbose > 0) fprintf(stderr, "Couldn't update event\n"); conn_set_state(c, conn_closing); break; } stop = true; break; } /* otherwise we have a real error, on which we close the connection */ if (settings.verbose > 0) { fprintf(stderr, "Failed to read, and not due to blocking:\n" "errno: %d %s \n" "rcurr=%lx ritem=%lx rbuf=%lx rlbytes=%d rsize=%d\n", errno, strerror(errno), (long)c->rcurr, (long)c->ritem, (long)c->rbuf, (int)c->rlbytes, (int)c->rsize); } conn_set_state(c, conn_closing); break; case conn_swallow: /* we are reading sbytes and throwing them away */ if (c->sbytes == 0) { conn_set_state(c, conn_new_cmd); break; } /* first check if we have leftovers in the conn_read buffer */ if (c->rbytes > 0) { int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes; c->sbytes -= tocopy; c->rcurr += tocopy; c->rbytes -= tocopy; break; } /* now try reading from the socket */ res = read(c->sfd, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize); if (res > 0) { pthread_mutex_lock(&c->thread->stats.mutex); c->thread->stats.bytes_read += res; pthread_mutex_unlock(&c->thread->stats.mutex); c->sbytes -= res; break; } if (res == 0) { /* end of stream */ conn_set_state(c, conn_closing); break; } if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) { if (!update_event(c, EV_READ | EV_PERSIST)) { if (settings.verbose > 0) fprintf(stderr, "Couldn't update event\n"); conn_set_state(c, conn_closing); break; } stop = true; break; } /* otherwise we have a real error, on which we close the connection */ if (settings.verbose > 0) fprintf(stderr, "Failed to read, and not due to blocking\n"); conn_set_state(c, conn_closing); break; /**通常调用out_string()处理返回结果都会来到这里*/ case conn_write: /* * We want to write out a simple response. If we haven't already, * assemble it into a msgbuf list (this will be a single-entry * list for TCP or a two-entry list for UDP). */ if (c->iovused == 0 || (IS_UDP(c->transport) && c->iovused == 1)) { if (add_iov(c, c->wcurr, c->wbytes) != 0) { if (settings.verbose > 0) fprintf(stderr, "Couldn't build response\n"); conn_set_state(c, conn_closing); break; } } /* fall through... */ /**没有break 直接走下面conn_mwrite逻辑*/ case conn_mwrite: if (IS_UDP(c->transport) && c->msgcurr == 0 && build_udp_headers(c) != 0) { if (settings.verbose > 0) fprintf(stderr, "Failed to build UDP headers\n"); conn_set_state(c, conn_closing); break; } /**下面调用transmit()真正的传输数据*/ switch (transmit(c)) { case TRANSMIT_COMPLETE: if (c->state == conn_mwrite) { conn_release_items(c); /* XXX: I don't know why this wasn't the general case */ if(c->protocol == binary_prot) { conn_set_state(c, c->write_and_go); } else { conn_set_state(c, conn_new_cmd); } } else if (c->state == conn_write) { if (c->write_and_free) { free(c->write_and_free); c->write_and_free = 0; } conn_set_state(c, c->write_and_go); /** 设置为写完之后的状态啦*/ } else { if (settings.verbose > 0) fprintf(stderr, "Unexpected state %d\n", c->state); conn_set_state(c, conn_closing); } break; case TRANSMIT_INCOMPLETE: case TRANSMIT_HARD_ERROR: break; /* Continue in state machine. */ case TRANSMIT_SOFT_ERROR: stop = true; break; } break; case conn_closing: if (IS_UDP(c->transport)) conn_cleanup(c); else conn_close(c); stop = true; break; case conn_closed: /* This only happens if dormando is an idiot. */ abort(); break; case conn_max_state: assert(false); break; } } return; } /* * read from network as much as we can, handle buffer overflow and connection * close. * before reading, move the remaining incomplete fragment of a command * (if any) to the beginning of the buffer. * * To protect us from someone flooding a connection with bogus data causing * the connection to eat up all available memory, break out and start looking * at the data I've got after a number of reallocs... * * @return enum try_read_result */ /** * 从socket中尽量的读取数据,处理buffer溢出以及客户端连接 * 在开始读取数据之前,将剩余的为解析的命令数据片段移到buffer的前面(尾巴) * * 为了防止有些人发送大量的伪造数据消耗可用内存,这里限制了每次重新分配内存的次数 * *(翻译得不是很好-_-) */ static enum try_read_result try_read_network(conn *c) { enum try_read_result gotdata = READ_NO_DATA_RECEIVED; /** 初始状态为未读取到数据*/ int res; int num_allocs = 0; /** 记录重新分配rbuf的次数*/ assert(c != NULL); /** 如果有可解析的数据*/ if (c->rcurr != c->rbuf) { if (c->rbytes != 0) /* 如果有剩余未解析的数据则将这些数据接到rbuf的尾巴 otherwise there's nothing to copy */ memmove(c->rbuf, c->rcurr, c->rbytes); c->rcurr = c->rbuf; /** rcurr指向rbuf,这样rcurr就有了当前所有的数据*/ } /** 下面开始读数据*/ while (1) { if (c->rbytes >= c->rsize) { /** 若尚未解析的数据长度大于rbuf的总的长度,这时候要重新分配rbuf的空间*/ if (num_allocs == 4) { /** 最多5次重新分配的机会,5次完了不管有没有读完直接返回已经读取到数据*/ return gotdata; /** 按总5次分配,基数为2048即2k,默认最多能达到64k */ } ++num_allocs; char *new_rbuf = realloc(c->rbuf, c->rsize * 2); /** 将原来空间扩大到2倍*/ if (!new_rbuf) { /** 分配失败了*/ STATS_LOCK(); stats.malloc_fails++; STATS_UNLOCK(); if (settings.verbose > 0) { fprintf(stderr, "Couldn't realloc input buffer\n"); } c->rbytes = 0; /* ignore what we read */ out_of_memory(c, "SERVER_ERROR out of memory reading request"); c->write_and_go = conn_closing; return READ_MEMORY_ERROR; } /** 新空间分配成功*/ c->rcurr = c->rbuf = new_rbuf; c->rsize *= 2; /** 设置大小*/ } int avail = c->rsize - c->rbytes; /**计算可用空间,总的长度-尚未解析的长度*/ res = read(c->sfd, c->rbuf + c->rbytes, avail); if (res > 0) { pthread_mutex_lock(&c->thread->stats.mutex); c->thread->stats.bytes_read += res; /** 统计系统数据的读取量*/ pthread_mutex_unlock(&c->thread->stats.mutex); gotdata = READ_DATA_RECEIVED; c->rbytes += res; /** 更新尚未解析的长度*/ if (res == avail) {/**继续读取数据, 这时候需要扩大rbuf的空间*/ continue; } else { break; /**读完了, 返回READ_DATA_RECEIVED*/ } } if (res == 0) { /** 读取出错*/ return READ_ERROR; } if (res == -1) {/** 对端断开链接?*/ if (errno == EAGAIN || errno == EWOULDBLOCK) { break; } return READ_ERROR; } } return gotdata; } /* * if we have a complete line in the buffer, process it. */ /** * 上面的官方注释说rbuf有完整的行可开始处理已有的这些数据 */ static int try_read_command(conn *c) { assert(c != NULL); assert(c->rcurr <= (c->rbuf + c->rsize)); assert(c->rbytes > 0); /** 暂时跳过*/ if (c->protocol == negotiating_prot || c->transport == udp_transport) { ... } /** 暂时跳过*/ if (c->protocol == binary_prot) { ... } else { /** 直接调转到 c->protocol == ascii_prot 的情况*/ char *el, *cont; if (c->rbytes == 0) /** 没有可解析数据*/ return 0; /** "set bico 0 0 5\r\nhello\r\n" 第一个'\n'分离了value*/ el = memchr(c->rcurr, '\n', c->rbytes); /** 尝试找到\n符号表示命令段结束*/ if (!el) { /** 找不到*/ if (c->rbytes > 1024) { /* * We didn't have a '\n' in the first k. This _has_ to be a * large multiget, if not we should just nuke the connection. */ char *ptr = c->rcurr; while (*ptr == ' ') { /* ignore leading whitespaces */ ++ptr; } if (ptr - c->rcurr > 100 || (strncmp(ptr, "get ", 4) && strncmp(ptr, "gets ", 5))) { conn_set_state(c, conn_closing); return 1; } } return 0; } /** 找到命令结束符*/ cont = el + 1; /** cont 指向el之后的内容*/ if ((el - c->rcurr) > 1 && *(el - 1) == '\r') { el--; /**el 指向了 '\r' 的位置 */ } /** *将'\r'位置重置为'\0' 这样rcurr暂时能解析到的数据就只有第一个'\n' 之前的数据 * 比如set命令 "set bico 0 0 5\r\nhello\r\n" rcurr指向的数据只有"set bico 0 0 5" */ *el = '\0'; assert(cont <= (c->rcurr + c->rbytes)); c->last_cmd_time = current_time; /** 跳到命令处理 c->rcurr只包含前段数据了 eg:"set bico 0 0 5" */ process_command(c, c->rcurr); /** 跳到process_command() */ c->rbytes -= (cont - c->rcurr); /** 更新未解析数据长度*/ c->rcurr = cont; /** 上面用cont来分段,这里将后段剩余数据放回到rcurr*/ assert(c->rcurr <= (c->rbuf + c->rsize)); } return 1; } /** * process_command 主要逻辑为判断命令的合法性并根据命令跳转到对应的处理函数 */ static void process_command(conn *c, char *command) { token_t tokens[MAX_TOKENS]; /***/ size_t ntokens; int comm; /* * for commands set/add/replace, we build an item and read the data * directly into it, then continue in nread_complete(). */ c->msgcurr = 0; c->msgused = 0; c->iovused = 0; if (add_msghdr(c) != 0) { out_of_memory(c, "SERVER_ERROR out of memory preparing response"); return; } /** *tokenize_command 比较简单,根据空格符将command进行瓜分,逐个放入到tokens数组,最多8个token * eg "set bico 0 0 5" 按顺序set是命令 bico是key 0是flag 0是expire 5是value的长度 *这样 toekens = {"set", "bico", "0", "0", "5", NULL}; */ ntokens = tokenize_command(command, tokens, MAX_TOKENS); if (ntokens >= 3 && ((strcmp(tokens[COMMAND_TOKEN].value, "get") == 0) || (strcmp(tokens[COMMAND_TOKEN].value, "bget") == 0))) { /** get命令的处理*/ process_get_command(c, tokens, ntokens, false); } else if ((ntokens == 6 || ntokens == 7) && (strcmp(tokens[COMMAND_TOKEN].value, "set") == 0 && (comm = NREAD_SET)) || ... (strcmp(tokens[COMMAND_TOKEN].value, "append") == 0 && (comm = NREAD_APPEND)) )) { /** set 命令会跳转到这, 进入process_update_command*/ process_update_command(c, tokens, ntokens, comm, false); } else ... } return; } /** * 真正处理命令的地方,涉及到内存申请操作 */ static void process_update_command(conn *c, token_t *tokens, const size_t ntokens, int comm, bool handle_cas) { char *key; size_t nkey; unsigned int flags; int32_t exptime_int = 0; time_t exptime; int vlen; /** 判断key的长度是否超过系统规定最大长度 250 bytes*/ if (tokens[KEY_TOKEN].length > KEY_MAX_LENGTH) { out_string(c, "CLIENT_ERROR bad command line format"); return; } /** 获取key的值*/ key = tokens[KEY_TOKEN].value; nkey = tokens[KEY_TOKEN].length; /** 获取其他值 flags exptime value_length*/ if (! (safe_strtoul(tokens[2].value, (uint32_t *)&flags) && safe_strtol(tokens[3].value, &exptime_int) && safe_strtol(tokens[4].value, (int32_t *)&vlen))) { out_string(c, "CLIENT_ERROR bad command line format"); return; } /* Ubuntu 8.04 breaks when I pass exptime to safe_strtol */ exptime = exptime_int; /* Negative exptimes can underflow and end up immortal. realtime() will immediately expire values that are greater than REALTIME_MAXDELTA, but less than process_started, so lets aim for that. */ if (exptime < 0) exptime = REALTIME_MAXDELTA + 1; // does cas value exist? if (handle_cas) { if (!safe_strtoull(tokens[5].value, &req_cas_id)) { out_string(c, "CLIENT_ERROR bad command line format"); return; } } vlen += 2; /** vlen+2 因为最后有'\r\n' */ if (vlen < 0 || vlen - 2 < 0) { out_string(c, "CLIENT_ERROR bad command line format"); return; } ... /** 分配一个item ,大致逻辑就是去申请一片内存空间,这里跟内存管理有关系,后面笔记会记录解析,这里暂不深入*/ it = item_alloc(key, nkey, flags, realtime(exptime), vlen); if (it == 0) { /** 分配失败*/ if (! item_size_ok(nkey, flags, vlen)) out_string(c, "SERVER_ERROR object too large for cache"); else out_of_memory(c, "SERVER_ERROR out of memory storing object"); /* swallow the data line */ c->write_and_go = conn_swallow; c->sbytes = vlen; /* Avoid stale data persisting in cache because we failed alloc. * Unacceptable for SET. Anywhere else too? */ if (comm == NREAD_SET) {
it = item_get(key, nkey); if (it) { item_unlink(it); item_remove(it); } } return; } /** 分配成功*/ ITEM_set_cas(it, req_cas_id); c->item = it; c->ritem = ITEM_data(it); /** 这里将ritem 指向item的data地址,非常有用,可以减少内存复制*/ c->rlbytes = it->nbytes; /** nbytes表示value的长度,具体可以看看item数据结构,后面笔记会做注释*/ c->cmd = comm; /** 当前正在处理的命令*/ conn_set_state(c, conn_nread); /** 设置状态为读取it ->nbytes 数据, 调回到process_command*/ } /** 返回命令处理的结果到客户端*/ static void out_string(conn *c, const char *str) { size_t len; ... /* Nuke a partial output... */ c->msgcurr = 0; c->msgused = 0; c->iovused = 0; add_msghdr(c); len = strlen(str); if ((len + 2) > c->wsize) { /* ought to be always enough. just fail for simplicity */ str = "SERVER_ERROR output line too long"; len = strlen(str); } memcpy(c->wbuf, str, len); /** 复制返回数据到wbuf*/ memcpy(c->wbuf + len, "\r\n", 2); c->wbytes = len + 2; c->wcurr = c->wbuf; conn_set_state(c, conn_write); /** 设置写状态,状态机去到写逻辑*/ c->write_and_go = conn_new_cmd; /**设置写完之后下一个状态*/ return; }

以上的注释主要是依据set这条比较简单的命令,这样比较容易理清整个逻辑过程,

下一次随笔主要是关注内存管理这一块。

才疏学浅,理解上可能会有偏差出错,欢迎指出,谢谢!

posted @ 2014-07-02 22:58  Bico 笔记  阅读(991)  评论(0编辑  收藏  举报