Larbin源代码分析[9]NAMEDSITE站点的抽象类

一 类的主要功能

         NamedSite 该类为一个主站点的抽象类,主站点类似www.baidu.com.

二 类的主要成员变量

    /* name of the site */

    (1)char name[maxSiteSize];               //主站点的名称

    (2)uint16_t port;                         //主站点的端口号

    (3)uint16_t nburls;                //内存中该站点的所有url的数目 

    (4)url *fifo[maxUrlsBySite];                //内存中该站点的url缓冲区队列

    (5)uint8_t inFifo;                 //内存中队列的队首

    (6)uint8_t outFifo;                 //内存中队列的尾部

    (7)bool isInFifo;                 //该站点是否在dnsSite中,即已经遍历完毕

    (8)char dnsState;                             //dns的状态

    (9)/** internet addr of this server */

        char dnsState;                         //internet的网站地址

        struct in_addr addr;                         //ipv4地址

        uint ipHash;                         //iphash值

    (10) time_t dnsTimeout;                            //请求dns以及获取robots.txt的时延 * Date of expiration of dns call and robots.txt fetch */

    (11) Vector<char> forbidden;                  /** test if a file can be fetched thanks to the robots.txt */ robots文件中禁止访问的url

三 成员函数

    (1)NamedSite构造函数

                   实质是将上述各个成员变量进行初始化操作。

    (2)putInFifo(url *u)  将u插入进队列fifo中,队列为循环队列。

    (3)url *NamedSite::getInFifo() 将fifo中的url取出队头

    (4)int fifoLength() {(inFifo - outFifo + maxUrlsBySite) % maxUrlsBySite ;}

    (5)putGenericUrl(url *u, int limit, bool prio)     //将获取的url插入进入站点的fifo队列中

                   //当当前的nburls的数量大于限制的时候,需要根据优先级将其存储在URLsPriorityWait队列或者URLsDiskWait队列中。        

                   函数实现方法如下:

                   if (nburls > maxUrlsBySite-limit) {                           //当前内存中的nburls数目,已经超过该限制了,下面就判断是否需要忽略该URL

                            if (!strcmp(name, u->getHost())) {                             //首先判断当前的站点名称和url的host名称是否一致

                                     if (dnsState == errorDns) {                   //如果当前的dns请求状态是 errorDns,则进行忽略操作

                                               nburls++;

                    forgetUrl(u, noDNS);

                    return;

                    }

                if (dnsState == noConnDns) {                             //当当前的dns状态为无连接,则进行URL忽略

                                               nburls++;

                    forgetUrl(u, noConnection);

                    return;

                }

                if (u->getPort() == port                           //如果当前的url在对应网站的forbid列表中,则将对应的url加入忽略

                                                        && dnsState == doneDns && !testRobots(u->getFile())) {

                                                        nburls++;

                                                        forgetUrl(u, forbiddenRobots);

                                                        return;

                    }

                }          

            //此时URL的host名称和当前站点的host地址不同,则需要记录(此时仍然无法放入fifo队列中,需要放入等待队列中)

                            refUrl();

                            global::inter->getOne(); //实际上inter表示全部站点中,fifo队列的数量

                            if (prio) {

                                     global::URLsPriorityWait->put(u);

                            } else {

                                     global::URLsDiskWait->put(u);

                            }

        }

                   //此处没有超出fifo队列的大小限制

                   else {

                            nburls++;

                            if (dnsState == waitDns

                                     || strcmp(name, u->getHost())

                                     || port != u->getPort()

                                     || global::now > dnsTimeout) {

                            // dns not done or other site

                            putInFifo(u);  //将此url放入进fifo中

                            addNamedUrl(); //将namesites中的url数目增加

                            // Put Site in fifo if not yet in

                            if (!isInFifo) { //如果当前该namesites没在dnsites中时,则进行添加操作

                                     isInFifo = true;

                                     global::dnsSites->put(this);

                            }

                   } else switch (dnsState) {

                            case doneDns:

                                     transfer(u);

                                     break;

                            case errorDns:

                                     forgetUrl(u, noDNS);

                                     break;

                            default: // noConnDns

                                     forgetUrl(u, noConnection);

                            }

                   }

  

  (6)void NamedSite::newQuery ()                             //该函数,主要用于发起一个新的dns请求

                   若请求成功,则调用调用dnsOk()。                      //该函数中,主要有处理代理地址,若没有解析到dns,则需要调用adns包中的请求函数。

  (7)void NamedSite::dnsAns (adns_answer *ans)  未看明白

 

(8)void NamedSite::dnsOK () {                             //请求各个站点的robots.txt文件

                   Connexion *conn = global::freeConns->get();         //从全局变量中获取空闲链接

                   char res = getFds(conn, &addr, port);                     //根据addr建立连接

                   if (res != emptyC) {

                            conn->timeout = timeoutPage;

                   if (global::proxyAddr != NULL) {

                            // use a proxy

                            conn->request.addString("GET http://");

                            conn->request.addString(name);

                            char tmp[15];

                            sprintf(tmp, ":%u", port);

                            conn->request.addString(tmp);

                            conn->request.addString("/robots.txt HTTP/1.0\r\nHost: ");

                   } else {

                            // direct connection

                            conn->request.addString("GET /robots.txt HTTP/1.0\r\nHost: ");

                   }

                   conn->request.addString(name);

                   conn->request.addString(global::headersRobots);  //具体请求报文,在global.cc中

                   conn->parser = new robots(this, conn);  //应该在file文件中,构建一个解析类robots

                   conn->pos = 0;

                   conn->err = success;

                   conn->state = res;

         } else {

                   // Unable to get a socket

                   global::freeConns->put(conn);

                   dnsState = noConnDns;

                   dnsErr();

         }

}

(9)bool NamedSite::testRobots(char *file)      //判断该文件是否在robots.txt中的屏蔽列表中

(10)NamedSite::robotsResult (FetchError res)         //获取robots.txt ,然后扫描fifo队列,

         将其中与namesite类的站点名称以及端口号相同的transfer到 Ipsite中,需要计算其Iphash

(11)NamedSite::transfer (url *u)将站点地址copy到url中,然后判断是否在robots.txt的列表中

         若在则执行forgetUrl,否则添加到global::IPSiteList[ipHash].putUrl(u);中。

 

四 综上:

    该类主要是站点类的抽象,涉及到了 请求DNS ,请求robots.txt ,打开connexion中的链接。

posted on 2011-10-24 17:35  zhoulinhu  阅读(384)  评论(0编辑  收藏  举报

导航