Larbin代码分析

在larbin里判断用一个URL是否被抓取过,用的是bloom filter算法(至少网上的人这么说),但是我感觉与《数学之美系列二十一 - 布隆过滤器(Bloom Filter)》中所介绍的算法有着很大的不同,因为larbin中只是简单用了hash方法,它有点像位图法,但我的算法很一般,意见仅供参考。

       先看一下hashTable的类定义:

class hashTable {

private:

    ssize_t size;

    char *table;

 

public:

    /* constructor */

    hashTable(bool create);

 

    /* destructor */

    ~hashTable();

 

    /* save the hashTable in a file */

    void save();

 

    /* test if this url is allready in the hashtable

     * return true if it has been added

     * return false if it has allready been seen

     */

    bool test(url *U);

 

    /* set a url as present in the hashtable

     */

    void set(url *U);

 

    /* add a new url in the hashtable

     * return true if it has been added

     * return false if it has allready been seen

     */

    bool testSet(url *U);

};

       它只有两个成员变量,一个是它的大小,一个是它的内容。Save函数是把hashTable保存到一个文件中,test函数是判断一个url是否已经存在于这个hashtable,set函数是将这个url在hashtable中标明已经见过了,testSet判断这个url有没有被加入过。

/* constructor */

hashTable::hashTable(bool create) {

    ssize_t total = hashSize / 8;

    table = new char[total];

    if (create) {

       for (ssize_t i = 0; i < hashSize / 8; i++) {

           table[i] = 0;

       }

    } else {

       int fds = open("hashtable.bak", O_RDONLY);

       if (fds < 0) {

           cerr << "Cannot find hashtable.bak, restart from scratch\n";

           for (ssize_t i = 0; i < hashSize / 8; i++) {

              table[i] = 0;

           }

       } else {

           ssize_t sr = 0;

           while (sr < total) {

              ssize_t tmp = read(fds, table + sr, total - sr);

              if (tmp <= 0) {

                  cerr << "Cannot read hashtable.bak : " <<

strerror(errno) << endl;

                  exit(1);

              } else {

                  sr += tmp;

              }

           }

           close(fds);

       }

    }

}

       hashSize定义在type.h中,它的大小为64000000,如果是创建一个hashtable,就将table中所有值转置空,否则,去读hashtable.bak中的内容。

/* save the hashTable in a file */

void hashTable::save() {

    rename("hashtable.bak", "hashtable.old");

    int fds = creat("hashtable.bak", 00600);

    if (fds >= 0) {

       ecrireBuff(fds, table, hashSize / 8);

       close(fds);

    }

    unlink("hashtable.old");

}

       Écrire是法语的write,就是将一个char*的buff写入文件的函数,unlink将文件的连接数减少一个,如果当前文件的连接数目为0,并且没有其他程序打开这个文件,则删除,而remove则将文件直接删除。

/* test if this url is allready in the hashtable

 * return true if it has been added

 * return false if it has allready been seen

 */

bool hashTable::test(url *U) {

    unsigned int code = U->hashCode();

    unsigned int pos = code / 8;

    unsigned int bits = 1 << (code % 8);

    return table[pos] & bits;

}

       这里计算得到URL的hash code,后计算它在哪个字节上,再看它在它个bit上,最后判断这个bit是否已经置过1。

/* return a hashcode for this url */

uint url::hashCode() {

    unsigned int h = port;

    unsigned int i = 0;

    while (host[i] != 0) {

       h = 31 * h + host[i];

       i++;

    }

    i = 0;

    while (file[i] != 0) {

       h = 31 * h + file[i];

       i++;

    }

    return h % hashSize;

}

       这里是将host name和后面部分做hash,nutch里算这个hash code值的时候,是反过来算的,因为这样同一host的url的hashcode的值就会有更大的差异,是不是会更好些呢?

/* set a url as present in the hashtable

 */

void hashTable::set(url *U) {

    unsigned int code = U->hashCode();

    unsigned int pos = code / 8;

    unsigned int bits = 1 << (code % 8);

    table[pos] |= bits;

}

       这里有test函数差不多,只是最后用或运算置位。

       在checker.h中声明了两个函数:

/** check if an url is already known

 * if not send it

 * @param u the url to check

 */

void check(url *u);

 

/** Check the extension of an url

 * @return true if it might be interesting, false otherwise

 */

bool filter1(char *host, char *file);

       check的实现如下:

void check(url *u) {

    if (global::seen->testSet(u)) {

       hashUrls(); // stat

       // where should this link go ?

#ifdef SPECIFICSEARCH

       if (privilegedExts[0] != NULL

              && matchPrivExt(u->getFile())) {

           interestingExtension();

           global::URLsPriority->put(u);

       } else {

           global::URLsDisk->put(u);

       }

#else // not a SPECIFICSEARCH

       global::URLsDisk->put(u);

#endif

    } else {

       // This url has already been seen

       answers(urlDup);

       delete u;

    }

}

       Global::seen是一个hashtable对象,用它来判断这个URL是否已经见过。

这里SPECIFICSEARCH是判断是我们我们想得到的那种格式,用URLsPriority处理,否则用URLsDisk处理,

/** Check the extension of an url

 * @return true if it might be interesting, false otherwise

 */

bool filter1(char *host, char *file) {

    int i = 0;

    if (global::domains != NULL) {

       bool ok = false;

       while ((*global::domains)[i] != NULL) {

           ok = ok || endWith((*global::domains)[i], host);

           i++;

       }

       if (!ok) {

           return false;

       }

    }

    i = 0;

    int l = strlen(file);

    if (endWithIgnoreCase("html", file, l) || file[l - 1] == '/'

           || endWithIgnoreCase("htm", file, l)) {

       return true;

    }

    while (global::forbExt[i] != NULL) {

       if (endWithIgnoreCase(global::forbExt[i], file, l)) {

           return false;

       }

       i++;

    }

    return true;

}

       Filter1的实现如下:

/** Check the extension of an url

 * @return true if it might be interesting, false otherwise

 */

bool filter1(char *host, char *file) {

    int i = 0;

    if (global::domains != NULL) {

       bool ok = false;

       while ((*global::domains)[i] != NULL) {

           ok = ( ok || endWith((*global::domains)[i], host) );

           i++;

       }

       if (!ok) {

           return false;

       }

    }

    i = 0;

    int l = strlen(file);

    if (endWithIgnoreCase("html", file, l) || file[l - 1] == '/'

           || endWithIgnoreCase("htm", file, l)) {

       return true;

    }

    while (global::forbExt[i] != NULL) {

       if (endWithIgnoreCase(global::forbExt[i], file, l)) {

           return false;

       }

       i++;

    }

    return true;

}

       前半部分是判断domain是否是配置中提到的,后面是判断这种格式是不是配置中禁止的,html和htm是永真的。

       配置在larbin.conf中

# Do you want to limit your search to a specific domain ?

# if yes, uncomment the following line

#limitToDomain .fr .dk .uk end

 

# What are the extensions you surely don't want

# never forbid .html, .htm and so on : larbin needs them

forbiddenExtensions

.tar .gz .tgz .zip .Z .rpm .deb

.ps .dvi .pdf

.png .jpg .jpeg .bmp .smi .tiff .gif

.mov .avi .mpeg .mpg .mp3 .qt .wav .ram .rm

.jar .java .class .diff

.doc .xls .ppt .mdb .rtf .exe .pps .so .psd

end

 

在main.cc的main函数中刚开始调用了global的构造函数,global函数中有一行是parseFile,它是用于解析配置文件larbin.conf文件的。

“UserAgent”: UserAgent

“From”: 使用者的邮箱

“startUrl”: 开始爬取的url

“waitduration”: 访问同一服务器的时间间隔

“proxy”: 代理服务器信息

“pageConnexions”: 最大并行连接数

“dnsConnexions”: DNS最大并行连接数

“httpPort”: 用于使用者查看抓取信息的端口

“inputPort”: 用于向labin添加url等输入信息的telnet端口

“depthInSite”: 指定爬虫爬取深度

“limitToDomain”: 限定爬取的域名

“forbiddenExtensions”: 禁止爬取的扩展名

“noExternalLinks”: 不爬取和页面不在同一站点的URL

       其中对startUrl的解析具体如下:

else if (!strcasecmp(tok, "startUrl")) {

    tok = nextToken(&posParse);

    url *u = new url(tok, global::depthInSite, (url *) NULL);

    if (u->isValid()) {

       check(u);

    }

}

       isValid函数通过判断host和file是否为NULL,然后是判断URL长度是否有不超过url限度的问题。Check函数已经看过了,但是其中的global::URLsDish->put(u)并没有看过。URLsDisk在global构造函数中初始化:

URLsDisk = new PersistentFifo(reload, fifoFile);

       Reload是通过参数传进来的,它说明是不是接着上次没爬完的爬,而fifoFile在types.h中定义,它是文件名,为”fifo”。简单起见,下面是删除了一部分代码的PersistentFifo:

PersistentFifo::PersistentFifo(bool reload, char *baseName) {

    fileNameLength = strlen(baseName) + 5;

    fileName = new char[fileNameLength + 2];

    strcpy(fileName, baseName);

    mypthread_mutex_init (&lock, NULL);

else {

       // Delete old fifos

       DIR *dir = opendir(".");

       struct dirent *name;

       name = readdir(dir);

       while (name != NULL) {

           if (startWith(fileName, name->d_name)) {

               unlink(name->d_name);

           }

           name = readdir(dir);

       }

       closedir(dir);

 

makeName(0);

       wfds = creat(fileName, S_IRUSR | S_IWUSR);

       rfds = open(fileName, O_RDONLY);

    }

}

       如果不是reload,它将当前目录下所有以fifo开头的文件全部删除。makeName是取得所要写入URL文件的名字fileName,它是以数字为名字的。

       下面是put函数:

/** Put something in the fifo

 * The objet is then deleted

 */

void PersistentFifo::put(url *obj) {

    mypthread_mutex_lock(&lock);

    char *s = obj->serialize(); // statically allocated string

    writeUrl(s);

    in++;

    updateWrite();

    mypthread_mutex_unlock(&lock);

    delete obj;

}

       Pthread的东西,先将url序列化,调用writeUrl写入或缓存,updateWrite是判断是不是写入了一定量的URL。

// write an url in the out file (buffered write)

void PersistentFifo::writeUrl(char *s) {

    size_t len = strlen(s);

    assert(len < maxUrlSize + 40 + maxCookieSize);

    if (outbufPos + len < BUF_SIZE) {

       memcpy(outbuf + outbufPos, s, len);

       outbufPos += len;

    } else {

       // The buffer is full

       flushOut();

       memcpy(outbuf + outbufPos, s, len);

       outbufPos = len;

    }

}

       这里判断写入这个url后会不会超过Buffer大小,BUF_SIZE,如果超过,就先把缓存中的内容flush。

void PersistentFifo::updateWrite() {

    if ((in % urlByFile) == 0) {

       flushOut();

       close(wfds);

       makeName(++fin);

       wfds = creat(fileName, S_IRUSR | S_IWUSR);

    }

}

       urlByFile的大小为10,000,如果写入了urlByFile个URL则重新将URL写入到另一个新的文件中。

在main函数里的input函数是一个提供给用户查看当前爬虫信息和输入一些参数的微型服务器,这个暂时跳过,看sequencer函数,sequencer在sequencer.c中实现:

/** start the sequencer

 */

void sequencer() {

    bool testPriority = true;

    if (space == 0) {

       space = global::inter->putAll();

    }

    int still = space;

    if (still > maxPerCall)

       still = maxPerCall;

    while (still) {

       if (canGetUrl(&testPriority)) {

           space--;

           still--;

       } else {

           still = 0;

       }

    }

}

       这里是最多处理maxPerCall个URL,canGetUrl的代码如下:

/* Get the next url

 * here is defined how priorities are handled

 */

static bool canGetUrl(bool *testPriority) {

    url *u;

    if (global::readPriorityWait) {

       global::readPriorityWait--;

       u = global::URLsPriorityWait->get();

       global::namedSiteList[u->hostHashCode()].putPriorityUrlWait(u);

       return true;

    } else if ( *testPriority

          && (u = global::URLsPriority->tryGet()) != NULL) {

       // We've got one url (priority)

       global::namedSiteList[u->hostHashCode()].putPriorityUrl(u);

       return true;

    } else {

       *testPriority = false;

       // Try to get an ordinary url

       if (global::readWait) {

           global::readWait--;

           u = global::URLsDiskWait->get();

           global::namedSiteList[u->hostHashCode()].putUrlWait(u);

           return true;

       } else {

           u = global::URLsDisk->tryGet();

           if (u != NULL) {

              global::namedSiteList[u->hostHashCode()].putUrl(u);

              return true;

           } else {

              return false;

           }

       }

    }

}

       关于这个函数,“执着的小雨”的Blog介绍了:下面是贴过来的:

“为什么disk和priority的队列都是成对出现的,是因为可以认为每个site在namedSiteList当中都有一个小的队列来保存它的url,这个url的个数是有个限制的,当超过这个限制的时候就不能再把该site下的url放入,但也不能丢弃,而是放入wait队列。larbin会控制一段时间在disk队列中取url,一段时间在diskWait当中取url。disk和priority的区别只是优先级的区别。namedSiteLIst的作用可以任务是实现了DNS缓存;IPSiteList是控制了polite访问。”

       readPriorityWait的值是在main.cc中的cron函数中:

// see if we should read again urls in fifowait

if ((global::now % 300) == 0) {

    global::readPriorityWait = global::URLsPriorityWait->getLength();

    global::readWait = global::URLsDiskWait->getLength();

}

if ((global::now % 300) == 150) {

    global::readPriorityWait = 0;

    global::readWait = 0;

}

       这里global::now % 300是判断这次是对wait里的url里进行处理,还是对不是wait里的进行处理,这里的%300等于0和150的概率都是1/300,所以大约300次换一次。readPriorityWait是URLsPriorityWait中的长度,也就是url的数量,readWait亦然。

       在canGetUrl中,在对于每个站点,将相应的URL放进去。putPriorityUrlWait,putPriorityUrl,putUrlWait,putUrl在site.h中的定义如下:

/** Put an url in the fifo

 * If there are too much, put it back in UrlsInternal

 * Never fill totally the fifo => call at least with 1 */

void putGenericUrl(url *u, int limit, bool prio);

inline void putUrl(url *u) {

    putGenericUrl(u, 15, false);

}

inline void putUrlWait(url *u) {

    putGenericUrl(u, 10, false);

}

inline void putPriorityUrl(url *u) {

    putGenericUrl(u, 5, true);

}

inline void putPriorityUrlWait(url *u) {

    putGenericUrl(u, 1, true);

}

       可以看出最终调用的都是putGenericUrl:

/* Put an url in the fifo if their are not too many */

void NamedSite::putGenericUrl(url *u, int limit, bool prio) {

    if (nburls > maxUrlsBySite - limit) {

       // Already enough Urls in memory for this Site

       // first check if it can already be forgotten

       if (!strcmp(name, u->getHost())) {

           if (dnsState == errorDns) {

              nburls++;

              forgetUrl(u, noDNS);

              return;

           }

           if (dnsState == noConnDns) {

              nburls++;

              forgetUrl(u, noConnection);

              return;

           }

           if (u->getPort() == port && dnsState == doneDns && !testRobots(

                  u->getFile())) {

              nburls++;

              forgetUrl(u, forbiddenRobots);

              return;

           }

       }

       // else put it back in URLsDisk

       refUrl();

       global::inter->getOne();

       if (prio) {

           global::URLsPriorityWait->put(u);

       } else {

           global::URLsDiskWait->put(u);

       }

    }

       如果已经有足够多的url在内存里,执行这里if中的代码,strcmp(name,u->getHost())是判断这个主机是不是已经就进行过dns方面的判断,也就是说对于一个站点,只做一次dns解析的判断,以后就按这个结果进行处理,dnsState有noDns,noConnDns,还有robots.txt不允许的情况,如果没有问题,就把它放到URLsDisk中。

else {

    nburls++;

    if (dnsState == waitDns || strcmp(name, u->getHost()) || port

           != u->getPort() || global::now > dnsTimeout) {

       // dns not done or other site

       putInFifo(u);

       addNamedUrl();

       // Put Site in fifo if not yet in

       if (!isInFifo) {

           isInFifo = true;

           global::dnsSites->put(this);

       }

    } else

       switch (dnsState) {

       case doneDns:

           transfer(u);

           break;

       case errorDns:

           forgetUrl(u, noDNS);

           break;

       default: // noConnDns

           forgetUrl(u, noConnection);

       }

}

       如果需要判断dns能不能解析,就将它放到dnsSites里,这个会在fetchDns中判断。或是如果还能放到内存里,并且又是doneDns,表示可以解析,就调用transfer:

void NamedSite::transfer(url *u) {

    if (testRobots(u->getFile())) {

       if (global::proxyAddr == NULL) {

           memcpy(&u->addr, &addr, sizeof(struct in_addr));

       }

       global::IPSiteList[ipHash].putUrl(u);

    } else {

       forgetUrl(u, forbiddenRobots);

    }

}

       这里是将url放入到IPSiteList的相应ipHash中。

在main函数中调用的下一个函数是fetchDns,将fetchDns分成两部分:

while (global::nbDnsCalls < global::dnsConn

       && global::freeConns->isNonEmpty() && global::IPUrl < maxIPUrls) { // try to avoid too many dns calls

    NamedSite *site = global::dnsSites->tryGet();

    if (site == NULL) {

       break;

    } else {

       site->newQuery();

    }

}

       从dnsSites取得一个需要dns解析测试的站点名,用newQuery提出dns解析请求:

void NamedSite::newQuery() {

    // Update our stats

    newId();

    if (global::proxyAddr != NULL) {

       // 略过

    } else if (isdigit(name[0])) {

       // the name already in numbers-and-dots notation

       siteSeen();

       if (inet_aton(name, &addr)) {

           // Yes, it is in numbers-and-dots notation

           siteDNS();

           // Get the robots.txt

           dnsOK();

       } else {

           // No, it isn't : this site is a non sense

           dnsState = errorDns;

           dnsErr();

       }

    } else {

       // submit an adns query

       global::nbDnsCalls++;

       adns_query quer = NULL;

       adns_submit(global::ads, name, (adns_rrtype) adns_r_addr,

              (adns_queryflags) 0, this, &quer);

    }

}

       如果使用代理地址,就是在larbin.conf中指定了代理地址的情况,这种情况略过。下面如果这个地址本来就是用数字加点的格式写的,它可能就是一个IP地址,不用解析。剩下的就要提交一个dns的请求了。

// Read available answers

while (global::nbDnsCalls && global::freeConns->isNonEmpty()) {

    NamedSite *site;

    adns_query quer = NULL;

    adns_answer *ans;

    int res = adns_check(global::ads, &quer, &ans, (void**) &site);

    if (res == ESRCH || res == EAGAIN) {

       // No more query or no more answers

       break;

    }

    global::nbDnsCalls--;

    site->dnsAns(ans);

    free(ans); // ans has been allocated with malloc

}

       在这里判断能不能解析,下面调用dnsAns:

void NamedSite::dnsAns(adns_answer *ans) {

    if (ans->status == adns_s_prohibitedcname) {

       // 略过

    } else {

       if (cname != NULL) {

           // 略过

       }

       if (ans->status != adns_s_ok) {

           // 略过

       } else {

           // compute the new addr

           memcpy(&addr, &ans->rrs.addr->addr.inet.sin_addr,

                  sizeof(struct in_addr));

           // Get the robots.txt

           dnsOK();

       }

    }

}

       关于ans->status是一个被禁止的别的的情况,略过,下面也只看正常的情况是得到新的地址,并调用dnsOK函数:

void NamedSite::dnsOK() {

    Connexion *conn = global::freeConns->get();

    char res = getFds(conn, &addr, port);

    if (res != emptyC) {

       conn->timeout = timeoutPage;

       if (global::proxyAddr != NULL) {

           // use a proxy

           conn->request.addString("GET http://");

           conn->request.addString(name);

           char tmp[15];

           sprintf(tmp, ":%u", port);

           conn->request.addString(tmp);

           conn->request.addString("/robots.txt HTTP/1.0\r\nHost: ");

       } else {

           // direct connection

         conn->request.addString("GET /robots.txt HTTP/1.0\r\nHost: ");

       }

       conn->request.addString(name);

       conn->request.addString(global::headersRobots);

       conn->parser = new robots(this, conn);

       conn->pos = 0;

       conn->err = success;

       conn->state = res;

    }

}

       这里是将request组合起来,请求robots.txt。

       在main中调用下一个函数是fetchOpen:

void fetchOpen() {

    static time_t next_call = 0;

    if (global::now < next_call) { // too early to come back

       return;

    }

    int cont = 1;

    while (cont && global::freeConns->isNonEmpty()) {

       IPSite *s = global::okSites->tryGet();

       if (s == NULL) {

           cont = 0;

       } else {

           next_call = s->fetch();

           cont = (next_call == 0);

       }

    }

}

       得到一个站点,然后调用fetch:

int IPSite::fetch() {

    if (tab.isEmpty()) {

       // 略过

    } else {

       int next_call = lastAccess + global::waitDuration;

       if (next_call > global::now) {

           global::okSites->rePut(this);

           return next_call;

       } else {

           Connexion *conn = global::freeConns->get();

           url *u = getUrl();

           // We're allowed to fetch this one

           // open the socket and write the request

           char res = getFds(conn, &(u->addr), u->getPort());

           if (res != emptyC) {

              lastAccess = global::now;

              conn->timeout = timeoutPage;

              conn->request.addString("GET ");

              if (global::proxyAddr != NULL) {

                  char *tmp = u->getUrl();

                  conn->request.addString(tmp);

              } else {

                  conn->request.addString(u->getFile());

              }

              conn->request.addString(" HTTP/1.0\r\nHost: ");

              conn->request.addString(u->getHost());

              conn->request.addString(global::headers);

              conn->parser = new html(u, conn);

              conn->pos = 0;

              conn->err = success;

              conn->state = res;

              if (tab.isEmpty()) {

                  isInFifo = false;

              } else {

                  global::okSites->put(this);

              }

              return 0;

           }

       }

    }

}

       为了polite所以不能对一个网站连续的爬取,next_call就是上次爬取的时间加上间隔时间的值,如果还没有到可以爬的时间,就返回下一次可以爬的时间,如果可以爬,就将conn->request等写好,有点像得到robots的时间。

 main函数中的最后一个重要函数是checkAll函数,将checkAll分为两部分来看:

// read and write what can be

for (uint i = 0; i < global::nb_conn; i++) {

    Connexion *conn = global::connexions + i;

    switch (conn->state) {

    case connectingC:

    case writeC:

       if (global::ansPoll[conn->socket]) {

           // trying to finish the connection

           pipeWrite(conn);

       }

       break;

    case openC:

       if (global::ansPoll[conn->socket]) {

           // The socket is open, let's try to read it

           pipeRead(conn);

       }

       break;

    }

}

       这里如果是处理connectingC和writeC状态就去连接,或是将请求发出去,如果是读数据,就发出写请求。

       checkAll的后面一部分是:

// update fd_set for the next select

for (uint i = 0; i < global::nb_conn; i++) {

    int n = (global::connexions + i)->socket;

    switch ((global::connexions + i)->state) {

    case connectingC:

    case writeC:

       global::setPoll(n, POLLOUT);

       break;

    case openC:

       global::setPoll(n, POLLIN);

       break;

    }

}

       这里对要将读写设为非堵塞的设置,setPoll为:

/** set this fds for next poll */

#define setPoll(fds, event) \

  global::pollfds[global::posPoll].fd = fds; \

  global::pollfds[global::posPoll].events = event; \

  global::posPoll++

       看一下pipeWrite的代码:

/** The socket is finally open !

 * Make sure it's all right, and write the request

 */

static void pipeWrite(Connexion *conn) {

    int res = 0;

    int wrtn, len;

    socklen_t size = sizeof(int);

    switch (conn->state) {

    case connectingC:

       // not connected yet

       getsockopt(conn->socket, SOL_SOCKET, SO_ERROR, &res, &size);

       if (res) {

           // Unable to connect

           conn->err = noConnection;

           endOfFile(conn);

           return;

       }

       // Connection succesfull

       conn->state = writeC;

       // no break

    case writeC:

       // writing the first string

       len = strlen(conn->request.getString());

       wrtn = write(conn->socket, conn->request.getString()

+ conn->pos, len - conn->pos);

       if (wrtn >= 0) {

           addWrite(wrtn);

           conn->pos += wrtn;

           if (conn->pos < len) {

              // Some chars of this string are not written yet

              return;

           }

       } else {

          if (errno == EAGAIN || errno == EINTR || errno == ENOTCONN) {

              // little error, come back soon

              return;

           } else {

              // unrecoverable error, forget it

              conn->err = earlyStop;

              endOfFile(conn);

              return;

           }

       }

       // All the request has been written

       conn->state = openC;

    }

}

       如果是要连接,调用getsockopt,再把状态改为writeC。如果是writeC,调用write,将状态改为openC。

/** Is there something to read on this socket

 * (which is open)

 */

static void pipeRead(Connexion *conn) {

    int p = conn->parser->pos;

    int size = read(conn->socket, conn->buffer + p, maxPageSize - p - 1);

    switch (size) {

    case 0:

       // End of file

       if (conn->parser->endInput())

           conn->err = (FetchError) errno;

       endOfFile(conn);

       break;

    case -1:

        // 省略

       break;

    default:

        // 省略

       break;

    }

}

       如果没有出错,读到了最后,调用endOfFile:

static void endOfFile(Connexion *conn) {

    crash("End of file");

    conn->state = emptyC;

    close(conn->socket);

    if (conn->parser->isRobots) {

       // That was a robots.txt

       robots *r = ((robots *) conn->parser);

       r->parse(conn->err != success);

       r->server->robotsResult(conn->err);

       conn->recycle();

       global::freeConns->put(conn);

    } else {

       // that was an html page

       manageHtml();

    }

}

       将conn的状态变为emptyC,也就是把这个conn资源释放,下面看这个是不是robots.txt,如果是就是前面conn的parser进行解析,如果是一个html页面,就用manageHtml:

#ifdef THREAD_OUTPUT

#define manageHtml() global::userConns->put(conn)

#else // THREAD_OUTPUT

#define manageHtml() \

    endOfLoad((html *)conn->parser, conn->err); \

    conn->recycle(); \

    global::freeConns->put(conn)

#endif // THREAD_OUTPUT

       这里endOfLoad里将页面保存。

 

 到现在还没有提到如何把从一个页面中把url得到,加入到要爬取的url集合中去,现在从页面解析开始:

/* parse an html page */

void html::parseHtml() {

    while ((posParse = strchr(posParse, '<')) != NULL) {

       if (posParse[1] == '!') {

           if (posParse[2] == '-' && posParse[3] == '-') {

              posParse += 4;

               parseComment();

           } else {

              // nothing...

              posParse += 2;

           }

       } else {

           posParse++;

           parseTag();

       }

    }

}

       这里是找’<’字符,当然这是HTML里标签中的重要字符,如果以”<!”开头,有可能是注释(<!--XXXXXXXX-->),否则,则有可能是一个标签。先看parseComment:

void html::parseComment() {

    while ((posParse = strchr(posParse, '-')) != NULL) {

       if (posParse[1] == '-' && posParse[2] == '>') {

           posParse += 3;

           return;

       } else {

           posParse++;

       }

    }

    posParse = buffer + pos;

}

       这里判断后面半截”-->”的代码,如果是注释,就跳过注释中的内容,不是注释,就从开始的位置开始。接下来看parseTag的前面一部分:

void html::parseTag() {

    skipSpace();

    char *param = NULL; // what parameter are we looking for

    int action = -1;

    // read the name of the tag

    if (thisCharIs(0, 'a')) { // a href

       param = "href";

       action = LINK;

       posParse++;

    } else if (thisCharIs(0, 'l')) {

        isTag(thisCharIs(1, 'i') && thisCharIs(2, 'n')

&& thisCharIs(3, 'k'), "href", LINK, 4);

    } else if (thisCharIs(0, 'b')) { // base href

       isTag(thisCharIs(1, 'a') && thisCharIs(2, 's')

&& thisCharIs(3, 'e'),"href", BASE, 4);

    } else if (thisCharIs(0, 'f')) { // frame src

       isTag(thisCharIs(1, 'r') && thisCharIs(2, 'a')

              && thisCharIs(3, 'm') && thisCharIs(4, 'e'),

              "src", LINK, 5);

#ifdef IMAGES

    } else if (thisCharIs(0, 'i')) { // img src

       isTag(thisCharIs(1, 'm') && thisCharIs(2, 'g'), "src", LINK, 3);

#endif // IMAGES

    } else {

       return;

    }

       skipSpace是除去空格,回车,换行,tab的宏:

#define skipSpace() \

  while (*posParse == ' ' || *posParse == '\n' \

         || *posParse == '\r' || *posParse == '\t') { \

    posParse++; \

  }

       接下来判断的标签依次是”a”,”link”,”base”,”frame”,”img”,isTag这个宏就很简单了:

#define isTag(t, p, a, i) if (t) { \

      param = p; \

      action = a; \

      posParse += i; \

    } else { \

      posParse++; \

      return; \

    }

       以link为例,t=thisCharIs(1, 'i') && thisCharIs(2, 'n') && thisCharIs(3, 'k'),也就是看l后面是不是ink这三个字符,如果不是,就返回,如果是param=”href”,action=LINK,i=4。

       下面是parseTag的后面一部分:

// now find the parameter

    assert(param != NULL);

    skipSpace();

    for (;;) {

       int i = 0;

       while (param[i] != 0 && thisCharIs(i, param[i]))

           i++;

       posParse += i;

       if (posParse[i] == '>' || posParse[i] == 0)

           return;

       if (param[i] == 0) {

           parseContent(action);

           return;

       } else {

           // not the good parameter

           nextWord();

       }

    }

}

       下面是判断接下来的字符是不是与param一样,也就是我们只关心与param这种参数。如果不是一个标签,则返回,如果是一个我们不关心的标签,就跳过这个标签的内容。如果是我们关心的标签,则用parseContent函数的前半部分:

posParse++;

    while (*posParse == ' ' || *posParse == '=')

       posParse++;

    if (*posParse == '\"' || *posParse == '\'')

       posParse++;

    area = posParse;

    char *endItem = area + maxUrlSize;

    if (endItem > buffer + pos)

       endItem = buffer + pos;

    while (posParse < endItem && *posParse != '\"' && *posParse != '\''

           && *posParse != '\n' && *posParse != ' ' && *posParse != '>'

           && *posParse != '\r' && *posParse != '\t'

&& notCgiChar(*posParse)) {

       if (*posParse == '\\')

           *posParse = '/'; // Bye Bye DOS !

       posParse++;

    }

       这里略过空格和=号,因为参数后面要写一个等号的,再有就有双引号与单引号,也就是我们想得到的参数值前的引号,这里限制了url的长度(maxUrlSize),下面的while是去掉其它可能影响的字符。

if (posParse == buffer + pos) {

       // end of file => content may be truncated => forget it

       return;

    } else if (posParse < endItem && notCgiChar(*posParse)) {

       // compute this url (not too long and not cgi)

       char oldchar = *posParse;

       *posParse = 0;

       switch (action) {

       case LINK:

           // try to understand this new link

           manageUrl(new url(area, here->getDepth() - 1, base), false);

           break;

       case BASE:

           // This page has a BASE HREF tag

       {

           uint end = posParse - area - 1;

           while (end > 7 && area[end] != '/')

              end--; // 7 because http://

           if (end > 7) { // this base looks good

              end++;

              char tmp = area[end];

              area[end] = 0;

              url *tmpbase = new url(area, 0, (url *) NULL);

              area[end] = tmp;

              delete base;

              if (tmpbase->isValid()) {

                  base = tmpbase;

              } else {

                  delete tmpbase;

                  base = NULL;

              }

           }

       }

           break;

       default:

           assert(false);

       }

       *posParse = oldchar;

    }

    posParse++;

       前一个if是判断是不是超过了文件的长度,case BASE中是得到base,即基地址,这个比较简单,如果是case LINK,则调用manageUrl:

void html::manageUrl(url *nouv, bool isRedir) {

    if (nouv->isValid() && filter1(nouv->getHost(), nouv->getFile())

           && (global::externalLinks

|| isRedir || !strcmp(nouv->getHost(),

                  this->here->getHost()))) {

       // The extension is not stupid (gz, pdf...)

#ifdef LINKS_INFO

       links.addElement(nouv->giveUrl());

#endif // LINKS_INFO

       if (nouv->initOK(here)) {

           check(nouv);

       } else {

           // this url is forbidden for errno reason (set by initOK)

           answers(errno);

           delete nouv;

       }

    } else {

       // The extension is stupid

       delete nouv;

    }

}

       filter1已经在check中看过了,它是对文件格式的过滤,而initOK是对url判断,看它是不是合理的,如果合理,就会调用check,也就是如果是没有见过的,就会根据它的文件后缀将它加入到URLsDisk中或是UrlsPriority中,这也就是何时将url加入集合中去的。

       再看一个函数:

int html::verifType() {

    if (startWithIgnoreCase("content-type: ", area)) {

       // Let's read the type of this doc

       if (!startWithIgnoreCase("text/html", area + 14)) {

#ifdef SPECIFICSEARCH

           if (matchContentType(area+14)) {

              interestingSeen();

              isInteresting = true;

           } else {

              checkType();

           }

#else // SPECIFICSEARCH

           checkType();

#endif // SPECIFICSEARCH

       }

    }

    return 0;

}

       这里会判断这种type是不是我们想要的,matchContentType如果定义了SPECIFICSEARCH,则为:

bool matchContentType (char *ct) {

    int i=0;

    while (contentTypes[i] != NULL) {

       if (startWithIgnoreCase(contentTypes[i], ct))

       return true;

       i++;

    }

    return false;

}

 

posted on 2011-10-24 23:01  zhoulinhu  阅读(910)  评论(0编辑  收藏  举报

导航