DocIndex

  1 #include <iostream>
  2 #include <fstream>
  3 #include <cstring>
  4 #include "Md5.h"
  5 #include "Url.h"
  6 #include "Document.h"
  7 
  8 using namespace std;
  9 
 10 int main(int argc, char* argv[]) {/*
 11  * DocIndex.cpp
 12  * Created on: 2011-11-9
 13  *   function:
 14  *   将一个原始网页库进行索引,
 15  *   生成网页索引文件Doc.idx
 16  *   和URL索引文件Url.idx
 17  */
 18 
 19     ifstream ifs("Tianwang.raw.1078930288");
 20     if (!ifs) {
 21         cout << "不能打开原始网页库<Tianwang.raw.******>" << endl;
 22         return -1;
 23     }
 24 
 25     ofstream ofsUrl("Url.idx", ios::in | ios::out | ios::trunc | ios::binary);
 26     if (!ofsUrl) {
 27         cout << "不能打开或者创建URL索引文件<Url.idx>" << endl;
 28         cout << "error open file " << endl;
 29     }
 30 
 31     ofstream ofsDoc("Doc.idx", ios::in | ios::out | ios::trunc | ios::binary);
 32     if (!ofsDoc) {
 33         cout << "error open file " << endl;
 34     }
 35 
 36     ofstream ofsDocId2Url("DocId2Url.idx",
 37             ios::in | ios::out | ios::trunc | ios::binary);
 38     if (!ofsDocId2Url) {
 39         cout << "error open file " << endl;
 40     }
 41 
 42     int cnt = 0;
 43     string strLine, strPage;
 44     CUrl iUrl;
 45     CDocument iDocument;
 46     CMD5 iMD5;
 47 
 48     int nOffset = ifs.tellg(); //得到文件读指针距该文件头的字节数
 49     cout << "tellg() is:" << nOffset << endl;
 50     while (getline(ifs, strLine)) {
 51         if (strLine[0] == '\0' || strLine[0] == '#' || strLine[0] == '\n') {
 52             nOffset = ifs.tellg();
 53             continue;
 54         }
 55 
 56         if (!strncmp(strLine.c_str(), "version: 1.0", 12)) {
 57             if (!getline(ifs, strLine))
 58                 break;
 59 
 60             if (!strncmp(strLine.c_str(), "url: ", 4)) {
 61                 iUrl.m_sUrl = strLine.substr(5);//保存url
 62                 iMD5.GenerateMD5((unsigned char*) iUrl.m_sUrl.c_str(),
 63                         iUrl.m_sUrl.size());
 64                 iUrl.m_sChecksum = iMD5.ToString();
 65 
 66             } else {
 67                 continue;
 68             }
 69 
 70             while (getline(ifs, strLine)) {//保存文件长度信息
 71                 if (!strncmp(strLine.c_str(), "length: ", 8)) {
 72                     sscanf(strLine.substr(8).c_str(), "%d",
 73                             &(iDocument.m_nLength));
 74                     break;
 75                 }
 76             }
 77 
 78             getline(ifs, strLine);
 79 
 80             iDocument.m_nDocId = cnt;
 81             iDocument.m_nPos = nOffset;
 82             char *pContent = new char[iDocument.m_nLength + 1];
 83 
 84             memset(pContent, 0, iDocument.m_nLength + 1);
 85             ifs.read(pContent, iDocument.m_nLength);
 86             iMD5.GenerateMD5((unsigned char*) pContent, iDocument.m_nLength);
 87             iDocument.m_sChecksum = iMD5.ToString();
 88 
 89             delete[] pContent;
 90 
 91             ofsUrl << iUrl.m_sChecksum;//MD5值
 92             ofsUrl << "\t" << iDocument.m_nDocId << endl;//doc的Id
 93 
 94             ofsDoc << iDocument.m_nDocId;//文件偏移位置到MD5的映射
 95             ofsDoc << "\t" << iDocument.m_nPos;
 96             //ofsDoc << "\t" << iDocument.m_nLength ;
 97             ofsDoc << "\t" << iDocument.m_sChecksum << endl;
 98 
 99             ofsDocId2Url << iDocument.m_nDocId;//文件编号到url的映射
100             ofsDocId2Url << "\t" << iUrl.m_sUrl << endl;
101 
102             cnt++;
103         }
104 
105         nOffset = ifs.tellg();
106 
107     }
108 
109     ofsDoc << cnt;
110     ofsDoc << "\t" << nOffset << endl;
111 
112     return (0);
113 }

posted on 2012-07-08 10:56  kakamilan  阅读(219)  评论(0编辑  收藏  举报

导航