1 #include <iostream>
2 #include <fstream>
3 #include <cstring>
4 #include "Md5.h"
5 #include "Url.h"
6 #include "Document.h"
7
8 using namespace std;
9
10 int main(int argc, char* argv[]) {/*
11 * DocIndex.cpp
12 * Created on: 2011-11-9
13 * function:
14 * 将一个原始网页库进行索引,
15 * 生成网页索引文件Doc.idx
16 * 和URL索引文件Url.idx
17 */
18
19 ifstream ifs("Tianwang.raw.1078930288");
20 if (!ifs) {
21 cout << "不能打开原始网页库<Tianwang.raw.******>" << endl;
22 return -1;
23 }
24
25 ofstream ofsUrl("Url.idx", ios::in | ios::out | ios::trunc | ios::binary);
26 if (!ofsUrl) {
27 cout << "不能打开或者创建URL索引文件<Url.idx>" << endl;
28 cout << "error open file " << endl;
29 }
30
31 ofstream ofsDoc("Doc.idx", ios::in | ios::out | ios::trunc | ios::binary);
32 if (!ofsDoc) {
33 cout << "error open file " << endl;
34 }
35
36 ofstream ofsDocId2Url("DocId2Url.idx",
37 ios::in | ios::out | ios::trunc | ios::binary);
38 if (!ofsDocId2Url) {
39 cout << "error open file " << endl;
40 }
41
42 int cnt = 0;
43 string strLine, strPage;
44 CUrl iUrl;
45 CDocument iDocument;
46 CMD5 iMD5;
47
48 int nOffset = ifs.tellg(); //得到文件读指针距该文件头的字节数
49 cout << "tellg() is:" << nOffset << endl;
50 while (getline(ifs, strLine)) {
51 if (strLine[0] == '\0' || strLine[0] == '#' || strLine[0] == '\n') {
52 nOffset = ifs.tellg();
53 continue;
54 }
55
56 if (!strncmp(strLine.c_str(), "version: 1.0", 12)) {
57 if (!getline(ifs, strLine))
58 break;
59
60 if (!strncmp(strLine.c_str(), "url: ", 4)) {
61 iUrl.m_sUrl = strLine.substr(5);//保存url
62 iMD5.GenerateMD5((unsigned char*) iUrl.m_sUrl.c_str(),
63 iUrl.m_sUrl.size());
64 iUrl.m_sChecksum = iMD5.ToString();
65
66 } else {
67 continue;
68 }
69
70 while (getline(ifs, strLine)) {//保存文件长度信息
71 if (!strncmp(strLine.c_str(), "length: ", 8)) {
72 sscanf(strLine.substr(8).c_str(), "%d",
73 &(iDocument.m_nLength));
74 break;
75 }
76 }
77
78 getline(ifs, strLine);
79
80 iDocument.m_nDocId = cnt;
81 iDocument.m_nPos = nOffset;
82 char *pContent = new char[iDocument.m_nLength + 1];
83
84 memset(pContent, 0, iDocument.m_nLength + 1);
85 ifs.read(pContent, iDocument.m_nLength);
86 iMD5.GenerateMD5((unsigned char*) pContent, iDocument.m_nLength);
87 iDocument.m_sChecksum = iMD5.ToString();
88
89 delete[] pContent;
90
91 ofsUrl << iUrl.m_sChecksum;//MD5值
92 ofsUrl << "\t" << iDocument.m_nDocId << endl;//doc的Id
93
94 ofsDoc << iDocument.m_nDocId;//文件偏移位置到MD5的映射
95 ofsDoc << "\t" << iDocument.m_nPos;
96 //ofsDoc << "\t" << iDocument.m_nLength ;
97 ofsDoc << "\t" << iDocument.m_sChecksum << endl;
98
99 ofsDocId2Url << iDocument.m_nDocId;//文件编号到url的映射
100 ofsDocId2Url << "\t" << iUrl.m_sUrl << endl;
101
102 cnt++;
103 }
104
105 nOffset = ifs.tellg();
106
107 }
108
109 ofsDoc << cnt;
110 ofsDoc << "\t" << nOffset << endl;
111
112 return (0);
113 }