1 #ifndef _Document_H_040410_
2 #define _Document_H_040410_
3
4 #include <string>
5
6 typedef struct{
7 int docid;
8 int offset;
9 }DocIdx;
10
11 using namespace std;
12
13 class CDocument
14 {
15 public:
16
17 int m_nDocId;
18 int m_nPos;
19 int m_nLength;
20 string m_sChecksum;
21
22 string m_sUrl;
23 string m_sRecord; // a record including a HEAD, a header and body
24 string m_sHead;
25 string m_sHeader;
26 string m_sBody;
27
28 string m_sBodyNoTags;
29
30 public:
31 CDocument();
32 ~CDocument();
33
34 bool ParseRecord(string &content) const;
35 bool CleanBody(string &body) const;
36
37 void RemoveTags(char *s);
38 };
39
40 #endif /* _Document_H_040410_ */
1 /*Document handling
2 */
3
4 #include "Document.h"
5
6 CDocument::CDocument()
7 {
8 m_nDocId = -1;
9 m_nPos = -1;
10 m_nLength = 0;
11 m_sChecksum = "";
12
13 m_sUrl = "";
14 }
15
16 CDocument::~CDocument()
17 {
18 }
19
20 bool CDocument::ParseRecord(string &content) const
21 {
22 return true;
23 }
24
25 bool CDocument::CleanBody(string &body) const
26 {
27 return true;
28 }
29
30 //把 <...> 删掉
31 void CDocument::RemoveTags(char *s)
32 {
33 int intag;
34 char *p, *q;
35
36 if (!s || !*s) return;
37
38 for (p=q=s, intag=0; *q; q++) {
39 switch (*q){
40 case '<':
41 intag = 1;
42 *p++ = ' ';
43 break;
44 case '>':
45 intag = 0;
46 break;
47 default:
48 if (!intag) {
49 *p++ = *q;
50 }
51 break;
52 }
53 }
54
55 *p = '\0';
56
57 /* second method
58 char *d = s;
59 while (*s) {
60 if (*s == '<') {
61 while (*s && *s!='>') s++;
62 if( *s == '\0') break;
63 s++;
64 continue;
65 }
66
67 *d++ = *s++;
68 }
69 *d = 0;
70 */
71 }