1 #ifndef _HZSEG_H_040415_
2 #define _HZSEG_H_040415_
3
4 #include <iostream>
5 #include <string>
6 #include <cstring>
7 #include <cstdlib>
8 #include <fstream>
9 #include "Dict.h"
10
11 using namespace std;
12
13 class CHzSeg
14 {
15 public:
16 CHzSeg();
17 ~CHzSeg();
18
19 string SegmentSentenceMM (CDict&, string) const;//只保留下中文
20 string SegmentHzStrMM (CDict&, string) const;//切词
21 string SegmentURL(CDict&, string) const;
22
23 // process a sentence before segmentation
24 void Translate(char* SourceStr) const;
25 };
26
27 #endif /* _HZSEG_H_040415_ */
1 // HzSeg handling
2
3 #include "HzSeg.h"
4 #include "Dict.h"
5
6 const unsigned int MAX_WORD_LENGTH = 8;
7 const string SEPARATOR("/ "); // delimiter between words
8
9 CHzSeg::CHzSeg() {
10 }
11
12 CHzSeg::~CHzSeg() {
13 }
14
15 // Using Max Matching method to segment a character string.
16 string CHzSeg::SegmentHzStrMM(CDict &dict, string s1) const {
17 string s2 = ""; //保存句子s1的分词结果
18 while (!s1.empty()) {
19 unsigned int len = s1.size();
20
21 if (len > MAX_WORD_LENGTH)
22 len = MAX_WORD_LENGTH;
23 //如果待切分的句子大于最大切分单元
24 //len=最大切分单元,否则len=句子的长度
25
26 string w = s1.substr(0, len); //取s1句子最左边长度len为的子句子
27 bool isw = dict.IsWord(w); //判断刚刚取出来的子句子是不是一个词
28
29 while (len > 2 && isw == false) { //当w中至少有2个中文字&&不能构成字的时候,减去最右边的一个中文字符
30 len -= 2; // cut a word
31 w = w.substr(0, len);
32 isw = dict.IsWord(w);
33 }
34 s2 += w + SEPARATOR;
35
36 s1 = s1.substr(w.size());
37 }
38
39 return s2;
40 }
41
42 // process a sentence before segmentation
43 string CHzSeg::SegmentSentenceMM(CDict &dict, string s1) const {
44 string s2 = "";
45 unsigned int i, len;
46 cout << endl << "I'm in SegmentSentenceMM" << endl;
47 cout << s1 << endl;
48 while (!s1.empty()) {
49 unsigned char ch = (unsigned char) s1[0];
50 if (ch < 128) { //吃掉一行中所有换行符以外的英文字符
51 i = 1;
52 len = s1.size();
53 while (i < len && ((unsigned char) s1[i] < 128) && (s1[i] != 10)
54 && (s1[i] != 13)) { // LF, CR
55 i++; //不是回车换行
56 }
57
58 if ((ch != 32) && (ch != 10) && (ch != 13)) { // SP, LF, CR
59 s2 += s1.substr(0, i) + SEPARATOR;
60 } else {
61 if (ch == 10 || ch == 13) {
62 s2 += s1.substr(0, i);
63 cout << "当前s2:" << s2 << endl;
64 }
65 }
66
67 if (i <= s1.size()) {
68 s1 = s1.substr(i); //获得删去部分英文字符后的数据
69 } else
70 break; // 处理完英文字符
71
72 continue;
73
74 } else {
75 if (ch < 176) { //中文标点等非汉字字符128<=ch<176
76 i = 0;
77 len = s1.length();
78
79 while (i < len && ((unsigned char) s1[i] < 176)
80 && ((unsigned char) s1[i] >= 161)
81 && (!((unsigned char) s1[i] == 161
82 && ((unsigned char) s1[i + 1] >= 162
83 && (unsigned char) s1[i + 1] <= 168)))
84 && (!((unsigned char) s1[i] == 161
85 && ((unsigned char) s1[i + 1] >= 171
86 && (unsigned char) s1[i + 1] <= 191)))
87 && (!((unsigned char) s1[i] == 163
88 && ((unsigned char) s1[i + 1] == 172
89 || (unsigned char) s1[i + 1] == 161)
90 || (unsigned char) s1[i + 1] == 168
91 || (unsigned char) s1[i + 1] == 169
92 || (unsigned char) s1[i + 1] == 186
93 || (unsigned char) s1[i + 1] == 187
94 || (unsigned char) s1[i + 1] == 191))) {
95 i = i + 2; //假定没有半个汉字
96 }
97
98 if (i == 0)
99 i = i + 2;
100
101 if (!(ch == 161 && (unsigned char) s1[1] == 161)) { // 不处理中文空格
102 if (i <= s1.size()) // yhf
103 s2 += s1.substr(0, i) + SEPARATOR; // 其他的非汉字双字节字符可能连续输出
104 else
105 break; // yhf
106 }
107
108 if (i <= s1.size()) {
109 s1 = s1.substr(i); //取s1从下标i开始的子字符串
110
111 } else
112 break; //yhf
113
114 continue;
115 }
116 }
117
118 i = 2;
119 len = s1.length();
120 while (i < len && (unsigned char) s1[i] >= 176)
121 // while(i<len && (unsigned char)s1[i]>=128 && (unsigned char)s1[i]!=161)
122 i += 2;
123
124 s2 += SegmentHzStrMM(dict, s1.substr(0, i));
125
126 if (i <= len) // yhf
127 s1 = s1.substr(i);
128 else
129 break; // yhf
130 }
131
132 return s2;
133 }
134
135 // translate the encoded URL(%xx) to actual chars
136 void CHzSeg::Translate(char* SourceStr) const {
137 int i = 0;
138 int j = 0;
139 char *tempstr, tempchar1, tempchar2;
140
141 tempstr = (char*) malloc(strlen(SourceStr) + 1);
142 if (tempstr == NULL) {
143 return;
144 }
145
146 while (SourceStr[j]) {
147 if ((tempstr[i] = SourceStr[j]) == '%') {
148 if (SourceStr[j + 1] >= 'A')
149 tempchar1 = ((SourceStr[j + 1] & 0xdf) - 'A') + 10;
150 else
151 tempchar1 = (SourceStr[j + 1] - '0');
152 if (SourceStr[j + 2] >= 'A')
153 tempchar2 = ((SourceStr[j + 2] & 0xdf) - 'A') + 10;
154 else
155 tempchar2 = (SourceStr[j + 2] - '0');
156 tempstr[i] = tempchar1 * 16 + tempchar2;
157 j = j + 2;
158 }
159 i++;
160 j++;
161 }
162 tempstr[i] = '\0';
163 strcpy(SourceStr, tempstr);
164
165 if (tempstr)
166 free(tempstr);
167 }
168
169 /*
170 * segment the image URL by '/'
171 * omit the domain name
172 */
173 string CHzSeg::SegmentURL(CDict &dict, string url) const {
174 string::size_type idx, nidx;
175 char *curl = (char *) url.c_str();
176 this->Translate(curl);
177 url = curl;
178 if ((idx = url.find("http://", 0)) != string::npos) {
179 if ((nidx = url.find("/", 7)) != string::npos) {
180 url = url.substr(nidx + 1); // cut the part of sitename
181 }
182 }
183 idx = 0;
184 while ((idx = url.find("/", idx)) != string::npos) {
185 url.replace(idx, 1, SEPARATOR); // replace "/" with SEPARATOR "/ "
186 idx += 3;
187 }
188 if ((idx = url.rfind(".")) != string::npos) {
189 url = url.erase(idx); // erase the file extension
190 }
191
192 url += "/ ";
193
194 // segment the string whose length is greater than 8 (4 HZ_chars)
195 idx = 0;
196 nidx = 0;
197 bool isover = false;
198 string stmp;
199 while (!isover) {
200 if ((nidx = url.find(SEPARATOR, idx)) == string::npos)
201 isover = true;
202 if (nidx - idx > 0) {
203 stmp = url.substr(idx, nidx - idx);
204 stmp = SegmentSentenceMM(dict, stmp);
205 if (stmp.size() >= 3)
206 stmp.erase(stmp.length() - 3); // erase the tail "/ "
207 url = url.replace(idx, nidx - idx, stmp);
208 idx += stmp.length() + 3;
209 } else if (nidx == string::npos && idx < url.length()) {
210 stmp = url.substr(idx);
211 stmp = SegmentSentenceMM(dict, stmp);
212 stmp.erase(stmp.length() - 3);
213 url = url.substr(0, idx) + stmp;
214 } else
215 idx = nidx + 3;
216 }
217
218 return url;
219
220 }