CHzSeg

 1 #ifndef _HZSEG_H_040415_
 2 #define _HZSEG_H_040415_
 3 
 4 #include <iostream>
 5 #include <string>
 6 #include <cstring>
 7 #include <cstdlib>
 8 #include <fstream>
 9 #include "Dict.h"
10 
11 using namespace std;
12 
13 class CHzSeg
14 {
15 public:
16     CHzSeg();
17     ~CHzSeg();
18 
19     string SegmentSentenceMM (CDict&, string) const;//只保留下中文
20     string SegmentHzStrMM (CDict&, string) const;//切词
21     string SegmentURL(CDict&, string) const;
22 
23     // process a sentence before segmentation
24     void Translate(char* SourceStr) const;
25 };
26     
27 #endif /* _HZSEG_H_040415_ */

  1 // HzSeg handling
  2 
  3 #include "HzSeg.h"
  4 #include "Dict.h"
  5 
  6 const unsigned int MAX_WORD_LENGTH = 8;
  7 const string SEPARATOR("/  "); // delimiter between words
  8 
  9 CHzSeg::CHzSeg() {
 10 }
 11 
 12 CHzSeg::~CHzSeg() {
 13 }
 14 
 15 // Using Max Matching method to segment a character string.
 16 string CHzSeg::SegmentHzStrMM(CDict &dict, string s1) const {
 17     string s2 = ""; //保存句子s1的分词结果
 18     while (!s1.empty()) {
 19         unsigned int len = s1.size();
 20 
 21         if (len > MAX_WORD_LENGTH)
 22             len = MAX_WORD_LENGTH;
 23         //如果待切分的句子大于最大切分单元
 24         //len=最大切分单元,否则len=句子的长度
 25 
 26         string w = s1.substr(0, len); //取s1句子最左边长度len为的子句子
 27         bool isw = dict.IsWord(w); //判断刚刚取出来的子句子是不是一个词
 28 
 29         while (len > 2 && isw == false) { //当w中至少有2个中文字&&不能构成字的时候,减去最右边的一个中文字符
 30             len -= 2; // cut a word
 31             w = w.substr(0, len);
 32             isw = dict.IsWord(w);
 33         }
 34         s2 += w + SEPARATOR;
 35 
 36         s1 = s1.substr(w.size());
 37     }
 38 
 39     return s2;
 40 }
 41 
 42 // process a sentence before segmentation
 43 string CHzSeg::SegmentSentenceMM(CDict &dict, string s1) const {
 44     string s2 = "";
 45     unsigned int i, len;
 46     cout << endl << "I'm in SegmentSentenceMM" << endl;
 47     cout << s1 << endl;
 48     while (!s1.empty()) {
 49         unsigned char ch = (unsigned char) s1[0];
 50         if (ch < 128) { //吃掉一行中所有换行符以外的英文字符
 51             i = 1;
 52             len = s1.size();
 53             while (i < len && ((unsigned char) s1[i] < 128) && (s1[i] != 10)
 54                     && (s1[i] != 13)) { // LF, CR
 55                 i++; //不是回车换行
 56             }
 57 
 58             if ((ch != 32) && (ch != 10) && (ch != 13)) { // SP, LF, CR
 59                 s2 += s1.substr(0, i) + SEPARATOR;
 60             } else {
 61                 if (ch == 10 || ch == 13) {
 62                     s2 += s1.substr(0, i);
 63                     cout << "当前s2:" << s2 << endl;
 64                 }
 65             }
 66 
 67             if (i <= s1.size()) {
 68                 s1 = s1.substr(i); //获得删去部分英文字符后的数据
 69             } else
 70                 break; // 处理完英文字符
 71 
 72             continue;
 73 
 74         } else {
 75             if (ch < 176) { //中文标点等非汉字字符128<=ch<176
 76                 i = 0;
 77                 len = s1.length();
 78 
 79                 while (i < len && ((unsigned char) s1[i] < 176)
 80                         && ((unsigned char) s1[i] >= 161)
 81                         && (!((unsigned char) s1[i] == 161
 82                                 && ((unsigned char) s1[i + 1] >= 162
 83                                         && (unsigned char) s1[i + 1] <= 168)))
 84                         && (!((unsigned char) s1[i] == 161
 85                                 && ((unsigned char) s1[i + 1] >= 171
 86                                         && (unsigned char) s1[i + 1] <= 191)))
 87                         && (!((unsigned char) s1[i] == 163
 88                                 && ((unsigned char) s1[i + 1] == 172
 89                                         || (unsigned char) s1[i + 1] == 161)
 90                                 || (unsigned char) s1[i + 1] == 168
 91                                 || (unsigned char) s1[i + 1] == 169
 92                                 || (unsigned char) s1[i + 1] == 186
 93                                 || (unsigned char) s1[i + 1] == 187
 94                                 || (unsigned char) s1[i + 1] == 191))) {
 95                     i = i + 2; //假定没有半个汉字
 96                 }
 97 
 98                 if (i == 0)
 99                     i = i + 2;
100 
101                 if (!(ch == 161 && (unsigned char) s1[1] == 161)) { // 不处理中文空格
102                     if (i <= s1.size()) // yhf
103                         s2 += s1.substr(0, i) + SEPARATOR; // 其他的非汉字双字节字符可能连续输出
104                     else
105                         break; // yhf
106                 }
107 
108                 if (i <= s1.size()) {
109                     s1 = s1.substr(i); //取s1从下标i开始的子字符串
110 
111                 } else
112                     break; //yhf
113 
114                 continue;
115             }
116         }
117 
118         i = 2;
119         len = s1.length();
120         while (i < len && (unsigned char) s1[i] >= 176)
121 //    while(i<len && (unsigned char)s1[i]>=128 && (unsigned char)s1[i]!=161)
122             i += 2;
123 
124         s2 += SegmentHzStrMM(dict, s1.substr(0, i));
125 
126         if (i <= len) // yhf
127             s1 = s1.substr(i);
128         else
129             break; // yhf
130     }
131 
132     return s2;
133 }
134 
135 // translate the encoded URL(%xx) to actual chars
136 void CHzSeg::Translate(char* SourceStr) const {
137     int i = 0;
138     int j = 0;
139     char *tempstr, tempchar1, tempchar2;
140 
141     tempstr = (char*) malloc(strlen(SourceStr) + 1);
142     if (tempstr == NULL) {
143         return;
144     }
145 
146     while (SourceStr[j]) {
147         if ((tempstr[i] = SourceStr[j]) == '%') {
148             if (SourceStr[j + 1] >= 'A')
149                 tempchar1 = ((SourceStr[j + 1] & 0xdf) - 'A') + 10;
150             else
151                 tempchar1 = (SourceStr[j + 1] - '0');
152             if (SourceStr[j + 2] >= 'A')
153                 tempchar2 = ((SourceStr[j + 2] & 0xdf) - 'A') + 10;
154             else
155                 tempchar2 = (SourceStr[j + 2] - '0');
156             tempstr[i] = tempchar1 * 16 + tempchar2;
157             j = j + 2;
158         }
159         i++;
160         j++;
161     }
162     tempstr[i] = '\0';
163     strcpy(SourceStr, tempstr);
164 
165     if (tempstr)
166         free(tempstr);
167 }
168 
169 /*
170  * segment the image URL by '/'
171  * omit the domain name
172  */
173 string CHzSeg::SegmentURL(CDict &dict, string url) const {
174     string::size_type idx, nidx;
175     char *curl = (char *) url.c_str();
176     this->Translate(curl);
177     url = curl;
178     if ((idx = url.find("http://", 0)) != string::npos) {
179         if ((nidx = url.find("/", 7)) != string::npos) {
180             url = url.substr(nidx + 1); // cut the part of sitename
181         }
182     }
183     idx = 0;
184     while ((idx = url.find("/", idx)) != string::npos) {
185         url.replace(idx, 1, SEPARATOR); // replace "/" with SEPARATOR "/  "
186         idx += 3;
187     }
188     if ((idx = url.rfind(".")) != string::npos) {
189         url = url.erase(idx); // erase the file extension
190     }
191 
192     url += "/  ";
193 
194     // segment the string whose length is greater than 8 (4 HZ_chars)
195     idx = 0;
196     nidx = 0;
197     bool isover = false;
198     string stmp;
199     while (!isover) {
200         if ((nidx = url.find(SEPARATOR, idx)) == string::npos)
201             isover = true;
202         if (nidx - idx > 0) {
203             stmp = url.substr(idx, nidx - idx);
204             stmp = SegmentSentenceMM(dict, stmp);
205             if (stmp.size() >= 3)
206                 stmp.erase(stmp.length() - 3); // erase the tail "/  "
207             url = url.replace(idx, nidx - idx, stmp);
208             idx += stmp.length() + 3;
209         } else if (nidx == string::npos && idx < url.length()) {
210             stmp = url.substr(idx);
211             stmp = SegmentSentenceMM(dict, stmp);
212             stmp.erase(stmp.length() - 3);
213             url = url.substr(0, idx) + stmp;
214         } else
215             idx = nidx + 3;
216     }
217 
218     return url;
219 
220 }

posted on 2012-07-14 20:59 kakamilan 阅读(219) 评论(0) 收藏举报

刷新页面返回顶部

kakamilan

CHzSeg

导航

公告