1 #ifndef ALGORITHM_WUMANBER_H
2 #define ALGORITHM_WUMANBER_H
3
4 #include <vector>
5 #include <string>
6 #include <set>
7
8 typedef std::set<std::string> ResultSetType;
9 typedef std::vector<unsigned int> MatchPosVector;
10 typedef std::pair<unsigned int, int> PrefixIdPairType;
11 typedef std::vector<PrefixIdPairType> PrefixTableType;
12
13 class WuManber
14 {
15 public:
16 WuManber();
17 ~WuManber();
18 /**
19 * Init Function
20 *
21 * @param patterns pattern list to be matched
22 */
23 bool Init(const std::vector<std::string>& patterns);
24
25 /**
26 * @param text raw text
27 * @param textLength length of text
28 * @param res string set containing matched patterns
29 *
30 * @return value 0: no pattern matchs, n: n patterns matched(n>0)
31 */
32 int Search( const char* text, const int textLength, ResultSetType& res);
33
34 /**
35 * @param str raw text
36 * @param res string set containing matched patterns
37 *
38 * @return value 0: no pattern matchs, n: n patterns matched(n>0)
39 */
40 int Search(const std::string& str, ResultSetType& res);
41
42 /**
43 * @brief Search text
44 *
45 * @return value 0: no pattern matchs, n: n patterns matched(n>0)
46 */
47 int Search(const char* text, const int textLength);
48
49 /**
50 * @param str raw text
51 * param matchPosVector vector containing matched patterns postion
52 * @return value 0: no pattern matchs, n: n patterns matched(n>0)
53 */
54 int Search(const char* text, const int textLength, MatchPosVector &matchPosVector);
55
56 /**
57 * param matchPosVector vector containing matched patterns postion
58 * @return value 0: no pattern matchs, n: n patterns matched(n>0)
59 */
60 int Search(const std::string& str, MatchPosVector &matchPosVector);
61
62 /**
63 * @brief Search text
64 *
65 * @return value 0: no pattern matchs, n: n patterns matched(n>0)
66 */
67 int Search(const std::string& str);
68
69 private:
70 // minmum length of patterns
71 int32_t mMin;
72 // SHIFT table
73 std::vector<int32_t> mShiftTable;
74 // a combination of HASH and PREFIX table
75 std::vector<PrefixTableType> mHashTable;
76 // patterns
77 std::vector<std::string> mPatterns;
78 // size of SHIFT and HASH table
79 int32_t mTableSize;
80 // size of block
81 int32_t mBlock;
82 };
83
84 #endif
1 #include <cmath>
2 #include <iostream>
3 #include "wumanber.h"
4
5 using namespace std;
6
7 /**
8 * @brief String hash function.
9 *
10 * @param str the string needed to be hashed
11 * @param len length of the substr should be hashed
12 *
13 * @return hash code
14 */
15 unsigned int HashCode(const char* str, int len)
16 {
17 unsigned int hash = 0;
18 while (*str && len>0)
19 {
20 hash = (*str++) + (hash << 6) + (hash << 16) - hash;
21 --len;
22 }
23 return (hash & 0x7FFFFFFF);
24 }
25
26 /**
27 * @brief constructor
28 */
29 WuManber::WuManber():mMin(0), mTableSize(0), mBlock(3)
30 {
31 //VOID
32 }
33
34 /**
35 * @brief Init
36 */
37 bool WuManber::Init(const vector<string>& patterns)
38 {
39 int patternSize = patterns.size();
40
41 //check if no pattern specified
42 if (patternSize == 0)
43 {
44 //cerr << "Error: wumanber init failed because no pattern specified." << endl;
45 return false;
46 }
47
48 //caculate the minmum pattern length
49 mMin = patterns[0].length();
50 int32_t lenPattern = 0;
51 for (int i = 0; i < patternSize; ++i)
52 {
53 lenPattern = patterns[i].length();
54 if (lenPattern < mMin)
55 {
56 mMin = lenPattern;
57 }
58 }
59
60 //check if mBlock larger than mMin
61 if (mBlock > mMin)
62 {
63 //cerr << "Warning: mBlock is larger than minmum pattern length, reset mBlock to minmum, but it will seriously affect the effiency." << endl;
64 mBlock = mMin;
65 }
66
67 //choose a suitable mTableSize for SHIFT, HASH table
68 int32_t primes[6] = {1003, 10007, 100003, 1000003, 10000019, 100000007};
69 vector<int32_t> primeList(&primes[0], &primes[6]);
70
71 int32_t threshold = 10 * mMin;
72 for (size_t i = 0; i < primeList.size(); ++i)
73 {
74 if (primeList[i] > patternSize && primeList[i] / patternSize > threshold)
75 {
76 mTableSize = primeList[i];
77 break;
78 }
79 }
80 cout << mTableSize << " " << mBlock << " " << mMin << endl;
81 //if size of patternList is huge.
82 if (0 == mTableSize)
83 {
84 //cerr << "Warning: amount of pattern is very large, will cost a great amount of memory." << endl;
85 mTableSize = primeList[5];
86 }
87
88 //construct ShiftTable and HashTable, and set default value for SHIFT table
89 mPatterns = patterns;
90 mHashTable.resize(mTableSize);
91 // default value is m-mBlock+1 for shift
92 int32_t defaultValue = mMin - mBlock + 1;
93 mShiftTable.resize(mTableSize, defaultValue);
94
95 //loop through patterns
96 for (int id = 0; id < patternSize; ++id)
97 {
98 // loop through each pattern from right to left
99 for (int index = mMin; index >= mBlock; --index)
100 {
101 unsigned int hash = HashCode(patterns[id].c_str() + index - mBlock, mBlock) % mTableSize;
102 if (mShiftTable[hash] > (mMin - index))
103 {
104 mShiftTable[hash] = mMin - index;
105 }
106 if (index == mMin)
107 {
108 unsigned int prefixHash = HashCode(patterns[id].c_str(), mBlock);
109 mHashTable[hash].push_back(make_pair(prefixHash, id));
110 }
111 }
112 }
113 cout << "Term number : " << mPatterns.size() << endl;
114 return true;
115 }
116
117 /**
118 * @brief destructor
119 */
120 WuManber::~WuManber()
121 {
122 //VOID
123 }
124
125
126 /**
127 * @public
128 * @brief search multiple pattern in text at one time
129 */
130 int WuManber::Search(const char* text, const int textLength, ResultSetType& res)
131 {
132 //hit count: value to be returned
133 int hits = 0;
134 int32_t index = mMin - 1; // start off by matching end of largest common pattern
135
136 int32_t blockMaxIndex = mBlock - 1;
137 int32_t windowMaxIndex = mMin - 1;
138
139 while (index < textLength)
140 {
141 unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
142 blockHash = blockHash % mTableSize;
143 int shift = mShiftTable[blockHash];
144 if (shift > 0)
145 {
146 index += shift;
147 }
148 else
149 {
150 // we have a potential match when shift is 0
151 unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
152 PrefixTableType &element = mHashTable[blockHash];
153 PrefixTableType::iterator iter = element.begin();
154
155 while (element.end() != iter)
156 {
157 if (prefixHash == iter->first)
158 {
159 // since prefindex matches, compare target substring with pattern
160 // we know first two characters already match
161 const char* indexTarget = text + index - windowMaxIndex; //+mBlock
162 const char* indexPattern = mPatterns[iter->second].c_str(); //+mBlock
163
164 while (('\0' != *indexTarget) && ('\0' != *indexPattern))
165 {
166 // match until we reach end of either string
167 if (*indexTarget == *indexPattern)
168 {
169 // match against chosen case sensitivity
170 ++indexTarget;
171 ++indexPattern;
172 }
173 else
174 break;
175 }
176 // match succeed since we reach the end of the pattern.
177 if ('\0' == *indexPattern)
178 {
179 res.insert(string(mPatterns[iter->second]));
180 ++hits;
181 }
182 }//end if
183 ++iter;
184 }//end while
185 ++index;
186 }//end else
187 }//end while
188
189 return hits;
190 }
191
192 /**
193 * Search
194 */
195 int WuManber::Search(const string& str, ResultSetType& res)
196 {
197 return Search(str.c_str(), str.length(), res);
198 }
199
200 /**
201 * Search
202 */
203 int WuManber::Search(const char* text, const int textLength)
204 {
205 //hit count: value to be returned
206 int hits = 0;
207 int index = mMin - 1; // start off by matching end of largest common pattern
208
209 uint32_t blockMaxIndex = mBlock - 1;
210 uint32_t windowMaxIndex = mMin - 1;
211
212 while (index < textLength)
213 {
214 unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
215 blockHash = blockHash % mTableSize;
216 int shift = mShiftTable[blockHash];
217 if (shift > 0)
218 {
219 index += shift;
220 }
221 else
222 {
223 // we have a potential match when shift is 0
224 unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
225 //prefixHash = prefixHash % mTableSize;
226 PrefixTableType &element = mHashTable[blockHash];
227 PrefixTableType::iterator iter = element.begin();
228
229 while (element.end() != iter)
230 {
231 if (prefixHash == iter->first)
232 {
233 // since prefindex matches, compare target substring with pattern
234 // we know first two characters already match
235 const char* indexTarget = text + index - windowMaxIndex; //+mBlock
236 const char* indexPattern = mPatterns[iter->second].c_str(); //+mBlock
237
238 while (('\0' != *indexTarget) && ('\0' != *indexPattern))
239 {
240 // match until we reach end of either string
241 if (*indexTarget == *indexPattern)
242 {
243 // match against chosen case sensitivity
244 ++indexTarget;
245 ++indexPattern;
246 }
247 else
248 break;
249 }
250 // match succeed since we reach the end of the pattern.
251 if ('\0' == *indexPattern)
252 {
253 ++hits;
254 }
255 }//end if
256 ++iter;
257 }//end while
258 ++index;
259 }//end else
260 }//end while
261
262 return hits;
263 }
264
265 int WuManber::Search(const char* text, const int textLength, MatchPosVector &matchPosVector)
266 {
267 //hit count: value to be returned
268 int hits = 0;
269 int index = mMin - 1; // start off by matching end of largest common pattern
270
271 uint32_t blockMaxIndex = mBlock - 1;
272 uint32_t windowMaxIndex = mMin - 1;
273
274 while (index < textLength)
275 {
276 unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
277 blockHash = blockHash % mTableSize;
278 int shift = mShiftTable[blockHash];
279 if (shift > 0)
280 {
281 index += shift;
282 }
283 else
284 {
285 // we have a potential match when shift is 0
286 unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
287 //prefixHash = prefixHash % mTableSize;
288 PrefixTableType &element = mHashTable[blockHash];
289 PrefixTableType::iterator iter = element.begin();
290
291 while (element.end() != iter)
292 {
293 if (prefixHash == iter->first)
294 {
295 // since prefindex matches, compare target substring with pattern
296 // we know first two characters already match
297 const char* indexTarget = text + index - windowMaxIndex; //+mBlock
298 const char* indexPattern = mPatterns[iter->second].c_str(); //+mBlock
299
300 while (('\0' != *indexTarget) && ('\0' != *indexPattern))
301 {
302 // match until we reach end of either string
303 if (*indexTarget == *indexPattern)
304 {
305 // match against chosen case sensitivity
306 ++indexTarget;
307 ++indexPattern;
308 }
309 else
310 break;
311 }
312 // match succeed since we reach the end of the pattern.
313 if ('\0' == *indexPattern)
314 {
315 ++hits;
316 matchPosVector.push_back(index);
317 }
318 }//end if
319 ++iter;
320 }//end while
321 ++index;
322 }//end else
323 }//end while
324
325 return hits;
326 }
327
328 int WuManber::Search(const string& str, MatchPosVector &matchPosVector)
329 {
330 return Search(str.c_str(), str.length(), matchPosVector);
331 }
332
333 int WuManber::Search(const string& str)
334 {
335 return Search(str.c_str(), str.length());
336 }
1 #include <iostream>
2 #include <fstream>
3 #include <string.h>
4 #include <vector>
5 #include <algorithm>
6 //#include "wumanber.h"
7
8 using namespace std;
9
10
11 //WuManber search;
12
13
14 int main()
15 {
16 ifstream readfile;
17 string line;
18 readfile.open("test_wumanber.dat", ios::in);
19 vector<string> pattern;
20 vector<unsigned int> pos;
21 while (getline(readfile, line)) {
22 if (line[0] == 1) {
23 line.erase(0,1);
24 pattern.push_back(line);
25 }
26 }
27 for (vector<string>::iterator it = pattern.begin(); it !=\
28 pattern.end(); it++)
29 cout << *it << endl;
30 /*search.Init(pattern);*/
31 //ResultSetType res;
32 //cout << search.Search(target, strlen(target), pos) << endl;
33 /*cout << endl;*/
34 }