二元分词(Lucene CJK Analyzer).Net版

Posted on 2006-11-24 20:08 REMING 阅读(2695) 评论(1) 编辑收藏举报

由于工作的需要，最近一直在研究Lucene.Net，在测试中我发现当索引库达到5GB左右的时候，搜索速度将变得奇慢。在网上查找一些资料，说分词器会影响搜索速度，但又苦于好的免费的分词器，于是只有改写Java版的CJKAnalyzer，我把它共享给大家。虽然我很久就申请了这个Blog，但是一直没有写什么东西，这篇文章也算是我的处女作，希望今后能够和大家多多交流。

 1
 2/**//**
 3 * Copyright 2004-2005 The Apache Software Foundation
 4 *
 5 * Licensed under the Apache License, Version 2.0 (the "License");
 6 * you may not use this file except in compliance with the License.
 7 * You may obtain a copy of the License at
 8 *
 9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17using System;
18using System.Collections;
19using System.IO;
20
21using Lucene.Net.Analysis;
22
23namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
24{
25    /**//**
26     * Filters CJKTokenizer with StopFilter.
27     *
28     * @author Che, Dong
29     */
30    public class CJKAnalyzer:Analyzer 
31    {
32        //~ Static fields/initializers ---------------------------------------------
33
34        /**//**
35         * An array containing some common English words that are not usually
36         * useful for searching and some double-byte interpunctions.
37         */
38        public  static string[] STOP_WORDS = {
39                                                 "a", "and", "are", "as", "at", "be",
40                                                 "but", "by", "for", "if", "in",
41                                                 "into", "is", "it", "no", "not",
42                                                 "of", "on", "or", "s", "such", "t",
43                                                 "that", "the", "their", "then",
44                                                 "there", "these", "they", "this",
45                                                 "to", "was", "will", "with", "",
46                                                 "www"
47                                             };
48
49        //~ Instance fields --------------------------------------------------------
50
51        /**//**
52         * stop word list
53         */
54        private Hashtable stopTable;
55
56        //~ Constructors -----------------------------------------------------------
57
58        /**//**
59         * Builds an analyzer which removes words in {@link #STOP_WORDS}.
60         */
61        public CJKAnalyzer() 
62        {
63            stopTable = StopFilter.MakeStopSet(STOP_WORDS);
64        }
65
66        /**//**
67         * Builds an analyzer which removes words in the provided array.
68         *
69         * @param stopWords stop word array
70         */
71        public CJKAnalyzer(string[] stopWords) 
72        {
73            stopTable = StopFilter.MakeStopSet(stopWords);
74        }
75
76        //~ Methods ----------------------------------------------------------------
77
78        /**//**
79         * get token stream from input
80         *
81         * @param fieldName lucene field name
82         * @param reader    input reader
83         * @return TokenStream
84         */
85        public override TokenStream TokenStream(string fieldName, TextReader reader) 
86        {
87            TokenStream ts=new CJKTokenizer(reader);
88            return new StopFilter(ts, stopTable);
89            //return new StopFilter(new CJKTokenizer(reader), stopTable);
90        }
91    }
92}

  1
  2
  3/**//**
  4 * Copyright 2004-2005 The Apache Software Foundation
  5 *
  6 * Licensed under the Apache License, Version 2.0 (the "License");
  7 * you may not use this file except in compliance with the License.
  8 * You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18
 19using System;
 20using System.Collections;
 21using System.IO;
 22
 23using Lucene.Net.Analysis;
 24
 25/**//**
 26 * CJKTokenizer was modified from StopTokenizer which does a decent job for
 27 * most European languages. It performs other token methods for double-byte
 28 * Characters: the token will return at each two charactors with overlap match.<br>
 29 * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
 30 * also need filter filter zero length token ""<br>
 31 * for Digit: digit, '+', '#' will token as letter<br>
 32 * for more info on Asia language(Chinese Japanese Korean) text segmentation:
 33 * please search  <a
 34 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
 35 *
 36 * @author Che, Dong
 37 */
 38namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
 39{
 40    public  class CJKTokenizer:Tokenizer 
 41    {
 42        //~ Static fields/initializers ---------------------------------------------
 43
 44        /**//** Max word length */
 45        private static int MAX_WORD_LEN = 255;
 46
 47        /**//** buffer size: */
 48        private static int IO_BUFFER_SIZE = 256;
 49
 50        //~ Instance fields --------------------------------------------------------
 51
 52        /**//** word offset, used to imply which character(in ) is parsed */
 53        private int offset = 0;
 54
 55        /**//** the index used only for ioBuffer */
 56        private int bufferIndex = 0;
 57
 58        /**//** data length */
 59        private int dataLen = 0;
 60
 61        /**//**
 62         * character buffer, store the characters which are used to compose <br>
 63         * the returned Token
 64         */
 65        private  char[] buffer = new char[MAX_WORD_LEN];
 66
 67        /**//**
 68         * I/O buffer, used to store the content of the input(one of the <br>
 69         * members of Tokenizer)
 70         */
 71        private  char[] ioBuffer = new char[IO_BUFFER_SIZE];
 72
 73        /**//** word type: single=>ASCII  double=>non-ASCII word=>default */
 74        private string tokenType = "word";
 75
 76        /**//**
 77         * tag: previous character is a cached double-byte character  "C1C2C3C4"
 78         * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
 79         * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
 80         */
 81        private bool preIsTokened = false;
 82
 83        //~ Constructors -----------------------------------------------------------
 84
 85        /**//**
 86         * Construct a token stream processing the given input.
 87         *
 88         * @param in I/O reader
 89         */
 90        public CJKTokenizer(TextReader reader) 
 91        {
 92            input = reader;
 93        }
 94
 95        //~ Methods ----------------------------------------------------------------
 96
 97        /**//**
 98         * Returns the next token in the stream, or null at EOS.
 99         * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
100         * for detail.
101         *
102         * @return Token
103         *
104         * @throws java.io.IOException - throw IOException when read error <br>
105         *         hanppened in the InputStream
106         *
107         */
108        public override Token Next()
109        {
110            /**//** how many character(s) has been stored in buffer */
111            int length = 0;
112
113            /**//** the position used to create Token */
114            int start = offset;
115
116            while (true) 
117            {
118                /**//** current charactor */
119                char c;
120
121            
122                offset++;
123
124                /**//*
125                 if (bufferIndex >= dataLen) 
126                 {
127                        dataLen = input.read(ioBuffer); //Java中read读到最后不会出错，但.Net会，
128                        bufferIndex = 0;
129                 }
130                 */
131
132                if (bufferIndex >= dataLen ) 
133                {
134                    if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错，但.Net会，所以此处是为了拦截异常
135                    {
136                        dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
137                        bufferIndex = 0;
138                    }
139                    else
140                    {
141                        dataLen=0;
142                    }
143                }
144
145                if (dataLen ==0) 
146                {
147                    if (length > 0) 
148                    {
149                        if (preIsTokened == true) 
150                        {
151                            length = 0;
152                            preIsTokened = false;
153                        }
154
155                        break;
156                    } 
157                    else 
158                    {
159                        return null;
160                    }
161                } 
162                else 
163                {
164                    //get current character
165                    c = ioBuffer[bufferIndex++];
166                }
167
168                //if the current character is ASCII or Extend ASCII
169                if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
170                {
171                    if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c)) 
172                    {
173                        /**//** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
174                        int i = (int) c;
175                        i = i - 65248;
176                        c = (char) i;
177                    }
178                    if the current character is a letter or "_" "+" "##region if the current character is a letter or "_" "+" "#
179
180                    // if the current character is a letter or "_" "+" "#"
181                    if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#'))) 
182                    {
183
184                        if (length == 0) 
185                        {
186                            // "javaC1C2C3C4linux" <br>
187                            //      ^--: the current character begin to token the ASCII
188                            // letter
189                            start = offset - 1;
190                        } 
191                        else if (tokenType == "double") 
192                        {
193                            // "javaC1C2C3C4linux" <br>
194                            //              ^--: the previous non-ASCII
195                            // : the current character
196                            offset--;
197                            bufferIndex--;
198                            tokenType = "single";
199
200                            if (preIsTokened == true) 
201                            {
202                                // there is only one non-ASCII has been stored
203                                length = 0;
204                                preIsTokened = false;
205
206                                break;
207                            } 
208                            else 
209                            {
210                                break;
211                            }
212                        }
213
214                        // store the LowerCase(c) in the buffer
215                        buffer[length++] = char.ToLower(c);
216                        tokenType = "single";
217                        // break the procedure if buffer overflowed!
218                        if (length == MAX_WORD_LEN) 
219                        {
220                            break;
221                        }
222                    } 
223                    else if (length > 0) 
224                    {
225                        if (preIsTokened == true) 
226                        {
227                            length = 0;
228                            preIsTokened = false;
229                        } 
230                        else 
231                        {
232                            break;
233                        }
234                    }
235                    #endregion
236
237                } 
238                else 
239                {
240                    // non-ASCII letter, eg."C1C2C3C4"#region // non-ASCII letter, eg."C1C2C3C4"
241
242                    // non-ASCII letter, eg."C1C2C3C4"
243                    if (char.IsLetter(c)) 
244                    {
245                        if (length == 0) 
246                        {
247                            start = offset - 1;
248                            buffer[length++] = c;
249                            tokenType = "double";
250                        } 
251                        else 
252                        {
253                            if (tokenType == "single") 
254                            {
255                                offset--;
256                                bufferIndex--;
257
258                                //return the previous ASCII characters
259                                break;
260                            } 
261                            else 
262                            {
263                                buffer[length++] = c;
264                                tokenType = "double";
265
266                                if (length == 2) 
267                                {
268                                    offset--;
269                                    bufferIndex--;
270                                    preIsTokened = true;
271
272                                    break;
273                                }
274                            }
275                        }
276                    } 
277                    else if (length > 0) 
278                    {
279                        if (preIsTokened == true) 
280                        {
281                            // empty the buffer
282                            length = 0;
283                            preIsTokened = false;
284                        } 
285                        else 
286                        {
287                            break;
288                        }
289                    }
290                    #endregion
291                }
292            }
293
294            return new Token(new String(buffer, 0, length), start, start + length,
295                tokenType
296                );
297        }
298
299        public bool     IsAscii(char c)
300        {
301            return c<256 && c>=0;
302        }
303        
304        public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
305        {
306            return c<=0xFFEF && c>=0xFF00;
307        }
308    }
309}

会员力量，点亮园子希望

刷新页面返回顶部

导航

公告

二元分词(Lucene CJK Analyzer).Net版