导航

二元分词(Lucene CJK Analyzer).Net版

Posted on 2006-11-24 20:08  REMING  阅读(2695)  评论(1编辑  收藏  举报

由于工作的需要,最近一直在研究Lucene.Net,在测试中我发现当索引库达到5GB左右的时候,搜索速度将变得奇慢。在网上查找一些资料,说分词器会影响搜索速度,但又苦于好的免费的分词器,于是只有改写Java版的CJKAnalyzer,我把它共享给大家。虽然我很久就申请了这个Blog,但是一直没有写什么东西,这篇文章也算是我的处女作,希望今后能够和大家多多交流。

 1
 2/**
 3 * Copyright 2004-2005 The Apache Software Foundation
 4 *
 5 * Licensed under the Apache License, Version 2.0 (the "License");
 6 * you may not use this file except in compliance with the License.
 7 * You may obtain a copy of the License at
 8 *
 9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */

17using System;
18using System.Collections;
19using System.IO;
20
21using Lucene.Net.Analysis;
22
23namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
24{
25    /**
26     * Filters CJKTokenizer with StopFilter.
27     *
28     * @author Che, Dong
29     */

30    public class CJKAnalyzer:Analyzer 
31    {
32        //~ Static fields/initializers ---------------------------------------------
33
34        /**
35         * An array containing some common English words that are not usually
36         * useful for searching and some double-byte interpunctions.
37         */

38        public  static string[] STOP_WORDS = {
39                                                 "a""and""are""as""at""be",
40                                                 "but""by""for""if""in",
41                                                 "into""is""it""no""not",
42                                                 "of""on""or""s""such""t",
43                                                 "that""the""their""then",
44                                                 "there""these""they""this",
45                                                 "to""was""will""with""",
46                                                 "www"
47                                             }
;
48
49        //~ Instance fields --------------------------------------------------------
50
51        /**
52         * stop word list
53         */

54        private Hashtable stopTable;
55
56        //~ Constructors -----------------------------------------------------------
57
58        /**
59         * Builds an analyzer which removes words in {@link #STOP_WORDS}.
60         */

61        public CJKAnalyzer() 
62        {
63            stopTable = StopFilter.MakeStopSet(STOP_WORDS);
64        }

65
66        /**
67         * Builds an analyzer which removes words in the provided array.
68         *
69         * @param stopWords stop word array
70         */

71        public CJKAnalyzer(string[] stopWords) 
72        {
73            stopTable = StopFilter.MakeStopSet(stopWords);
74        }

75
76        //~ Methods ----------------------------------------------------------------
77
78        /**
79         * get token stream from input
80         *
81         * @param fieldName lucene field name
82         * @param reader    input reader
83         * @return TokenStream
84         */

85        public override TokenStream TokenStream(string fieldName, TextReader reader) 
86        {
87            TokenStream ts=new CJKTokenizer(reader);
88            return new StopFilter(ts, stopTable);
89            //return new StopFilter(new CJKTokenizer(reader), stopTable);
90        }

91    }

92}

  1
  2
  3/**
  4 * Copyright 2004-2005 The Apache Software Foundation
  5 *
  6 * Licensed under the Apache License, Version 2.0 (the "License");
  7 * you may not use this file except in compliance with the License.
  8 * You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */

 18
 19using System;
 20using System.Collections;
 21using System.IO;
 22
 23using Lucene.Net.Analysis;
 24
 25/**
 26 * CJKTokenizer was modified from StopTokenizer which does a decent job for
 27 * most European languages. It performs other token methods for double-byte
 28 * Characters: the token will return at each two charactors with overlap match.<br>
 29 * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
 30 * also need filter filter zero length token ""<br>
 31 * for Digit: digit, '+', '#' will token as letter<br>
 32 * for more info on Asia language(Chinese Japanese Korean) text segmentation:
 33 * please search  <a
 34 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
 35 *
 36 * @author Che, Dong
 37 */

 38namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
 39{
 40    public  class CJKTokenizer:Tokenizer 
 41    {
 42        //~ Static fields/initializers ---------------------------------------------
 43
 44        /** Max word length */
 45        private static int MAX_WORD_LEN = 255;
 46
 47        /** buffer size: */
 48        private static int IO_BUFFER_SIZE = 256;
 49
 50        //~ Instance fields --------------------------------------------------------
 51
 52        /** word offset, used to imply which character(in ) is parsed */
 53        private int offset = 0;
 54
 55        /** the index used only for ioBuffer */
 56        private int bufferIndex = 0;
 57
 58        /** data length */
 59        private int dataLen = 0;
 60
 61        /**
 62         * character buffer, store the characters which are used to compose <br>
 63         * the returned Token
 64         */

 65        private  char[] buffer = new char[MAX_WORD_LEN];
 66
 67        /**
 68         * I/O buffer, used to store the content of the input(one of the <br>
 69         * members of Tokenizer)
 70         */

 71        private  char[] ioBuffer = new char[IO_BUFFER_SIZE];
 72
 73        /** word type: single=>ASCII  double=>non-ASCII word=>default */
 74        private string tokenType = "word";
 75
 76        /**
 77         * tag: previous character is a cached double-byte character  "C1C2C3C4"
 78         * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
 79         * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
 80         */

 81        private bool preIsTokened = false;
 82
 83        //~ Constructors -----------------------------------------------------------
 84
 85        /**
 86         * Construct a token stream processing the given input.
 87         *
 88         * @param in I/O reader
 89         */

 90        public CJKTokenizer(TextReader reader) 
 91        {
 92            input = reader;
 93        }

 94
 95        //~ Methods ----------------------------------------------------------------
 96
 97        /**
 98         * Returns the next token in the stream, or null at EOS.
 99         * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
100         * for detail.
101         *
102         * @return Token
103         *
104         * @throws java.io.IOException - throw IOException when read error <br>
105         *         hanppened in the InputStream
106         *
107         */

108        public override Token Next()
109        {
110            /** how many character(s) has been stored in buffer */
111            int length = 0;
112
113            /** the position used to create Token */
114            int start = offset;
115
116            while (true
117            {
118                /** current charactor */
119                char c;
120
121            
122                offset++;
123
124                /*
125                 if (bufferIndex >= dataLen) 
126                 {
127                        dataLen = input.read(ioBuffer); //Java中read读到最后不会出错,但.Net会,
128                        bufferIndex = 0;
129                 }
130                 */

131
132                if (bufferIndex >= dataLen ) 
133                {
134                    if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
135                    {
136                        dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
137                        bufferIndex = 0;
138                    }

139                    else
140                    {
141                        dataLen=0;
142                    }

143                }

144
145                if (dataLen ==0
146                {
147                    if (length > 0
148                    {
149                        if (preIsTokened == true
150                        {
151                            length = 0;
152                            preIsTokened = false;
153                        }

154
155                        break;
156                    }
 
157                    else 
158                    {
159                        return null;
160                    }

161                }
 
162                else 
163                {
164                    //get current character
165                    c = ioBuffer[bufferIndex++];
166                }

167
168                //if the current character is ASCII or Extend ASCII
169                if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
170                {
171                    if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c)) 
172                    {
173                        /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
174                        int i = (int) c;
175                        i = i - 65248;
176                        c = (char) i;
177                    }

178                    if the current character is a letter or "_" "+" "#
236
237                }
 
238                else 
239                {
240                    // non-ASCII letter, eg."C1C2C3C4"
291                }

292            }

293
294            return new Token(new String(buffer, 0, length), start, start + length,
295                tokenType
296                );
297        }

298
299        public bool     IsAscii(char c)
300        {
301            return c<256 && c>=0;
302        }

303        
304        public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
305        {
306            return c<=0xFFEF && c>=0xFF00;
307        }

308    }

309}