基于Lucene 4.x的ik-analyzer

1 import java.io.Reader;
2 import org.apache.lucene.analysis.Analyzer;
3 import org.apache.lucene.analysis.Tokenizer;
4
5 /**
6  * 实现Lucene Analyzer 基于IKTokenizer的中文分词器
7  *
8  * @author 林良益
9  *
10  */
11 public final class IKAnalyzer extends Analyzer {
12
13     private boolean isMaxWordLength = false;
14
15     /**
16      * IK分词器Lucene Analyzer接口实现类默认最细粒度切分算法
17      */
18     public IKAnalyzer() {
19         this(false);
20     }
21
22     /**
23      * IK分词器Lucene Analyzer接口实现类
24      *
25      * @param isMaxWordLength
26      *            当为true时，分词器进行最大词长切分
27      */
28     public IKAnalyzer(boolean isMaxWordLength) {
29         super();
30         this.setMaxWordLength(isMaxWordLength);
31     }
32
33     @Override
34     public TokenStreamComponents createComponents(String fieldName,
35             Reader reader) {
36         Tokenizer tokenizer = new IKTokenizer(reader, isMaxWordLength());
37         return new TokenStreamComponents(tokenizer, null);
38     }
39
40     public void setMaxWordLength(boolean isMaxWordLength) {
41         this.isMaxWordLength = isMaxWordLength;
42     }
43
44     public boolean isMaxWordLength() {
45         return isMaxWordLength;
46     }
47
48 }

1 import java.io.IOException;
2 import java.io.Reader;
3
4 import org.apache.lucene.analysis.Tokenizer;
5 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
6 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
7 import org.wltea.analyzer.IKSegmentation;
8 import org.wltea.analyzer.Lexeme;
9
10 /**
11  * IK Analyzer v3.2 Lucene4.x Tokenizer适配器类它封装了IKSegmentation实现
12  *
13  * @author 林良益
14  *
15  */
16 public final class IKTokenizer extends Tokenizer {
17     // IK分词器实现
18     private IKSegmentation _IKImplement;
19     // 词元文本属性
20     private CharTermAttribute termAtt;
21     // 词元位移属性
22     private OffsetAttribute offsetAtt;
23     // 记录最后一个词元的结束位置
24     private int finalOffset;
25
26     /**
27      * Lucene Tokenizer适配器类构造函数
28      *
29      * @param in
30      * @param isMaxWordLength
31      *            当为true时，分词器进行最大词长切分；当为false是，采用最细粒度切分
32      */
33     public IKTokenizer(Reader in, boolean isMaxWordLength) {
34         super(in);
35         offsetAtt = addAttribute(OffsetAttribute.class);
36         termAtt = addAttribute(CharTermAttribute.class);
37         _IKImplement = new IKSegmentation(in, isMaxWordLength);
38     }
39
40     @Override
41     public final boolean incrementToken() throws IOException {
42         // 清除所有的词元属性
43         clearAttributes();
44         Lexeme nextLexeme = _IKImplement.next();
45         if (nextLexeme != null) {
46             // 将Lexeme转成Attributes
47             // 设置词元文本
48             termAtt.setEmpty().append(nextLexeme.getLexemeText());
49             // 设置词元位移
50             offsetAtt.setOffset(nextLexeme.getBeginPosition(),
51                     nextLexeme.getEndPosition());
52             offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
53             finalOffset = nextLexeme.getEndPosition();
54             // 返会true告知还有下个词元
55             return true;
56         }
57         // 返会false告知词元输出完毕
58         return false;
59     }
60
61     /*
62      * (non-Javadoc)
63      *
64      * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
65      */
66     public void reset() throws IOException {
67         super.reset();
68         _IKImplement.reset(input);
69     }
70
71     @Override
72     public final void end() {
73         offsetAtt.setOffset(finalOffset, finalOffset);
74     }

75 }

1 import java.io.Reader;
2 import java.util.Map;
3
4 import org.apache.lucene.analysis.Tokenizer;
5 import org.apache.lucene.analysis.util.TokenizerFactory;
6 import org.wltea.analyzer.lucene.IKTokenizer;
7
8 /**
9  * 实现Solr4.x分词器接口
10  * 基于IKTokenizer的实现
11  *
12  * @author 林良益、李良杰
13  *
14  */
15 public final class IKTokenizerFactory extends TokenizerFactory{
16
17     private boolean isMaxWordLength = false;
18
19     /**
20      * IK分词器Solr TokenizerFactory接口实现类
21      * 默认最细粒度切分算法
22      */
23     public IKTokenizerFactory(){
24     }
25
26     /*
27      * (non-Javadoc)
28      * @see org.apache.solr.analysis.BaseTokenizerFactory#init(java.util.Map)
29      */
30     public void init(Map<String,String> args){
31         String _arg = args.get("isMaxWordLength");
32         isMaxWordLength = Boolean.parseBoolean(_arg);
33     }
34
35     /*
36      * (non-Javadoc)
37      * @see org.apache.solr.analysis.TokenizerFactory#create(java.io.Reader)
38      */
39     public Tokenizer create(Reader reader) {
40         return new IKTokenizer(reader , isMaxWordLength());
41     }
42
43     public void setMaxWordLength(boolean isMaxWordLength) {
44         this.isMaxWordLength = isMaxWordLength;
45     }
46
47     public boolean isMaxWordLength() {
48         return isMaxWordLength;
49     }
50 }

公告