1 package com.lucene.util;
2
3 import java.io.Reader;
4
5 import org.apache.lucene.analysis.Analyzer;
6 import org.apache.lucene.analysis.TokenStream;
7
8 import com.chenlb.mmseg4j.Dictionary;
9 import com.chenlb.mmseg4j.MaxWordSeg;
10 import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
11
12 public class MySameworkAnalyzer extends Analyzer {
13
14 @Override
15 public TokenStream tokenStream(String str, Reader reader) {
16 //获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器
17 Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data");
18 return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));
19 }
20
21 }
1 @Test
2 public void test05(){
3 try {
4 Analyzer a1=new MySameworkAnalyzer();
5 String str="我来自中国,我的名字叫什么";
6 AnalyzerUtil.displayToken(str, a1);
7 Directory directory=new RAMDirectory();
8 IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1));
9 Document document=new Document();
10 document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED));
11 indexWriter.addDocument(document);
12 indexWriter.close();
13 IndexReader indexReader=IndexReader.open(directory);
14 IndexSearcher searcher=new IndexSearcher(indexReader);
15 TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10);
16 ScoreDoc[] docs=tds.scoreDocs;
17 Document doc=searcher.doc(docs[0].doc);
18 System.out.println(doc.get("content"));
19 searcher.close();
20 indexReader.close();
21 } catch (CorruptIndexException e) {
22 e.printStackTrace();
23 } catch (LockObtainFailedException e) {
24 e.printStackTrace();
25 } catch (IOException e) {
26 e.printStackTrace();
27 }
28 }
1 package com.lucene.util;
2
3 import java.io.IOException;
4 import java.util.HashMap;
5 import java.util.Map;
6 import java.util.Stack;
7
8 import org.apache.lucene.analysis.TokenFilter;
9 import org.apache.lucene.analysis.TokenStream;
10 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
11 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
12 import org.apache.lucene.util.AttributeSource;
13
14 public class MySameworkFilter extends TokenFilter {
15
16 //保存相应的词汇
17 private CharTermAttribute cta=null;
18 //保存词与词之间的位置增量
19 private PositionIncrementAttribute pia=null;
20 //定义一个状态
21 private AttributeSource.State current=null;
22 //用栈保存同义词集合
23 private Stack<String> sames=null;
24 protected MySameworkFilter(TokenStream input) {
25 super(input);
26 cta=this.addAttribute(CharTermAttribute.class);
27 pia=this.addAttribute(PositionIncrementAttribute.class);
28 sames=new Stack<String>();
29 }
30
31
32 @Override
33 public boolean incrementToken() throws IOException {
34 if(sames.size()>0){
35 //将元素出栈,并获取同义词
36 String str=sames.pop();
37 //还原状态
38 restoreState(current);
39 //先清空,再添加
40 cta.setEmpty();
41 cta.append(str);
42 //设置位置为0,表示同义词
43 pia.setPositionIncrement(0);
44 return true;
45 }
46
47 if(!this.input.incrementToken())
48 return false;
49
50 //如果改词中有同义词,捕获当前状态
51 if(this.getSamewords(cta.toString())){
52 current=captureState();
53 }
54
55 return true;
56 }
57
58 //定义同义词字典,并判断如果有同义词就返回true
59 private boolean getSamewords(String key){
60 Map<String, String[]> maps=new HashMap<String, String[]>();
61 maps.put("我", new String[]{"咱","俺"});
62 maps.put("中国", new String[]{"大陆","天朝"});
63
64 if(maps.get(key)!=null){
65 for(String s:maps.get(key)){
66 sames.push(s);
67 }
68 }
69
70 if(sames.size()>0){
71 return true;
72 }
73 return false;
74 }
75
76 }