6.lucene入门-分词器

 1 package com.home.utils;
 2 
 3 import java.io.StringReader;
 4 
 5 import org.apache.lucene.analysis.Analyzer;
 6 import org.apache.lucene.analysis.TokenStream;
 7 import org.apache.lucene.analysis.cjk.CJKAnalyzer;
 8 import org.apache.lucene.analysis.cn.ChineseAnalyzer;
 9 import org.apache.lucene.analysis.standard.StandardAnalyzer;
10 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
11 import org.apache.lucene.util.Version;
12 import org.junit.Test;
13 /**
14  * 主要針對汉语
15  * 英语
16  *   
17  * 汉语
18  * @author Administrator
19  *
20  */
21 public class AnalyzerTest {
22     
23     /**
24       * 经过该方法可以把分词后的结果输出
25       * @param analyzer
26       * @param text
27       * @throws Exception
28       */
29      private void testAnalyzer(Analyzer analyzer,String text)throws Exception{
30          //分词器做好处理之后得到的一个流,这个流中存储了分词的各种信息.可以通过TokenStream有效的获取到分词单元
31          TokenStream tokenStream = analyzer.tokenStream("content",new StringReader(text));
32         //获取每个单词信息
33          tokenStream.addAttribute(TermAttribute.class);
34          //对语汇单元进行处理过滤，是进行一个incrementToken判断，这是相当于对语汇单元的（if语句）一次处理
35          while(tokenStream.incrementToken()){
36             // 表示token的字符串信息。比如"I'm"
37              TermAttribute termAttribute =  tokenStream.getAttribute(TermAttribute.class);
38              System.out.println(termAttribute.term());
39          }
40      }
41      
42      @Test
43      public void testEn() throws Exception{
44          /**
45           * Creates a searcher searching the index in the named directory
46           */
47           /**
48           * 1、切分关键词
49           * 2、去掉停用词
50           * 3、把大写转化成小写
51           */
52          
53           String text = "Creates a searcher searching the index in the named directory";
54           Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
55           
56           this.testAnalyzer(analyzer, text);
57      }
58      
59      @Test
60      public void testZH() throws Exception{
61       /**
62       * 单字分词
63       */
64          Analyzer analyzer = new ChineseAnalyzer();
65           String text = "传智播客的黎活明是UFO";
66           this.testAnalyzer(analyzer, text);
67      }
68      
69      @Test
70      public void testZH2() throws Exception{
71       /**
72       * 单字分词
73       */
74          
75          Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
76           String text = "传智播客的黎活明是UFO";
77           this.testAnalyzer(analyzer, text);
78      }
79      
80 //     @Test
81 //     public void testZH3() throws Exception{
82 //      Analyzer analyzer = new IKAnalyzer();
83 //      String text = "北京美女";
84 //      this.testAnalyzer(analyzer, text);
85 //     }
86      
87      
88      
89 }
posted on 2016-10-24 12:09 Sharpest 阅读(783) 评论(0) 收藏举报