1.
1 package com.home.utils; 2 3 import java.io.StringReader; 4 5 import org.apache.lucene.analysis.Analyzer; 6 import org.apache.lucene.analysis.TokenStream; 7 import org.apache.lucene.analysis.cjk.CJKAnalyzer; 8 import org.apache.lucene.analysis.cn.ChineseAnalyzer; 9 import org.apache.lucene.analysis.standard.StandardAnalyzer; 10 import org.apache.lucene.analysis.tokenattributes.TermAttribute; 11 import org.apache.lucene.util.Version; 12 import org.junit.Test; 13 /** 14 * 主要針對汉语 15 * 英语 16 * 17 * 汉语 18 * @author Administrator 19 * 20 */ 21 public class AnalyzerTest { 22 23 /** 24 * 经过该方法可以把分词后的结果输出 25 * @param analyzer 26 * @param text 27 * @throws Exception 28 */ 29 private void testAnalyzer(Analyzer analyzer,String text)throws Exception{ 30 //分词器做好处理之后得到的一个流,这个流中存储了分词的各种信息.可以通过TokenStream有效的获取到分词单元 31 TokenStream tokenStream = analyzer.tokenStream("content",new StringReader(text)); 32 //获取每个单词信息 33 tokenStream.addAttribute(TermAttribute.class); 34 //对语汇单元进行处理过滤,是进行一个incrementToken判断,这是相当于对语汇单元的(if语句)一次处理 35 while(tokenStream.incrementToken()){ 36 // 表示token的字符串信息。比如"I'm" 37 TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); 38 System.out.println(termAttribute.term()); 39 } 40 } 41 42 @Test 43 public void testEn() throws Exception{ 44 /** 45 * Creates a searcher searching the index in the named directory 46 */ 47 /** 48 * 1、切分关键词 49 * 2、去掉停用词 50 * 3、把大写转化成小写 51 */ 52 53 String text = "Creates a searcher searching the index in the named directory"; 54 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); 55 56 this.testAnalyzer(analyzer, text); 57 } 58 59 @Test 60 public void testZH() throws Exception{ 61 /** 62 * 单字分词 63 */ 64 Analyzer analyzer = new ChineseAnalyzer(); 65 String text = "传智播客的黎活明是UFO"; 66 this.testAnalyzer(analyzer, text); 67 } 68 69 @Test 70 public void testZH2() throws Exception{ 71 /** 72 * 单字分词 73 */ 74 75 Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30); 76 String text = "传智播客的黎活明是UFO"; 77 this.testAnalyzer(analyzer, text); 78 } 79 80 // @Test 81 // public void testZH3() throws Exception{ 82 // Analyzer analyzer = new IKAnalyzer(); 83 // String text = "北京美女"; 84 // this.testAnalyzer(analyzer, text); 85 // } 86 87 88 89 }