Lucene笔记
概述
Lucene最主要就是做两件事:建立索引和进行搜索 ,以下介绍Lucene常用类或者接口:
IndexWriter:lucene中将文档加入索引,同时控制索引过程中的一些参数使用;
Analyzer:分析器,主要用于分析搜索引擎遇到的各种文本。常用的有StandardAnalyzer分析器,StopAnalyzer分析器,WhitespaceAnalyzer分析器等;
Directory:索引存放的位置;lucene提供了两种索引存放的位置,一种是磁盘,一种是内存。一般情况将索引放在磁盘上;相应地lucene提供了FSDirectory和RAMDirectory两个类;
Document:文档;Document相当于一个要进行索引的单元,任何可以想要被索引的文件都必须转化为Document对象才能进行索引。
Field:字段。
IndexSearcher:是lucene中最基本的检索工具,所有的检索都会用到IndexSearcher工具;
Query:查询,lucene中支持模糊查询,语义查询,短语查询,组合查询等等,如有TermQuery,BooleanQuery,RangeQuery,WildcardQuery等一些类。
QueryParser: 是一个解析用户输入的工具,可以通过扫描用户输入的字符串,生成Query对象。
Hits:在搜索完成之后,需要把搜索结果返回并显示给用户,只有这样才算是完成搜索的目的。在lucene中,搜索的结果的集合是用Hits类的实例来
表示的。
具体使用
1.构建索引
1.1添加文档
/** * 生成索引 * @throws Exception */ @Test public void index() throws Exception { directory = FSDirectory.open(Paths.get("D:\\AllFiles\\lucene01")); IndexWriter writer = getWriter(); for (int i = 0; i < ids.length; i++) { Document document = new Document(); document.add(new StringField("id", ids[i], Field.Store.YES)); document.add(new StringField("author", authors[i], Field.Store.YES)); document.add(new StringField("position", positions[i], Field.Store.YES)); //加权操作 TextField field = new TextField("title", titles[i],Field.Store.YES); if("boss".equals(positions[i])){ field.setBoost(1.5f); } document.add(field); document.add(new TextField("content", contents[i],Field.Store.NO)); writer.addDocument(document); } writer.close(); }
1.2删除文档
/** * 测试删除在合并前 * @throws Exception */ @Test public void testDeleteBeforeMerge()throws Exception{ IndexWriter writer=getWriter(); System.out.println("删除前:"+writer.numDocs()); writer.deleteDocuments(new Term("id","1")); writer.commit(); System.out.println("writer.maxDoc():"+writer.maxDoc()); System.out.println("writer.numDocs():"+writer.numDocs()); writer.close(); } /** * 测试删除在合并后 * @throws Exception */ @Test public void testDeleteAfterMerge()throws Exception{ IndexWriter writer=getWriter(); System.out.println("删除前:"+writer.numDocs()); writer.deleteDocuments(new Term("id","1")); writer.forceMergeDeletes(); // 强制删除 writer.commit(); System.out.println("writer.maxDoc():"+writer.maxDoc()); System.out.println("writer.numDocs():"+writer.numDocs()); writer.close(); }
1.3修改文档
/** * 测试更新 * @throws Exception */ @Test public void testUpdate()throws Exception{ IndexWriter writer=getWriter(); Document doc=new Document(); doc.add(new StringField("id", "1", Field.Store.YES)); doc.add(new StringField("city","qingdao",Field.Store.YES)); doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO)); writer.updateDocument(new Term("id","1"), doc); writer.close(); }
2.搜索功能
2.1对特定项搜索
/** * 对特定项搜索 * @throws Exception */ @Test public void testTermQuery() throws Exception { String searcheField = "content"; String q = "particular"; Term term = new Term(searcheField,q); Query query = new TermQuery(term); TopDocs hits = is.search(query, 10); System.out.println("匹配 '"+q+"',总共查询到"+hits.totalHits+"个文档"); for (ScoreDoc scoreDoc:hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); System.out.println(doc.get("fullPath")); } }
2.2查询表达式:QueryParser
/** * 解析查询表达式 * @throws Exception */ @Test public void testQueryParser()throws Exception{ Analyzer analyzer = new StandardAnalyzer(); String searchField = "contents"; String q = "particular and Areek"; QueryParser parser=new QueryParser(searchField, analyzer); Query query=parser.parse(q); TopDocs hits=is.search(query, 100); System.out.println("匹配 "+q+"查询到"+hits.totalHits+"个记录"); for(ScoreDoc scoreDoc:hits.scoreDocs){ Document doc=is.doc(scoreDoc.doc); System.out.println(doc.get("fullPath")); } }
2.3其他查询方式
/** * 指定项范围搜索 * @throws Exception */ @Test public void testTermRangeQuery()throws Exception{ TermRangeQuery query=new TermRangeQuery("desc", new BytesRef("b".getBytes()), new BytesRef("c".getBytes()), true, true); TopDocs hits=is.search(query, 10); for(ScoreDoc scoreDoc:hits.scoreDocs){ Document doc=is.doc(scoreDoc.doc); System.out.println(doc.get("id")); System.out.println(doc.get("city")); System.out.println(doc.get("desc")); } } /** * 指定数字范围 * @throws Exception */ @Test public void testNumericRangeQuery()throws Exception{ NumericRangeQuery<Integer> query=NumericRangeQuery.newIntRange("id", 1, 2, true, true); TopDocs hits=is.search(query, 10); for(ScoreDoc scoreDoc:hits.scoreDocs){ Document doc=is.doc(scoreDoc.doc); System.out.println(doc.get("id")); System.out.println(doc.get("city")); System.out.println(doc.get("desc")); } } /** * 指定字符串开头搜索 * @throws Exception */ @Test public void testPrefixQuery()throws Exception{ PrefixQuery query=new PrefixQuery(new Term("city","a")); TopDocs hits=is.search(query, 10); for(ScoreDoc scoreDoc:hits.scoreDocs){ Document doc=is.doc(scoreDoc.doc); System.out.println(doc.get("id")); System.out.println(doc.get("city")); System.out.println(doc.get("desc")); } } /** * 多条件查询 * @throws Exception */ @Test public void testBooleanQuery()throws Exception{ NumericRangeQuery<Integer> query1=NumericRangeQuery.newIntRange("id", 1, 2, true, true); PrefixQuery query2=new PrefixQuery(new Term("city","a")); BooleanQuery.Builder booleanQuery=new BooleanQuery.Builder(); booleanQuery.add(query1,BooleanClause.Occur.MUST); booleanQuery.add(query2,BooleanClause.Occur.MUST); TopDocs hits=is.search(booleanQuery.build(), 10); for(ScoreDoc scoreDoc:hits.scoreDocs){ Document doc=is.doc(scoreDoc.doc); System.out.println(doc.get("id")); System.out.println(doc.get("city")); System.out.println(doc.get("desc")); } }
3.扩展
3.1中文分词smartcn以及检索结果高亮显示实现
public class Indexer { private Integer ids[]={1,2,3}; private String citys[]={"青岛","南京","上海"}; private String descs[]={ "青岛是一个美丽的城市。", "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] "江南佳丽地,金陵帝王州",南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有"六朝古都"、"十朝都会"之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有"天下文枢"、"东南第一学"的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。", "上海是一个繁华的城市。" }; private Directory directory; /** * 获取IndexWriter实例 * @return * @throws Exception */ private IndexWriter getWriter() throws Exception{ //Analyzer analyzer = new StandardAnalyzer(); SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(directory, iwc); return writer; } /** * 生成索引 * @param indexDir * @throws Exception */ private void index(String indexDir)throws Exception{ directory = FSDirectory.open(Paths.get(indexDir)); IndexWriter writer = getWriter(); for(int i=0;i<ids.length;i++){ Document doc=new Document(); doc.add(new IntField("id", ids[i], Field.Store.YES)); doc.add(new StringField("city",citys[i],Field.Store.YES)); doc.add(new TextField("desc", descs[i], Field.Store.YES)); writer.addDocument(doc); // 添加文档 } writer.close(); } public static void main(String[] args) throws Exception { String indexDir = "D:\\AllFiles\\lucene"; new Indexer().index(indexDir); } }
public class Searcher { public static void search(String indexDir,String q)throws Exception{ Directory dir=FSDirectory.open(Paths.get(indexDir)); IndexReader reader=DirectoryReader.open(dir); IndexSearcher is=new IndexSearcher(reader); // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器 SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer(); QueryParser parser=new QueryParser("desc", analyzer); Query query=parser.parse(q); long start=System.currentTimeMillis(); TopDocs hits=is.search(query, 10); long end=System.currentTimeMillis(); System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录"); QueryScorer scorer=new QueryScorer(query); Fragmenter fragmenter=new SimpleSpanFragmenter(scorer); SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>"); Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer); highlighter.setTextFragmenter(fragmenter); for(ScoreDoc scoreDoc:hits.scoreDocs){ Document doc=is.doc(scoreDoc.doc); System.out.println(doc.get("city")); System.out.println(doc.get("desc")); String desc=doc.get("desc"); if(desc!=null){ TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc)); System.out.println(highlighter.getBestFragment(tokenStream, desc)); } } reader.close(); } public static void main(String[] args) { String indexDir="D:\\AllFiles\\lucene"; String q="南京城市"; try { search(indexDir,q); } catch (Exception e) { e.printStackTrace(); } } }
输出到网页时加上html标签