Lucene索引,查询及高亮显示
本文通过代码简单展示了采用TermQuery和FuzzyLikeThisQuery进行索引查询,并且展示了如何在查询结果中高亮显示匹配的关键字(这在实际使用中是一个很有用的功能)
1 public class Indexer 2 { 3 4 /** 5 * @param args 6 * @throws IOException 7 * @throws LockObtainFailedException 8 * @throws CorruptIndexException 9 * @throws InvalidTokenOffsetsException 10 */ 11 public static void main(String[] args) throws CorruptIndexException, 12 LockObtainFailedException, IOException, InvalidTokenOffsetsException 13 { 14 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); 15 16 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); 17 config.setOpenMode(OpenMode.CREATE_OR_APPEND); 18 19 Directory indexDir = new RAMDirectory(); 20 21 /** 22 * 1. Indexing... 23 */ 24 IndexWriter writer = new IndexWriter(indexDir, config); 25 26 File docs = new File("D:\\files"); 27 28 if (docs.exists() && docs.isDirectory()) 29 { 30 File[] files = docs.listFiles(); 31 32 if (files != null && files.length > 0) 33 { 34 for (File file : files) 35 { 36 // •Field.Index.NO 不索引,如果存储选项为YES,一般用于只保存不搜索的字段; 37 // •Field.Index.ANALYZED 分词建索引; 38 // •Field.Index.NOT_ANALYZED 建索引但不分词,字段虽然被索引但是没有任何分析器对字段进行分析,只能整词精确搜索,可保存唯一性字段(例如ID)并用于更新索引 39 Document doc = new Document(); 40 doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.NO)); 41 doc.add(new Field("id", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); 42 doc.add(new Field("name", file.getName(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); 43 44 doc.add(new Field("size", file.getTotalSpace() + "b", Field.Store.YES, Field.Index.NO)); 45 46 writer.addDocument(doc); 47 } 48 49 writer.commit(); 50 } 51 } 52 53 writer.close(true); 54 55 56 /** 57 * 2. List indexed files ... 58 */ 59 IndexReader reader = IndexReader.open(indexDir); 60 IndexSearcher searcher = new IndexSearcher(reader); 61 62 System.out.println("Max doc:" + searcher.maxDoc()); 63 System.out.println("List files below...."); 64 65 Document doc = null; 66 for (int i = 0; i < searcher.maxDoc(); i++) 67 { 68 doc = searcher.doc(i); 69 System.out.println("Doc " + i + " Name: " + doc.get("name") + ", Path: " + doc.get("path") + ", Size: " + doc.get("size")); 70 } 71 System.out.println("==================================================================================="); 72 73 74 /** 75 * 3.Searching... 76 */ 77 String id = "we"; 78 // 此处若改为Query queryId = new TermQuery(new Term("id", id));则无法搜索出结果,除非id = "We are young.txt"; 79 Query queryId = new TermQuery(new Term("name", id)); 80 TopDocs hitsForId = searcher.search(queryId, null, 100); 81 if (hitsForId != null && hitsForId.totalHits > 0) 82 { 83 System.out.println("Searched " + hitsForId.totalHits + " docs for id " + id + "..."); 84 85 for (int j = 0; j < hitsForId.scoreDocs.length; j++) 86 { 87 System.out.println("Score doc for id " + j + " is " + hitsForId.scoreDocs[j].toString()); 88 } 89 } 90 System.out.println("==================================================================================="); 91 92 String keyword = "we are yy"; 93 FuzzyLikeThisQuery fuzzyLikeThisQuery = new FuzzyLikeThisQuery(100, analyzer); 94 fuzzyLikeThisQuery.addTerms(keyword, "name", 0.8F, 0); 95 96 // FuzzyLikeThisQuery不是lucene core自带的查询类,属于contrib的query模块 97 // 默认情况下QueryScorer的私有成员WeightedSpanTermExtractor无法识别它,getBestFragment将返回null 98 // 因此此处调用rewrite生成一个WeightedSpanTermExtractor可以识别的query对象,用于匹配内容关键字 99 Query query = fuzzyLikeThisQuery.rewrite(reader); 100 101 // 高亮显示关键字,如果内容中本来就有<span></span>,可能导致显示错乱 102 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span>", "</span>"); 103 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); 104 105 TopDocs hits = searcher.search(fuzzyLikeThisQuery, null, 100); 106 107 if (hits != null && hits.totalHits > 0) 108 { 109 System.out.println("Searched " + hits.totalHits + "docs for keyword " + keyword + "..."); 110 111 ScoreDoc[] sDocs = hits.scoreDocs; 112 113 Document docMatched = null; 114 for (int j = 0; j < sDocs.length; j++) 115 { 116 System.out.println("Score doc " + j + " is " + sDocs[j].toString()); 117 118 docMatched = searcher.doc(sDocs[j].doc); 119 120 TokenStream tokenStream = analyzer.tokenStream("name", new StringReader(docMatched.get("name"))); 121 String str = highlighter.getBestFragment(tokenStream, docMatched.get("name")); 122 123 System.out.println("Score doc " + j + " hightlight to: " + str); 124 125 } 126 } 127 128 reader.close(); 129 indexDir.close(); 130 } 131 }
输出如下
Max doc:13
List files below....
Doc 0 Name: ab.txt, Path: D:\files\ab.txt, Size: 104857595904b
Doc 1 Name: abc.txt, Path: D:\files\abc.txt, Size: 104857595904b
Doc 2 Name: M_1.txt, Path: D:\files\M_1.txt, Size: 104857595904b
Doc 3 Name: M_11.txt, Path: D:\files\M_11.txt, Size: 104857595904b
Doc 4 Name: We are young.txt, Path: D:\files\We are young.txt, Size: 104857595904b
Doc 5 Name: 什么是微博.txt, Path: D:\files\什么是微博.txt, Size: 104857595904b
Doc 6 Name: 喝水不忘挖井人.txt, Path: D:\files\喝水不忘挖井人.txt, Size: 104857595904b
Doc 7 Name: 天苍苍野茫茫.txt, Path: D:\files\天苍苍野茫茫.txt, Size: 104857595904b
Doc 8 Name: 怎么使用lucene.txt, Path: D:\files\怎么使用lucene.txt, Size: 104857595904b
Doc 9 Name: 神马是一种马吗.txt, Path: D:\files\神马是一种马吗.txt, Size: 104857595904b
Doc 10 Name: 苍井.txt, Path: D:\files\苍井.txt, Size: 104857595904b
Doc 11 Name: 苍白 - 副本.txt, Path: D:\files\苍白 - 副本.txt, Size: 104857595904b
Doc 12 Name: 苍白.txt, Path: D:\files\苍白.txt, Size: 104857595904b
===================================================================================
Searched 1 docs for id we...
Score doc for id 0 is doc=4 score=1.7948763 shardIndex=-1
===================================================================================
Searched 1docs for keyword we are yy...
Score doc 0 is doc=4 score=0.625 shardIndex=-1
Score doc 0 hightlight to: <span>We</span> are young.txt