lucene
一,什么是lucene
lucene是一个全文检索工具,原理类似与字典
二,原理
  全文检索算法(倒排索引算法)
        描述:把数据库中的所有内容都查询出来,然后进行切分词, 把切开分词组成索引(目录),把内容放到文档对象中,索引与文档组成索引库; 检索时,先查询到索引,索引与文档之间有联系,通过联系可以快速确定文档的位置,返回文档,这就是倒排索引算法.
        缺点:空间换时间
        优点:查询效率高,不会随着数据的大量增长而效率明显降低
        举例:字典:把所有的字偏旁部首都取出来,组成目录,目录与后面的内容有联系, 通过目录能快速的找到字的详细
三,代码
(1)引入依赖
<dependencies> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.35</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <!--lucene分词器--> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>4.10.3</version> </dependency> <!--lucene核心--> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>4.10.3</version> </dependency> <!--查询解析对象--> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>4.10.2</version> </dependency> <!--ik中文分词器--> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency> </dependencies>
(2)创建索引库
package com.myjava; import com.myjava.dao.BookDao; import com.myjava.domain.Book; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class Test { /** * 创建索引库 * @throws Exception */ @org.junit.Test public void test() throws Exception { //1.查询所有数据 List<Book> bookList = new BookDao().findAll(); //2.把内容放到文档对象中 List<Document> docList = new ArrayList<Document>(); for (Book book : bookList) { Document document = new Document(); //对象每一列对应文档中的域 TextField idField = new TextField("id",book.getId(), Field.Store.YES); TextField nameField = new TextField("name",book.getName(), Field.Store.YES); TextField priceField = new TextField("price",String.valueOf(book.getPrice()), Field.Store.YES); TextField picField = new TextField("pic",book.getPic(), Field.Store.YES); TextField descriptionField = new TextField("description",book.getDescription(), Field.Store.YES); document.add(idField); document.add(nameField); document.add(priceField); document.add(picField); document.add(descriptionField); //把文档对象添加到文档集合 docList.add(document); } //3.创建索引库的位置 FSDirectory directory = FSDirectory.open(new File("d:/dic")); //4.创建分词器对象 Analyzer analyzer = new IKAnalyzer(); //5.把文档存入索引库(5.1获取索引输出流对象) IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_3,analyzer); //重点!!!!!!!! IndexWriter writer = new IndexWriter(directory,config); //5.2用输出流对象把文档对象写入到索引库 for (Document document : docList) { writer.addDocument(document); } writer.commit(); writer.close(); } }
(3.0)查询索引
package com.myjava; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import java.io.File; import java.io.IOException; public class SearchIndexTest { @Test public void test() throws Exception { //查询索引对象 FSDirectory fsDirectory = FSDirectory.open(new File("d:/dic")); IndexReader reader = IndexReader.open(fsDirectory); //重点 IndexSearcher indexSearcher = new IndexSearcher(reader); //查询关键字对象 Analyzer analyzer = new StandardAnalyzer(); QueryParser queryParser = new QueryParser("name", analyzer); Query query = queryParser.parse("description:java"); TopDocs topDocs = indexSearcher.search(query, 5); //分数文档对象数组 ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { int docId = scoreDoc.doc; //here Document document = indexSearcher.doc(docId); //then System.out.println("id:"+document.get("id")); System.out.println("name:"+document.get("name")); System.out.println("pic:"+document.get("pic")); System.out.println("price:"+document.get("price")); System.out.println("description:"+document.get("description")); } } }
(3.1)删除索引
package com.myjava; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.Query; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import java.io.File; import java.io.IOException; public class DeleteIndexTest { @Test public void test() throws Exception { //索引输出流 FSDirectory directory = FSDirectory.open(new File("d:/dic")); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_3,new StandardAnalyzer()); //重点!!!!! IndexWriter indexWriter = new IndexWriter(directory,config); QueryParser queryParser = new QueryParser("name",new StandardAnalyzer()); Query query = queryParser.parse("id:2"); /* Term term = new Term("id","1"); */ //删除文档而不删除索引 indexWriter.deleteDocuments(query); //删除全部文档和索引 indexWriter.deleteAll(); indexWriter.commit(); indexWriter.close(); } }
(3.2)更新索引
package com.myjava; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import java.io.File; import java.io.IOException; public class UpdateIndexTest { //update是删除原本索引的文档然创建新的索引和文档(注意:保留原索引) @Test public void test() throws Exception { FSDirectory fsDirectory = FSDirectory.open(new File("d:/dic")); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_3,analyzer); //重点!!!!!!!!! IndexWriter indexWriter = new IndexWriter(fsDirectory,config); Term term = new Term("id","2"); Document document = new Document(); TextField idField = new TextField("id","6", Field.Store.YES); TextField nameField = new TextField("name","MDZZ", Field.Store.YES); TextField priceField = new TextField("price","998", Field.Store.YES); TextField picField = new TextField("pic","daushd.jpg", Field.Store.YES); TextField descriptionField = new TextField("description","hhhhhhhhhh", Field.Store.YES); document.add(idField); document.add(nameField); document.add(priceField); document.add(picField); document.add(descriptionField); indexWriter.updateDocument(term,document); indexWriter.commit(); indexWriter.close(); } }
(4)ik中文分词器对象
Analyzer analyzer = new IKAnalyzer();
注意:使用ik中文分词器需要导入配置文件
四,域对象的选择
域对象的选择主要取决于三个问题
1.是否分词:分词的目的就是索引,分词后是否有意义,如果有意义,则分词,无意义,则不分词
2.是否索引:查询该对象时是否需要索引来查询
3.是否存储: 是否存储到索引库中, 在查询页面需要展示就需要存储,不需要展示则不需要存储
注意:如果在分词时需要区间(范围)检索,则必须分词,必须索引,必须存储,这是lucene的底层规则
    
 
                    
                
                
            
        
浙公网安备 33010602011771号