代码改变世界

lucenc代码阅读指南、测试范例

2013-03-18 14:08  zhenjing  阅读(2088)  评论(1编辑  收藏  举报

阅读指南

Lucene 原理与代码分析完整版  -- 力荐

Lucene介绍及源码剖析: http://javenstudio.org/blog/annotated-lucene  -- 核心IndexWriter

下载:Annotated+Lucene+.pdf: http://ishare.iask.sina.com.cn/f/24103589.html

阅读步骤:

1、了解检索的基本原理和概念

2、了解lucene的基本概念

3、熟悉lucene的索引文件格式 -- 关键

4、熟悉lucene的索引流程:具体代码的类层次较多,且引入不必要的设计模式致使代码阅读相对困难。基本思路:controler + model 封装索引链,实现多线程并发处理(数据不共享)。

5、熟悉lucene的搜索流程

6、了解lucene搜索语法解析器 和 熟悉分词

 

推荐资料深入剖析lucene的源码,非常有价值。光看文档,不够形象,大体看过文档后,建议结合源码理解文档内容。代码能让读者有大体的基本概念,但文档对源码细节的解释容易让读者"只见枝叶不见森林”,理解困难。根据文档作者提供的大体思路,结合实际源码,读起来更容易。

测试

测试对于了解lucene的工作原理、代码执行流程极有帮助,是阅读代码的重要辅助手段。

IndexerExample.java

/*
 * Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  IndexerExample.java  
 * Exec    : java  -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  IndexerExample  
 *
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;


public class IndexerExample {
    
    private static void EnExample() throws Exception {

        // Store the index on disk
        Directory directory = FSDirectory.getDirectory("/tmp/testindex");
        // Use standard analyzer
        Analyzer analyzer = new StandardAnalyzer();
        // Create IndexWriter object
        IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
        iwriter.setMaxFieldLength(25000);
        // make a new, empty document
        Document doc = new Document();
        File f = new File("/tmp/test.txt");
        
        // Add the path of the file as a field named "path".  Use a field that is
        // indexed (i.e. searchable), but don't tokenize the field into words.
        doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));
        
        String text = "This is the text to be indexed.";
        doc.add(new Field("fieldname", text, Field.Store.YES,      Field.Index.TOKENIZED));
        doc.add(new Field("name", text, Field.Store.YES,      Field.Index.TOKENIZED));
        
        // Add the last modified date of the file a field named "modified".  Use
        // a field that is indexed (i.e. searchable), but don't tokenize the field
        // into words.
        doc.add(new Field("modified",
                    DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
                    Field.Store.YES, Field.Index.UN_TOKENIZED));
        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in the system's default encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new Field("contents", new FileReader(f)));
        
        iwriter.addDocument(doc);
        iwriter.optimize();
        iwriter.close();

    }
 
    private static void CnExample() throws Exception {

        // Store the index on disk
        Directory directory = FSDirectory.getDirectory("/tmp/testindex");
        // Use chinese analyzer
        Analyzer analyzer = new ChineseAnalyzer();
        PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer());
        wrapper.addAnalyzer("name", analyzer);
        
        // Create IndexWriter object
        IndexWriter iwriter = new IndexWriter(directory, wrapper, true);
        iwriter.setMaxFieldLength(25000);
        // make a new, empty document
        Document doc = new Document();
        File f = new File("/tmp/test.txt");
        
        // Add the path of the file as a field named "path".  Use a field that is
        // indexed (i.e. searchable), but don't tokenize the field into words.
        doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));
        
        String text = "This is the text to be indexed.";
        doc.add(new Field("fieldname", text, Field.Store.YES, Field.Index.TOKENIZED));
        
        String name = "2013春装新款女气质修身风衣大翻领双层大摆长款外套 系腰带";
        doc.add(new Field("name", name, Field.Store.YES, Field.Index.TOKENIZED));
        
        // Add the last modified date of the file a field named "modified".  Use
        // a field that is indexed (i.e. searchable), but don't tokenize the field
        // into words.
        doc.add(new Field("modified",
                    DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
                    Field.Store.YES, Field.Index.UN_TOKENIZED));
        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in the system's default encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new Field("contents", new FileReader(f)));
        
        iwriter.addDocument(doc);
        iwriter.optimize();
        iwriter.close();
    }

    public static void main(String[] args) throws Exception {
        System.out.println("Start test: ");

        if( args.length > 0){
            CnExample();
        }
        else{
            EnExample();
        }

        System.out.println("Index dir: /tmp/testindex");
    }
}

SearcherExample.java

/*
 * Compiler: javac -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  SearcherExample.java  
 * Exec    : java  -classpath .:../lucene-core-2.9.1.jar:http://www.cnblogs.com/ChineseSegmenter/chineseSegmenter.jar  SearcherExample
 * 
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.queryParser.QueryParser;


public class SearcherExample { 

    public static void main(String[] args) throws Exception { 
        if (args.length < 2) { 
            throw new Exception("Usage: java " + Searcher.class.getName() 
                    + "<index dir> <query> [cn]"); 
        } 
        File indexDir = new File(args[0]);
        String q = args[1]; 
        boolean bCn = args.length > 2? true : false;

        if (!indexDir.exists() || !indexDir.isDirectory()) { 
            throw new Exception(indexDir + 
                    " does not exist or is not a directory."); 
        } 
        search(indexDir, q, bCn); 
    } 

    public static void search(File indexDir, String q, boolean bCn) 
        throws Exception { 
        Directory fsDir = FSDirectory.getDirectory(indexDir, false); 
        IndexSearcher is = new IndexSearcher(fsDir);

        Analyzer analyzer = new StandardAnalyzer();
        if( bCn ){
            analyzer = new ChineseAnalyzer();
        }

        QueryParser parser = new QueryParser( "name",  analyzer);
        Query query = parser.parse(q); 
        
        System.out.println("Query: " + query.toString());
        long start = new Date().getTime(); 
        Hits hits = is.search(query);
        long end = new Date().getTime(); 

        System.err.println("Found " + hits.length() + 
                " document(s) (in " + (end - start) + 
                " milliseconds) that matched query '" + 
                q + "'"); 

        for (int i = 0; i < hits.length(); i++) { 
            Document doc = hits.doc(i); 
            System.out.println( "HIT " + i + " :" + doc.get("name")); 
        } 
    } 
} 

中文分词可采用lucene自带的库,效果不好,或者自行封装,核心就是封装分词Tokenizer。

package org.apache.lucene.analysis.cn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;

public class SnippetTermTokenizer extends Tokenizer {
        private StringBuffer buffer = new StringBuffer();
        private BufferedReader inputBuffer;
        private JNISelecter selecter;     // 中文分词核心类
        private List<Token> tokenList = null;
        private List<String> phraseTokenList = null;
        private Iterator<Token> tokenIter = null;

        public SnippetTermTokenizer(Reader reader, JNISelecter s) {
                inputBuffer = new BufferedReader(reader, 2048);
                selecter = s;
        }

        public Token next() throws IOException {
                if (tokenIter != null) {
                        if (tokenIter.hasNext()) {
                                return tokenIter.next();
                        } else {
                                // finish read input
                                return null;
                        }
                }
                // need to read content
                readContent();
                if (segment()) {
                        // segment succeed, create iterator
                        return tokenIter.next();
                }
                return null;
        }

        public void close() throws IOException {
                inputBuffer.close();
        }
       
        // 分词相关略