[web] lucene 搜索入门

来一段关于lucene的维基百科介绍:

Lucene是一套用于全文检索搜寻开放源码程式库,由Apache软件基金会支持和提供。Lucene提供了一个简单却强大的应用程序界面,能够做全文索引和搜寻,在Java开发环境里Lucene是一个成熟的免费开放源代码工具;就其本身而论,Lucene是现在并且是这几年,最受欢迎的免费Java资讯检索程式库。

这个demo是基于本地文件的搜索,搞清楚原理,其他扩展就不是太困难了,此处demo用Apache lucene 5.3.0,刚刚下载的最新版本。

1.由本地文件生成索引文件:

package com.wa.xwolf.eap.search;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.*;
import java.nio.file.Paths;

/**
 *  基于lucene 5.3.0 的DEMO
 * Created by Administrator on 2015/9/10.
 */
public class IndexWriter {

    private static org.apache.lucene.index.IndexWriter indexWriter ;
    private  static Directory directory;
    private  static IndexWriterConfig indexWriterConfig ;
    private  static Analyzer analyzer;

    public IndexWriter(String souceFile) {

        try {
            //索引文件存储目录
            directory= FSDirectory.open(Paths.get(souceFile));
            //创建标准分析器
            analyzer=new StandardAnalyzer();
            indexWriterConfig=new IndexWriterConfig(analyzer);
            //索引写开启
            indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

            indexWriter = new org.apache.lucene.index.IndexWriter(directory,indexWriterConfig);
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * 将要索引的文件转化返回一个Document对象
     * @param file
     * @return
     */
    public static Document  getDocuments(File file){
        Document document = new Document();
        try {
            FileInputStream fis = new FileInputStream(file);

            Reader reader = new BufferedReader(new InputStreamReader(fis));
            //创建域  name value 是否存储
            Field path = new StringField("file_path",file.getAbsolutePath(), Field.Store.YES);
            Field size = new LongField("size",file.length(), Field.Store.YES);
            Field content = new TextField("text",reader);
             document.add(path);
            document.add(size);
            document.add(content);
            return document;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 由文件创建索引
     */
    public static  void writeIndex(String srcFile) throws IOException {
        File dirFile = new File(srcFile);
        if (dirFile.isDirectory()) {
            String[] files = dirFile.list();
            for (int i = 0; i < files.length; i++) {
                //遍历目录下的文件
                File file = new File(dirFile, files[i]);
                Document doc = getDocuments(file);
                System.out.println(dirFile+"目录正在创建索引 : " + file + "");
                indexWriter.addDocument(doc);
            }
        }else{
            System.out.println("文件正在创建索引 : " + dirFile + "");
            indexWriter.addDocument(getDocuments(dirFile));
        }
    }

    public static  void main(String[] args) throws  Exception{

        IndexWriter indexWriter1= new IndexWriter(Constants.LUCENE_INDEX_STORE);
        indexWriter1.writeIndex(Constants.LUCENE_FILE_STORE) ;
        //关闭  否则无法创建出segments_* 文件
        indexWriter.close();

    }

}

2. 根据生成的索引进行搜索

package com.wa.xwolf.eap.search;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.flexible.standard.QueryParserUtil;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Paths;

/**
 * Created by Administrator on 2015/9/10.
 */
public class Search {
    private  static IndexSearcher indexSearcher;

    private static Query query ;

    private  static IndexReader  indexReader;



    public Search(String path){
        try {
            indexReader= DirectoryReader.open(FSDirectory.open(Paths.get(path)));
            indexSearcher=new IndexSearcher(indexReader);
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * 获取查询结果对象
     * @param keys  要搜索的关键字
     * @param fields  对应的域
     * @return
     */
    private static TopDocs getResult(String[] keys,String[] fields){

        Analyzer analyzer = new StandardAnalyzer();
        try {
              /*用基本的实现类实现*/
            //传入两个参数  (域,分析器)
          //  QueryParser queryParser = new QueryParser(fields[0],analyzer);
            //要搜索的关键字
          //   query=  queryParser.parse(keys[0]);
              /*用提供的工具方法实现
              * 传入对应的数组
              * */
           query=   QueryParserUtil.parse(keys,fields,analyzer);

          TopDocs topDocs= indexSearcher.search(query, 100);
            return topDocs;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static void main(String[] args) {

        /**
         * 中文不支持搜索
         */
        Search search = new Search(Constants.LUCENE_INDEX_STORE);
        //关键字
       String[] keys = {"lucene","F"};
        //要查询的域
       String[] fields = {"text","file_path"};
        TopDocs docs =search.getResult(keys,fields);
        ScoreDoc[] scoreDocs = docs.scoreDocs;
        if(scoreDocs.length==0){
            System.out.println("没有找到内容...");
        }else{

            for(int i=0;i<scoreDocs.length;i++){
                try {
                    Document doc = indexSearcher.doc(scoreDocs[i].doc);
                    System.out.print("这是第" + i + "个检索到的结果,文件名为");
                    System.out.println(doc.get("file_path"));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

        }

    }



}

3.用到的两个文件目录:

package com.wa.xwolf.eap.search;

/**
 * Created by Administrator on 2015/9/10.
 */
public class Constants {
    
    public static final String LUCENE_FILE_STORE="F:\\lucene\\file";
    public static  final String LUCENE_INDEX_STORE="F:\\lucene\\index";
}

 

中文分词不支持,待深入学习...

遇到的几个异常:

1.传入两个数组,一个key,一个field,两个数组长度要相等。

参考资料:

Lucene:基于Java的全文检索引擎简介

http://www.chedong.com/tech/lucene.html#learn

 

Apache: Lucene Project
http://jakarta.apache.org/lucene/
Lucene开发/用户邮件列表归档
Lucene-dev@jakarta.apache.org
Lucene-user@jakarta.apache.org

The Lucene search engine: Powerful, flexible, and free
http://www.javaworld.com/javaworld/jw-09-2000/jw-0915-Lucene_p.html

Lucene Tutorial
http://www.darksleep.com/puff/lucene/lucene.html

Notes on distributed searching with Lucene
http://home.clara.net/markharwood/lucene/

中文语言的切分词
http://www.google.com/search?sourceid=navclient&hl=zh-CN&q=chinese+word+segment

搜索引擎工具介绍
http://searchtools.com/

Lucene作者Cutting的几篇论文和专利
http://lucene.sourceforge.net/publications.html 

Lucene的.NET实现:dotLucene
http://sourceforge.net/projects/dotlucene/

Lucene作者Cutting的另外一个项目:基于Java的搜索引擎Nutch
http://www.nutch.org/   http://sourceforge.net/projects/nutch/

关于基于词表和N-Gram的切分词比较
http://china.nikkeibp.co.jp/cgi-bin/china/news/int/int200302100112.html

2005-01-08 Cutting在Pisa大学做的关于Lucene的讲座:非常详细的Lucene架构解说

posted @ 2015-09-10 12:58  snow__wolf  阅读(182)  评论(0)    收藏  举报