lucene4.4 索引的增删改查

由于本人的英文不好，好多的注释不准确。请谅解！
package com.lucene.test;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;

public class IndexUtil {
    private static final Logger LOGGER = Logger.getLogger(IndexUtil.class);
    private Directory directory = null;
    private DirectoryReader reader = null;
    private IndexWriterConfig config = null;
    private IndexWriter writer = null;
    public static final IndexUtil Instance = new IndexUtil();

    private IndexUtil() {
        try {
            directory = FSDirectory.open(new File("D:/lucene/index"));
            config = new IndexWriterConfig(Version.LUCENE_44,
                    new StandardAnalyzer(Version.LUCENE_44));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 添加索引
     * @throws IOException
     */
    public void index() throws IOException {
        writer = new IndexWriter(directory, config);
        File file = new File("D:\\lucene\\example");
        Document document = null;
        int id = 0;
        long start = new Date().getTime();
        LOGGER.info("添加索引…………………………");
        for (File f : file.listFiles()) {
            document = new Document();
            document.add(new StringField("name", f.getName(), Store.YES));
            document.add(new IntField("id", id++, Store.YES));
            document.add(new StringField("path", f.getAbsolutePath(), Store.YES));
            document.add(new TextField("context", new FileReader(f)));
            writer.addDocument(document);
        }
        long end = new Date().getTime();
        LOGGER.info("添加索引完成，用时：" + (end - start) / 1000.0 + "s…………………………");
        writer.close();
    }

    /**
     * 查询索引
     * @throws IOException
     * @throws ParseException
     */
    public void search() throws IOException, ParseException {
        reader = DirectoryReader.open(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_44, "context",
                new StandardAnalyzer(Version.LUCENE_44));
        Query query = parser.parse("lucene");
        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs docs = searcher.search(query, 100);
        /**
         * reader.maxDoc()包含索引文档的总数 包含可用的 和已经删除的数量
         * reader.numDocs()当前可用的索引文档的数量 不包含已经删除的
         * reader.numDeletedDocs()删除的索引文档的数量
         */
        LOGGER.info("总记录：" + docs.totalHits + " 命中文档数：" + docs.scoreDocs.length
                + " 最大的文档数maxDoc：" + reader.maxDoc() + " 删除文件数numDeletedDocs："
                + reader.numDeletedDocs() + " numDocs" + reader.numDocs());
        for (ScoreDoc doc : docs.scoreDocs) {
            Document document = reader.document(doc.doc);
            LOGGER.info("id:" + document.get("id") + " name:"
                    + document.get("name") + " path:" + document.get("path"));
        }
        reader.close();
    }

    /**
     * 更新索引
     * @throws IOException
     */
    public void update() throws IOException {
        writer = new IndexWriter(directory, config);
        Document document = new Document();
        document.add(new StringField("name", "新文件", Store.YES));
        document.add(new IntField("id", 12, Store.YES));
        document.add(new StringField("path", "D:\\lucene\\example\\新文件.txt", Store.YES));
        writer.updateDocument(new Term("id", "2"),document);
        writer.commit();
        writer.close();
    }
    
    /**
     * 删除索引 删除的索引会保存到一个新的文件中（以del为结尾的文件 相当于删除到回收站）
     * @throws IOException
     */
    public void delete() throws IOException {
        writer = new IndexWriter(directory, config);
        writer.deleteDocuments(new Term("name", "11.txt"));
        writer.close();
    }

    /**
     * 删除所有的索引 删除的索引会保存到一个新的文件中（以del为结尾的文件 相当于删除到回收站）
     * @throws IOException
     */
    public void deleteAll() throws IOException {
        writer = new IndexWriter(directory, config);
        writer.deleteAll();
        writer.close();
    }

    /**
     * 删除已经删除的索引 对应上一个删除方法 删除回收站的文件
     * @throws IOException
     */
    public void forceMergeDeletes() throws IOException {
        writer = new IndexWriter(directory, config);
        writer.forceMergeDeletes();// 清空回收站
        writer.close();
    }
    /**
     * 显示所有的索引
     * @throws IOException
     */
    public void showIndex() throws IOException {
        reader = DirectoryReader.open(directory);
        Fields fields = MultiFields.getFields(reader); //获取directory中所有的field
            for (String field : fields) {
                LOGGER.info(field);
            }
            //显示 field 中 context的所有的分词 
            Terms terms = fields.terms("context");
            TermsEnum termsEnum =  terms.iterator(null);
            BytesRef term = null;
            while ((term=termsEnum.next()) !=null) {
                System.out.print(term.utf8ToString()+"\t");//分词的内容
                System.out.print(termsEnum.docFreq()+"\t");//出现该分词的有文档的数量
                System.out.print(termsEnum.totalTermFreq()+"\t");//分词的总数
                DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
                //如果要查询的字段 没有被分词 ，docsAndPositionsEnum就会为空 继续循环
                if(docsAndPositionsEnum==null){
                    continue;
                }
                int docId ;
                while ((docId = docsAndPositionsEnum.nextDoc())!= DocIdSetIterator.NO_MORE_DOCS) {
                    Document document = reader.document(docId);//获取document对象
                    System.out.print(docId+"\t");//分词的总数
                    System.out.print(document.get("name")+"\t");//可以获取document中field的值
                    int freq = docsAndPositionsEnum.freq();//该document中 该分词出现的次数
                    for (int i = 0; i < freq; i++) {
                        System.out.print(docsAndPositionsEnum.nextPosition()+":"); //分词的位置
                         System.out.print("["+docsAndPositionsEnum.startOffset()+"");//分词起始偏移量的位置
                         System.out.print(docsAndPositionsEnum.endOffset()+"],");//分词结束偏移量的位置
                         System.out.print(docsAndPositionsEnum.getPayload()+"\t");
                    }
                }
                System.out.println();
            }
        reader.close();
    }

}
posted on 2013-08-26 17:46 无与伦比的卒子阅读(1711) 评论(0) 收藏举报