X-man

导航

lucene 建立索引的不同方式

1.创建一个简单的索引:

package lia.meetlucene;

import java.io.File;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class BasicIndexer {
    public static void main(String[] args) throws java.io.IOException {
        String indexDir = "C:/Users/Administrator/Desktop/xdj";
        
        Directory dir = FSDirectory.open(new File(indexDir));
        /*
         * writer = new IndexWriter(dir, //3 创建Lucene Index Writer new
         * StandardAnalyzer( //3 Version.LUCENE_30),//3 true, //3
         * IndexWriter.MaxFieldLength.UNLIMITED); //3
         */
        IndexWriter writer = new IndexWriter(dir, // 3 创建Lucene Index Writer
                new StandardAnalyzer(Version.LUCENE_30),// 3
                true, // 3
                IndexWriter.MaxFieldLength.UNLIMITED); // 3

        // Document
        Document doc = new Document();

        // Field -title
        String title = "i love china";
        Field field = new Field("title", title, Field.Store.YES,
                Field.Index.ANALYZED);
        // add field
        doc.add(field);

        // Field -content
        String content = "i love you, my mother land! ";
        field = new Field("content", content, Field.Store.YES,
                Field.Index.ANALYZED);
        // add field
        doc.add(field);

        // add document
        writer.addDocument(doc);

        // close IndexWriter
        writer.close();

        // message
        System.out.println("Index Created!");
    }
}
View Code

2.创建一个复杂点的索引:

package lia.meetlucene;

import java.io.File;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class BasicIndexer {
    public static void main(String[] args) throws java.io.IOException {
        String indexDir = "C:/Users/Administrator/Desktop/xdj";

        Directory dir = FSDirectory.open(new File(indexDir));
        /*
         * writer = new IndexWriter(dir, //3 创建Lucene Index Writer new
         * StandardAnalyzer( //3 Version.LUCENE_30),//3 true, //3
         * IndexWriter.MaxFieldLength.UNLIMITED); //3
         */
        IndexWriter writer = new IndexWriter(dir, // 3 创建Lucene Index Writer
                new StandardAnalyzer(Version.LUCENE_30),// 3
                true, // 3
                IndexWriter.MaxFieldLength.UNLIMITED); // 3

        // 创建Document--1
        Document doc = new Document();

        // 创建Field -title
        String title = "i love china";
        Field field = new Field("title", title, Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        // 添加add field
        doc.add(field);

        // 创建Field -content
        String content = "i love you, my mother land! ";
        field = new Field("content", content, Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        // 添加add field
        doc.add(field);

        // 创建Field -time
        String time = "2007-05-31";
        field = new Field("time", time, Field.Store.YES, Field.Index.NO);
        // 创建add field
        doc.add(field);

        // 添加add document
        writer.addDocument(doc);

        // 创建Document--2
        doc = new Document();

        // 创建Field -title
        title = "i love mom";
        field = new Field("title", title, Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        // add field
        doc.add(field);

        // 创建Field -content
        content = "i love you, my mother! ";
        field = new Field("content", content, Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        // 添加add field
        doc.add(field);

        // 创建Field -time
        time = "2007-05-31";
        field = new Field("time", time, Field.Store.YES, Field.Index.NO);
        // 添加add field
        doc.add(field);

        // 添加add document
        writer.addDocument(doc);

        // 创建Document--3
        doc = new Document();

        // 创建Field -title
        title = "i love xiaoyue";
        field = new Field("title", title, Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        // 添加add field
        doc.add(field);

        // 创建Field -content
        content = "i love you, my wife! ";
        field = new Field("content", content, Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        // 添加add field
        doc.add(field);

        // 创建Field -time
        time = "2007-05-31";
        field = new Field("time", time, Field.Store.YES, Field.Index.NO);
        // add field
        doc.add(field);

        // 添加add document
        writer.addDocument(doc);

        // 关闭close IndexWriter
        writer.close();

        // 提示message
        System.out.println("Index Three Created!");
    }
}
View Code

3.文件创建一个索引

package lia.meetlucene;

import java.io.File;
import java.io.FileReader;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class BasicIndexer {
    public static void main(String[] args) throws java.io.IOException {
        String indexDir = "C:/Users/Administrator/Desktop/xdj";

        Directory dir = FSDirectory.open(new File(indexDir));
        /*
         * writer = new IndexWriter(dir, //3 创建Lucene Index Writer new
         * StandardAnalyzer( //3 Version.LUCENE_30),//3 true, //3
         * IndexWriter.MaxFieldLength.UNLIMITED); //3
         */
        IndexWriter writer = new IndexWriter(dir, // 3 创建Lucene Index Writer
                new StandardAnalyzer(Version.LUCENE_30),// 3
                true, // 3
                IndexWriter.MaxFieldLength.UNLIMITED); // 3

        // 创建Document
        Document doc = new Document();
        File f = new File(
                "E:/xdj/tengxun/a_______________mm/2014-02-19 06.59.53.xml");

        // 创建Field -name
        String name = f.getName();
        Field field = new Field("name", name, Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        // 添加add field
        doc.add(field);

        // 创建Field -content
        field = new Field("content", new FileReader(f)); // FileText.getText(f);
        // add field
        doc.add(field);

        // 创建Field -path
        String path = f.getPath();
        field = new Field("path", path, Field.Store.YES, Field.Index.NO);
        // 添加add field
        doc.add(field);

        // 添加add document
        writer.addDocument(doc);

        // 创建**************************************************************/
        doc = new Document();
        f = new File(
                "E:/xdj/tengxun/a_______________mm/2014-02-04 11.43.01.xml");

        // 创建Field -name
        name = f.getName();
        field = new Field("name", name, Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        // add field
        doc.add(field);

        // 创建Field -content
        field = new Field("content", new FileReader(f));
        // 添加add field
        doc.add(field);

        // 创建Field -path
        path = f.getPath();
        field = new Field("path", path, Field.Store.YES, Field.Index.NO);
        // 添加add field
        doc.add(field);

        // 添加add document
        writer.addDocument(doc);

        // 关闭close IndexWriter
        writer.close();

        // 提示message
        System.out.println("File Index Created!");
    }
}
View Code

4.某个文件夹的所有文件创建索引

package lia.meetlucene;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
 */

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;

// From chapter 1

/**
 * This code was originally written for Erik's Lucene intro java.net article
 */
public class Indexer {

    public static void main(String[] args) throws Exception {
        // args = new String[2];
        // args[0] = "E:/xiaodajun/new/lia2e/src/lia/meetlucene";
        // args[1] =
        // "E:/xiaodajun/new/lia2e/src/lia/meetlucene/data";//"src/lia/meetlucene/data";
        // C:/Users/Administrator/Desktop/xdj/data
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Indexer.class.getName() + " <index dir> <data dir>");
        }
        // String indexDir = args[0]; // 1
        // String dataDir = args[1]; // 2

        String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
        String dataDir = "C:/Users/Administrator/Desktop/xdj/data";

        // String indexDir =
        // "E:/xdj/tengxun";//"C:/Users/Administrator/Desktop/xdj/suoyin";
        // String dataDir =
        // "E:/xdj/tengxunsuoying";//"C:/Users/Administrator/Desktop/xdj/weibohanzi";

        long start = System.currentTimeMillis();
        // ///////////////////////////////////////////////////////////////////////////////////////////
        Indexer indexer = new Indexer(indexDir);
        int numIndexed;
        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());
        } finally {
            indexer.close();
        }
        long end = System.currentTimeMillis();
        // /////////////////////////////////////////////////////////////////////////////////////////////
        System.out.println("Indexing " + numIndexed + " files took "
                + (end - start) + " milliseconds");
    }

    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException {
        Directory dir = FSDirectory.open(new File(indexDir));

        /*
         * writer = new IndexWriter(dir, //3 创建Lucene Index Writer new
         * StandardAnalyzer( //3 Version.LUCENE_30),//3 true, //3
         * IndexWriter.MaxFieldLength.UNLIMITED); //3
         */
        writer = new IndexWriter(dir, // 3 创建Lucene Index Writer
                new SmartChineseAnalyzer(Version.LUCENE_20),// 3
                true, // 3
                IndexWriter.MaxFieldLength.UNLIMITED); // 3
    }

    public void close() throws IOException {
        writer.close(); // 4 关闭Lucene Index Writer
    }

    public int index(String dataDir, FileFilter filter) throws Exception {

        File[] files = new File(dataDir).listFiles();

        for (File f : files) {
            if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead()
                    && (filter == null || filter.accept(f))) {
                indexFile(f);
            }
        }

        return writer.numDocs(); // 5返沪被索引文档数
    }

    private static class TextFilesFilter implements FileFilter {
        public boolean accept(File path) {
            return path.getName().toLowerCase() // 6只索引.txt文件,采用FileFilter
                    .endsWith(".txt"); // 6
        }
    }

    protected Document getDocument(File f) throws Exception {
        Document doc = new Document();
        doc.add(new Field("contents", new FileReader(f))); // 7索引文件内容
        doc.add(new Field("filename", f.getName(), // 8索引文件名
                Field.Store.YES, Field.Index.NOT_ANALYZED));// 8
        doc.add(new Field("fullpath", f.getCanonicalPath(), // 9索引文件完整路径
                Field.Store.YES, Field.Index.NOT_ANALYZED));// 9
        return doc;
    }

    // Store.是否存储 yes no compress(压缩之后再存)
    // Index。是否进行索引 Index.ANALYZED 分词后进行索引,NOT_ANALYZED 不索引,NOT_ANALYZED 不分词直接索引

    private void indexFile(File f) throws Exception {
        System.out.println("Indexing " + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc); // 10向Lucene索引中添加文档
    }
}

/*
 * #1 Create index in this directory #2 Index *.txt files from this directory #3
 * Create Lucene IndexWriter #4 Close IndexWriter #5 Return number of documents
 * indexed #6 Index .txt files only, using FileFilter #7 Index file content #8
 * Index file name #9 Index file full path #10 Add document to Lucene index
 */
View Code

5.<Lucene in action>第二版索引demo

package lia.meetlucene;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
 */

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;

// From chapter 1

/**
 * This code was originally written for Erik's Lucene intro java.net article
 */
public class Indexer {

    public static void main(String[] args) throws Exception {
        // args = new String[2];
        // args[0] = "E:/xiaodajun/new/lia2e/src/lia/meetlucene";
        // args[1] =
        // "E:/xiaodajun/new/lia2e/src/lia/meetlucene/data";//"src/lia/meetlucene/data";
        // C:/Users/Administrator/Desktop/xdj/data
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Indexer.class.getName() + " <index dir> <data dir>");
        }
        // String indexDir = args[0]; // 1
        // String dataDir = args[1]; // 2

        // String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
        // String dataDir = "C:/Users/Administrator/Desktop/xdj/data";

        String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
        String dataDir = "C:/Users/Administrator/Desktop/xdj/tengxun/A__Vae";

        long start = System.currentTimeMillis();
        // ///////////////////////////////////////////////////////////////////////////////////////////
        Indexer indexer = new Indexer(indexDir);
        int numIndexed;
        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());
        } finally {
            indexer.close();
        }
        long end = System.currentTimeMillis();
        // /////////////////////////////////////////////////////////////////////////////////////////////
        System.out.println("Indexing " + numIndexed + " files took "
                + (end - start) + " milliseconds");
    }

    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException {
        Directory dir = FSDirectory.open(new File(indexDir));

        /*
         * writer = new IndexWriter(dir, //3 创建Lucene Index Writer new
         * StandardAnalyzer( //3 Version.LUCENE_30),//3 true, //3
         * IndexWriter.MaxFieldLength.UNLIMITED); //3
         */
        writer = new IndexWriter(dir, // 3 创建Lucene Index Writer
                new SmartChineseAnalyzer(Version.LUCENE_20),// 3
                // new StandardAnalyzer(Version.LUCENE_30),
                true, // 3
                IndexWriter.MaxFieldLength.UNLIMITED); // 3
    }

    public void close() throws IOException {
        writer.close(); // 4 关闭Lucene Index Writer
    }

    public int index(String dataDir, FileFilter filter) throws Exception {

        File[] files = new File(dataDir).listFiles();

        for (File f : files) {
            if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead()
                    && (filter == null || filter.accept(f))) {

                indexFile(f);
            }
        }

        return writer.numDocs(); // 5返沪被索引文档数
    }

    private static class TextFilesFilter implements FileFilter {
        public boolean accept(File path) {
            return path.getName().toLowerCase() // 6只索引.txt文件,采用FileFilter
                    .endsWith(".xml"); // 6
        }
    }

    protected Document getDocument(File f) throws Exception {
        Document doc = new Document();
        doc.add(new Field("contents", new FileReader(f))); // 7索引文件内容
        doc.add(new Field("filename", f.getName(), // 8索引文件名
                Field.Store.YES, Field.Index.NOT_ANALYZED));// 8
        doc.add(new Field("fullpath", f.getCanonicalPath(), // 9索引文件完整路径
                Field.Store.YES, Field.Index.NOT_ANALYZED));// 9
        return doc;
    }

    // Store.是否存储 yes no compress(压缩之后再存)
    // Index。是否进行索引 Index.ANALYZED 分词后进行索引,NOT_ANALYZED 不索引,NOT_ANALYZED 不分词直接索引

    private void indexFile(File f) throws Exception {
        System.out.println("Indexing " + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc); // 10向Lucene索引中添加文档
    }
}

/*
 * #1 Create index in this directory #2 Index *.txt files from this directory #3
 * Create Lucene IndexWriter #4 Close IndexWriter #5 Return number of documents
 * indexed #6 Index .txt files only, using FileFilter #7 Index file content #8
 * Index file name #9 Index file full path #10 Add document to Lucene index
 */
View Code

 

posted on 2015-04-16 20:44  雨钝风轻  阅读(251)  评论(0编辑  收藏  举报