Lucene笔记

概述

Lucene最主要就是做两件事:建立索引和进行搜索 ,以下介绍Lucene常用类或者接口:

IndexWriter:lucene中将文档加入索引,同时控制索引过程中的一些参数使用;

Analyzer:分析器,主要用于分析搜索引擎遇到的各种文本。常用的有StandardAnalyzer分析器,StopAnalyzer分析器,WhitespaceAnalyzer分析器等;

Directory:索引存放的位置;lucene提供了两种索引存放的位置,一种是磁盘,一种是内存。一般情况将索引放在磁盘上;相应地lucene提供了FSDirectory和RAMDirectory两个类;

Document:文档;Document相当于一个要进行索引的单元,任何可以想要被索引的文件都必须转化为Document对象才能进行索引。

Field:字段。

IndexSearcher:是lucene中最基本的检索工具,所有的检索都会用到IndexSearcher工具;

Query:查询,lucene中支持模糊查询,语义查询,短语查询,组合查询等等,如有TermQuery,BooleanQuery,RangeQuery,WildcardQuery等一些类。

QueryParser: 是一个解析用户输入的工具,可以通过扫描用户输入的字符串,生成Query对象。

Hits:在搜索完成之后,需要把搜索结果返回并显示给用户,只有这样才算是完成搜索的目的。在lucene中,搜索的结果的集合是用Hits类的实例来

表示的。

具体使用

1.构建索引

1.1添加文档

/** 

     * 生成索引

     * @throws Exception 

     */ 

    @Test 

    public void index() throws Exception { 

        directory = FSDirectory.open(Paths.get("D:\\AllFiles\\lucene01")); 

        IndexWriter writer = getWriter(); 

        for (int i = 0; i < ids.length; i++) { 

            Document document = new Document(); 

            document.add(new StringField("id", ids[i], Field.Store.YES)); 

            document.add(new StringField("author", authors[i], Field.Store.YES)); 

            document.add(new StringField("position", positions[i], Field.Store.YES)); 

            //加权操作

            TextField field = new TextField("title", titles[i],Field.Store.YES); 

            if("boss".equals(positions[i])){ 

                field.setBoost(1.5f); 

            } 

            document.add(field); 

            document.add(new TextField("content", contents[i],Field.Store.NO)); 

            writer.addDocument(document); 

        } 
        writer.close(); 
    }  

1.2删除文档

/** 

     * 测试删除在合并前

     * @throws Exception 

     */ 

    @Test 

    public void testDeleteBeforeMerge()throws Exception{ 

        IndexWriter writer=getWriter(); 

        System.out.println("删除前:"+writer.numDocs()); 

        writer.deleteDocuments(new Term("id","1")); 

        writer.commit(); 

        System.out.println("writer.maxDoc():"+writer.maxDoc()); 

        System.out.println("writer.numDocs():"+writer.numDocs()); 

        writer.close(); 
    } 
    /** 

     * 测试删除在合并后

     * @throws Exception 

     */ 

    @Test 

    public void testDeleteAfterMerge()throws Exception{ 

        IndexWriter writer=getWriter(); 

        System.out.println("删除前:"+writer.numDocs()); 

        writer.deleteDocuments(new Term("id","1")); 

        writer.forceMergeDeletes(); // 强制删除

        writer.commit(); 

        System.out.println("writer.maxDoc():"+writer.maxDoc()); 

        System.out.println("writer.numDocs():"+writer.numDocs()); 

        writer.close(); 
    }

1.3修改文档

/** 

     * 测试更新

     * @throws Exception 

     */ 

    @Test 

    public void testUpdate()throws Exception{ 

        IndexWriter writer=getWriter(); 

        Document doc=new Document(); 

        doc.add(new StringField("id", "1", Field.Store.YES)); 

        doc.add(new StringField("city","qingdao",Field.Store.YES)); 

        doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO)); 

        writer.updateDocument(new Term("id","1"), doc); 

        writer.close(); 
    }

2.搜索功能

2.1对特定项搜索

/** 

     * 对特定项搜索

     * @throws Exception 

     */ 

    @Test 

    public void testTermQuery() throws Exception { 

        String searcheField = "content"; 

        String q = "particular"; 

        Term term = new Term(searcheField,q); 

        Query query = new TermQuery(term); 

        TopDocs hits = is.search(query, 10); 

        System.out.println("匹配 '"+q+"',总共查询到"+hits.totalHits+"个文档"); 

        for (ScoreDoc scoreDoc:hits.scoreDocs) { 

            Document doc = is.doc(scoreDoc.doc); 

            System.out.println(doc.get("fullPath")); 

        } 
    }

2.2查询表达式:QueryParser

/** 

     * 解析查询表达式

     * @throws Exception 

     */ 

    @Test 

    public void testQueryParser()throws Exception{ 

        Analyzer analyzer = new StandardAnalyzer(); 

        String searchField = "contents"; 

        String q = "particular and Areek"; 

        QueryParser parser=new QueryParser(searchField, analyzer); 

        Query query=parser.parse(q); 

        TopDocs hits=is.search(query, 100); 

        System.out.println("匹配 "+q+"查询到"+hits.totalHits+"个记录"); 

        for(ScoreDoc scoreDoc:hits.scoreDocs){ 

            Document doc=is.doc(scoreDoc.doc); 

            System.out.println(doc.get("fullPath")); 
        }
    }

2.3其他查询方式

/** 

     * 指定项范围搜索

     * @throws Exception 

     */ 

    @Test 

    public void testTermRangeQuery()throws Exception{ 

        TermRangeQuery query=new TermRangeQuery("desc", new BytesRef("b".getBytes()), new BytesRef("c".getBytes()), true, true); 

        TopDocs hits=is.search(query, 10); 

        for(ScoreDoc scoreDoc:hits.scoreDocs){ 

            Document doc=is.doc(scoreDoc.doc); 

            System.out.println(doc.get("id")); 

            System.out.println(doc.get("city")); 

            System.out.println(doc.get("desc")); 

        }         
    } 
    /** 

     * 指定数字范围

     * @throws Exception 

     */ 

    @Test 

    public void testNumericRangeQuery()throws Exception{ 

        NumericRangeQuery<Integer> query=NumericRangeQuery.newIntRange("id", 1, 2, true, true); 

        TopDocs hits=is.search(query, 10); 

        for(ScoreDoc scoreDoc:hits.scoreDocs){ 

            Document doc=is.doc(scoreDoc.doc); 

            System.out.println(doc.get("id")); 

            System.out.println(doc.get("city")); 

            System.out.println(doc.get("desc")); 

        }         
    } 
    /** 

     * 指定字符串开头搜索

     * @throws Exception 

     */ 

    @Test 

    public void testPrefixQuery()throws Exception{ 

        PrefixQuery query=new PrefixQuery(new Term("city","a")); 

        TopDocs hits=is.search(query, 10); 

        for(ScoreDoc scoreDoc:hits.scoreDocs){ 

            Document doc=is.doc(scoreDoc.doc); 

            System.out.println(doc.get("id")); 

            System.out.println(doc.get("city")); 

            System.out.println(doc.get("desc")); 
        }     
    } 
    /** 

     * 多条件查询

     * @throws Exception 

     */ 

    @Test 

    public void testBooleanQuery()throws Exception{ 

        NumericRangeQuery<Integer> query1=NumericRangeQuery.newIntRange("id", 1, 2, true, true); 

        PrefixQuery query2=new PrefixQuery(new Term("city","a")); 

        BooleanQuery.Builder booleanQuery=new BooleanQuery.Builder(); 

        booleanQuery.add(query1,BooleanClause.Occur.MUST); 

        booleanQuery.add(query2,BooleanClause.Occur.MUST); 

        TopDocs hits=is.search(booleanQuery.build(), 10); 

        for(ScoreDoc scoreDoc:hits.scoreDocs){ 

            Document doc=is.doc(scoreDoc.doc); 

            System.out.println(doc.get("id")); 

            System.out.println(doc.get("city")); 

            System.out.println(doc.get("desc")); 
        }     
    }

3.扩展

3.1中文分词smartcn以及检索结果高亮显示实现

public class Indexer { 

    private Integer ids[]={1,2,3}; 

    private String citys[]={"青岛","南京","上海"}; 

    private String descs[]={ 

            "青岛是一个美丽的城市。", 

            "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] "江南佳丽地,金陵帝王州",南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有"六朝古都"、"十朝都会"之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有"天下文枢"、"东南第一学"的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。", 

            "上海是一个繁华的城市。" 

    }; 

    private Directory directory; 

    /** 

     * 获取IndexWriter实例

     * @return 

     * @throws Exception 

     */ 

    private IndexWriter getWriter() throws Exception{ 

        //Analyzer analyzer = new StandardAnalyzer(); 

        SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); 

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer); 

        IndexWriter writer = new IndexWriter(directory, iwc); 

        return writer; 
    } 

    /** 

     * 生成索引

     * @param indexDir 

     * @throws Exception 

     */ 

    private void index(String indexDir)throws Exception{ 

        directory = FSDirectory.open(Paths.get(indexDir)); 

        IndexWriter writer = getWriter(); 

        for(int i=0;i<ids.length;i++){ 

            Document doc=new Document(); 

            doc.add(new IntField("id", ids[i], Field.Store.YES)); 

            doc.add(new StringField("city",citys[i],Field.Store.YES)); 

            doc.add(new TextField("desc", descs[i], Field.Store.YES)); 

            writer.addDocument(doc); // 添加文档

        } 

        writer.close(); 
    } 
    public static void main(String[] args) throws Exception { 

        String indexDir = "D:\\AllFiles\\lucene"; 

        new Indexer().index(indexDir); 
    } 
} 
public class Searcher { 

    public static void search(String indexDir,String q)throws Exception{ 

        Directory dir=FSDirectory.open(Paths.get(indexDir)); 

        IndexReader reader=DirectoryReader.open(dir); 

        IndexSearcher is=new IndexSearcher(reader); 

        // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器

        SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer(); 

        QueryParser parser=new QueryParser("desc", analyzer); 

        Query query=parser.parse(q); 

        long start=System.currentTimeMillis(); 

        TopDocs hits=is.search(query, 10); 

        long end=System.currentTimeMillis(); 

        System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录"); 
        QueryScorer scorer=new QueryScorer(query); 

        Fragmenter fragmenter=new SimpleSpanFragmenter(scorer); 

        SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>"); 

        Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer); 

        highlighter.setTextFragmenter(fragmenter); 

        for(ScoreDoc scoreDoc:hits.scoreDocs){ 

            Document doc=is.doc(scoreDoc.doc); 

            System.out.println(doc.get("city")); 

            System.out.println(doc.get("desc")); 

            String desc=doc.get("desc"); 

            if(desc!=null){ 

                TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc)); 

                System.out.println(highlighter.getBestFragment(tokenStream, desc)); 

            } 

        } 

        reader.close(); 

    } 

    public static void main(String[] args) { 

        String indexDir="D:\\AllFiles\\lucene"; 

        String q="南京城市"; 

        try { 

            search(indexDir,q); 

        } catch (Exception e) { 

            e.printStackTrace(); 

        } 
    } 
}

输出到网页时加上html标签

posted @ 2016-03-21 15:03  Aaron殇醉月  阅读(140)  评论(0编辑  收藏  举报