转:lucene index 包分析

关键字:   lucene index    
Index包分析

转载自http://www.gamvan.com/club/clubPage.jsp?ccStyle=0&tID=10633&ccID=37

Lucene索引中有几个最基础的概念,索引(index),文档(document),域(field),和项(或者译为语词term)

其中Index为Document的序列   Document为Field的序列  Field为Term的序列

Term就是一个子串.存在于不同的Field中的同一个子串被认为是不同的Term.因此Term实际上是用一对子串表示的,
第一个子串为Field的name,第二个为Field中的子串.既然Term这么重要,我们先来认识一下Term.

认识Term最好的方法就是看其源码表示.
public final class Term implements Comparable, java.io.Serializable {
  String field;
  String text;
  public Term(String fld, String txt) {this(fld, txt, true);}
  public final String field() { return field; }
  public final String text() { return text; }
  //overwrite equals()
  public final boolean equals(Object o) { }
  //overwrite hashCode()
  public final int hashCode() {r
       return field.hashCode() + text.hashCode();
  }
  public int compareTo(Object other) {
      return compareTo((Term)other);
  }
  public final int compareTo(Term other)
  final void set(String fld, String txt)  public final String toString() {
         return field + ":" + text; 
  }
  private void readObject(java.io.ObjectInputStream in){

  }
从代码中我们可以大体看出Tern其实是一个二元组

下边来编制一个程序来结束本章的讨论。
程序代码:
package org.apache.lucene.index;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.store.*;
import org.apache.lucene.document.*;
import org.apache.lucene.demo.*;
import org.apache.lucene.search.*;
import java.io.*;
/**在使用此程序时,会尽量用到Lucene Index中的每一个类,尽量将其展示个大家
*使用的Index包中类有
*document.riter(提供给用用户使用的为IndexWriter)
*FieldInfo(和FieldInfos)
* SegmentDocs(扩展自TermDocs)
*SegmentReader(扩展自IndexReader,提供给用户使用的是IndexReader)
*SegmentMerger
*segmentTermEnum(扩展自TermEnum)
*segmentTermPositions(扩展自TermPositions)
*segmentTermVector(扩展自TermFreqVector)
*/

public class TestIndexpackage{
//用于将document.入索引
public static void indexdocument.String segment,String fileName)
throws Exception
{
//第二个参数用来控制,如果获得不了目录是否创建
Directory directory = FSDirectory.getDirectory("testIndexpackage",false);
Analyzer analyzer = new SimpleAnalyzer();
//第三个参数为每一个Field最多拥有的Token个数
document.riter writer = new document.riter(directory,analyzer,Similarity.getDefault(),1000);
File file = new File(fileName);
//由于使用Filedocument.file包装成了Docuement,会在document.创建三个field(path,modified,contents)
document.nbsp;doc = Filedocument.document.file);
writer.adddocument.segment,doc);
directory.close();
}
//将多个segment进行合并
public static void merge(String segment1,String segment2,String segmentMerged)throws Exception {
Directory directory = FSDirectory.getDirectory("testIndexpackage",false);
SegmentReader segmentReader1=new SegmentReader(new SegmentInfo(segment1,1,directory));
SegmentReader segmentReader2=new SegmentReader(new SegmentInfo(segment2,1,directory));
//第三个参数为是否创建.cfs文件
SegmentMerger segmentMerger =new SegmentMerger(directory,segmentMerged,false);
segmentMerger.add(segmentReader1);
segmentMerger.add(segmentReader2);
segmentMerger.merge();
segmentMerger.closeReaders();
directory.close();
}
//将segment即Index的子索引的所有内容展示给你看。
public static void printSegment(String segment) throws Exception
{
Directory directory =FSDirectory.getDirectory("testIndexpackage",false);
SegmentReader segmentReader = new SegmentReader(new SegmentInfo(segment,1,directory));
//display document.
for(int i=0;i<segmentReader.numDocs();i++)
System.out.println(segmentReader.document.i));
TermEnum termEnum = segmentReader.terms();//此处实际为SegmentTermEnum
//display term and term positions,termDocs
while(termEnum.next()){
System.out.print(termEnum.term().toString2());
System.out.println(" document.requency=" + termEnum.docFreq());
TermPositions termPositions= segmentReader.termPositions(termEnum.term());
int i=0;
while(termPositions.next()) {
System.out.println((i++)+"->"+termPositions);
}
TermDocs termDocs=segmentReader.termDocs(termEnum.term());//实际为segmentDocs
while (termDocs.next())
{
System.out.println((i++)+"->"+termDocs);
}
}
//display field info
FieldInfos fieldInfos= segmentReader.fieldInfos;
FieldInfo pathFieldInfo = fieldInfos.fieldInfo("path");
FieldInfo modifiedFieldInfo = fieldInfos.fieldInfo("modified");
FieldInfo contentsFieldInfo =fieldInfos.fieldInfo("contents");
System.out.println(pathFieldInfo);
System.out.println(modifiedFieldInfo);
System.out.println(contentsFieldInfo);
//display TermFreqVector
for(int i=0;i<segmentReader.numDocs();i++){
//对contents的token之后的term存于了TermFreqVector
TermFreqVector termFreqVector=segmentReader.getTermFreqVector(i,"contents");
System.out.println(termFreqVector);
}
}

public static void main(String [] args){
try{
Directory directory = FSDirectory.getDirectory("testIndexpackage",true);
directory.close();
indexdocument."segmentOne","e:\\lucene\\test.txt");
//printSegment("segmentOne");
indexdocument."segmentTwo","e:\\lucene\\test2.txt");
// printSegment("segmentTwo");
merge("segmentOne","segmentTwo","merge");
printSegment("merge");
}catch(Exception e){
System.out.println("caught a "+e.getCause()+"\n with message:"+e.getMessage());
e.printStackTrace();
}
}
}

看看其结果如下:
程序代码:
document.lt;Text<path:e:\lucene\test.txt> Keyword<modified:0eg4e221c>>
document.lt;Text<path:e:\lucene\test2.txt> Keyword<modified:0eg4ee8b4>>
<Term:FieldName,text>=<contents,china> document.requency=1
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=1 Pos=2>
1-><docNumber,freq>=<0,1>
<Term:FieldName,text>=<contents,i> document.requency=2
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=2 Pos=0,3>
1-><doc,TermFrequency,Pos>:< doc=1, TermFrequency=1 Pos=0>
2-><docNumber,freq>=<0,2>
3-><docNumber,freq>=<1,1>
<Term:FieldName,text>=<contents,love> document.requency=2
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=2 Pos=1,4>
1-><doc,TermFrequency,Pos>:< doc=1, TermFrequency=1 Pos=1>
2-><docNumber,freq>=<0,2>
3-><docNumber,freq>=<1,1>
<Term:FieldName,text>=<contents,nankai> document.requency=1
0-><doc,TermFrequency,Pos>:< doc=1, TermFrequency=1 Pos=2>
1-><docNumber,freq>=<1,1>
<Term:FieldName,text>=<contents,tianjin> document.requency=1
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=1 Pos=5>
1-><docNumber,freq>=<0,1>
<Term:FieldName,text>=<modified,0eg4e221c> document.requency=1
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=1 Pos=0>
1-><docNumber,freq>=<0,1>
<Term:FieldName,text>=<modified,0eg4ee8b4> document.requency=1
0-><doc,TermFrequency,Pos>:< doc=1, TermFrequency=1 Pos=0>
1-><docNumber,freq>=<1,1>
<Term:FieldName,text>=<path,e> document.requency=2
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=1 Pos=0>
1-><doc,TermFrequency,Pos>:< doc=1, TermFrequency=1 Pos=0>
2-><docNumber,freq>=<0,1>
3-><docNumber,freq>=<1,1>
<Term:FieldName,text>=<path,lucene> document.requency=2
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=1 Pos=1>
1-><doc,TermFrequency,Pos>:< doc=1, TermFrequency=1 Pos=1>
2-><docNumber,freq>=<0,1>
3-><docNumber,freq>=<1,1>
<Term:FieldName,text>=<path,test> document.requency=2
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=1 Pos=2>
1-><doc,TermFrequency,Pos>:< doc=1, TermFrequency=1 Pos=2>
2-><docNumber,freq>=<0,1>
3-><docNumber,freq>=<1,1>
<Term:FieldName,text>=<path,txt> document.requency=2
0-><doc,TermFrequency,Pos>:< doc=0, TermFrequency=1 Pos=3>
1-><doc,TermFrequency,Pos>:< doc=1, TermFrequency=1 Pos=3>
2-><docNumber,freq>=<0,1>
3-><docNumber,freq>=<1,1>

<fieldName,isIndexed,fieldNumber,storeTermVector>=path,true,3,false>
<fieldName,isIndexed,fieldNumber,storeTermVector>=modified,true,2,false>
<fieldName,isIndexed,fieldNumber,storeTermVector>=contents,true,1,true>
{contents: china/1, i/2, love/2, tianjin/1}
{contents: i/1, love/1, nankai/1}

认真审视其结果,你就会更加明白Lucene底层的索引结构如何。

posted @ 2007-09-21 13:38  harry.guo  阅读(582)  评论(0编辑  收藏  举报