Lucene5学习之Group分组统计
Group即分组,类似SQL里的group by功能,Lucene中分组是通过内置的几种Collector结果集收集器实现的,有关group的结果集收集器都在org.apache.lucene.search.grouping包及其子包下, 
包含group关键字的Collector都是有关Group分组的结果收集器,如果你只需要统计如下这些分组信息:
/** 所有组的数量 */ int totalGroupCount = 0; /** 所有满足条件的记录数 */ int totalHitCount = 0; /** 所有组内的满足条件的记录数(通常该值与totalHitCount是一致的) */ int totalGroupedHitCount = -1;
则直接使用FirstPassGroupingCollector收集器即可,如果你需要统计每个分组内部的命中总数以及命中索引文档的评分等信息,则需要使用SecondPassGroupingCollector,为了提高第二次查询的效率,可以使用CacheCollector来缓存第一次查询结果,这样第二次就直接从缓存中获取第一次查询结果,为了统计总的分组数量,你可能还需要使用AllGroupsCollector结果收集器。
下面是一个Group分组使用示例,具体详细说明请看代码里面的注释:
package com.yida.framework.lucene5.group;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource;
import org.apache.lucene.search.CachingCollector;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.grouping.AbstractAllGroupsCollector;
import org.apache.lucene.search.grouping.AbstractFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.AbstractSecondPassGroupingCollector;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.SearchGroup;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.search.grouping.function.FunctionAllGroupsCollector;
import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.function.FunctionSecondPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.mutable.MutableValue;
import org.apache.lucene.util.mutable.MutableValueStr;
import com.yida.framework.lucene5.util.Tools;
public class GroupTest {
/** 索引目录 */
private static final String indexDir = "C:/group-index";
/** 分词器 */
private static Analyzer analyzer = new StandardAnalyzer();
/** 分组域 */
private static String groupField = "author";
public static void main(String[] args) throws Exception {
// 创建测试索引
// createIndex();
Directory directory = FSDirectory.open(Paths.get(indexDir));
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new TermQuery(new Term("content", "random"));
/**每个分组内部的排序规则*/
Sort groupSort = Sort.RELEVANCE;
groupBy(searcher, query, groupSort);
//groupSearch(searcher);
}
public static void groupBy(IndexSearcher searcher, Query query, Sort groupSort) throws IOException {
/** 前N条中分组 */
int topNGroups = 10;
/** 分组起始偏移量 */
int groupOffset = 0;
/** 是否填充SearchGroup的sortValues */
boolean fillFields = true;
/** groupSort用于对组进行排序,docSort用于对组内记录进行排序,多数情况下两者是相同的,但也可不同 */
Sort docSort = groupSort;
/** 用于组内分页,起始偏移量 */
int docOffset = 0;
/** 每组返回多少条结果 */
int docsPerGroup = 2;
/** 是否需要计算总的分组数量 */
boolean requiredTotalGroupCount = true;
/** 是否需要缓存评分 */
boolean cacheScores = true;
TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("author", groupSort, groupOffset + topNGroups);
//第一次查询缓存容量的大小:设置为16M
double maxCacheRAMMB = 16.0;
/** 将TermFirstPassGroupingCollector包装成CachingCollector,为第一次查询加缓存,CachingCollector就是用来为结果收集器添加缓存功能的 */
CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
// 开始第一次分组统计
searcher.search(query, cachedCollector);
/**第一次查询返回的结果集TopGroups中只有分组域值以及每组总的评分,至于每个分组里有几条,分别哪些索引文档,则需要进行第二次查询获取*/
Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
if (topGroups == null) {
System.out.println("No groups matched ");
return;
}
Collector secondPassCollector = null;
// 是否获取每个分组内部每个索引的评分
boolean getScores = true;
// 是否计算最大评分
boolean getMaxScores = true;
// 如果需要对Lucene的score进行修正,则需要重载TermSecondPassGroupingCollector
TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset + docsPerGroup, getScores, getMaxScores, fillFields);
// 如果需要计算总的分组数量,则需要把TermSecondPassGroupingCollector包装成TermAllGroupsCollector
// TermAllGroupsCollector就是用来收集总分组数量的
TermAllGroupsCollector allGroupsCollector = null;
//若需要统计总的分组数量
if (requiredTotalGroupCount) {
allGroupsCollector = new TermAllGroupsCollector("author");
secondPassCollector = MultiCollector.wrap(c2, allGroupsCollector);
} else {
secondPassCollector = c2;
}
/**如果第一次查询已经加了缓存,则直接从缓存中取*/
if (cachedCollector.isCached()) {
// 第二次查询直接从缓存中取
cachedCollector.replay(secondPassCollector);
} else {
// 开始第二次分组查询
searcher.search(query, secondPassCollector);
}
/** 所有组的数量 */
int totalGroupCount = 0;
/** 所有满足条件的记录数 */
int totalHitCount = 0;
/** 所有组内的满足条件的记录数(通常该值与totalHitCount是一致的) */
int totalGroupedHitCount = -1;
if (requiredTotalGroupCount) {
totalGroupCount = allGroupsCollector.getGroupCount();
}
//打印总的分组数量
System.out.println("groupCount: " + totalGroupCount);
TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
//这里打印的3项信息就是第一次查询的统计结果
totalHitCount = groupsResult.totalHitCount;
totalGroupedHitCount = groupsResult.totalGroupedHitCount;
System.out.println("groupsResult.totalHitCount:" + totalHitCount);
System.out.println("groupsResult.totalGroupedHitCount:" + totalGroupedHitCount);
System.out.println("///////////////////////////////////////////////");
int groupIdx = 0;
//下面打印的是第二次查询的统计结果,如果你仅仅值需要第一次查询的统计结果信息,不需要每个分组内部的详细信息,则不需要进行第二次查询
// 迭代组
for (GroupDocs<BytesRef> groupDocs : groupsResult.groups) {
groupIdx++;
String groupVL = groupDocs.groupValue == null ? "分组域的域值为空" : new String(groupDocs.groupValue.bytes);
// 分组域的域值,groupIdx表示组的索引即第几组
System.out.println("group[" + groupIdx + "].groupFieldValue:" + groupVL);
// 当前分组内命中的总记录数
System.out.println("group[" + groupIdx + "].totalHits:" + groupDocs.totalHits);
int docIdx = 0;
// 迭代组内的记录
for (ScoreDoc scoreDoc : groupDocs.scoreDocs) {
docIdx++;
// 打印分组内部每条记录的索引文档ID及其评分
System.out.println("group[" + groupIdx + "][" + docIdx + "]{docID:Score}:" + scoreDoc.doc + "/" + scoreDoc.score);
//根据docID可以获取到整个Document对象,通过doc.get(fieldName)可以获取某个存储域的域值
//注意searcher.doc根据docID返回的document对象中不包含docValuesField域的域值,只包含非docValuesField域的域值,请知晓
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("group[" + groupIdx + "][" + docIdx + "]{docID:author}:" + doc.get("id") + ":" + doc.get("content"));
}
System.out.println("******************华丽且拉轰的分割线***********************");
}
}
public static void groupSearch(IndexSearcher indexSearcher) throws IOException {
Sort groupSort = Sort.RELEVANCE;
/** 第一次查询只有Top N条记录进行分组统计 */
final AbstractFirstPassGroupingCollector<?> c1 = createRandomFirstPassCollector( groupField, groupSort, 10);
indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
/*
* final AbstractSecondPassGroupingCollector<?> c2 =
* createSecondPassCollector( c1, groupField, groupSort, null, 0, 5,
* true, true, true); indexSearcher.search(new TermQuery(new
* Term("content", "random")), c2);
*/
/** 第一个参数表示截取偏移量offset,截取[offset, offset+topN]范围内的组 */
Collection<?> groups = c1.getTopGroups(0, true);
System.out.println("group.size:" + groups.size());
for (Object object : groups) {
SearchGroup searchGroup = (SearchGroup) object;
if (searchGroup.groupValue != null) {
if (searchGroup.groupValue.getClass().isAssignableFrom(BytesRef.class)) {
String groupVL = new String( (((BytesRef) searchGroup.groupValue)).bytes);
if (groupVL.equals("")) {
System.out.println("该分组不包含分组域");
} else {
System.out.println(groupVL);
}
} else if (searchGroup.groupValue.getClass().isAssignableFrom(
MutableValueStr.class)) {
if (searchGroup.groupValue.toString().endsWith("(null)")) {
System.out.println("该分组不包含分组域");
} else {
System.out.println(new String((((MutableValueStr) searchGroup.groupValue)).value.bytes()));
}
}
} else {
System.out.println("该分组不包含分组域");
}
for (int i = 0; i < searchGroup.sortValues.length; i++) {
System.out.println("searchGroup.sortValues:"+ searchGroup.sortValues[i]);
}
}
/*
* System.out.println("groups.maxScore:" + groups.maxScore);
* System.out.println("groups.totalHitCount:" + groups.totalHitCount);
* System.out.println("groups.totalGroupedHitCount:" +
* groups.totalGroupedHitCount); System.out.println("groups.length:" +
* groups.groups.length); System.out.println("");
*
* GroupDocs<?> group = groups.groups[0]; compareGroupValue("author3",
* group); System.out.println(group.scoreDocs.length);
*/
}
/**
* 创建测试用的索引文档
*
* @throws IOException
*/
public static void createIndex() throws IOException {
Directory dir = FSDirectory.open(Paths.get(indexDir));
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
addDocuments(groupField, writer);
}
/**
* 添加索引文档
*
* @param groupField
* @param writer
* @throws IOException
*/
public static void addDocuments(String groupField, IndexWriter writer) throws IOException {
// 0
Document doc = new Document();
addGroupField(doc, groupField, "author1");
doc.add(new TextField("content", "random text", Field.Store.YES));
doc.add(new Field("id", "1", Store.YES, Index.NOT_ANALYZED));
writer.addDocument(doc);
// 1
doc = new Document();
addGroupField(doc, groupField, "author1");
doc.add(new TextField("content", "some more random text", Field.Store.YES));
doc.add(new Field("id", "2", Store.YES, Index.NOT_ANALYZED));
writer.addDocument(doc);
// 2
doc = new Document();
addGroupField(doc, groupField, "author1");
doc.add(new TextField("content", "some more random textual data", Field.Store.YES));
doc.add(new Field("id", "3", Store.YES, Index.NOT_ANALYZED));
writer.addDocument(doc);
// 3
doc = new Document();
addGroupField(doc, groupField, "author2");
doc.add(new TextField("content", "some random text", Field.Store.YES));
doc.add(new Field("id", "4", Store.YES, Index.NOT_ANALYZED));
writer.addDocument(doc);
// 4
doc = new Document();
addGroupField(doc, groupField, "author3");
doc.add(new TextField("content", "some more random text", Field.Store.YES));
doc.add(new Field("id", "5", Store.YES, Index.NOT_ANALYZED));
writer.addDocument(doc);
// 5
doc = new Document();
addGroupField(doc, groupField, "author3");
doc.add(new TextField("content", "random", Field.Store.YES));
doc.add(new Field("id", "6", Store.YES, Index.NOT_ANALYZED));
writer.addDocument(doc);
// 6 -- no author field
doc = new Document();
doc.add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES));
doc.add(new Field("id", "6", Store.YES, Index.NOT_ANALYZED));
writer.addDocument(doc);
writer.commit();
writer.close();
}
/**
* 判断域值是否与分组域值相等
*
* @param expected
* @param group
*/
private static void compareGroupValue(String expected, GroupDocs<?> group) {
if (expected == null) {
if (group.groupValue == null) {
return;
} else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) {
return;
} else if (((BytesRef) group.groupValue).length == 0) {
return;
}
}
if (group.groupValue.getClass().isAssignableFrom(BytesRef.class)) {
System.out.println("expected == groupValue?" + new BytesRef(expected) == group.groupValue);
} else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) {
MutableValueStr v = new MutableValueStr();
v.value.copyChars(expected);
System.out.println("expected == groupValue?" + v == group.groupValue);
} else {
}
}
/**
* 创建FirstPassCollector首次检索
*
* @param groupField
* @param groupSort
* @param topDocs
* @param firstPassGroupingCollector
* @return
* @throws IOException
*/
private AbstractFirstPassGroupingCollector<?> createFirstPassCollector(String groupField, Sort groupSort, int topDocs, AbstractFirstPassGroupingCollector<?> firstPassGroupingCollector) throws IOException {
if (TermFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
ValueSource vs = new BytesRefFieldSource(groupField);
return new FunctionFirstPassGroupingCollector(vs, new HashMap(),groupSort, topDocs);
}
return new TermFirstPassGroupingCollector(groupField, groupSort,topDocs);
}
private static AbstractFirstPassGroupingCollector<?> createRandomFirstPassCollector(String groupField, Sort groupSort, int topDocs) throws IOException {
AbstractFirstPassGroupingCollector<?> selected;
// boolean flag = new Random().nextBoolean();
if (false) {
ValueSource vs = new BytesRefFieldSource(groupField);
// FunctionFirstPassGroupingCollector区别是对于分组域的值采用MutableValueStr进行存储,
// MutableValueStr内部维护的是一个BytesRefBuilder,BytesRefBuilder内部有一个grow函数,会自动
// 扩充内部byte[]容量,而BytesRef是定长的buffer
selected = new FunctionFirstPassGroupingCollector(vs, new HashMap(), groupSort, topDocs);
} else {
// TermFirstPassGroupingCollector适用于你的分组域是一个非DocValuesField
selected = new TermFirstPassGroupingCollector(groupField,groupSort, topDocs);
}
return selected;
}
private static <T> AbstractSecondPassGroupingCollector<T> createSecondPassCollector(
AbstractFirstPassGroupingCollector firstPassGroupingCollector,
String groupField, Sort groupSort, Sort sortWithinGroup,
int groupOffset, int maxDocsPerGroup, boolean getScores,
boolean getMaxScores, boolean fillSortFields) throws IOException {
if (TermFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
Collection<SearchGroup<BytesRef>> searchGroups = firstPassGroupingCollector.getTopGroups(groupOffset, fillSortFields);
return (AbstractSecondPassGroupingCollector) new TermSecondPassGroupingCollector(
groupField, searchGroups, groupSort, sortWithinGroup,
maxDocsPerGroup, getScores, getMaxScores, fillSortFields);
} else {
ValueSource vs = new BytesRefFieldSource(groupField);
Collection<SearchGroup<MutableValue>> searchGroups = firstPassGroupingCollector .getTopGroups(groupOffset, fillSortFields);
return (AbstractSecondPassGroupingCollector) new FunctionSecondPassGroupingCollector(
searchGroups, groupSort, sortWithinGroup, maxDocsPerGroup,
getScores, getMaxScores, fillSortFields, vs, new HashMap());
}
}
// Basically converts searchGroups from MutableValue to BytesRef if grouping
// by ValueSource
@SuppressWarnings("unchecked")
private AbstractSecondPassGroupingCollector<?> createSecondPassCollector(AbstractFirstPassGroupingCollector<?> firstPassGroupingCollector,
String groupField, Collection<SearchGroup<BytesRef>> searchGroups,Sort groupSort, Sort sortWithinGroup, int maxDocsPerGroup,boolean getScores, boolean getMaxScores, boolean fillSortFields) throws IOException {
if (firstPassGroupingCollector.getClass().isAssignableFrom(TermFirstPassGroupingCollector.class)) {
return new TermSecondPassGroupingCollector(groupField, searchGroups, groupSort, sortWithinGroup, maxDocsPerGroup, getScores, getMaxScores, fillSortFields);
} else {
ValueSource vs = new BytesRefFieldSource(groupField);
List<SearchGroup<MutableValue>> mvalSearchGroups = new ArrayList<SearchGroup<MutableValue>>(searchGroups.size());
for (SearchGroup<BytesRef> mergedTopGroup : searchGroups) {
SearchGroup<MutableValue> sg = new SearchGroup();
MutableValueStr groupValue = new MutableValueStr();
if (mergedTopGroup.groupValue != null) {
groupValue.value.copyBytes(mergedTopGroup.groupValue);
} else {
groupValue.exists = false;
}
sg.groupValue = groupValue;
sg.sortValues = mergedTopGroup.sortValues;
mvalSearchGroups.add(sg);
}
return new FunctionSecondPassGroupingCollector(mvalSearchGroups,groupSort, sortWithinGroup, maxDocsPerGroup, getScores,getMaxScores, fillSortFields, vs, new HashMap());
}
}
private AbstractAllGroupsCollector<?> createAllGroupsCollector(AbstractFirstPassGroupingCollector<?> firstPassGroupingCollector,String groupField) {
if (firstPassGroupingCollector.getClass().isAssignableFrom(TermFirstPassGroupingCollector.class)) {
return new TermAllGroupsCollector(groupField);
} else {
ValueSource vs = new BytesRefFieldSource(groupField);
return new FunctionAllGroupsCollector(vs, new HashMap());
}
}
/**
* 添加分组域
*
* @param doc
* 索引文档
* @param groupField
* 需要分组的域名称
* @param value
* 域值
*/
private static void addGroupField(Document doc, String groupField,String value) {
doc.add(new SortedDocValuesField(groupField, new BytesRef(value)));
}
}
源码:https://files.cnblogs.com/files/benwu/lucene5-Group.zip
【实例2】
http://www.hankcs.com/program/java/lucene-classification-statistical-sample.html
1 package com.lxbg.datapro.lucene; 2 3 import java.io.IOException; 4 import java.nio.file.Path; 5 import java.nio.file.Paths; 6 import java.util.ArrayList; 7 import java.util.Collection; 8 import java.util.List; 9 import java.util.Map; 10 11 import org.apache.commons.collections.MapUtils; 12 import org.apache.commons.lang3.StringUtils; 13 import org.apache.log4j.LogManager; 14 import org.apache.log4j.Logger; 15 import org.apache.lucene.document.Document; 16 import org.apache.lucene.document.Field; 17 import org.apache.lucene.document.Field.Store; 18 import org.apache.lucene.document.FieldType; 19 import org.apache.lucene.document.SortedDocValuesField; 20 import org.apache.lucene.document.StoredField; 21 import org.apache.lucene.document.StringField; 22 import org.apache.lucene.index.DirectoryReader; 23 import org.apache.lucene.index.IndexOptions; 24 import org.apache.lucene.index.IndexReader; 25 import org.apache.lucene.index.IndexWriter; 26 import org.apache.lucene.index.IndexWriterConfig; 27 import org.apache.lucene.index.IndexableField; 28 import org.apache.lucene.index.Term; 29 import org.apache.lucene.queries.function.ValueSource; 30 import org.apache.lucene.queryparser.classic.QueryParser; 31 import org.apache.lucene.search.BooleanClause.Occur; 32 import org.apache.lucene.search.BooleanQuery; 33 import org.apache.lucene.search.CachingCollector; 34 import org.apache.lucene.search.Collector; 35 import org.apache.lucene.search.IndexSearcher; 36 import org.apache.lucene.search.MatchAllDocsQuery; 37 import org.apache.lucene.search.MultiCollector; 38 import org.apache.lucene.search.Query; 39 import org.apache.lucene.search.ScoreDoc; 40 import org.apache.lucene.search.Sort; 41 import org.apache.lucene.search.TermQuery; 42 import org.apache.lucene.search.grouping.GroupDocs; 43 import org.apache.lucene.search.grouping.SearchGroup; 44 import org.apache.lucene.search.grouping.TopGroups; 45 import org.apache.lucene.search.grouping.term.TermAllGroupsCollector; 46 import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector; 47 import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector; 48 import org.apache.lucene.spatial.SpatialStrategy; 49 import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy; 50 import org.apache.lucene.spatial.prefix.tree.GeohashPrefixTree; 51 import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree; 52 import org.apache.lucene.spatial.query.SpatialArgs; 53 import org.apache.lucene.spatial.query.SpatialOperation; 54 import org.apache.lucene.store.Directory; 55 import org.apache.lucene.store.MMapDirectory; 56 import org.apache.lucene.util.BytesRef; 57 import org.wltea.analyzer.lucene.IKAnalyzer; 58 59 import com.lxbg.base.MapUtil; 60 import com.lxbg.base.Tool; 61 import com.lxbg.base.json.JsonUtil; 62 import com.lxbg.base.model.Parts; 63 import com.lxbg.datapro.model.SearchedUser; 64 import com.spatial4j.core.context.SpatialContext; 65 import com.spatial4j.core.distance.DistanceUtils; 66 import com.spatial4j.core.shape.Point; 67 import com.spatial4j.core.shape.Shape; 68 69 public class SpatialSearch { 70 71 private static Logger logger = LogManager.getLogger(SpatialSearch.class); 72 73 private int topNGroups = 10; // 每页需要多少个组 74 private int groupOffset = 0; // 起始的组 75 private boolean fillFields = true; 76 private int docOffset = 0; // 用于组内分页,起始的记录 77 private int docsPerGroup = 1;// 每组返回多少条结果 78 private boolean requiredTotalGroupCount = true; // 是否需要计算总的组的数量 79 80 /** 81 * 搜索的最少个数 82 */ 83 private static IndexWriter indexWriter; 84 private static IndexReader indexReader; 85 private static IndexSearcher searcher; 86 private SpatialContext ctx; 87 private SpatialStrategy strategy; 88 89 private String indexPath; 90 91 // 使用key更新索引,或者先删除再增加,不然会重复多条 92 93 public SpatialSearch(String indexPath) { 94 95 this.indexPath = indexPath; 96 this.ctx = SpatialContext.GEO; 97 SpatialPrefixTree grid = new GeohashPrefixTree(ctx, 11); 98 this.strategy = new RecursivePrefixTreeStrategy(grid, "location"); 99 } 100 101 /** 102 * 获取索引写入器(只有一个线程,不需要同步) 103 * 104 * @return 105 */ 106 private IndexWriter getWriter() { 107 108 if (indexWriter == null) { 109 IndexWriterConfig iwc = new IndexWriterConfig(new IKAnalyzer()); 110 111 try { 112 Path path = Paths.get(indexPath); 113 Directory directory = new MMapDirectory(path); 114 115 indexWriter = new IndexWriter(directory, iwc); 116 indexWriter.commit(); 117 } catch (IOException e) { 118 logger.error("【SpatialSearch Constructor】" + e); 119 } 120 } 121 122 return indexWriter; 123 } 124 125 /** 126 * 获取搜索器(可能同时有多个用户调用,要同步) 127 * 128 * @return 129 */ 130 private IndexSearcher getSearcher() { 131 132 try { 133 if (indexReader == null) { 134 indexReader = DirectoryReader.open(new MMapDirectory(Paths.get(indexPath))); 135 } else { 136 IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) indexReader);// 读入新增加的增量索引内容,满足实时索引需求 137 if (newReader != null) { 138 indexReader.close(); 139 indexReader = newReader; 140 } 141 } 142 return new IndexSearcher(indexReader); 143 } catch (IOException e) { 144 logger.error("SpatialSearch->getSearcher:" + e); 145 } 146 147 return null; 148 } 149 150 /** 151 * @param sitemCode 152 * 服务项目编号 153 * @param lat 154 * 经度 155 * @param lng 156 * 纬度 157 * @param maxDistance 158 * 搜索最多距离 159 * @param inputText 160 * 用户输入信息 161 * @param tags 162 * 输入的标签,多个用 ,号隔开 163 * @throws Exception 164 */ 165 public List<SearchedUser> search(String sitemCode, Double lat, Double lng, int maxDistance, String inputText, String tags) throws Exception { 166 167 searcher = getSearcher(); 168 if (searcher == null) { 169 return new ArrayList<SearchedUser>(); 170 } 171 172 Point p = ctx.makePoint(lat, lng); 173 SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects, 174 ctx.makeCircle(lat, lng, DistanceUtils.dist2Degrees(maxDistance, DistanceUtils.EARTH_MEAN_RADIUS_KM))); 175 176 ValueSource valueSource = strategy.makeDistanceValueSource(p); 177 Sort distSort = new Sort(valueSource.getSortField(false)).rewrite(searcher); 178 179 Query query = buildQuery(sitemCode, inputText, tags); 180 181 TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector(DocField.USER, distSort, topNGroups); 182 boolean cacheScores = true; 183 double maxCacheRAMMB = 16.0; 184 CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB); 185 186 searcher.search(query, cachedCollector); 187 188 Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields); 189 190 if (topGroups == null) { 191 return new ArrayList<>(); 192 } 193 194 Collector secondPassCollector = null; 195 196 boolean getScores = true; 197 boolean getMaxScores = true; 198 // 如果需要对Lucene的score进行修正,则需要重载TermSecondPassGroupingCollector 199 TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector(DocField.USER, topGroups, distSort, distSort, docsPerGroup, getScores, 200 getMaxScores, fillFields); 201 202 // 是否需要计算一共有多少个分类,这一步是可选的 203 TermAllGroupsCollector allGroupsCollector = null; 204 if (requiredTotalGroupCount) { 205 allGroupsCollector = new TermAllGroupsCollector(DocField.USER); 206 secondPassCollector = MultiCollector.wrap(c2, allGroupsCollector); 207 } else { 208 secondPassCollector = c2; 209 } 210 211 if (cachedCollector.isCached()) { 212 // 被缓存的话,就用缓存 213 cachedCollector.replay(secondPassCollector); 214 } else { 215 // 超出缓存大小,重新执行一次查询 216 searcher.search(query, secondPassCollector); 217 } 218 219 TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset); 220 double x = 0; 221 double y = 0; 222 223 List<SearchedUser> result = new ArrayList<>(); 224 225 // 迭代组 226 for (GroupDocs<BytesRef> groupDocs : groupsResult.groups) { 227 228 // 迭代组内的记录 229 for (ScoreDoc scoreDoc : groupDocs.scoreDocs) { 230 Document doc = searcher.doc(scoreDoc.doc); 231 232 System.out.println("doc:" + doc); 233 234 String[] toXY = doc.get("location").split(" "); // 目标的 x y ,如: 12.955800 77.620979 235 x = Double.valueOf(toXY[0]); 236 y = Double.valueOf(toXY[1]); 237 238 double docDistDEG = ctx.getDistCalc().distance(args.getShape().getCenter(), x, y); 239 double distance = DistanceUtils.degrees2Dist(docDistDEG, DistanceUtils.EARTH_EQUATORIAL_RADIUS_KM * 1000); // 转化为: 米 240 241 /** 242 * 配件信息 243 */ 244 long id = Long.valueOf(doc.get("id")); 245 String name = doc.get("name"); 246 String images = doc.get("images"); 247 String prices = doc.get("prices"); 248 String info = doc.get("info"); 249 String ptags = doc.get("tags"); 250 251 String code = doc.get("code"); // 技工编号 252 String scode = doc.get("scode"); 253 String stype = doc.get("stype"); 254 255 SearchedUser searchedUser = new SearchedUser(doc.get(DocField.USER), Math.round(distance), x, y); 256 Parts parts = new Parts(code, scode, null, null, name, images, prices, info, ptags); 257 parts.setId(id); 258 searchedUser.setParts(parts); 259 result.add(searchedUser); 260 } 261 } 262 263 return result; 264 265 } 266 267 // 多域查询 268 private Query buildQuery(String sitemCode, String inputText, String tags) throws Exception { 269 270 // 如果没有指定关键字,则返回范围内的所有结果 271 if (StringUtils.isBlank(inputText)) { 272 return new MatchAllDocsQuery(); 273 } 274 275 IKAnalyzer ik = new IKAnalyzer(); 276 277 // 等于服务项目编号 and(name包括 or info包括 + tags包括) 278 // 权重: name包括 > labs包括 > info包括 279 280 BooleanQuery booleanQuery = new BooleanQuery(); 281 Query sitmeQuery = new TermQuery(new Term(DocField.SITEM, sitemCode)); 282 booleanQuery.add(sitmeQuery, Occur.MUST); 283 284 BooleanQuery orQuery = new BooleanQuery(); 285 286 QueryParser nameParser = new QueryParser(DocField.NAME, ik); 287 Query nameQuery = nameParser.parse(inputText); 288 orQuery.add(nameQuery, Occur.SHOULD); 289 290 QueryParser infoParser = new QueryParser(DocField.INFO, ik); 291 Query infoQuery = infoParser.parse(inputText); 292 orQuery.add(infoQuery, Occur.SHOULD); 293 294 QueryParser labsParser = new QueryParser(DocField.TAGS, ik); 295 Query labsQuery = labsParser.parse(inputText); 296 orQuery.add(labsQuery, Occur.SHOULD); 297 298 if (StringUtils.isNotBlank(tags)) { 299 QueryParser p1 = new QueryParser(DocField.NAME, ik); 300 Query query1 = p1.parse(tags); 301 orQuery.add(query1, Occur.SHOULD); 302 303 QueryParser p2 = new QueryParser(DocField.INFO, ik); 304 Query query2 = p2.parse(tags); 305 orQuery.add(query2, Occur.SHOULD); 306 307 QueryParser p3 = new QueryParser(DocField.TAGS, ik); 308 Query query3 = p3.parse(tags); 309 orQuery.add(query3, Occur.SHOULD); 310 } 311 312 booleanQuery.add(orQuery, Occur.MUST); 313 314 return booleanQuery; 315 } 316 317 // ######### 构建索引 ####### 318 319 public void deleteIndex(final Collection<String> ids) { 320 321 try { 322 IndexWriter writer = getWriter(); 323 for (String id : ids) { 324 writer.deleteDocuments(new Term(DocField.ID, id)); // 先删除 325 } 326 writer.commit(); 327 } catch (Exception e) { 328 logger.error("【deleteIndex】error:" + e); 329 } 330 331 } 332 333 /** 334 * 配件信息索引 335 * 336 * @param parts 337 * 配件信息 338 * @param ids 339 * 有变动的配件编号(包括删除的) 340 * @throws IOException 341 */ 342 public void indexParts(final List<Map<String, String>> parts, final Collection<String> ids) throws IOException { 343 344 deleteIndex(ids); 345 346 Tool.sleep(0.5f); 347 348 IndexWriter writer = getWriter(); 349 350 double x; 351 double y; 352 353 for (Map<?, ?> map : parts) { 354 x = MapUtils.getDoubleValue(map, "x"); 355 y = MapUtils.getDoubleValue(map, "y"); 356 357 writer.addDocument(createDoc(MapUtils.getLongValue(map, "id"), ctx.makePoint(x, y), MapUtil.getString(map, "name"), MapUtil.getString(map, "info"), 358 MapUtil.getString(map, "ptag"), MapUtil.getString(map, "scode"), MapUtil.getString(map, "stype"), MapUtil.getString(map, "code"), 359 MapUtil.getString(map, "images"), MapUtil.getString(map, "prices"))); 360 361 } 362 363 writer.commit(); // 注意;性能问题 364 365 // indexWriter.close(); // 不要关闭,不然还要初始化 366 } 367 368 /** 369 * 创建Geo Document 370 * 371 * @param id 372 * 产品ID 373 * @param shape 374 * 位置 375 * @param name 376 * 产品名称 377 * @param info 378 * 产品描述 379 * @param tags 380 * 产品标签 381 * @param sitem 382 * 所属服务项目 383 * @param stype 384 * 所属服务板块 385 * @param user 386 * 所属技工code 387 * @param images 388 * 产品图像 389 * @param prices 390 * 产品价格 391 * @return 392 */ 393 private Document createDoc(Long id, Shape shape, String name, String info, String tags, String sitem, String stype, String userCode, String images, 394 String prices) { 395 396 FieldType ft = new FieldType(); 397 ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); 398 ft.setStored(true); 399 400 Document doc = new Document(); 401 402 FieldType idFieldType = new FieldType(); 403 idFieldType.setIndexOptions(IndexOptions.DOCS); // 索引方式 404 idFieldType.setStored(true); // 是否存储 405 idFieldType.setTokenized(false); // 是否分类 406 407 doc.add(new Field(DocField.ID, String.valueOf(id), idFieldType)); 408 409 doc.add(new StringField(DocField.SITEM, sitem, Store.YES)); 410 doc.add(new StringField(DocField.STYPE, stype, Store.YES)); 411 doc.add(new StringField(DocField.USER, userCode, Store.YES)); 412 doc.add(new StringField(DocField.IMAGES, images, Store.YES)); 413 doc.add(new StringField(DocField.PRICES, prices, Store.YES)); 414 415 // 分组 416 doc.add(new SortedDocValuesField(DocField.USER, new BytesRef(userCode))); 417 418 Field nameField = new Field(DocField.NAME, name, ft); 419 nameField.setBoost(3); 420 doc.add(nameField); 421 422 Field labsField = new Field(DocField.TAGS, tags, ft); 423 labsField.setBoost(2); 424 doc.add(labsField); 425 426 doc.add(new Field(DocField.INFO, info, ft)); 427 428 // ===关键语句=== 429 for (IndexableField f : strategy.createIndexableFields(shape)) { 430 doc.add(f); 431 } 432 433 doc.add(new StoredField(strategy.getFieldName(), ctx.toString(shape))); 434 435 return doc; 436 } 437 438 // ##########################以下为测试代码########################## 439 440 public void indexDocuments() throws IOException { 441 442 indexWriter.addDocument(createDoc((long) 1, ctx.makePoint(12.9558, 77.620979), "电脑", "主要是戴尔品牌电脑、显示器、鼠标键盘、配件", "tag1,tag2,tag3", "c104_104000", "c104", 443 "ai301", "images", "111-22")); 444 indexWriter.addDocument(createDoc((long) 2, ctx.makePoint(12.974045, 77.591995), "软件", "各类办公应用软件 如金蝶 用友财务系统", "tag1,tag2,tag3", "c104_104001", "c104", 445 "ai301", "images", "111-22")); 446 indexWriter.addDocument(createDoc((long) 3, ctx.makePoint(12.959365, 77.573792), "网络", "网络硬件产品,如路由器 交换机 网线等等", "tag1,tag2,tag3", "c104_104002", "c104", 447 "ai301", "images", "111-22")); 448 indexWriter.addDocument(createDoc((long) 4, ctx.makePoint(12.998095, 77.592041), "电脑", "国产各品牌电脑 联想 华硕 宏基 ", "tag1,tag2,tag3", "c104_104000", "c104", 449 "ai302", "images", "111-22")); 450 indexWriter.addDocument(createDoc((long) 5, ctx.makePoint(12.97018, 77.61219), "软件", "各类办公应用软件 主营CRM 分销系统 主营服装鞋帽类软件", "tag1,tag2,tag3", "c104_104001", 451 "c104", "ai302", "images", "111-22")); 452 indexWriter.addDocument(createDoc((long) 6, ctx.makePoint(12.992189, 80.2348618), "监控网络", "各类监控设备 摄像头 路由器 交换机 网线等等", "tag1,tag2,tag3", "c104_104002", 453 "c104", "ai302", "images", "111-22")); 454 indexWriter.addDocument(createDoc((long) 7, ctx.makePoint(12.998095, 77.592041), "电脑", "国产各品牌电脑 联想 华硕 宏基 ", "tag1,tag2,tag3", "c104_104000", "c104", 455 "ai303", "images", "111-22")); 456 indexWriter.addDocument(createDoc((long) 8, ctx.makePoint(12.998095, 77.592041), "电脑", "国产各品牌电脑 联想 华硕 宏基 ", "tag1,tag2,tag3", "c104_104000", "c104", 457 "ai305*", "images", "111-22")); 458 459 indexWriter.commit(); 460 461 indexWriter.close(); 462 } 463 464 // 删除时,把某个技工的全部删掉重建 465 public void delindex(String id) throws IOException { 466 indexWriter.deleteDocuments(new Term(DocField.ID, id)); // 先删除 467 indexWriter.forceMergeDeletes(); 468 indexWriter.commit(); 469 } 470 471 // 472 // Document doc1 = new Document(); 473 // doc1.add(new TextField("filename", "text1", Store.YES)); 474 // doc1.add(new TextField("content", text1, Store.YES)); 475 // 476 // indexWriter.updateDocument(new Term("filename","text1"), doc1); 477 // ===以下是搜索=============================================================// 478 479 public void setSearchIndexPath(String indexPath) throws IOException { 480 481 this.indexReader = DirectoryReader.open(new MMapDirectory(Paths.get(indexPath))); 482 this.searcher = new IndexSearcher(indexReader); 483 } 484 485 /** 486 * @param args 487 * @throws Exception 488 */ 489 public static void main(String[] args) throws Exception { 490 // TODO Auto-generated method stub 491 492 // String indexPath = "/usr/local/lucene"; 493 494 String indexPath = "."; 495 SpatialSearch s = new SpatialSearch(indexPath); 496 497 indexWriter = s.getWriter(); 498 // Indexes sample documents 499 s.indexDocuments(); 500 501 // 搜索 502 s.setSearchIndexPath(indexPath); 503 504 // Get Places Within 4 kilometers from cubbon park. 505 System.out.println("--------------\t 电脑"); 506 s.search("c104_104000", 12.974045, 77.591995, 4, "电脑", ""); 507 System.out.println("--------------\t路由器"); 508 s.search("c104_104000", 12.974045, 77.591995, 4, "路由器", ""); 509 System.out.println("--------------\tCRM"); 510 List<SearchedUser> list = s.search("c104_104001", 12.974045, 77.591995, 4, "CRM", ""); 511 512 for (SearchedUser item : list) { 513 System.out.println("ptags: " + item.getParts().getPtag()); 514 } 515 516 System.out.println("END..."); 517 518 // Set<String> users = Redis.getUserJedis().zrange("id:", 0, -1); 519 // 520 // for(String userCode: users){ 521 // System.out.println(userCode); 522 // 523 // } 524 525 Parts parts = new Parts(); 526 parts.setName("one parts"); 527 parts.setCode("one code"); 528 parts.setInfo("info"); 529 parts.setPtag("tag1, tag2, tag3"); 530 531 System.out.println(JsonUtil.toJson(parts)); 532 533 } 534 }

浙公网安备 33010602011771号