Lucene5学习之Group分组统计

Group即分组，类似SQL里的group by功能，Lucene中分组是通过内置的几种Collector结果集收集器实现的，有关group的结果集收集器都在org.apache.lucene.search.grouping包及其子包下，

包含group关键字的Collector都是有关Group分组的结果收集器，如果你只需要统计如下这些分组信息:

/** 所有组的数量 */
int totalGroupCount = 0;
/** 所有满足条件的记录数 */
int totalHitCount = 0;
/** 所有组内的满足条件的记录数(通常该值与totalHitCount是一致的) */
int totalGroupedHitCount = -1;

则直接使用FirstPassGroupingCollector收集器即可，如果你需要统计每个分组内部的命中总数以及命中索引文档的评分等信息，则需要使用SecondPassGroupingCollector，为了提高第二次查询的效率，可以使用CacheCollector来缓存第一次查询结果，这样第二次就直接从缓存中获取第一次查询结果，为了统计总的分组数量，你可能还需要使用AllGroupsCollector结果收集器。

下面是一个Group分组使用示例，具体详细说明请看代码里面的注释：

package com.yida.framework.lucene5.group;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Random;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource;
import org.apache.lucene.search.CachingCollector;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.grouping.AbstractAllGroupsCollector;
import org.apache.lucene.search.grouping.AbstractFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.AbstractSecondPassGroupingCollector;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.SearchGroup;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.search.grouping.function.FunctionAllGroupsCollector;
import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.function.FunctionSecondPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.mutable.MutableValue;
import org.apache.lucene.util.mutable.MutableValueStr;

import com.yida.framework.lucene5.util.Tools;

public class GroupTest {
	/** 索引目录 */
	private static final String indexDir = "C:/group-index";
	/** 分词器 */
	private static Analyzer analyzer = new StandardAnalyzer();
	/** 分组域 */
	private static String groupField = "author";


	public static void main(String[] args) throws Exception {
		// 创建测试索引
		// createIndex();
		Directory directory = FSDirectory.open(Paths.get(indexDir));
		IndexReader reader = DirectoryReader.open(directory);
		IndexSearcher searcher = new IndexSearcher(reader);
		Query query = new TermQuery(new Term("content", "random"));
		/**每个分组内部的排序规则*/
		Sort groupSort = Sort.RELEVANCE;
		groupBy(searcher, query, groupSort);
		//groupSearch(searcher);
	}

	public static void groupBy(IndexSearcher searcher, Query query, Sort groupSort) throws IOException {
		/** 前N条中分组 */
		int topNGroups = 10;
		/** 分组起始偏移量 */
		int groupOffset = 0;
		/** 是否填充SearchGroup的sortValues */
		boolean fillFields = true;
		/** groupSort用于对组进行排序，docSort用于对组内记录进行排序，多数情况下两者是相同的，但也可不同 */
		Sort docSort = groupSort;
		/** 用于组内分页，起始偏移量 */
		int docOffset = 0;
		/** 每组返回多少条结果 */
		int docsPerGroup = 2;
		/** 是否需要计算总的分组数量 */
		boolean requiredTotalGroupCount = true;
		/** 是否需要缓存评分 */
		boolean cacheScores = true;

		TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("author", groupSort, groupOffset + topNGroups);
		//第一次查询缓存容量的大小：设置为16M
		double maxCacheRAMMB = 16.0;
		/** 将TermFirstPassGroupingCollector包装成CachingCollector，为第一次查询加缓存，CachingCollector就是用来为结果收集器添加缓存功能的 */
		CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
		// 开始第一次分组统计
		searcher.search(query, cachedCollector);

		/**第一次查询返回的结果集TopGroups中只有分组域值以及每组总的评分，至于每个分组里有几条，分别哪些索引文档，则需要进行第二次查询获取*/
		Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);

		if (topGroups == null) {
			System.out.println("No groups matched ");
			return;
		}
		
		Collector secondPassCollector = null;
		
		// 是否获取每个分组内部每个索引的评分
		boolean getScores = true;
		// 是否计算最大评分
		boolean getMaxScores = true;
		// 如果需要对Lucene的score进行修正，则需要重载TermSecondPassGroupingCollector
		TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset + docsPerGroup, getScores, getMaxScores, fillFields);

		// 如果需要计算总的分组数量，则需要把TermSecondPassGroupingCollector包装成TermAllGroupsCollector
		// TermAllGroupsCollector就是用来收集总分组数量的
		TermAllGroupsCollector allGroupsCollector = null;
		//若需要统计总的分组数量
		if (requiredTotalGroupCount) {
			allGroupsCollector = new TermAllGroupsCollector("author");
			secondPassCollector = MultiCollector.wrap(c2, allGroupsCollector);
		} else {
			secondPassCollector = c2;
		}

		/**如果第一次查询已经加了缓存，则直接从缓存中取*/
		if (cachedCollector.isCached()) {
			// 第二次查询直接从缓存中取
			cachedCollector.replay(secondPassCollector);
		} else {
			// 开始第二次分组查询
			searcher.search(query, secondPassCollector);
		}

		/** 所有组的数量 */
		int totalGroupCount = 0;
		/** 所有满足条件的记录数 */
		int totalHitCount = 0;
		/** 所有组内的满足条件的记录数(通常该值与totalHitCount是一致的) */
		int totalGroupedHitCount = -1;
		if (requiredTotalGroupCount) {
			totalGroupCount = allGroupsCollector.getGroupCount();
		}
		//打印总的分组数量
		System.out.println("groupCount: " + totalGroupCount);

		TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
		//这里打印的3项信息就是第一次查询的统计结果
		totalHitCount = groupsResult.totalHitCount;
		totalGroupedHitCount = groupsResult.totalGroupedHitCount;
		System.out.println("groupsResult.totalHitCount:" + totalHitCount);
		System.out.println("groupsResult.totalGroupedHitCount:" + totalGroupedHitCount);
		System.out.println("///////////////////////////////////////////////");
		int groupIdx = 0;
		
		//下面打印的是第二次查询的统计结果，如果你仅仅值需要第一次查询的统计结果信息，不需要每个分组内部的详细信息，则不需要进行第二次查询
		// 迭代组
		for (GroupDocs<BytesRef> groupDocs : groupsResult.groups) {
			groupIdx++;
			String groupVL = groupDocs.groupValue == null ? "分组域的域值为空" : new String(groupDocs.groupValue.bytes);
			// 分组域的域值，groupIdx表示组的索引即第几组
			System.out.println("group[" + groupIdx + "].groupFieldValue:" + groupVL);
			// 当前分组内命中的总记录数
			System.out.println("group[" + groupIdx + "].totalHits:" + groupDocs.totalHits);
			int docIdx = 0;
			// 迭代组内的记录
			for (ScoreDoc scoreDoc : groupDocs.scoreDocs) {
				docIdx++;
				// 打印分组内部每条记录的索引文档ID及其评分
				System.out.println("group[" + groupIdx + "][" + docIdx + "]{docID:Score}:" + scoreDoc.doc + "/" + scoreDoc.score);
				//根据docID可以获取到整个Document对象，通过doc.get(fieldName)可以获取某个存储域的域值
				//注意searcher.doc根据docID返回的document对象中不包含docValuesField域的域值，只包含非docValuesField域的域值，请知晓
				Document doc = searcher.doc(scoreDoc.doc);
				System.out.println("group[" + groupIdx + "][" + docIdx + "]{docID:author}:" + doc.get("id") + ":" + doc.get("content"));
			}
			System.out.println("******************华丽且拉轰的分割线***********************");
		}
	}

	public static void groupSearch(IndexSearcher indexSearcher)  throws IOException {

		Sort groupSort = Sort.RELEVANCE;

		/** 第一次查询只有Top N条记录进行分组统计 */
		final AbstractFirstPassGroupingCollector<?> c1 = createRandomFirstPassCollector( groupField, groupSort, 10);
		indexSearcher.search(new TermQuery(new Term("content", "random")), c1);

		/*
		 * final AbstractSecondPassGroupingCollector<?> c2 =
		 * createSecondPassCollector( c1, groupField, groupSort, null, 0, 5,
		 * true, true, true); indexSearcher.search(new TermQuery(new
		 * Term("content", "random")), c2);
		 */

		/** 第一个参数表示截取偏移量offset，截取[offset, offset+topN]范围内的组 */
		Collection<?> groups = c1.getTopGroups(0, true);
		System.out.println("group.size:" + groups.size());
		for (Object object : groups) {
			SearchGroup searchGroup = (SearchGroup) object;

			if (searchGroup.groupValue != null) {
				if (searchGroup.groupValue.getClass().isAssignableFrom(BytesRef.class)) {
					String groupVL = new String(	(((BytesRef) searchGroup.groupValue)).bytes);
					if (groupVL.equals("")) {
						System.out.println("该分组不包含分组域");
					} else {
						System.out.println(groupVL);
					}
				} else if (searchGroup.groupValue.getClass().isAssignableFrom(
						MutableValueStr.class)) {
					if (searchGroup.groupValue.toString().endsWith("(null)")) {
						System.out.println("该分组不包含分组域");
					} else {
						System.out.println(new String((((MutableValueStr) searchGroup.groupValue)).value.bytes()));
					}
				}
			} else {
				System.out.println("该分组不包含分组域");
			}
			for (int i = 0; i < searchGroup.sortValues.length; i++) {
				System.out.println("searchGroup.sortValues:"+ searchGroup.sortValues[i]);
			}
		}

		/*
		 * System.out.println("groups.maxScore：" + groups.maxScore);
		 * System.out.println("groups.totalHitCount：" + groups.totalHitCount);
		 * System.out.println("groups.totalGroupedHitCount：" +
		 * groups.totalGroupedHitCount); System.out.println("groups.length：" +
		 * groups.groups.length); System.out.println("");
		 * 
		 * GroupDocs<?> group = groups.groups[0]; compareGroupValue("author3",
		 * group); System.out.println(group.scoreDocs.length);
		 */

	}

	/**
	 * 创建测试用的索引文档
	 * 
	 * @throws IOException
	 */
	public static void createIndex() throws IOException {
		Directory dir = FSDirectory.open(Paths.get(indexDir));
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
		indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
		IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
		addDocuments(groupField, writer);
	}

	/**
	 * 添加索引文档
	 * 
	 * @param groupField
	 * @param writer
	 * @throws IOException
	 */
	public static void addDocuments(String groupField, IndexWriter writer)	throws IOException {
		// 0
		Document doc = new Document();
		addGroupField(doc, groupField, "author1");
		doc.add(new TextField("content", "random text", Field.Store.YES));
		doc.add(new Field("id", "1", Store.YES, Index.NOT_ANALYZED));
		writer.addDocument(doc);

		// 1
		doc = new Document();
		addGroupField(doc, groupField, "author1");
		doc.add(new TextField("content", "some more random text", 	Field.Store.YES));
		doc.add(new Field("id", "2", Store.YES, Index.NOT_ANALYZED));
		writer.addDocument(doc);

		// 2
		doc = new Document();
		addGroupField(doc, groupField, "author1");
		doc.add(new TextField("content", "some more random textual data",	Field.Store.YES));
		doc.add(new Field("id", "3", Store.YES, Index.NOT_ANALYZED));
		writer.addDocument(doc);

		// 3
		doc = new Document();
		addGroupField(doc, groupField, "author2");
		doc.add(new TextField("content", "some random text", Field.Store.YES));
		doc.add(new Field("id", "4", Store.YES, Index.NOT_ANALYZED));
		writer.addDocument(doc);

		// 4
		doc = new Document();
		addGroupField(doc, groupField, "author3");
		doc.add(new TextField("content", "some more random text",	Field.Store.YES));
		doc.add(new Field("id", "5", Store.YES, Index.NOT_ANALYZED));
		writer.addDocument(doc);

		// 5
		doc = new Document();
		addGroupField(doc, groupField, "author3");
		doc.add(new TextField("content", "random", Field.Store.YES));
		doc.add(new Field("id", "6", Store.YES, Index.NOT_ANALYZED));
		writer.addDocument(doc);

		// 6 -- no author field
		doc = new Document();
		doc.add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES));
		doc.add(new Field("id", "6", Store.YES, Index.NOT_ANALYZED));
		writer.addDocument(doc);
		writer.commit();
		writer.close();
	}

	/**
	 * 判断域值是否与分组域值相等
	 * 
	 * @param expected
	 * @param group
	 */
	private static void compareGroupValue(String expected, GroupDocs<?> group) {
		if (expected == null) {
			if (group.groupValue == null) {
				return;
			} else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) {
				return;
			} else if (((BytesRef) group.groupValue).length == 0) {
				return;
			}
		}

		if (group.groupValue.getClass().isAssignableFrom(BytesRef.class)) {
			System.out.println("expected == groupValue?"  + new BytesRef(expected) == group.groupValue);
		} else if (group.groupValue.getClass().isAssignableFrom(MutableValueStr.class)) {
			MutableValueStr v = new MutableValueStr();
			v.value.copyChars(expected);
			System.out.println("expected == groupValue?" + v == group.groupValue);
		} else {
		}
	}

	/**
	 * 创建FirstPassCollector首次检索
	 * 
	 * @param groupField
	 * @param groupSort
	 * @param topDocs
	 * @param firstPassGroupingCollector
	 * @return
	 * @throws IOException
	 */
	private AbstractFirstPassGroupingCollector<?> createFirstPassCollector(String groupField, Sort groupSort, int topDocs,	AbstractFirstPassGroupingCollector<?> firstPassGroupingCollector)  throws IOException {
		if (TermFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
			ValueSource vs = new BytesRefFieldSource(groupField);
			return new FunctionFirstPassGroupingCollector(vs, new HashMap(),groupSort, topDocs);
		}
		return new TermFirstPassGroupingCollector(groupField, groupSort,topDocs);
	}

	private static AbstractFirstPassGroupingCollector<?> createRandomFirstPassCollector(String groupField, Sort groupSort, int topDocs) throws IOException {
		AbstractFirstPassGroupingCollector<?> selected;
		// boolean flag = new Random().nextBoolean();
		if (false) {
			ValueSource vs = new BytesRefFieldSource(groupField);
			// FunctionFirstPassGroupingCollector区别是对于分组域的值采用MutableValueStr进行存储，
			// MutableValueStr内部维护的是一个BytesRefBuilder，BytesRefBuilder内部有一个grow函数，会自动
			// 扩充内部byte[]容量，而BytesRef是定长的buffer
			selected = new FunctionFirstPassGroupingCollector(vs,	new HashMap(), groupSort, topDocs);
		} else {
			// TermFirstPassGroupingCollector适用于你的分组域是一个非DocValuesField
			selected = new TermFirstPassGroupingCollector(groupField,groupSort, topDocs);
		}
		return selected;
	}

	private static <T> AbstractSecondPassGroupingCollector<T> createSecondPassCollector(
			AbstractFirstPassGroupingCollector firstPassGroupingCollector,
			String groupField, Sort groupSort, Sort sortWithinGroup,
			int groupOffset, int maxDocsPerGroup, boolean getScores,
			boolean getMaxScores, boolean fillSortFields) throws IOException {

		if (TermFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
			Collection<SearchGroup<BytesRef>> searchGroups = firstPassGroupingCollector.getTopGroups(groupOffset, fillSortFields);
			return (AbstractSecondPassGroupingCollector) new TermSecondPassGroupingCollector(
					groupField, searchGroups, groupSort, sortWithinGroup,
					maxDocsPerGroup, getScores, getMaxScores, fillSortFields);
		} else {
			ValueSource vs = new BytesRefFieldSource(groupField);
			Collection<SearchGroup<MutableValue>> searchGroups = firstPassGroupingCollector	.getTopGroups(groupOffset, fillSortFields);
			return (AbstractSecondPassGroupingCollector) new FunctionSecondPassGroupingCollector(
					searchGroups, groupSort, sortWithinGroup, maxDocsPerGroup,
					getScores, getMaxScores, fillSortFields, vs, new HashMap());
		}
	}

	// Basically converts searchGroups from MutableValue to BytesRef if grouping
	// by ValueSource
	@SuppressWarnings("unchecked")
	private AbstractSecondPassGroupingCollector<?> createSecondPassCollector(AbstractFirstPassGroupingCollector<?> firstPassGroupingCollector,
			String groupField, Collection<SearchGroup<BytesRef>> searchGroups,Sort groupSort, Sort sortWithinGroup, int maxDocsPerGroup,boolean getScores, boolean getMaxScores, boolean fillSortFields) throws IOException {
		if (firstPassGroupingCollector.getClass().isAssignableFrom(TermFirstPassGroupingCollector.class)) {
			return new TermSecondPassGroupingCollector(groupField,	searchGroups, groupSort, sortWithinGroup, maxDocsPerGroup,	getScores, getMaxScores, fillSortFields);
		} else {
			ValueSource vs = new BytesRefFieldSource(groupField);
			List<SearchGroup<MutableValue>> mvalSearchGroups = new ArrayList<SearchGroup<MutableValue>>(searchGroups.size());
			for (SearchGroup<BytesRef> mergedTopGroup : searchGroups) {
				SearchGroup<MutableValue> sg = new SearchGroup();
				MutableValueStr groupValue = new MutableValueStr();
				if (mergedTopGroup.groupValue != null) {
					groupValue.value.copyBytes(mergedTopGroup.groupValue);
				} else {
					groupValue.exists = false;
				}
				sg.groupValue = groupValue;
				sg.sortValues = mergedTopGroup.sortValues;
				mvalSearchGroups.add(sg);
			}

			return new FunctionSecondPassGroupingCollector(mvalSearchGroups,groupSort, sortWithinGroup, maxDocsPerGroup, getScores,getMaxScores, fillSortFields, vs, new HashMap());
		}
	}

	private AbstractAllGroupsCollector<?> createAllGroupsCollector(AbstractFirstPassGroupingCollector<?> firstPassGroupingCollector,String groupField) {
		if (firstPassGroupingCollector.getClass().isAssignableFrom(TermFirstPassGroupingCollector.class)) {
			return new TermAllGroupsCollector(groupField);
		} else {
			ValueSource vs = new BytesRefFieldSource(groupField);
			return new FunctionAllGroupsCollector(vs, new HashMap());
		}
	}

	/**
	 * 添加分组域
	 * 
	 * @param doc
	 *            索引文档
	 * @param groupField
	 *            需要分组的域名称
	 * @param value
	 *            域值
	 */
	private static void addGroupField(Document doc, String groupField,String value) {
		doc.add(new SortedDocValuesField(groupField, new BytesRef(value)));
	}
}

源码：https://files.cnblogs.com/files/benwu/lucene5-Group.zip

【实例2】
http://www.hankcs.com/program/java/lucene-classification-statistical-sample.html

  1 package com.lxbg.datapro.lucene;
  2 
  3 import java.io.IOException;
  4 import java.nio.file.Path;
  5 import java.nio.file.Paths;
  6 import java.util.ArrayList;
  7 import java.util.Collection;
  8 import java.util.List;
  9 import java.util.Map;
 10 
 11 import org.apache.commons.collections.MapUtils;
 12 import org.apache.commons.lang3.StringUtils;
 13 import org.apache.log4j.LogManager;
 14 import org.apache.log4j.Logger;
 15 import org.apache.lucene.document.Document;
 16 import org.apache.lucene.document.Field;
 17 import org.apache.lucene.document.Field.Store;
 18 import org.apache.lucene.document.FieldType;
 19 import org.apache.lucene.document.SortedDocValuesField;
 20 import org.apache.lucene.document.StoredField;
 21 import org.apache.lucene.document.StringField;
 22 import org.apache.lucene.index.DirectoryReader;
 23 import org.apache.lucene.index.IndexOptions;
 24 import org.apache.lucene.index.IndexReader;
 25 import org.apache.lucene.index.IndexWriter;
 26 import org.apache.lucene.index.IndexWriterConfig;
 27 import org.apache.lucene.index.IndexableField;
 28 import org.apache.lucene.index.Term;
 29 import org.apache.lucene.queries.function.ValueSource;
 30 import org.apache.lucene.queryparser.classic.QueryParser;
 31 import org.apache.lucene.search.BooleanClause.Occur;
 32 import org.apache.lucene.search.BooleanQuery;
 33 import org.apache.lucene.search.CachingCollector;
 34 import org.apache.lucene.search.Collector;
 35 import org.apache.lucene.search.IndexSearcher;
 36 import org.apache.lucene.search.MatchAllDocsQuery;
 37 import org.apache.lucene.search.MultiCollector;
 38 import org.apache.lucene.search.Query;
 39 import org.apache.lucene.search.ScoreDoc;
 40 import org.apache.lucene.search.Sort;
 41 import org.apache.lucene.search.TermQuery;
 42 import org.apache.lucene.search.grouping.GroupDocs;
 43 import org.apache.lucene.search.grouping.SearchGroup;
 44 import org.apache.lucene.search.grouping.TopGroups;
 45 import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
 46 import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
 47 import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
 48 import org.apache.lucene.spatial.SpatialStrategy;
 49 import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy;
 50 import org.apache.lucene.spatial.prefix.tree.GeohashPrefixTree;
 51 import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
 52 import org.apache.lucene.spatial.query.SpatialArgs;
 53 import org.apache.lucene.spatial.query.SpatialOperation;
 54 import org.apache.lucene.store.Directory;
 55 import org.apache.lucene.store.MMapDirectory;
 56 import org.apache.lucene.util.BytesRef;
 57 import org.wltea.analyzer.lucene.IKAnalyzer;
 58 
 59 import com.lxbg.base.MapUtil;
 60 import com.lxbg.base.Tool;
 61 import com.lxbg.base.json.JsonUtil;
 62 import com.lxbg.base.model.Parts;
 63 import com.lxbg.datapro.model.SearchedUser;
 64 import com.spatial4j.core.context.SpatialContext;
 65 import com.spatial4j.core.distance.DistanceUtils;
 66 import com.spatial4j.core.shape.Point;
 67 import com.spatial4j.core.shape.Shape;
 68 
 69 public class SpatialSearch {
 70 
 71     private static Logger logger = LogManager.getLogger(SpatialSearch.class);
 72 
 73     private int topNGroups = 10; // 每页需要多少个组
 74     private int groupOffset = 0; // 起始的组
 75     private boolean fillFields = true;
 76     private int docOffset = 0; // 用于组内分页，起始的记录
 77     private int docsPerGroup = 1;// 每组返回多少条结果
 78     private boolean requiredTotalGroupCount = true; // 是否需要计算总的组的数量
 79 
 80     /**
 81      * 搜索的最少个数
 82      */
 83     private static IndexWriter indexWriter;
 84     private static IndexReader indexReader;
 85     private static IndexSearcher searcher;
 86     private SpatialContext ctx;
 87     private SpatialStrategy strategy;
 88 
 89     private String indexPath;
 90 
 91     // 使用key更新索引，或者先删除再增加，不然会重复多条
 92 
 93     public SpatialSearch(String indexPath) {
 94 
 95         this.indexPath = indexPath;
 96         this.ctx = SpatialContext.GEO;
 97         SpatialPrefixTree grid = new GeohashPrefixTree(ctx, 11);
 98         this.strategy = new RecursivePrefixTreeStrategy(grid, "location");
 99     }
100 
101     /**
102      * 获取索引写入器(只有一个线程，不需要同步)
103      *
104      * @return
105      */
106     private IndexWriter getWriter() {
107 
108         if (indexWriter == null) {
109             IndexWriterConfig iwc = new IndexWriterConfig(new IKAnalyzer());
110 
111             try {
112                 Path path = Paths.get(indexPath);
113                 Directory directory = new MMapDirectory(path);
114 
115                 indexWriter = new IndexWriter(directory, iwc);
116                 indexWriter.commit();
117             } catch (IOException e) {
118                 logger.error("【SpatialSearch Constructor】" + e);
119             }
120         }
121 
122         return indexWriter;
123     }
124 
125     /**
126      * 获取搜索器(可能同时有多个用户调用，要同步)
127      *
128      * @return
129      */
130     private IndexSearcher getSearcher() {
131 
132         try {
133             if (indexReader == null) {
134                 indexReader = DirectoryReader.open(new MMapDirectory(Paths.get(indexPath)));
135             } else {
136                 IndexReader newReader = DirectoryReader.openIfChanged((DirectoryReader) indexReader);// 读入新增加的增量索引内容，满足实时索引需求
137                 if (newReader != null) {
138                     indexReader.close();
139                     indexReader = newReader;
140                 }
141             }
142             return new IndexSearcher(indexReader);
143         } catch (IOException e) {
144             logger.error("SpatialSearch->getSearcher:" + e);
145         }
146 
147         return null;
148     }
149 
150     /**
151      * @param sitemCode
152      *            服务项目编号
153      * @param lat
154      *            经度
155      * @param lng
156      *            纬度
157      * @param maxDistance
158      *            搜索最多距离
159      * @param inputText
160      *            用户输入信息
161      * @param tags
162      *            输入的标签，多个用 ,号隔开
163      * @throws Exception
164      */
165     public List<SearchedUser> search(String sitemCode, Double lat, Double lng, int maxDistance, String inputText, String tags) throws Exception {
166 
167         searcher = getSearcher();
168         if (searcher == null) {
169             return new ArrayList<SearchedUser>();
170         }
171 
172         Point p = ctx.makePoint(lat, lng);
173         SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects,
174                 ctx.makeCircle(lat, lng, DistanceUtils.dist2Degrees(maxDistance, DistanceUtils.EARTH_MEAN_RADIUS_KM)));
175 
176         ValueSource valueSource = strategy.makeDistanceValueSource(p);
177         Sort distSort = new Sort(valueSource.getSortField(false)).rewrite(searcher);
178 
179         Query query = buildQuery(sitemCode, inputText, tags);
180 
181         TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector(DocField.USER, distSort, topNGroups);
182         boolean cacheScores = true;
183         double maxCacheRAMMB = 16.0;
184         CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
185 
186         searcher.search(query, cachedCollector);
187 
188         Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
189 
190         if (topGroups == null) {
191             return new ArrayList<>();
192         }
193 
194         Collector secondPassCollector = null;
195 
196         boolean getScores = true;
197         boolean getMaxScores = true;
198         // 如果需要对Lucene的score进行修正，则需要重载TermSecondPassGroupingCollector
199         TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector(DocField.USER, topGroups, distSort, distSort, docsPerGroup, getScores,
200                 getMaxScores, fillFields);
201 
202         // 是否需要计算一共有多少个分类，这一步是可选的
203         TermAllGroupsCollector allGroupsCollector = null;
204         if (requiredTotalGroupCount) {
205             allGroupsCollector = new TermAllGroupsCollector(DocField.USER);
206             secondPassCollector = MultiCollector.wrap(c2, allGroupsCollector);
207         } else {
208             secondPassCollector = c2;
209         }
210 
211         if (cachedCollector.isCached()) {
212             // 被缓存的话，就用缓存
213             cachedCollector.replay(secondPassCollector);
214         } else {
215             // 超出缓存大小，重新执行一次查询
216             searcher.search(query, secondPassCollector);
217         }
218 
219         TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
220         double x = 0;
221         double y = 0;
222 
223         List<SearchedUser> result = new ArrayList<>();
224 
225         // 迭代组
226         for (GroupDocs<BytesRef> groupDocs : groupsResult.groups) {
227 
228             // 迭代组内的记录
229             for (ScoreDoc scoreDoc : groupDocs.scoreDocs) {
230                 Document doc = searcher.doc(scoreDoc.doc);
231 
232                 System.out.println("doc:" + doc);
233 
234                 String[] toXY = doc.get("location").split(" "); // 目标的 x y ,如： 12.955800 77.620979
235                 x = Double.valueOf(toXY[0]);
236                 y = Double.valueOf(toXY[1]);
237 
238                 double docDistDEG = ctx.getDistCalc().distance(args.getShape().getCenter(), x, y);
239                 double distance = DistanceUtils.degrees2Dist(docDistDEG, DistanceUtils.EARTH_EQUATORIAL_RADIUS_KM * 1000); // 转化为： 米
240 
241                 /**
242                  * 配件信息
243                  */
244                 long id = Long.valueOf(doc.get("id"));
245                 String name = doc.get("name");
246                 String images = doc.get("images");
247                 String prices = doc.get("prices");
248                 String info = doc.get("info");
249                 String ptags = doc.get("tags");
250 
251                 String code = doc.get("code"); // 技工编号
252                 String scode = doc.get("scode");
253                 String stype = doc.get("stype");
254 
255                 SearchedUser searchedUser = new SearchedUser(doc.get(DocField.USER), Math.round(distance), x, y);
256                 Parts parts = new Parts(code, scode, null, null, name, images, prices, info, ptags);
257                 parts.setId(id);
258                 searchedUser.setParts(parts);
259                 result.add(searchedUser);
260             }
261         }
262 
263         return result;
264 
265     }
266 
267     // 多域查询
268     private Query buildQuery(String sitemCode, String inputText, String tags) throws Exception {
269 
270         // 如果没有指定关键字，则返回范围内的所有结果
271         if (StringUtils.isBlank(inputText)) {
272             return new MatchAllDocsQuery();
273         }
274 
275         IKAnalyzer ik = new IKAnalyzer();
276 
277         // 等于服务项目编号 and（name包括 or info包括 + tags包括）
278         // 权重： name包括 > labs包括 > info包括
279 
280         BooleanQuery booleanQuery = new BooleanQuery();
281         Query sitmeQuery = new TermQuery(new Term(DocField.SITEM, sitemCode));
282         booleanQuery.add(sitmeQuery, Occur.MUST);
283 
284         BooleanQuery orQuery = new BooleanQuery();
285 
286         QueryParser nameParser = new QueryParser(DocField.NAME, ik);
287         Query nameQuery = nameParser.parse(inputText);
288         orQuery.add(nameQuery, Occur.SHOULD);
289 
290         QueryParser infoParser = new QueryParser(DocField.INFO, ik);
291         Query infoQuery = infoParser.parse(inputText);
292         orQuery.add(infoQuery, Occur.SHOULD);
293 
294         QueryParser labsParser = new QueryParser(DocField.TAGS, ik);
295         Query labsQuery = labsParser.parse(inputText);
296         orQuery.add(labsQuery, Occur.SHOULD);
297 
298         if (StringUtils.isNotBlank(tags)) {
299             QueryParser p1 = new QueryParser(DocField.NAME, ik);
300             Query query1 = p1.parse(tags);
301             orQuery.add(query1, Occur.SHOULD);
302 
303             QueryParser p2 = new QueryParser(DocField.INFO, ik);
304             Query query2 = p2.parse(tags);
305             orQuery.add(query2, Occur.SHOULD);
306 
307             QueryParser p3 = new QueryParser(DocField.TAGS, ik);
308             Query query3 = p3.parse(tags);
309             orQuery.add(query3, Occur.SHOULD);
310         }
311 
312         booleanQuery.add(orQuery, Occur.MUST);
313 
314         return booleanQuery;
315     }
316 
317     // ######### 构建索引 #######
318 
319     public void deleteIndex(final Collection<String> ids) {
320 
321         try {
322             IndexWriter writer = getWriter();
323             for (String id : ids) {
324                 writer.deleteDocuments(new Term(DocField.ID, id)); // 先删除
325             }
326             writer.commit();
327         } catch (Exception e) {
328             logger.error("【deleteIndex】error:" + e);
329         }
330 
331     }
332 
333     /**
334      * 配件信息索引
335      *
336      * @param parts
337      *            配件信息
338      * @param ids
339      *            有变动的配件编号(包括删除的)
340      * @throws IOException
341      */
342     public void indexParts(final List<Map<String, String>> parts, final Collection<String> ids) throws IOException {
343 
344         deleteIndex(ids);
345 
346         Tool.sleep(0.5f);
347 
348         IndexWriter writer = getWriter();
349 
350         double x;
351         double y;
352 
353         for (Map<?, ?> map : parts) {
354             x = MapUtils.getDoubleValue(map, "x");
355             y = MapUtils.getDoubleValue(map, "y");
356 
357             writer.addDocument(createDoc(MapUtils.getLongValue(map, "id"), ctx.makePoint(x, y), MapUtil.getString(map, "name"), MapUtil.getString(map, "info"),
358                     MapUtil.getString(map, "ptag"), MapUtil.getString(map, "scode"), MapUtil.getString(map, "stype"), MapUtil.getString(map, "code"),
359                     MapUtil.getString(map, "images"), MapUtil.getString(map, "prices")));
360 
361         }
362 
363         writer.commit(); // 注意；性能问题
364 
365         // indexWriter.close(); // 不要关闭，不然还要初始化
366     }
367 
368     /**
369      * 创建Geo Document
370      *
371      * @param id
372      *            产品ID
373      * @param shape
374      *            位置
375      * @param name
376      *            产品名称
377      * @param info
378      *            产品描述
379      * @param tags
380      *            产品标签
381      * @param sitem
382      *            所属服务项目
383      * @param stype
384      *            所属服务板块
385      * @param user
386      *            所属技工code
387      * @param images
388      *            产品图像
389      * @param prices
390      *            产品价格
391      * @return
392      */
393     private Document createDoc(Long id, Shape shape, String name, String info, String tags, String sitem, String stype, String userCode, String images,
394             String prices) {
395 
396         FieldType ft = new FieldType();
397         ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
398         ft.setStored(true);
399 
400         Document doc = new Document();
401 
402         FieldType idFieldType = new FieldType();
403         idFieldType.setIndexOptions(IndexOptions.DOCS); // 索引方式
404         idFieldType.setStored(true); // 是否存储
405         idFieldType.setTokenized(false); // 是否分类
406 
407         doc.add(new Field(DocField.ID, String.valueOf(id), idFieldType));
408 
409         doc.add(new StringField(DocField.SITEM, sitem, Store.YES));
410         doc.add(new StringField(DocField.STYPE, stype, Store.YES));
411         doc.add(new StringField(DocField.USER, userCode, Store.YES));
412         doc.add(new StringField(DocField.IMAGES, images, Store.YES));
413         doc.add(new StringField(DocField.PRICES, prices, Store.YES));
414 
415         // 分组
416         doc.add(new SortedDocValuesField(DocField.USER, new BytesRef(userCode)));
417 
418         Field nameField = new Field(DocField.NAME, name, ft);
419         nameField.setBoost(3);
420         doc.add(nameField);
421 
422         Field labsField = new Field(DocField.TAGS, tags, ft);
423         labsField.setBoost(2);
424         doc.add(labsField);
425 
426         doc.add(new Field(DocField.INFO, info, ft));
427 
428         // ===关键语句===
429         for (IndexableField f : strategy.createIndexableFields(shape)) {
430             doc.add(f);
431         }
432 
433         doc.add(new StoredField(strategy.getFieldName(), ctx.toString(shape)));
434 
435         return doc;
436     }
437 
438     // ##########################以下为测试代码##########################
439 
440     public void indexDocuments() throws IOException {
441 
442         indexWriter.addDocument(createDoc((long) 1, ctx.makePoint(12.9558, 77.620979), "电脑", "主要是戴尔品牌电脑、显示器、鼠标键盘、配件", "tag1,tag2,tag3", "c104_104000", "c104",
443                 "ai301", "images", "111-22"));
444         indexWriter.addDocument(createDoc((long) 2, ctx.makePoint(12.974045, 77.591995), "软件", "各类办公应用软件 如金蝶 用友财务系统", "tag1,tag2,tag3", "c104_104001", "c104",
445                 "ai301", "images", "111-22"));
446         indexWriter.addDocument(createDoc((long) 3, ctx.makePoint(12.959365, 77.573792), "网络", "网络硬件产品，如路由器 交换机 网线等等", "tag1,tag2,tag3", "c104_104002", "c104",
447                 "ai301", "images", "111-22"));
448         indexWriter.addDocument(createDoc((long) 4, ctx.makePoint(12.998095, 77.592041), "电脑", "国产各品牌电脑  联想 华硕 宏基 ", "tag1,tag2,tag3", "c104_104000", "c104",
449                 "ai302", "images", "111-22"));
450         indexWriter.addDocument(createDoc((long) 5, ctx.makePoint(12.97018, 77.61219), "软件", "各类办公应用软件 主营CRM 分销系统 主营服装鞋帽类软件", "tag1,tag2,tag3", "c104_104001",
451                 "c104", "ai302", "images", "111-22"));
452         indexWriter.addDocument(createDoc((long) 6, ctx.makePoint(12.992189, 80.2348618), "监控网络", "各类监控设备 摄像头 路由器 交换机 网线等等", "tag1,tag2,tag3", "c104_104002",
453                 "c104", "ai302", "images", "111-22"));
454         indexWriter.addDocument(createDoc((long) 7, ctx.makePoint(12.998095, 77.592041), "电脑", "国产各品牌电脑  联想 华硕 宏基 ", "tag1,tag2,tag3", "c104_104000", "c104",
455                 "ai303", "images", "111-22"));
456         indexWriter.addDocument(createDoc((long) 8, ctx.makePoint(12.998095, 77.592041), "电脑", "国产各品牌电脑  联想 华硕 宏基 ", "tag1,tag2,tag3", "c104_104000", "c104",
457                 "ai305*", "images", "111-22"));
458 
459         indexWriter.commit();
460 
461         indexWriter.close();
462     }
463 
464     // 删除时，把某个技工的全部删掉重建
465     public void delindex(String id) throws IOException {
466         indexWriter.deleteDocuments(new Term(DocField.ID, id)); // 先删除
467         indexWriter.forceMergeDeletes();
468         indexWriter.commit();
469     }
470 
471     //
472     // Document doc1 = new Document();
473     // doc1.add(new TextField("filename", "text1", Store.YES));
474     // doc1.add(new TextField("content", text1, Store.YES));
475     //
476     // indexWriter.updateDocument(new Term("filename","text1"), doc1);
477     // ===以下是搜索=============================================================//
478 
479     public void setSearchIndexPath(String indexPath) throws IOException {
480 
481         this.indexReader = DirectoryReader.open(new MMapDirectory(Paths.get(indexPath)));
482         this.searcher = new IndexSearcher(indexReader);
483     }
484 
485     /**
486      * @param args
487      * @throws Exception
488      */
489     public static void main(String[] args) throws Exception {
490         // TODO Auto-generated method stub
491 
492         // String indexPath = "/usr/local/lucene";
493 
494         String indexPath = ".";
495         SpatialSearch s = new SpatialSearch(indexPath);
496 
497         indexWriter = s.getWriter();
498         // Indexes sample documents
499         s.indexDocuments();
500 
501         // 搜索
502         s.setSearchIndexPath(indexPath);
503 
504         // Get Places Within 4 kilometers from cubbon park.
505         System.out.println("--------------\t 电脑");
506         s.search("c104_104000", 12.974045, 77.591995, 4, "电脑", "");
507         System.out.println("--------------\t路由器");
508         s.search("c104_104000", 12.974045, 77.591995, 4, "路由器", "");
509         System.out.println("--------------\tCRM");
510         List<SearchedUser> list = s.search("c104_104001", 12.974045, 77.591995, 4, "CRM", "");
511 
512         for (SearchedUser item : list) {
513             System.out.println("ptags: " + item.getParts().getPtag());
514         }
515 
516         System.out.println("END...");
517 
518         // Set<String> users = Redis.getUserJedis().zrange("id:", 0, -1);
519         //
520         // for(String userCode: users){
521         // System.out.println(userCode);
522         //
523         // }
524 
525         Parts parts = new Parts();
526         parts.setName("one parts");
527         parts.setCode("one code");
528         parts.setInfo("info");
529         parts.setPtag("tag1, tag2, tag3");
530 
531         System.out.println(JsonUtil.toJson(parts));
532 
533     }
534 }

View Code

posted @ 2016-08-15 15:34 南极山阅读(715) 评论(0) 收藏举报

刷新页面返回顶部

南极山

努力学习新技术....

Lucene5学习之Group分组统计

公告