第10章 - 大数据集成
第10章 - 大数据集成
10.1 大数据场景概述
10.1.1 geometry-api-java 的大数据定位
geometry-api-java 最初就是为大数据空间处理设计的。它特别适合:
- Hadoop MapReduce:分布式空间分析
- Apache Spark:内存计算空间处理
- Apache Hive:SQL 空间查询(通过 UDF)
- Apache Storm:实时空间流处理
- 分布式数据库:HBase、Cassandra 的空间扩展
10.1.2 大数据空间处理挑战
挑战 geometry-api-java 解决方案
─────────────────────────────────────────────────
数据规模大 → GeometryCursor 流式处理
内存限制 → 轻量级对象设计
序列化开销 → 高效的 WKB 格式
分布式计算 → 无状态算子设计
容错处理 → 可序列化的几何对象
10.2 Hadoop MapReduce 集成
10.2.1 空间数据 InputFormat
/**
* 读取 GeoJSON 文件的 InputFormat
*/
public class GeoJsonInputFormat extends FileInputFormat<LongWritable, Text> {
@Override
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new GeoJsonRecordReader();
}
}
public class GeoJsonRecordReader extends RecordReader<LongWritable, Text> {
private LineRecordReader lineReader;
private LongWritable key = new LongWritable();
private Text value = new Text();
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException {
lineReader = new LineRecordReader();
lineReader.initialize(split, context);
}
@Override
public boolean nextKeyValue() throws IOException {
if (!lineReader.nextKeyValue()) {
return false;
}
key.set(lineReader.getCurrentKey().get());
value.set(lineReader.getCurrentValue());
return true;
}
// ... 其他方法实现
}
10.2.2 空间过滤 Mapper
/**
* 空间过滤 Mapper:只输出在指定区域内的几何
*/
public class SpatialFilterMapper
extends Mapper<LongWritable, Text, Text, Text> {
private Polygon filterPolygon;
private SpatialReference sr;
@Override
protected void setup(Context context) {
// 从配置读取过滤区域
String wkt = context.getConfiguration().get("filter.polygon.wkt");
filterPolygon = (Polygon) GeometryEngine.geometryFromWkt(
wkt, 0, Geometry.Type.Polygon);
sr = SpatialReference.create(4326);
// 加速过滤多边形
OperatorContains.local().accelerateGeometry(
filterPolygon, sr,
Geometry.GeometryAccelerationDegree.enumMedium);
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
try {
// 解析 GeoJSON
MapGeometry mapGeom = GeometryEngine.geoJsonToGeometry(
value.toString(), 0, Geometry.Type.Unknown);
Geometry geom = mapGeom.getGeometry();
// 空间过滤
if (GeometryEngine.contains(filterPolygon, geom, sr)) {
context.write(new Text(key.toString()), value);
}
} catch (Exception e) {
context.getCounter("Geometry", "ParseErrors").increment(1);
}
}
@Override
protected void cleanup(Context context) {
Operator.deaccelerateGeometry(filterPolygon);
}
}
10.2.3 空间聚合 Reducer
/**
* 空间合并 Reducer:将相交的几何合并
*/
public class SpatialUnionReducer
extends Reducer<Text, BytesWritable, Text, BytesWritable> {
private SpatialReference sr;
@Override
protected void setup(Context context) {
sr = SpatialReference.create(4326);
}
@Override
protected void reduce(Text key, Iterable<BytesWritable> values,
Context context) throws IOException, InterruptedException {
List<Geometry> geometries = new ArrayList<>();
for (BytesWritable value : values) {
byte[] wkb = value.getBytes();
Geometry geom = OperatorImportFromWkb.local().execute(
0, Geometry.Type.Unknown,
ByteBuffer.wrap(wkb, 0, value.getLength()), null);
geometries.add(geom);
}
// 合并所有几何
Geometry[] geomArray = geometries.toArray(new Geometry[0]);
Geometry union = GeometryEngine.union(geomArray, sr);
// 输出 WKB
ByteBuffer wkb = OperatorExportToWkb.local().execute(0, union, null);
context.write(key, new BytesWritable(wkb.array()));
}
}
10.2.4 完整的 MapReduce 作业
public class SpatialFilterJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// 设置过滤多边形
conf.set("filter.polygon.wkt",
"POLYGON ((116 39, 117 39, 117 40, 116 40, 116 39))");
Job job = Job.getInstance(conf, "Spatial Filter");
job.setJarByClass(SpatialFilterJob.class);
job.setMapperClass(SpatialFilterMapper.class);
job.setReducerClass(SpatialUnionReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
10.3 Apache Spark 集成
10.3.1 Spark 空间 RDD
/**
* 使用 Spark Java API 进行空间处理
*/
public class SparkSpatialAnalysis {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("Spatial Analysis")
.setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
// 读取 GeoJSON 数据
JavaRDD<String> geoJsonRDD = sc.textFile("hdfs://path/to/geojson");
// 转换为几何 RDD
JavaRDD<Geometry> geometryRDD = geoJsonRDD.map(json -> {
try {
MapGeometry mg = GeometryEngine.geoJsonToGeometry(
json, 0, Geometry.Type.Unknown);
return mg.getGeometry();
} catch (Exception e) {
return null;
}
}).filter(g -> g != null);
// 计算缓冲区
SpatialReference sr = SpatialReference.create(4326);
JavaRDD<Geometry> bufferedRDD = geometryRDD.map(geom ->
GeometryEngine.buffer(geom, sr, 0.01));
// 输出为 WKT
JavaRDD<String> wktRDD = bufferedRDD.map(geom ->
GeometryEngine.geometryToWkt(geom, 0));
wktRDD.saveAsTextFile("hdfs://path/to/output");
sc.close();
}
}
10.3.2 空间连接
/**
* Spark 空间连接实现
*/
public class SparkSpatialJoin {
public static JavaPairRDD<Long, Tuple2<Geometry, Geometry>> spatialJoin(
JavaPairRDD<Long, Geometry> left,
JavaPairRDD<Long, Geometry> right,
SpatialReference sr) {
// 简单的嵌套循环连接(适用于小数据集)
// 大数据集应使用空间索引
// 收集右侧数据(假设较小)
Map<Long, Geometry> rightMap = right.collectAsMap();
Broadcast<Map<Long, Geometry>> rightBroadcast =
left.context().broadcast(rightMap);
return left.flatMapToPair(tuple -> {
List<Tuple2<Long, Tuple2<Geometry, Geometry>>> results =
new ArrayList<>();
Geometry leftGeom = tuple._2();
Envelope2D leftEnv = new Envelope2D();
leftGeom.queryEnvelope2D(leftEnv);
for (Map.Entry<Long, Geometry> entry :
rightBroadcast.value().entrySet()) {
Geometry rightGeom = entry.getValue();
Envelope2D rightEnv = new Envelope2D();
rightGeom.queryEnvelope2D(rightEnv);
// 包围盒过滤
if (!leftEnv.isIntersecting(rightEnv)) {
continue;
}
// 精确相交检查
if (!GeometryEngine.disjoint(leftGeom, rightGeom, sr)) {
results.add(new Tuple2<>(tuple._1(),
new Tuple2<>(leftGeom, rightGeom)));
}
}
return results.iterator();
});
}
}
10.3.3 Spark SQL 空间 UDF
/**
* Spark SQL 空间 UDF 定义
*/
public class SparkSpatialUDFs {
public static void registerUDFs(SparkSession spark) {
// 从 WKT 创建几何
spark.udf().register("ST_GeomFromText",
(String wkt) -> {
try {
Geometry geom = GeometryEngine.geometryFromWkt(
wkt, 0, Geometry.Type.Unknown);
return GeometryEngine.geometryToWkt(geom, 0);
} catch (Exception e) {
return null;
}
}, DataTypes.StringType);
// 计算缓冲区
spark.udf().register("ST_Buffer",
(String wkt, Double distance) -> {
try {
Geometry geom = GeometryEngine.geometryFromWkt(
wkt, 0, Geometry.Type.Unknown);
Geometry buffer = GeometryEngine.buffer(
geom, SpatialReference.create(4326), distance);
return GeometryEngine.geometryToWkt(buffer, 0);
} catch (Exception e) {
return null;
}
}, DataTypes.StringType);
// 判断包含
spark.udf().register("ST_Contains",
(String wkt1, String wkt2) -> {
try {
Geometry g1 = GeometryEngine.geometryFromWkt(
wkt1, 0, Geometry.Type.Unknown);
Geometry g2 = GeometryEngine.geometryFromWkt(
wkt2, 0, Geometry.Type.Unknown);
return GeometryEngine.contains(
g1, g2, SpatialReference.create(4326));
} catch (Exception e) {
return false;
}
}, DataTypes.BooleanType);
// 计算距离
spark.udf().register("ST_Distance",
(String wkt1, String wkt2) -> {
try {
Geometry g1 = GeometryEngine.geometryFromWkt(
wkt1, 0, Geometry.Type.Unknown);
Geometry g2 = GeometryEngine.geometryFromWkt(
wkt2, 0, Geometry.Type.Unknown);
return GeometryEngine.distance(
g1, g2, SpatialReference.create(4326));
} catch (Exception e) {
return Double.NaN;
}
}, DataTypes.DoubleType);
}
}
// 使用示例
// SELECT * FROM points WHERE ST_Contains(region_wkt, point_wkt)
// SELECT id, ST_Buffer(geom_wkt, 0.01) AS buffer FROM features
10.4 Hive 空间 UDF
10.4.1 Hive UDF 基础
/**
* Hive 空间 UDF:计算面积
*/
public class ST_Area extends GenericUDF {
private ObjectInspectorConverters.Converter inputConverter;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments)
throws UDFArgumentException {
if (arguments.length != 1) {
throw new UDFArgumentException("ST_Area takes 1 argument");
}
inputConverter = ObjectInspectorConverters.getConverter(
arguments[0],
PrimitiveObjectInspectorFactory.writableStringObjectInspector);
return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
Text wktText = (Text) inputConverter.convert(arguments[0].get());
if (wktText == null) {
return null;
}
try {
Geometry geom = GeometryEngine.geometryFromWkt(
wktText.toString(), 0, Geometry.Type.Unknown);
return new DoubleWritable(geom.calculateArea2D());
} catch (Exception e) {
return null;
}
}
@Override
public String getDisplayString(String[] children) {
return "ST_Area(" + children[0] + ")";
}
}
10.4.2 Hive 空间 UDAF
/**
* Hive 空间聚合函数:合并几何
*/
public class ST_Union_Aggr extends AbstractGenericUDAFResolver {
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
throws SemanticException {
return new ST_Union_Aggr_Evaluator();
}
public static class ST_Union_Aggr_Evaluator extends GenericUDAFEvaluator {
private PrimitiveObjectInspector inputOI;
private SpatialReference sr = SpatialReference.create(4326);
@Override
public ObjectInspector init(Mode m, ObjectInspector[] parameters)
throws HiveException {
super.init(m, parameters);
inputOI = (PrimitiveObjectInspector) parameters[0];
return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
}
@Override
public AggregationBuffer getNewAggregationBuffer() throws HiveException {
return new UnionBuffer();
}
@Override
public void reset(AggregationBuffer agg) throws HiveException {
((UnionBuffer) agg).geometries.clear();
}
@Override
public void iterate(AggregationBuffer agg, Object[] parameters)
throws HiveException {
String wkt = PrimitiveObjectInspectorUtils.getString(
parameters[0], inputOI);
if (wkt != null) {
try {
Geometry geom = GeometryEngine.geometryFromWkt(
wkt, 0, Geometry.Type.Unknown);
((UnionBuffer) agg).geometries.add(geom);
} catch (Exception e) {
// 忽略解析错误
}
}
}
@Override
public Object terminatePartial(AggregationBuffer agg) throws HiveException {
return terminate(agg);
}
@Override
public void merge(AggregationBuffer agg, Object partial)
throws HiveException {
if (partial != null) {
String wkt = ((Text) partial).toString();
Geometry geom = GeometryEngine.geometryFromWkt(
wkt, 0, Geometry.Type.Unknown);
((UnionBuffer) agg).geometries.add(geom);
}
}
@Override
public Object terminate(AggregationBuffer agg) throws HiveException {
List<Geometry> geoms = ((UnionBuffer) agg).geometries;
if (geoms.isEmpty()) {
return null;
}
Geometry[] geomArray = geoms.toArray(new Geometry[0]);
Geometry union = GeometryEngine.union(geomArray, sr);
return new Text(GeometryEngine.geometryToWkt(union, 0));
}
static class UnionBuffer implements AggregationBuffer {
List<Geometry> geometries = new ArrayList<>();
}
}
}
10.5 数据库集成
10.5.1 PostGIS 集成
/**
* PostGIS 数据读写工具
*/
public class PostGISHelper {
/**
* 从 PostGIS 读取几何
*/
public List<Geometry> readGeometries(Connection conn, String tableName,
String geomColumn) throws SQLException {
List<Geometry> geometries = new ArrayList<>();
String sql = String.format(
"SELECT ST_AsBinary(%s) AS geom FROM %s",
geomColumn, tableName);
try (Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
byte[] wkb = rs.getBytes("geom");
if (wkb != null) {
Geometry geom = OperatorImportFromWkb.local().execute(
0, Geometry.Type.Unknown,
ByteBuffer.wrap(wkb), null);
geometries.add(geom);
}
}
}
return geometries;
}
/**
* 写入几何到 PostGIS
*/
public void writeGeometry(Connection conn, String tableName,
String geomColumn, Geometry geometry, int srid) throws SQLException {
String sql = String.format(
"INSERT INTO %s (%s) VALUES (ST_SetSRID(ST_GeomFromWKB(?), ?))",
tableName, geomColumn);
ByteBuffer wkb = OperatorExportToWkb.local().execute(0, geometry, null);
try (PreparedStatement pstmt = conn.prepareStatement(sql)) {
pstmt.setBytes(1, wkb.array());
pstmt.setInt(2, srid);
pstmt.executeUpdate();
}
}
/**
* 执行空间查询
*/
public List<Geometry> spatialQuery(Connection conn, String tableName,
String geomColumn, Geometry queryGeom, int srid) throws SQLException {
String sql = String.format(
"SELECT ST_AsBinary(%s) AS geom FROM %s " +
"WHERE ST_Intersects(%s, ST_SetSRID(ST_GeomFromWKB(?), ?))",
geomColumn, tableName, geomColumn);
ByteBuffer wkb = OperatorExportToWkb.local().execute(0, queryGeom, null);
List<Geometry> results = new ArrayList<>();
try (PreparedStatement pstmt = conn.prepareStatement(sql)) {
pstmt.setBytes(1, wkb.array());
pstmt.setInt(2, srid);
try (ResultSet rs = pstmt.executeQuery()) {
while (rs.next()) {
byte[] resultWkb = rs.getBytes("geom");
Geometry geom = OperatorImportFromWkb.local().execute(
0, Geometry.Type.Unknown,
ByteBuffer.wrap(resultWkb), null);
results.add(geom);
}
}
}
return results;
}
}
10.5.2 MongoDB 空间集成
/**
* MongoDB 空间数据操作
*/
public class MongoSpatialHelper {
private MongoCollection<Document> collection;
/**
* 插入带空间索引的几何
*/
public void insertGeometry(String id, Geometry geometry) {
String geoJson = GeometryEngine.geometryToGeoJson(geometry);
Document doc = Document.parse(geoJson);
Document record = new Document()
.append("_id", id)
.append("geometry", doc);
collection.insertOne(record);
}
/**
* 空间查询:在多边形内
*/
public List<Geometry> findWithin(Polygon queryPolygon) {
String geoJson = GeometryEngine.geometryToGeoJson(null, queryPolygon);
Document queryGeom = Document.parse(geoJson);
Bson filter = Filters.geoWithin("geometry", queryGeom);
List<Geometry> results = new ArrayList<>();
for (Document doc : collection.find(filter)) {
Document geomDoc = doc.get("geometry", Document.class);
MapGeometry mg = GeometryEngine.geoJsonToGeometry(
geomDoc.toJson(), 0, Geometry.Type.Unknown);
results.add(mg.getGeometry());
}
return results;
}
/**
* 空间查询:距离范围
*/
public List<Geometry> findNear(Point center, double maxDistanceMeters) {
Point2D pt = center.getXY();
Bson filter = Filters.nearSphere("geometry", pt.x, pt.y,
maxDistanceMeters / 6378137.0); // 转换为弧度
List<Geometry> results = new ArrayList<>();
for (Document doc : collection.find(filter)) {
Document geomDoc = doc.get("geometry", Document.class);
MapGeometry mg = GeometryEngine.geoJsonToGeometry(
geomDoc.toJson(), 0, Geometry.Type.Unknown);
results.add(mg.getGeometry());
}
return results;
}
}
10.6 本章小结
本章介绍了 geometry-api-java 的大数据集成:
- Hadoop MapReduce:Mapper/Reducer 空间处理
- Apache Spark:RDD/SQL 空间分析
- Hive UDF:空间函数和聚合
- 数据库集成:PostGIS、MongoDB
关键设计原则:
- 使用 WKB 格式进行高效序列化
- 利用几何加速提升查询性能
- 使用包围盒预过滤减少精确计算
- 无状态算子设计支持分布式处理

浙公网安备 33010602011771号