第10章 - 大数据集成

第10章 - 大数据集成

10.1 大数据场景概述

10.1.1 geometry-api-java 的大数据定位

geometry-api-java 最初就是为大数据空间处理设计的。它特别适合:

  • Hadoop MapReduce:分布式空间分析
  • Apache Spark:内存计算空间处理
  • Apache Hive:SQL 空间查询(通过 UDF)
  • Apache Storm:实时空间流处理
  • 分布式数据库:HBase、Cassandra 的空间扩展

10.1.2 大数据空间处理挑战

挑战                    geometry-api-java 解决方案
─────────────────────────────────────────────────
数据规模大              → GeometryCursor 流式处理
内存限制                → 轻量级对象设计
序列化开销              → 高效的 WKB 格式
分布式计算              → 无状态算子设计
容错处理                → 可序列化的几何对象

10.2 Hadoop MapReduce 集成

10.2.1 空间数据 InputFormat

/**
 * 读取 GeoJSON 文件的 InputFormat
 */
public class GeoJsonInputFormat extends FileInputFormat<LongWritable, Text> {
    
    @Override
    public RecordReader<LongWritable, Text> createRecordReader(
            InputSplit split, TaskAttemptContext context) {
        return new GeoJsonRecordReader();
    }
}

public class GeoJsonRecordReader extends RecordReader<LongWritable, Text> {
    private LineRecordReader lineReader;
    private LongWritable key = new LongWritable();
    private Text value = new Text();
    
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) 
            throws IOException {
        lineReader = new LineRecordReader();
        lineReader.initialize(split, context);
    }
    
    @Override
    public boolean nextKeyValue() throws IOException {
        if (!lineReader.nextKeyValue()) {
            return false;
        }
        key.set(lineReader.getCurrentKey().get());
        value.set(lineReader.getCurrentValue());
        return true;
    }
    
    // ... 其他方法实现
}

10.2.2 空间过滤 Mapper

/**
 * 空间过滤 Mapper:只输出在指定区域内的几何
 */
public class SpatialFilterMapper 
        extends Mapper<LongWritable, Text, Text, Text> {
    
    private Polygon filterPolygon;
    private SpatialReference sr;
    
    @Override
    protected void setup(Context context) {
        // 从配置读取过滤区域
        String wkt = context.getConfiguration().get("filter.polygon.wkt");
        filterPolygon = (Polygon) GeometryEngine.geometryFromWkt(
            wkt, 0, Geometry.Type.Polygon);
        sr = SpatialReference.create(4326);
        
        // 加速过滤多边形
        OperatorContains.local().accelerateGeometry(
            filterPolygon, sr, 
            Geometry.GeometryAccelerationDegree.enumMedium);
    }
    
    @Override
    protected void map(LongWritable key, Text value, Context context) 
            throws IOException, InterruptedException {
        try {
            // 解析 GeoJSON
            MapGeometry mapGeom = GeometryEngine.geoJsonToGeometry(
                value.toString(), 0, Geometry.Type.Unknown);
            Geometry geom = mapGeom.getGeometry();
            
            // 空间过滤
            if (GeometryEngine.contains(filterPolygon, geom, sr)) {
                context.write(new Text(key.toString()), value);
            }
        } catch (Exception e) {
            context.getCounter("Geometry", "ParseErrors").increment(1);
        }
    }
    
    @Override
    protected void cleanup(Context context) {
        Operator.deaccelerateGeometry(filterPolygon);
    }
}

10.2.3 空间聚合 Reducer

/**
 * 空间合并 Reducer:将相交的几何合并
 */
public class SpatialUnionReducer 
        extends Reducer<Text, BytesWritable, Text, BytesWritable> {
    
    private SpatialReference sr;
    
    @Override
    protected void setup(Context context) {
        sr = SpatialReference.create(4326);
    }
    
    @Override
    protected void reduce(Text key, Iterable<BytesWritable> values, 
            Context context) throws IOException, InterruptedException {
        
        List<Geometry> geometries = new ArrayList<>();
        
        for (BytesWritable value : values) {
            byte[] wkb = value.getBytes();
            Geometry geom = OperatorImportFromWkb.local().execute(
                0, Geometry.Type.Unknown, 
                ByteBuffer.wrap(wkb, 0, value.getLength()), null);
            geometries.add(geom);
        }
        
        // 合并所有几何
        Geometry[] geomArray = geometries.toArray(new Geometry[0]);
        Geometry union = GeometryEngine.union(geomArray, sr);
        
        // 输出 WKB
        ByteBuffer wkb = OperatorExportToWkb.local().execute(0, union, null);
        context.write(key, new BytesWritable(wkb.array()));
    }
}

10.2.4 完整的 MapReduce 作业

public class SpatialFilterJob {
    
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        
        // 设置过滤多边形
        conf.set("filter.polygon.wkt", 
            "POLYGON ((116 39, 117 39, 117 40, 116 40, 116 39))");
        
        Job job = Job.getInstance(conf, "Spatial Filter");
        job.setJarByClass(SpatialFilterJob.class);
        
        job.setMapperClass(SpatialFilterMapper.class);
        job.setReducerClass(SpatialUnionReducer.class);
        
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);
        
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

10.3 Apache Spark 集成

10.3.1 Spark 空间 RDD

/**
 * 使用 Spark Java API 进行空间处理
 */
public class SparkSpatialAnalysis {
    
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
            .setAppName("Spatial Analysis")
            .setMaster("local[*]");
        
        JavaSparkContext sc = new JavaSparkContext(conf);
        
        // 读取 GeoJSON 数据
        JavaRDD<String> geoJsonRDD = sc.textFile("hdfs://path/to/geojson");
        
        // 转换为几何 RDD
        JavaRDD<Geometry> geometryRDD = geoJsonRDD.map(json -> {
            try {
                MapGeometry mg = GeometryEngine.geoJsonToGeometry(
                    json, 0, Geometry.Type.Unknown);
                return mg.getGeometry();
            } catch (Exception e) {
                return null;
            }
        }).filter(g -> g != null);
        
        // 计算缓冲区
        SpatialReference sr = SpatialReference.create(4326);
        JavaRDD<Geometry> bufferedRDD = geometryRDD.map(geom -> 
            GeometryEngine.buffer(geom, sr, 0.01));
        
        // 输出为 WKT
        JavaRDD<String> wktRDD = bufferedRDD.map(geom -> 
            GeometryEngine.geometryToWkt(geom, 0));
        
        wktRDD.saveAsTextFile("hdfs://path/to/output");
        
        sc.close();
    }
}

10.3.2 空间连接

/**
 * Spark 空间连接实现
 */
public class SparkSpatialJoin {
    
    public static JavaPairRDD<Long, Tuple2<Geometry, Geometry>> spatialJoin(
            JavaPairRDD<Long, Geometry> left,
            JavaPairRDD<Long, Geometry> right,
            SpatialReference sr) {
        
        // 简单的嵌套循环连接(适用于小数据集)
        // 大数据集应使用空间索引
        
        // 收集右侧数据(假设较小)
        Map<Long, Geometry> rightMap = right.collectAsMap();
        Broadcast<Map<Long, Geometry>> rightBroadcast = 
            left.context().broadcast(rightMap);
        
        return left.flatMapToPair(tuple -> {
            List<Tuple2<Long, Tuple2<Geometry, Geometry>>> results = 
                new ArrayList<>();
            
            Geometry leftGeom = tuple._2();
            Envelope2D leftEnv = new Envelope2D();
            leftGeom.queryEnvelope2D(leftEnv);
            
            for (Map.Entry<Long, Geometry> entry : 
                    rightBroadcast.value().entrySet()) {
                
                Geometry rightGeom = entry.getValue();
                Envelope2D rightEnv = new Envelope2D();
                rightGeom.queryEnvelope2D(rightEnv);
                
                // 包围盒过滤
                if (!leftEnv.isIntersecting(rightEnv)) {
                    continue;
                }
                
                // 精确相交检查
                if (!GeometryEngine.disjoint(leftGeom, rightGeom, sr)) {
                    results.add(new Tuple2<>(tuple._1(), 
                        new Tuple2<>(leftGeom, rightGeom)));
                }
            }
            
            return results.iterator();
        });
    }
}

10.3.3 Spark SQL 空间 UDF

/**
 * Spark SQL 空间 UDF 定义
 */
public class SparkSpatialUDFs {
    
    public static void registerUDFs(SparkSession spark) {
        // 从 WKT 创建几何
        spark.udf().register("ST_GeomFromText", 
            (String wkt) -> {
                try {
                    Geometry geom = GeometryEngine.geometryFromWkt(
                        wkt, 0, Geometry.Type.Unknown);
                    return GeometryEngine.geometryToWkt(geom, 0);
                } catch (Exception e) {
                    return null;
                }
            }, DataTypes.StringType);
        
        // 计算缓冲区
        spark.udf().register("ST_Buffer", 
            (String wkt, Double distance) -> {
                try {
                    Geometry geom = GeometryEngine.geometryFromWkt(
                        wkt, 0, Geometry.Type.Unknown);
                    Geometry buffer = GeometryEngine.buffer(
                        geom, SpatialReference.create(4326), distance);
                    return GeometryEngine.geometryToWkt(buffer, 0);
                } catch (Exception e) {
                    return null;
                }
            }, DataTypes.StringType);
        
        // 判断包含
        spark.udf().register("ST_Contains", 
            (String wkt1, String wkt2) -> {
                try {
                    Geometry g1 = GeometryEngine.geometryFromWkt(
                        wkt1, 0, Geometry.Type.Unknown);
                    Geometry g2 = GeometryEngine.geometryFromWkt(
                        wkt2, 0, Geometry.Type.Unknown);
                    return GeometryEngine.contains(
                        g1, g2, SpatialReference.create(4326));
                } catch (Exception e) {
                    return false;
                }
            }, DataTypes.BooleanType);
        
        // 计算距离
        spark.udf().register("ST_Distance", 
            (String wkt1, String wkt2) -> {
                try {
                    Geometry g1 = GeometryEngine.geometryFromWkt(
                        wkt1, 0, Geometry.Type.Unknown);
                    Geometry g2 = GeometryEngine.geometryFromWkt(
                        wkt2, 0, Geometry.Type.Unknown);
                    return GeometryEngine.distance(
                        g1, g2, SpatialReference.create(4326));
                } catch (Exception e) {
                    return Double.NaN;
                }
            }, DataTypes.DoubleType);
    }
}

// 使用示例
// SELECT * FROM points WHERE ST_Contains(region_wkt, point_wkt)
// SELECT id, ST_Buffer(geom_wkt, 0.01) AS buffer FROM features

10.4 Hive 空间 UDF

10.4.1 Hive UDF 基础

/**
 * Hive 空间 UDF:计算面积
 */
public class ST_Area extends GenericUDF {
    
    private ObjectInspectorConverters.Converter inputConverter;
    
    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) 
            throws UDFArgumentException {
        
        if (arguments.length != 1) {
            throw new UDFArgumentException("ST_Area takes 1 argument");
        }
        
        inputConverter = ObjectInspectorConverters.getConverter(
            arguments[0], 
            PrimitiveObjectInspectorFactory.writableStringObjectInspector);
        
        return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
    }
    
    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        Text wktText = (Text) inputConverter.convert(arguments[0].get());
        if (wktText == null) {
            return null;
        }
        
        try {
            Geometry geom = GeometryEngine.geometryFromWkt(
                wktText.toString(), 0, Geometry.Type.Unknown);
            return new DoubleWritable(geom.calculateArea2D());
        } catch (Exception e) {
            return null;
        }
    }
    
    @Override
    public String getDisplayString(String[] children) {
        return "ST_Area(" + children[0] + ")";
    }
}

10.4.2 Hive 空间 UDAF

/**
 * Hive 空间聚合函数:合并几何
 */
public class ST_Union_Aggr extends AbstractGenericUDAFResolver {
    
    @Override
    public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) 
            throws SemanticException {
        return new ST_Union_Aggr_Evaluator();
    }
    
    public static class ST_Union_Aggr_Evaluator extends GenericUDAFEvaluator {
        
        private PrimitiveObjectInspector inputOI;
        private SpatialReference sr = SpatialReference.create(4326);
        
        @Override
        public ObjectInspector init(Mode m, ObjectInspector[] parameters) 
                throws HiveException {
            super.init(m, parameters);
            inputOI = (PrimitiveObjectInspector) parameters[0];
            return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
        }
        
        @Override
        public AggregationBuffer getNewAggregationBuffer() throws HiveException {
            return new UnionBuffer();
        }
        
        @Override
        public void reset(AggregationBuffer agg) throws HiveException {
            ((UnionBuffer) agg).geometries.clear();
        }
        
        @Override
        public void iterate(AggregationBuffer agg, Object[] parameters) 
                throws HiveException {
            String wkt = PrimitiveObjectInspectorUtils.getString(
                parameters[0], inputOI);
            if (wkt != null) {
                try {
                    Geometry geom = GeometryEngine.geometryFromWkt(
                        wkt, 0, Geometry.Type.Unknown);
                    ((UnionBuffer) agg).geometries.add(geom);
                } catch (Exception e) {
                    // 忽略解析错误
                }
            }
        }
        
        @Override
        public Object terminatePartial(AggregationBuffer agg) throws HiveException {
            return terminate(agg);
        }
        
        @Override
        public void merge(AggregationBuffer agg, Object partial) 
                throws HiveException {
            if (partial != null) {
                String wkt = ((Text) partial).toString();
                Geometry geom = GeometryEngine.geometryFromWkt(
                    wkt, 0, Geometry.Type.Unknown);
                ((UnionBuffer) agg).geometries.add(geom);
            }
        }
        
        @Override
        public Object terminate(AggregationBuffer agg) throws HiveException {
            List<Geometry> geoms = ((UnionBuffer) agg).geometries;
            if (geoms.isEmpty()) {
                return null;
            }
            
            Geometry[] geomArray = geoms.toArray(new Geometry[0]);
            Geometry union = GeometryEngine.union(geomArray, sr);
            return new Text(GeometryEngine.geometryToWkt(union, 0));
        }
        
        static class UnionBuffer implements AggregationBuffer {
            List<Geometry> geometries = new ArrayList<>();
        }
    }
}

10.5 数据库集成

10.5.1 PostGIS 集成

/**
 * PostGIS 数据读写工具
 */
public class PostGISHelper {
    
    /**
     * 从 PostGIS 读取几何
     */
    public List<Geometry> readGeometries(Connection conn, String tableName, 
            String geomColumn) throws SQLException {
        
        List<Geometry> geometries = new ArrayList<>();
        
        String sql = String.format(
            "SELECT ST_AsBinary(%s) AS geom FROM %s", 
            geomColumn, tableName);
        
        try (Statement stmt = conn.createStatement();
             ResultSet rs = stmt.executeQuery(sql)) {
            
            while (rs.next()) {
                byte[] wkb = rs.getBytes("geom");
                if (wkb != null) {
                    Geometry geom = OperatorImportFromWkb.local().execute(
                        0, Geometry.Type.Unknown, 
                        ByteBuffer.wrap(wkb), null);
                    geometries.add(geom);
                }
            }
        }
        
        return geometries;
    }
    
    /**
     * 写入几何到 PostGIS
     */
    public void writeGeometry(Connection conn, String tableName, 
            String geomColumn, Geometry geometry, int srid) throws SQLException {
        
        String sql = String.format(
            "INSERT INTO %s (%s) VALUES (ST_SetSRID(ST_GeomFromWKB(?), ?))",
            tableName, geomColumn);
        
        ByteBuffer wkb = OperatorExportToWkb.local().execute(0, geometry, null);
        
        try (PreparedStatement pstmt = conn.prepareStatement(sql)) {
            pstmt.setBytes(1, wkb.array());
            pstmt.setInt(2, srid);
            pstmt.executeUpdate();
        }
    }
    
    /**
     * 执行空间查询
     */
    public List<Geometry> spatialQuery(Connection conn, String tableName,
            String geomColumn, Geometry queryGeom, int srid) throws SQLException {
        
        String sql = String.format(
            "SELECT ST_AsBinary(%s) AS geom FROM %s " +
            "WHERE ST_Intersects(%s, ST_SetSRID(ST_GeomFromWKB(?), ?))",
            geomColumn, tableName, geomColumn);
        
        ByteBuffer wkb = OperatorExportToWkb.local().execute(0, queryGeom, null);
        
        List<Geometry> results = new ArrayList<>();
        
        try (PreparedStatement pstmt = conn.prepareStatement(sql)) {
            pstmt.setBytes(1, wkb.array());
            pstmt.setInt(2, srid);
            
            try (ResultSet rs = pstmt.executeQuery()) {
                while (rs.next()) {
                    byte[] resultWkb = rs.getBytes("geom");
                    Geometry geom = OperatorImportFromWkb.local().execute(
                        0, Geometry.Type.Unknown, 
                        ByteBuffer.wrap(resultWkb), null);
                    results.add(geom);
                }
            }
        }
        
        return results;
    }
}

10.5.2 MongoDB 空间集成

/**
 * MongoDB 空间数据操作
 */
public class MongoSpatialHelper {
    
    private MongoCollection<Document> collection;
    
    /**
     * 插入带空间索引的几何
     */
    public void insertGeometry(String id, Geometry geometry) {
        String geoJson = GeometryEngine.geometryToGeoJson(geometry);
        Document doc = Document.parse(geoJson);
        
        Document record = new Document()
            .append("_id", id)
            .append("geometry", doc);
        
        collection.insertOne(record);
    }
    
    /**
     * 空间查询:在多边形内
     */
    public List<Geometry> findWithin(Polygon queryPolygon) {
        String geoJson = GeometryEngine.geometryToGeoJson(null, queryPolygon);
        Document queryGeom = Document.parse(geoJson);
        
        Bson filter = Filters.geoWithin("geometry", queryGeom);
        
        List<Geometry> results = new ArrayList<>();
        for (Document doc : collection.find(filter)) {
            Document geomDoc = doc.get("geometry", Document.class);
            MapGeometry mg = GeometryEngine.geoJsonToGeometry(
                geomDoc.toJson(), 0, Geometry.Type.Unknown);
            results.add(mg.getGeometry());
        }
        
        return results;
    }
    
    /**
     * 空间查询:距离范围
     */
    public List<Geometry> findNear(Point center, double maxDistanceMeters) {
        Point2D pt = center.getXY();
        
        Bson filter = Filters.nearSphere("geometry", pt.x, pt.y, 
            maxDistanceMeters / 6378137.0);  // 转换为弧度
        
        List<Geometry> results = new ArrayList<>();
        for (Document doc : collection.find(filter)) {
            Document geomDoc = doc.get("geometry", Document.class);
            MapGeometry mg = GeometryEngine.geoJsonToGeometry(
                geomDoc.toJson(), 0, Geometry.Type.Unknown);
            results.add(mg.getGeometry());
        }
        
        return results;
    }
}

10.6 本章小结

本章介绍了 geometry-api-java 的大数据集成:

  1. Hadoop MapReduce:Mapper/Reducer 空间处理
  2. Apache Spark:RDD/SQL 空间分析
  3. Hive UDF:空间函数和聚合
  4. 数据库集成:PostGIS、MongoDB

关键设计原则

  • 使用 WKB 格式进行高效序列化
  • 利用几何加速提升查询性能
  • 使用包围盒预过滤减少精确计算
  • 无状态算子设计支持分布式处理

← 上一章:性能优化与加速 | 下一章:开发实战案例 →

posted @ 2025-12-03 15:15  我才是银古  阅读(6)  评论(0)    收藏  举报