栅格篇 - 栅格转矢量

@ 20240723 & lth
image

栅格数据来源:微信公众号,小白说遥感

  目标:获取栅格的矢量边界
  逻辑:
      1. 获取栅格基本信息;
      2. 判断是否需要进行多进程、分块处理;
      3. 从栅格数据创建掩膜;
      4. 将掩膜矢量化;
      5. 过滤有效的多边形(DN值为1的多边形,即有数据的区域)。
  代码:https://github.com/Lth1121/lth/blob/main/raster_to_vector.py

简易示例
from raster_to_vector import RasterToVector
import os

input = r"E:\code_git\gis_learning\raster_process\rasterfootprint\LC09_L2SP_138039_20230101_20230315_02_T1_SR_B2_rgb_clip.tif"
output = os.path.join(os.path.dirname(input), "output.shp")

if __name__ == '__main__':
    processor = RasterToVector()
    processor.process_raster(input, output)

代码实现

类注册

# !/usr/bin/env python3
# -*- coding: utf-8 -*-
# 栅格数据转矢量边界工具
# 支持处理栅格数据中的空洞,输出为Shapefile或GeoJSON格式
# 支持超大栅格的分块处理和加速优化

import os
import sys
import argparse
import time
import gc
from multiprocessing import Pool, cpu_count
from osgeo import gdal, ogr, osr
import numpy as np


class RasterToVector:
    """栅格转矢量类"""

    def __init__(self, chunk_size=512, max_memory_mb=256, use_multiprocessing=True):
        """
        初始化栅格转矢量处理器
        """
        gdal.UseExceptions()  # 启用GDAL异常
        self.chunk_size = chunk_size  # 分块大小(像素)
        self.max_memory_mb = max_memory_mb  # 最大内存使用量(MB)
        self.use_multiprocessing = use_multiprocessing  # 是否使用多进程
        self.cpu_cores = cpu_count()

获取栅格基本信息


    def get_raster_info(self, raster_path):
        """
        获取栅格基本信息

        Args:
            raster_path (str): 栅格文件路径

        Returns:
            dict: 栅格信息字典
        """
        dataset = gdal.Open(raster_path, gdal.GA_ReadOnly)
        if dataset is None:
            raise ValueError(f"无法打开栅格文件: {raster_path}")

        info = {
            'width': dataset.RasterXSize,
            'height': dataset.RasterYSize,
            'bands': dataset.RasterCount,
            'geotransform': dataset.GetGeoTransform(),
            'projection': dataset.GetProjection(),
            'datatype': dataset.GetRasterBand(1).DataType,
            'nodata': dataset.GetRasterBand(1).GetNoDataValue()
        }

        # 计算文件大小(估算)
        pixel_count = info['width'] * info['height']
        bytes_per_pixel = gdal.GetDataTypeSize(info['datatype']) // 8
        estimated_size_mb = (pixel_count * bytes_per_pixel) / (1024 * 1024)
        info['estimated_size_mb'] = estimated_size_mb

        dataset = None
        return info

判断是否分块处理

    def should_use_chunked_processing(self, raster_info):
        """
        判断是否需要使用分块处理

        Args:
            raster_info (dict): 栅格信息

        Returns:
            bool: 是否需要分块处理
        """
        # 如果估算内存使用超过阈值,或者像素数量过大,则使用分块处理
        print(f"估算内存需求: {raster_info['estimated_size_mb']:.1f} MB")
        return (raster_info['estimated_size_mb'] > self.max_memory_mb or
                raster_info['width'] > self.chunk_size * 4 or
                raster_info['height'] > self.chunk_size * 4)

单栅格创建掩膜

    def create_mask_from_raster(self, raster_path, nodata_value=None):
        """
        从栅格数据创建掩膜
        
        Args:
            raster_path (str): 栅格文件路径
            nodata_value (float): 无数据值,如果为None则从栅格文件读取
            
        Returns:
            tuple: (mask_array, geotransform, projection, width, height)
        """
        # 打开栅格文件
        dataset = gdal.Open(raster_path, gdal.GA_ReadOnly)
        if dataset is None:
            raise ValueError(f"无法打开栅格文件: {raster_path}")
        
        # 获取栅格信息
        width = dataset.RasterXSize
        height = dataset.RasterYSize
        geotransform = dataset.GetGeoTransform()
        projection = dataset.GetProjection()
        print(f"栅格信息:{width} x {height},投影坐标projection:{projection}")
        
        # 获取第一个波段
        band = dataset.GetRasterBand(1)
        
        # 获取无数据值
        if nodata_value is None:
            nodata_value = band.GetNoDataValue()
        
        # 读取数据
        data = band.ReadAsArray()
        
        # 创建掩膜:有效数据为1,无效数据为0
        if nodata_value is not None:
            mask = np.where(np.isclose(data, nodata_value, equal_nan=True), 0, 1)
        else:
            # 如果没有无数据值,则将NaN视为无数据
            mask = np.where(np.isnan(data), 0, 1)
        
        # 转换为uint8类型
        mask = mask.astype(np.uint8)
        
        dataset = None  # 关闭数据集

        return mask, geotransform, projection, width, height

分块创建掩膜

    def create_mask_chunked(self, raster_path, nodata_value=None):
        """
        分块创建掩膜(用于超大栅格)

        Args:
            raster_path (str): 栅格文件路径
            nodata_value (float): 无数据值

        Returns:
            tuple: (mask_array, geotransform, projection, width, height)
        """
        print("使用分块处理模式...")
        start_time = time.time()

        # 打开栅格文件
        dataset = gdal.Open(raster_path, gdal.GA_ReadOnly)
        if dataset is None:
            raise ValueError(f"无法打开栅格文件: {raster_path}")

        # 获取栅格信息
        width = dataset.RasterXSize
        height = dataset.RasterYSize
        geotransform = dataset.GetGeoTransform()
        projection = dataset.GetProjection()
        print(f"栅格信息:{width} x {height},投影坐标projection:{projection}")

        # 获取第一个波段
        band = dataset.GetRasterBand(1)

        # 获取无数据值
        if nodata_value is None:
            nodata_value = band.GetNoDataValue()

        # 创建输出掩膜数组
        mask = np.zeros((height, width), dtype=np.uint8)

        # 计算分块参数
        x_chunks = (width + self.chunk_size - 1) // self.chunk_size
        y_chunks = (height + self.chunk_size - 1) // self.chunk_size
        total_chunks = x_chunks * y_chunks

        print(f"分块处理:{x_chunks} x {y_chunks} = {total_chunks} 个块")

        # 分块处理
        processed_chunks = 0
        for y_chunk in range(y_chunks):
            for x_chunk in range(x_chunks):
                # 计算当前块的范围
                x_start = x_chunk * self.chunk_size
                y_start = y_chunk * self.chunk_size
                x_size = min(self.chunk_size, width - x_start)
                y_size = min(self.chunk_size, height - y_start)

                # 读取数据块
                data_chunk = band.ReadAsArray(x_start, y_start, x_size, y_size)

                # 创建掩膜块
                if nodata_value is not None:
                    mask_chunk = np.where(np.isclose(data_chunk, nodata_value, equal_nan=True), 0, 1)
                else:
                    mask_chunk = np.where(np.isnan(data_chunk), 0, 1)

                # 写入到总掩膜中
                mask[y_start:y_start+y_size, x_start:x_start+x_size] = mask_chunk.astype(np.uint8)

                processed_chunks += 1
                if processed_chunks % 100 == 0 or processed_chunks == total_chunks:
                    progress = (processed_chunks / total_chunks) * 100
                    elapsed = time.time() - start_time
                    print(f"进度: {processed_chunks}/{total_chunks} ({progress:.1f}%), 耗时: {elapsed:.1f}s")

                # 强制垃圾回收
                del data_chunk, mask_chunk
                if processed_chunks % 50 == 0:
                    gc.collect()

        dataset = None  # 关闭数据集

        print(f"分块处理完成,总耗时: {time.time() - start_time:.1f}s")
        return mask, geotransform, projection, width, height

掩膜矢量化

    def polygonize_mask(self, mask, geotransform, projection, output_path, output_format='ESRI Shapefile'):
        """
        将掩膜矢量化
        
        Args:
            mask (numpy.ndarray): 掩膜数组
            geotransform (tuple): 地理变换参数
            projection (str): 投影信息
            output_path (str): 输出文件路径
            output_format (str): 输出格式 ('ESRI Shapefile' 或 'GeoJSON')
        """
        # 创建内存中的栅格数据集
        mem_driver = gdal.GetDriverByName('MEM')
        mem_dataset = mem_driver.Create('', mask.shape[1], mask.shape[0], 1, gdal.GDT_Byte)
        mem_dataset.SetGeoTransform(geotransform)
        mem_dataset.SetProjection(projection)
        
        # 写入掩膜数据
        mem_band = mem_dataset.GetRasterBand(1)
        mem_band.WriteArray(mask)
        mem_band.SetNoDataValue(0)
        
        # 创建输出矢量数据集
        if output_format == 'GeoJSON':
            driver_name = 'GeoJSON'
            if not output_path.endswith('.geojson'):
                output_path += '.geojson'
        else:
            driver_name = 'ESRI Shapefile'
            if not output_path.endswith('.shp'):
                output_path += '.shp'
        
        # 删除已存在的输出文件
        if os.path.exists(output_path):
            vector_driver = ogr.GetDriverByName(driver_name)
            vector_driver.DeleteDataSource(output_path)
        
        # 创建矢量数据源
        vector_driver = ogr.GetDriverByName(driver_name)
        vector_dataset = vector_driver.CreateDataSource(output_path)
        
        # 创建空间参考系统
        srs = osr.SpatialReference()
        srs.ImportFromWkt(projection)
        
        # 创建图层
        layer = vector_dataset.CreateLayer('footprint', srs, ogr.wkbPolygon)
        
        # 添加字段
        field_def = ogr.FieldDefn('DN', ogr.OFTInteger)
        layer.CreateField(field_def)
        
        # 执行矢量化
        gdal.Polygonize(mem_band, mem_band, layer, 0, [], callback=None)
        
        # 清理
        mem_dataset = None
        vector_dataset = None
        
        print(f"矢量化完成,输出文件: {output_path}")

超大掩膜转矢量

    def polygonize_mask_optimized(self, mask, geotransform, projection, output_path, output_format='ESRI Shapefile'):
        """
        优化的矢量化方法,支持大型掩膜

        Args:
            mask (numpy.ndarray): 掩膜数组
            geotransform (tuple): 地理变换参数
            projection (str): 投影信息
            output_path (str): 输出文件路径
            output_format (str): 输出格式
        """
        print("开始优化矢量化...")
        start_time = time.time()

        # 检查掩膜大小,决定是否需要特殊处理
        mask_size_mb = mask.nbytes / (1024 * 1024)
        print(f"掩膜大小: {mask_size_mb:.1f} MB")

        # 创建内存中的栅格数据集
        mem_driver = gdal.GetDriverByName('MEM')
        mem_dataset = mem_driver.Create('', mask.shape[1], mask.shape[0], 1, gdal.GDT_Byte)
        mem_dataset.SetGeoTransform(geotransform)
        mem_dataset.SetProjection(projection)

        # 写入掩膜数据
        mem_band = mem_dataset.GetRasterBand(1)
        mem_band.WriteArray(mask)
        mem_band.SetNoDataValue(0)

        # 设置缓存大小以优化性能
        gdal.SetCacheMax(min(512 * 1024 * 1024, int(mask_size_mb * 2 * 1024 * 1024)))  # 设置为掩膜大小的2倍或512MB

        # 创建输出矢量数据集
        if output_format == 'GeoJSON':
            driver_name = 'GeoJSON'
            if not output_path.endswith('.geojson'):
                output_path += '.geojson'
        else:
            driver_name = 'ESRI Shapefile'
            if not output_path.endswith('.shp'):
                output_path += '.shp'

        # 删除已存在的输出文件
        if os.path.exists(output_path):
            vector_driver = ogr.GetDriverByName(driver_name)
            vector_driver.DeleteDataSource(output_path)

        # 创建矢量数据源
        vector_driver = ogr.GetDriverByName(driver_name)
        vector_dataset = vector_driver.CreateDataSource(output_path)

        # 创建空间参考系统
        srs = osr.SpatialReference()
        srs.ImportFromWkt(projection)

        # 创建图层
        layer = vector_dataset.CreateLayer('footprint', srs, ogr.wkbPolygon)

        # 添加字段
        field_def = ogr.FieldDefn('DN', ogr.OFTInteger)
        layer.CreateField(field_def)

        # 执行矢量化(带进度回调)
        def progress_callback(complete, message, data):
            if complete % 0.1 < 0.01:  # 每10%显示一次进度
                elapsed = time.time() - start_time
                print(f"矢量化进度: {complete*100:.1f}%, 耗时: {elapsed:.1f}s")
            return 1  # 继续处理

        gdal.Polygonize(mem_band, mem_band, layer, 0, [], callback=progress_callback)

        # 清理
        mem_dataset = None
        vector_dataset = None

        elapsed = time.time() - start_time
        print(f"优化矢量化完成,耗时: {elapsed:.1f}s,输出文件: {output_path}")

过滤有效多边形

    def filter_valid_polygons(self, input_path, output_path, output_format='ESRI Shapefile'):
        """
        过滤有效的多边形(DN值为1的多边形,即有数据的区域)
        
        Args:
            input_path (str): 输入矢量文件路径
            output_path (str): 输出矢量文件路径
            output_format (str): 输出格式
        """
        # 打开输入数据源
        input_dataset = ogr.Open(input_path, 0)
        input_layer = input_dataset.GetLayer()
        
        # 创建输出数据源
        if output_format == 'GeoJSON':
            driver_name = 'GeoJSON'
            if not output_path.endswith('.geojson'):
                output_path += '.geojson'
        else:
            driver_name = 'ESRI Shapefile'
            if not output_path.endswith('.shp'):
                output_path += '.shp'
        
        # 删除已存在的输出文件
        if os.path.exists(output_path):
            output_driver = ogr.GetDriverByName(driver_name)
            output_driver.DeleteDataSource(output_path)
        
        # 创建输出数据源
        output_driver = ogr.GetDriverByName(driver_name)
        output_dataset = output_driver.CreateDataSource(output_path)
        
        # 创建输出图层
        output_layer = output_dataset.CreateLayer(
            'footprint', 
            input_layer.GetSpatialRef(), 
            ogr.wkbPolygon
        )
        
        # 复制字段定义
        input_layer_defn = input_layer.GetLayerDefn()
        for i in range(input_layer_defn.GetFieldCount()):
            field_defn = input_layer_defn.GetFieldDefn(i)
            output_layer.CreateField(field_defn)
        
        # 过滤并复制要素
        valid_count = 0
        for feature in input_layer:
            dn_value = feature.GetField('DN')
            if dn_value == 1:  # 只保留有数据的区域
                output_layer.CreateFeature(feature)
                valid_count += 1
        
        # 清理
        input_dataset = None
        output_dataset = None
        
        print(f"过滤完成,保留了 {valid_count} 个有效多边形")

智能处理策略


    def process_raster(self, input_raster, output_vector, output_format='ESRI Shapefile', nodata_value=None):
        """
        处理栅格文件,生成矢量边界(智能选择处理策略)

        Args:
            input_raster (str): 输入栅格文件路径
            output_vector (str): 输出矢量文件路径
            output_format (str): 输出格式 ('ESRI Shapefile' 或 'GeoJSON')
            nodata_value (float): 无数据值
        """
        try:
            print(f"开始处理栅格文件: {input_raster}")
            start_time = time.time()

            # 0. 获取栅格信息并选择处理策略
            print("分析栅格文件...")
            raster_info = self.get_raster_info(input_raster)
            print(f"栅格大小: {raster_info['width']} x {raster_info['height']}")
            print(f"估算内存需求: {raster_info['estimated_size_mb']:.1f} MB")

            use_chunked = self.should_use_chunked_processing(raster_info)
            if use_chunked:
                print("检测到大型栅格,使用分块处理模式")
            else:
                print("使用标准处理模式")

            # 1. 创建掩膜
            print("创建掩膜...")
            if use_chunked:
                mask, geotransform, projection, width, height = self.create_mask_chunked(
                    input_raster, nodata_value
                )
            else:
                mask, geotransform, projection, width, height = self.create_mask_from_raster(
                    input_raster, nodata_value
                )

            # 2. 矢量化掩膜
            print("矢量化掩膜...")
            temp_output = output_vector + '_temp'

            # 根据掩膜大小选择矢量化方法
            mask_size_mb = mask.nbytes / (1024 * 1024)
            if mask_size_mb > 100:  # 大于100MB使用优化方法
                self.polygonize_mask_optimized(mask, geotransform, projection, temp_output, output_format)
            else:
                self.polygonize_mask(mask, geotransform, projection, temp_output, output_format)
            
            # 3. 过滤有效多边形
            print("过滤有效多边形...")
            if output_format == 'GeoJSON':
                temp_output += '.geojson'
            else:
                temp_output += '.shp'
            
            self.filter_valid_polygons(temp_output, output_vector, output_format)
            
            # 4. 清理临时文件
            if output_format == 'ESRI Shapefile':
                # 删除shapefile相关文件
                for ext in ['.shp', '.shx', '.dbf', '.prj']:
                    temp_file = temp_output.replace('.shp', ext)
                    if os.path.exists(temp_file):
                        os.remove(temp_file)
            else:
                if os.path.exists(temp_output):
                    os.remove(temp_output)

            # 5. 清理内存
            del mask
            gc.collect()

            total_time = time.time() - start_time
            print(f"处理完成!总耗时: {total_time:.1f}s")

        except Exception as e:
            import traceback
            error_details = traceback.format_exc()
            print(f"处理过程中发生错误: {str(e)}")
            print(f"详细错误信息:")
            print(error_details)
            raise

主函数

def main():
    """主函数"""
    parser = argparse.ArgumentParser(description='栅格数据转矢量边界工具(支持超大栅格优化处理)')
    parser.add_argument('input', help='输入栅格文件路径 (.tif)')
    parser.add_argument('output', help='输出矢量文件路径 (.shp 或 .geojson)')
    parser.add_argument('--format', choices=['shapefile', 'geojson'], default='shapefile',
                       help='输出格式 (默认: shapefile)')
    parser.add_argument('--nodata', type=float, help='无数据值 (如果不指定则从栅格文件读取)')
    parser.add_argument('--chunk-size', type=int, default=2048,
                       help='分块大小(像素),用于大型栅格处理 (默认: 2048)')
    parser.add_argument('--max-memory', type=int, default=1024,
                       help='最大内存使用量(MB) (默认: 1024)')
    parser.add_argument('--no-multiprocessing', action='store_true',
                       help='禁用多进程处理')

    args = parser.parse_args()

    # 检查输入文件是否存在
    if not os.path.exists(args.input):
        print(f"错误: 输入文件不存在: {args.input}")
        sys.exit(1)

    # 确定输出格式
    output_format = 'GeoJSON' if args.format == 'geojson' else 'ESRI Shapefile'

    # 创建处理器并执行
    processor = RasterToVector(
        chunk_size=args.chunk_size,
        max_memory_mb=args.max_memory,
        use_multiprocessing=not args.no_multiprocessing
    )

    print(f"处理器配置:")
    print(f"  分块大小: {args.chunk_size} 像素")
    print(f"  最大内存: {args.max_memory} MB")
    print(f"  多进程: {'启用' if not args.no_multiprocessing else '禁用'}")
    print(f"  CPU核心数: {processor.cpu_cores}")
    print()

    processor.process_raster(args.input, args.output, output_format, args.nodata)


if __name__ == '__main__':
    main()
posted @ 2025-07-24 16:03  李天豪  阅读(29)  评论(0)    收藏  举报