从阿里云DATAV GeoAtlas接口抽取行政区划数据

阿里云提供的地理信息接口

https://datav.aliyun.com/tools/atlas/

有两个接口, 一个是[行政编码].json, 一个是[行政编码]_full.json, 从接口中可以提取到区县一级的行政区划信息. 提取的过程中遇到的一些问题:

  • 从[行政编码].json中读取的信息中, 可能parent = null, 出现这种情况的大都是一些撤县改区的节点, 要将其设为上一级节点的行政编码
  • 从[行政编码].json中读到的parent的adcode, 可能与[父节点行政编码]_full.json中读到的parent的adcode不一致, 例如从110000_full.json中得到的节点列表, 其parent都是110000, 但是在取其字节点110101.json时会发现, parent变成了110100, 这时候要使用110100这个行政编码
  • 因为从上至下遍历时, 是不会遇到110100这个节点的, 所以在遍历的过程中, 要检查是否出现了未知的行政编码, 如果有, 需要额外读取并入库
  • 有部分节点, 其json无法读取(不存在), 例如密云110118.json, 延庆110119.json, 这时候要用前一步得到的信息入库

使用生成的行政区划数据时, 对于香港澳门的数据, 因为没有level=city的这一级, 所以需要特殊处理一下, 例如在读取province这一级的子节点时, 如果发现没有level=city的节点, 那么就返回一个虚拟的节点, 这个节点各字段值和自己一样, 但是level=city.

#!/usr/bin/python3
# -*- coding: UTF-8 -*-

import json
import traceback
import rbcommon

def readRegion(adcode, parent_code = None):
    # https://geo.datav.aliyun.com/areas/bound/140000.json
    url = 'https://geo.datav.aliyun.com/areas/bound/' + adcode + '.json'
    print(url)
    echo = rbcommon.requestGet(url, 'UTF-8', 20, 10)
    if echo is None:
        print('URL request failed: ' + url)
        return
    elif echo.find('<?') == 0:
        print('Not found: ' + url)
        return
    # print(echo)
    json_obj = json.loads(echo)
    region = {}
    region['name'] = json_obj['features'][0]['properties']['name']
    region['adcode'] = json_obj['features'][0]['properties']['adcode']
    region['telecode'] = json_obj['features'][0]['properties']['telecode']
    level = json_obj['features'][0]['properties']['level']
    if (level == 'country'):
        region['level'] = 0
    elif (level == 'province'):
        region['level'] = 1
    elif (level == 'city'):
        region['level'] = 2
    elif (level == 'district'):
        region['level'] = 3
    if ('parent' in json_obj['features'][0]['properties']) and (not json_obj['features'][0]['properties']['parent'] is None):
        region['parent'] = json_obj['features'][0]['properties']['parent']['adcode']
    else:
        region['parent'] = parent_code

    # read sub regions
    sub_regions = []
    region['children'] = sub_regions
    # https://geo.datav.aliyun.com/areas/bound/140000_full.json
    url = 'https://geo.datav.aliyun.com/areas/bound/' + adcode + '_full.json'
    print(url)
    echo = rbcommon.requestGet(url, 'UTF-8', 20, 10)
    if echo is None:
        print('URL request failed: ' + url)
        return region
    elif echo.find('<?') == 0:
        print('Not found: ' + url)
        return region
    # print(echo)
    json_obj = json.loads(echo)
    sub_objs = json_obj['features']
    for sub_obj in sub_objs:
        sub_region = {}
        sub_region['adcode'] = (str)(sub_obj['properties']['adcode'])
        if (sub_region['adcode'] == region['adcode']):
            continue
        sub_region['name'] = sub_obj['properties']['name']
        sub_region['telecode'] = None
        level = sub_obj['properties']['level']
        if (level == 'country'):
            sub_region['level'] = 0
        elif (level == 'province'):
            sub_region['level'] = 1
        elif (level == 'city'):
            sub_region['level'] = 2
        elif (level == 'district'):
            sub_region['level'] = 3
        sub_region['parent'] = adcode
        sub_regions.append(sub_region)

    # further check if the parent adcode is correct
    if (len(sub_regions) > 0):
        # https://geo.datav.aliyun.com/areas/bound/140000.json
        url = 'https://geo.datav.aliyun.com/areas/bound/' + sub_regions[0]['adcode'] + '.json'
        # print(url)
        echo = rbcommon.requestGet(url, 'UTF-8', 20, 10)
        if echo is None:
            print('URL request failed: ' + url)
        elif echo.find('<?') == 0:
            print('Not found: ' + url)
        else:
            json_obj = json.loads(echo)
            if ('parent' in json_obj['features'][0]['properties']) and (not json_obj['features'][0]['properties']['parent'] is None):
                dummy_parent = json_obj['features'][0]['properties']['parent']['adcode']
                if (dummy_parent != sub_regions[0]['parent']):
                    print('Update parent from {} to {}', sub_regions[0]['parent'], dummy_parent)
                    for sub_region in sub_regions:
                        sub_region['parent'] = dummy_parent

    return region

def readAllRegion(parent_region):
    region = readRegion(parent_region['adcode'], parent_region['parent'])
    if not region is None:
        if (not region['parent'] is None) and (not region['parent'] in regions):
            new_region = readRegion(region['parent'], parent_region['parent'])
            if not new_region is None:
                regions.add(new_region['adcode'])
                insert(new_region)

        regions.add(region['adcode'])
        insert(region)

        for sub_region in region['children']:
            readAllRegion(sub_region)
    else:
        regions.add(parent_region['adcode'])
        insert(parent_region)

def insert(region):
    try:
        with rbcommon.mysqlclient.cursor() as cursor:
            sql = 'INSERT IGNORE INTO `s_region` (`id`, `parent_id`, `level`, `name`, `tele_code`, `short_name`, ' \
                  '`full_name`) VALUES (%s, %s, %s, %s, %s, %s, %s)'
            cursor.execute(sql, (
                region['adcode'],
                None if (not 'parent' in region) else region['parent'],
                region['level'],
                region['name'],
                region['telecode'],
                region['name'],
                '{}'))
            rbcommon.mysqlclient.commit()
    except Exception as e:
        print(json.dumps(region))
        traceback.print_exc()

### MAIN ###
regions = set()
region = readRegion('100000')
readAllRegion(region)

其中rbcommon.mysqlclient的初始化方法

mysqlclient = pymysql.connect(
    host=cfg['mysql']['host'],
    port=cfg['mysql']['port'],
    user=cfg['mysql']['user'],
    password=cfg['mysql']['password'],
    db=cfg['mysql']['db'],
    charset=cfg['mysql']['charset'],
    cursorclass=pymysql.cursors.DictCursor)

  

posted on 2019-05-31 15:33  Milton  阅读(15704)  评论(0)    收藏  举报

导航