结合GIS与Python,爬取百度商业POI
一、总体思路
1-利用arcgis创建渔网,商业网点稀疏用大矩形,商业网点密集用小矩形
2-求出矩形左下角和右上角坐标点,读取矩形表得到坐标串
maxX= !shape.extent.xmax!
minX= !shape.extent.xmin!
3-采用百度API矩形区域检索,发起GET请求,并解析json数据
4-本地存储到Excel
5-转为shp,gis可视化
二、代码实现
1、利用arcgis创建渔网,商业网点稀疏用大矩形,商业网点密集用小矩形
由于百度API限制,一个矩形范围内仅能下载400条记录(400个POI),为了减少请求次数,提高效率,对目标区域进行处理:商业网点稀疏用大矩形,商业网点密集用小矩形。这里利用ArcGIS渔网工具,最小网格为2KM*2KM。类似如下图:

2、求出矩形左下角和右上角坐标点,读取矩形表得到坐标串
1-新建左下角X,左下角Y,右上角X,右上角Y,bsm字段
2-利用arcmap字段计算器工具计算每个矩形的左下角和右上角坐标点,表达式为:
左下角X,Y: 右上角X,Y:
minX= !shape.extent.xmin! maxX= !shape.extent.xmax!
minY= !shape.extent.ymin! maxY= !shape.extent.ymax!

3-代码读取坐标串
1 import pandas as pd 2 def get_bounds(self,path,sheet_name): # 1读取矩形表 3 bounds={} 4 df = pd.read_excel(io = path, sheet_name=sheet_name) 5 for index in df.index: 6 bsm = df.at[index, 'bsm'] 7 ymin = df.at[index, '左下角Y'] 8 xmin = df.at[index, '左下角X'] 9 ymax = df.at[index, '右上角Y'] 10 xmax = df.at[index, '右上角X'] 11 coord = str(ymin) + ',' + str(xmin) + ',' + str(ymax) + ',' + str(xmax) 12 bounds[bsm] = coord #一个标识码对应一个坐标串,方便后续查看每个矩形框下载完成情况 13 return bounds
3、采用矩形区域检索,发起GET请求,并解析json数据
4、本地存储到Excel
百度API文档示例:http://api.map.baidu.com/place/v2/search?query=银行&bounds=39.915,116.404,39.975,116.414&output=json&ak={您的密钥} //GET请求
百度POI分类:http://lbsyun.baidu.com/index.php?title=lbscloud/poitags
1 def get_poi(self,query,bound,bsm,path): # 2发起请求 2 wb = openpyxl.load_workbook(path) # 打开工作簿 3 ws1 = wb['POI'] # 选取表单,为了写入POI信息 4 ws2 = wb['Summary'] #为了记录每个矩形框下载情况是否超400条记录,如果有超过,就得重新分成小矩形 5 for page_num in range(20): # 请求页面循环,返回的poi是百度坐标需要坐标转换 6 print('正在爬取矩形框bsm为{0}的第{1}页数据...'.format(bsm,page_num+1)) 7 url = 'http://api.map.baidu.com/place/v2/search' 8 params = { 9 'query': query, # 示例,;分开 10 'bounds': bound, # 1:检索矩形区域,多组坐标间以","分隔 # 38.76623,116.43213,39.54321,116.46773 lat,lng(左下角坐标),lat,lng(右上角坐标) 11 'output': 'json', 12 'coord_type': '1', # 1(wgs84ll即GPS经纬度) 表示输入的是wgs坐标 13 'page_size': '20', 14 'page_num': page_num, # 0代表第一页 15 'ak': '你的ak' 16 } 17 # 拿到一页详情 18 detail_page = requests.get(url=url,params=params,headers=self.headers).json() 19 if len(detail_page['results']) == 0 and detail_page['status'] == 0: #判断如果请求页面没有数据,就爬取下一页 20 print('请求成功,但是没有记录,开始下一页爬取。') 21 break 22 #解析 23 for dic_result in detail_page['results']: 24 total = detail_page['total'] # 后续查看爬取情况是否满400条记录 25 # 基本信息,做判断相应的关键字中是否都有结果,没有赋值为‘’ 26 if 'name' in dic_result.keys(): 27 name = dic_result['name'] 28 else: 29 name = '' 30 if 'location' in dic_result.keys(): 31 lng, lat = cc.bd09_to_wgs84(dic_result['location']['lng'], dic_result['location']['lat']) #百度坐标转gps 32 else: 33 lng, lat = '','' 34 if 'address' in dic_result.keys(): 35 address = dic_result['address'] 36 else: 37 address='' 38 # 详情信息 39 # if dic_result['detail_info'].key(): 40 # price = dic_result['detail_info']['price'] #均价 41 # overall_rating = dic_result['detail_info']['overall_rating'] #总体评分 42 # comment_num = dic_result['detail_info']['comment_num'] #评论数 43 # return [name,lng,lat,address,bsm],[bsm,total] 44 ws1.append([name,lng,lat,address,bsm]) 45 ws2.append([bsm,total]) 46 print('矩形框bsm为{0}的第{1}页数据下载成功!'.format(bsm,page_num+1)) 47 time.sleep(0.5) #防止访问过快,百度服务求拒绝请求 48 wb.save(path) 49 print('-------矩形框bsm为{0}数据下载完成-------'.format(bsm))
5、GIS可视化
利用GIS的核密度工具,生成商业网点热力图。population字段为权重字段,不同商业网点不同的规模,这里不做设置,仅仅体现数量的密集度。也可以结合arcScene做3D效果。

附完整代码
这里没必要用类编程,只是现学现用了,直接函数式编程即可
12 import requests,time,openpyxl,os 13 import pandas as pd 14 import coordconversion as cc 15 class BaiduSpider(): 16 def __init__(self): 17 self.headers= { 18 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36' 19 } 20 21 def get_bounds(self,path,sheet_name): # 1读取矩形表 22 bounds={} 23 df = pd.read_excel(io = path, sheet_name=sheet_name) 24 for index in df.index: 25 bsm = df.at[index, 'bsm'] 26 ymin = df.at[index, '左下角Y'] 27 xmin = df.at[index, '左下角X'] 28 ymax = df.at[index, '右上角Y'] 29 xmax = df.at[index, '右上角X'] 30 coord = str(ymin) + ',' + str(xmin) + ',' + str(ymax) + ',' + str(xmax) 31 bounds[bsm] = coord 32 return bounds 33 34 def get_poi(self,query,bound,bsm,path): # 2发起请求 35 wb = openpyxl.load_workbook(path) # 打开工作簿 36 ws1 = wb['POI'] # 选取表单,为了写入POI信息 37 ws2 = wb['Summary'] #为了记录每个矩形框下载情况是否超400条记录,如果有超过,就得重新分成小矩形 38 for page_num in range(20): # 请求页面循环,返回的poi是百度坐标需要坐标转换 39 print('正在爬取矩形框bsm为{0}的第{1}页数据...'.format(bsm,page_num+1)) 40 url = 'http://api.map.baidu.com/place/v2/search' 41 params = { 42 'query': query, # 示例,;分开 43 'bounds': bound, # 1:检索矩形区域,多组坐标间以","分隔 # 38.76623,116.43213,39.54321,116.46773 lat,lng(左下角坐标),lat,lng(右上角坐标) 44 'output': 'json', 45 'coord_type': '1', # 1(wgs84ll即GPS经纬度) 表示输入的是wgs坐标 46 'page_size': '20', 47 'page_num': page_num, # 0代表第一页 48 'ak': '你的ak' 49 } 50 # 拿到一页详情 51 detail_page = requests.get(url=url,params=params,headers=self.headers).json() 52 if len(detail_page['results']) == 0 and detail_page['status'] == 0: #判断如果请求页面没有数据,就爬取下一页 53 print('请求成功,但是没有记录,开始下一页爬取。') 54 break 55 #解析 56 for dic_result in detail_page['results']: 57 total = detail_page['total'] # 后续查看爬取情况是否满400条记录 58 # 基本信息,做判断相应的关键字中是否都有结果,没有赋值为‘’ 59 if 'name' in dic_result.keys(): 60 name = dic_result['name'] 61 else: 62 name = '' 63 if 'location' in dic_result.keys(): 64 lng, lat = cc.bd09_to_wgs84(dic_result['location']['lng'], dic_result['location']['lat']) #百度坐标转gps 65 else: 66 lng, lat = '','' 67 if 'address' in dic_result.keys(): 68 address = dic_result['address'] 69 else: 70 address='' 71 # 详情信息 72 # if dic_result['detail_info'].key(): 73 # price = dic_result['detail_info']['price'] #均价 74 # overall_rating = dic_result['detail_info']['overall_rating'] #总体评分 75 # comment_num = dic_result['detail_info']['comment_num'] #评论数 76 # return [name,lng,lat,address,bsm],[bsm,total] 77 ws1.append([name,lng,lat,address,bsm]) 78 ws2.append([bsm,total]) 79 print('矩形框bsm为{0}的第{1}页数据下载成功!'.format(bsm,page_num+1)) 80 time.sleep(0.5) #防止访问过快,百度服务求拒绝请求 81 wb.save(path) 82 print('-------矩形框bsm为{0}数据下载完成-------'.format(bsm)) 83 84 def creat_excel(self,path): #写 85 if not os.path.exists(path): 86 wb = openpyxl.Workbook() #创建工作簿 87 ws1=wb.create_sheet('POI') #新增一个POI表单 88 ws1.append(['name', 'lng', 'lat', 'address', 'bsm']) 89 ws2=wb.create_sheet('Summary') #新增一个汇总表单 90 ws2.append(['bsm','total']) 91 wb.save(path) # 保存 92 93 if __name__=='__main__': 94 query = '休闲娱乐' 95 bs = BaiduSpider() 96 bs.creat_excel(path = query + '商业网点POI.xlsx') 97 bounds=bs.get_bounds(path = r'F:\商业网点规划\矩形表.xlsx', sheet_name = 'Sheet1') # 读取矩形表 98 for bsm, bound in bounds.items(): # 矩形框循环,bsm用于记录哪些矩形框已经爬取过了 99 bs.get_poi(query = query,bound = bound,bsm = bsm,path = query + '商业网点POI.xlsx') 100 # print(bsm,bound) 101 print('---------------程序结束!--------------')
posted on 2020-10-22 22:37 GIS与Python 阅读(620) 评论(0) 收藏 举报
浙公网安备 33010602011771号