百度高德批量爬取经纬度并计算距离
声明:代码仅作学习交流用途,代码分享者与创作者不承担任何由他人恶意运行而导致的责任,勿擅自修改限制频率的参数,勿恶意攻击网页,请学习浏览者遵守社会公德与法律秩序,爬虫导致的网页崩溃等损失由计算机操作者负全部责任,造成严重后果的需要承担刑事责任
爬虫代写:邮箱 leon_leon@yeah.net
#百度爬取经纬度,耗费网络资源
import requests
from fake_useragent import UserAgent
import pandas as pd
from urllib.parse import quote
import re
from time import sleep
from random import randint
import random
#文件读取类,URL管理类
place_name = pd.read_csv(r'C:\name.csv',encoding='GBK')
hangshu = place_name.shape[0]
leishu = place_name.shape[1]
class Url_Mnger:
# hangshu = place_name.shape[0]
# leishu = place_name.shape[1]
def Url_join(self,hangshu,leishu):
all_url = []
for i in range(hangshu): #长度
for j in [1,2]: #宽度
village_name = place_name.iloc[i,j]
place_encode = quote(village_name)
url = 'http://api.map.baidu.com/geocoder?address={}'.format(place_encode)
all_url.append(url)
return all_url
#请求发送类
class Response_Cast(object):
def Get_response(self,url):
headers={
'User-Agent':UserAgent().chrome
}
response = requests.get(url=url,headers = headers)
return response.text
#数据管理类
class Info_Manger:
def Parse_html(self,info_text): #解析
latitude=re.findall(r'<lat>(.+)</lat>',info_text)
longitude = re.findall('<lng>(.+)</lng>',info_text)
latitude = latitude[0]
longitude = longitude[0]
return latitude,longitude
# def Data_join(self,latitude,longitude):
# lat.append(latitude)
# longi.append(longitude)
# return lat,longi
def Make_dataform(self,lat,longi):
lat_length = len(lat)
longi_length = len(longi)
a_lat = lat[0:lat_length+1:2]
a_longi = longi[0:longi_length+1:2]
b_lat = lat[1:lat_length+1:2]
b_longi = longi[1:longi_length+1:2]
df = pd.DataFrame({'a_point':place_name.iloc[:,1],'a_lat':a_lat,'a_longi':a_longi,'b_point':place_name.iloc[:,2],'b_lat':b_lat,'b_longi':b_longi})
return df
#数据保存
def Savedata(self,df):
df.to_csv('helf2_geo_data.csv',encoding='GBK')
# with open('geo.csv','w',encoding = 'utf-8') as f:
# f.write(df)
# f.close()
class Run_Scrapy:
def __init__(self):
url_manger = Url_Mnger()
url_list = url_manger.Url_join(hangshu,leishu)
url_list_length = len(url_list)
response_cast = Response_Cast()
info_manger = Info_Manger()
lat = []
longi = []
print(url_list)
for url,j in zip(url_list, range(url_list_length)):
print(j,'/',url_list_length)
if (j % 130) == 0 :
sleep(random.uniform(3, 10))
response_info = response_cast.Get_response(url)
info_latitude, info_longitude = info_manger.Parse_html(response_info)
lat.append(info_latitude)
longi.append(info_longitude)
else:
response_info = response_cast.Get_response(url)
info_latitude,info_longitude = info_manger.Parse_html(response_info)
lat.append(info_latitude)
longi.append(info_longitude)
make_dataform = info_manger.Make_dataform(lat,longi)
info_manger.Savedata(make_dataform)
if __name__ == '__main__':
Run_Scrapy()
#高德爬取,与百度爬取一样耗费资源
import requests
from fake_useragent import UserAgent
import pandas as pd
#import xlrd
import numpy as np
from urllib.parse import quote
import re
from time import sleep
from random import randint
import random
#文件读取类,URL管理类
d = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_name.csv',encoding='GBK')
#place_name = d.iloc[553:800,:]
#place_name = d.iloc[800:1000,:]
#place_name = d.iloc[1000:1100,:]#4
place_name = d.iloc[1100:1500,:]#5
hangshu = place_name.shape[0]
leishu = place_name.shape[1]
class Url_Mnger:
# hangshu = place_name.shape[0]
# leishu = place_name.shape[1]
def Url_join(self,hangshu,leishu):
all_url = []
for i in range(hangshu): #长度
for j in [1,2]: #宽度
village_name = place_name.iloc[i,j]
place_encode = quote(village_name)
url = 'https://restapi.amap.com/v3/place/text?s=rsv3&children=&key=8325164e247e15eea68b59e89200988b&page=1&offset=10&city=371300&language=zh_cn&callback=jsonp_175268_&platform=JS&logversion=2.0&sdkversion=1.3&appname=https%3A%2F%2Flbs.amap.com%2Fconsole%2Fshow%2Fpicker&csid=0DD8EBDE-E857-46DE-813A-42CD0AB36E00&keywords={}'.format(place_encode)
all_url.append(url)
return all_url
#请求发送类
class Response_Cast(object):
def Get_response(self,url):
headers={
'User-Agent':UserAgent().chrome
}
response = requests.get(url=url,headers = headers)
return response.text
#数据管理类
class Info_Manger:
def Parse_html(self,info_text): #解析
latitude=re.findall(r',"location":".+,(.+)","tel"',info_text)
longitude = re.findall('"location":"([0-9]{3}\.[0-9]{6}),.+","tel"',info_text)
latitude = latitude[0]
longitude = longitude[0]
return latitude,longitude
# def Data_join(self,latitude,longitude):
# lat.append(latitude)
# longi.append(longitude)
# return lat,longi
def Make_dataform(self,lat,longi):
lat_length = len(lat)
longi_length = len(longi)
a_lat = lat[0:lat_length+1:2]
a_longi = longi[0:longi_length+1:2]
b_lat = lat[1:lat_length+1:2]
b_longi = longi[1:longi_length+1:2]
df = pd.DataFrame({'a_point':place_name.iloc[:,1],'a_lat':a_lat,'a_longi':a_longi,'b_point':place_name.iloc[:,2],'b_lat':b_lat,'b_longi':b_longi})
return df
#数据保存
def Savedata(self,df):
df.to_csv('geo_data_gaode5.csv',encoding='GBK')
# with open('geo.csv','w',encoding = 'utf-8') as f:
# f.write(df)
# f.close()
class Run_Scrapy:
def __init__(self):
url_manger = Url_Mnger()
url_list = url_manger.Url_join(hangshu,leishu)
url_list_length = len(url_list)
response_cast = Response_Cast()
info_manger = Info_Manger()
lat = []
longi = []
print(url_list)
for url,j in zip(url_list, range(url_list_length)):
print(j,'/',url_list_length)
sleep(random.uniform(0.6, 1.2))
if (j % 130) == 0 :
sleep(random.uniform(3, 10))
response_info = response_cast.Get_response(url)
info_latitude, info_longitude = info_manger.Parse_html(response_info)
lat.append(info_latitude)
longi.append(info_longitude)
print(lat,longi)
else:
response_info = response_cast.Get_response(url)
info_latitude,info_longitude = info_manger.Parse_html(response_info)
lat.append(info_latitude)
longi.append(info_longitude)
print(lat,longi)
make_dataform = info_manger.Make_dataform(lat,longi)
info_manger.Savedata(make_dataform)
if __name__ == '__main__':
Run_Scrapy()
#数据去重,优秀节能
import pandas as pd
place_name = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_name - all.csv',encoding='GBK')
d = place_name.iloc[:,1]
print(d)
hangshu = place_name.shape[0]
name = []
for i in range(hangshu):
name.append(d.iloc[i])
namequchong = list(set(name))
df = pd.DataFrame({'a_point':namequchong})
df.to_csv('geo_name_quchong.csv',encoding='GBK')
#数据爬取
import requests
from fake_useragent import UserAgent
import pandas as pd
#import xlrd
import numpy as np
from urllib.parse import quote
import re
from time import sleep
from random import randint
import random
d = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_name_quchong1.csv',encoding='GBK')
place_name = d.iloc[:,:]#5
hangshu = place_name.shape[0]
leishu = place_name.shape[1]
class Url_Mnger:
# hangshu = place_name.shape[0]
# leishu = place_name.shape[1]
def Url_join(self,hangshu):
all_url = []
for i in range(hangshu): #长度
#宽度
village_name = place_name.iloc[i,1]
place_encode = quote(village_name)
url = 'https://restapi.amap.com/v3/place/text?s=rsv3&children=&key=8325164e247e15eea68b59e89200988b&page=1&offset=10&city=371300&language=zh_cn&callback=jsonp_175268_&platform=JS&logversion=2.0&sdkversion=1.3&appname=https%3A%2F%2Flbs.amap.com%2Fconsole%2Fshow%2Fpicker&csid=0DD8EBDE-E857-46DE-813A-42CD0AB36E00&keywords={}'.format(place_encode)
all_url.append(url)
return all_url
#请求发送类
class Response_Cast(object):
def Get_response(self,url):
headers={
'User-Agent':UserAgent().chrome
}
response = requests.get(url=url,headers = headers)
return response.text
#数据管理类
class Info_Manger:
def Parse_html(self,info_text): #解析
latitude=re.findall(r',"location":".+,(.+)","tel"',info_text)
longitude = re.findall('"location":"([0-9]{3}\.[0-9]{6}),.+",',info_text)
latitude = latitude[0]
longitude = longitude[0]
return latitude,longitude
# def Data_join(self,latitude,longitude):
# lat.append(latitude)
# longi.append(longitude)
# return lat,longi
def Make_dataform(self,lat,longi):
df = pd.DataFrame({'a_point':place_name.iloc[:,1],'a_lat':lat,'a_longi':longi})
return df
#数据保存
def Savedata(self,df):
df.to_csv('geo_data_gaode_quchong.csv',encoding='GBK')
# with open('geo.csv','w',encoding = 'utf-8') as f:
# f.write(df)
# f.close()
class Run_Scrapy:
def __init__(self):
url_manger = Url_Mnger()
url_list = url_manger.Url_join(hangshu)
url_list_length = len(url_list)
response_cast = Response_Cast()
info_manger = Info_Manger()
lat = []
longi = []
print(url_list)
for url,j in zip(url_list, range(url_list_length)):
print(j,'/',url_list_length)
sleep(random.uniform(1, 1.5))
if (j % 130) == 0 :
# sleep(random.uniform(3, 10))
response_info = response_cast.Get_response(url)
info_latitude, info_longitude = info_manger.Parse_html(response_info)
lat.append(info_latitude)
longi.append(info_longitude)
print(lat,longi)
else:
response_info = response_cast.Get_response(url)
info_latitude,info_longitude = info_manger.Parse_html(response_info)
lat.append(info_latitude)
longi.append(info_longitude)
print(lat,longi)
make_dataform = info_manger.Make_dataform(lat,longi)
info_manger.Savedata(make_dataform)
if __name__ == '__main__':
Run_Scrapy()
#数据整合
import pandas as pd
place_name = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_name.csv',encoding='GBK')
h = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_data_gaode_quchong.csv',encoding='GBK')
hangshu = place_name.shape[0]
hangshu1 = h.shape[0]
leishu = place_name.shape[1]
lat = []
longi = []
for i in range(hangshu):
for j in [1,2]:
a = place_name.iloc[i,j]
for w in range(hangshu1):
b = h.iloc[w,1]
if a == b:
lat.append(h.iloc[w,2])
longi.append(h.iloc[w,3])
break
else:
pass
lat_length = len(lat)
longi_length = len(longi)
a_lat = lat[0:lat_length + 1:2]
a_longi = longi[0:longi_length + 1:2]
b_lat = lat[1:lat_length + 1:2]
b_longi = longi[1:longi_length + 1:2]
df = pd.DataFrame(
{'a_point': place_name.iloc[:, 1], 'a_lat': a_lat, 'a_longi': a_longi, 'b_point': place_name.iloc[:, 2],
'b_lat': b_lat, 'b_longi': b_longi})
df.to_csv('geo_data_gaode_end.csv',encoding='GBK')
#根据经纬度计算距离
from math import radians, cos, sin, asin,sqrt
import pandas as pd
def geodistance(lng1, lat1, lng2, lat2):
lng1, lat1, lng2, lat2 = map(radians, [lng1, lat1, lng2, lat2]) # 将经纬度转化为弧度
dlon = lng2-lng1
dlat = lat2-lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
distance = 2*asin(sqrt(a))*6371
return distance
place_name = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_data_gaode_end.csv',encoding='GBK')
hangshu = place_name.shape[0]
distance_list = []
for i in range(hangshu):
print(i,'/',hangshu)
lng1 = place_name.iloc[i,3]
lat1 = place_name.iloc[i,2]
lng2 = place_name.iloc[i,6]
lat2 = place_name.iloc[i,5]
distance_start_end = geodistance(lng1=lng1,lat1=lat1,lng2=lng2,lat2=lat2)
distance_list.append(distance_start_end)
df = pd.DataFrame({'start_point':place_name.iloc[:,1],'lat1':place_name.iloc[:,2],'lon1':place_name.iloc[:,3],'end_point':place_name.iloc[:,4],'lat2':place_name.iloc[:,5],'lon2':place_name.iloc[:,6],'distance':distance_list})
df.to_csv('distance_geo_data_gaode.csv',encoding='GBK')

浙公网安备 33010602011771号