百度高德批量爬取经纬度并计算距离

声明:代码仅作学习交流用途,代码分享者与创作者不承担任何由他人恶意运行而导致的责任,勿擅自修改限制频率的参数,勿恶意攻击网页,请学习浏览者遵守社会公德与法律秩序,爬虫导致的网页崩溃等损失由计算机操作者负全部责任,造成严重后果的需要承担刑事责任
爬虫代写:邮箱 leon_leon@yeah.net

#百度爬取经纬度,耗费网络资源
import requests
from fake_useragent import UserAgent
import pandas as pd
from urllib.parse import quote
import re
from time import sleep 
from random import randint
import random
#文件读取类,URL管理类
place_name = pd.read_csv(r'C:\name.csv',encoding='GBK')
hangshu = place_name.shape[0]
leishu = place_name.shape[1]
class Url_Mnger:
    # hangshu = place_name.shape[0]
    # leishu = place_name.shape[1]
    def Url_join(self,hangshu,leishu):
        all_url = []
        for i in range(hangshu):       #长度
            for j in [1,2]:    #宽度
                village_name = place_name.iloc[i,j]
                place_encode = quote(village_name)
                url = 'http://api.map.baidu.com/geocoder?address={}'.format(place_encode)
                all_url.append(url)
        return all_url
#请求发送类
class Response_Cast(object):
    def Get_response(self,url):
        headers={
        'User-Agent':UserAgent().chrome
        }
        response = requests.get(url=url,headers = headers)
        return  response.text
#数据管理类
class Info_Manger:
    def Parse_html(self,info_text):      #解析
        latitude=re.findall(r'<lat>(.+)</lat>',info_text)
        longitude = re.findall('<lng>(.+)</lng>',info_text)
        latitude = latitude[0]
        longitude = longitude[0]
        return latitude,longitude

    # def Data_join(self,latitude,longitude):
    #     lat.append(latitude)
    #     longi.append(longitude)
    #     return lat,longi

    def Make_dataform(self,lat,longi):
        lat_length = len(lat)
        longi_length = len(longi)
        a_lat = lat[0:lat_length+1:2]
        a_longi = longi[0:longi_length+1:2]
        b_lat = lat[1:lat_length+1:2]
        b_longi = longi[1:longi_length+1:2]
        df = pd.DataFrame({'a_point':place_name.iloc[:,1],'a_lat':a_lat,'a_longi':a_longi,'b_point':place_name.iloc[:,2],'b_lat':b_lat,'b_longi':b_longi})
        return df
#数据保存
    def Savedata(self,df):
        df.to_csv('helf2_geo_data.csv',encoding='GBK')
        # with open('geo.csv','w',encoding = 'utf-8') as f:
        #     f.write(df)
        #     f.close()

class Run_Scrapy:
    def __init__(self):
        url_manger = Url_Mnger()
        url_list = url_manger.Url_join(hangshu,leishu)
        url_list_length = len(url_list)
        response_cast = Response_Cast()
        info_manger = Info_Manger()
        lat = []
        longi = []
        print(url_list)
        for url,j in zip(url_list, range(url_list_length)):
            print(j,'/',url_list_length)
            if (j % 130) == 0 :
                sleep(random.uniform(3, 10))
                response_info = response_cast.Get_response(url)
                info_latitude, info_longitude = info_manger.Parse_html(response_info)
                lat.append(info_latitude)
                longi.append(info_longitude)

            else:
                response_info = response_cast.Get_response(url)
                info_latitude,info_longitude = info_manger.Parse_html(response_info)
                lat.append(info_latitude)
                longi.append(info_longitude)

        make_dataform = info_manger.Make_dataform(lat,longi)
        info_manger.Savedata(make_dataform)

if __name__ == '__main__':
    Run_Scrapy()
#高德爬取,与百度爬取一样耗费资源
import requests
from fake_useragent import UserAgent
import pandas as pd
#import xlrd
import numpy as np
from urllib.parse import quote
import re
from time import sleep 
from random import randint
import random
#文件读取类,URL管理类
d  = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_name.csv',encoding='GBK')
#place_name = d.iloc[553:800,:]
#place_name = d.iloc[800:1000,:]
#place_name = d.iloc[1000:1100,:]#4
place_name = d.iloc[1100:1500,:]#5
hangshu = place_name.shape[0]
leishu = place_name.shape[1]
class Url_Mnger:
    # hangshu = place_name.shape[0]
    # leishu = place_name.shape[1]
    def Url_join(self,hangshu,leishu):
        all_url = []
        for i in range(hangshu):       #长度
            for j in [1,2]:    #宽度
                village_name = place_name.iloc[i,j]
                place_encode = quote(village_name)
                url = 'https://restapi.amap.com/v3/place/text?s=rsv3&children=&key=8325164e247e15eea68b59e89200988b&page=1&offset=10&city=371300&language=zh_cn&callback=jsonp_175268_&platform=JS&logversion=2.0&sdkversion=1.3&appname=https%3A%2F%2Flbs.amap.com%2Fconsole%2Fshow%2Fpicker&csid=0DD8EBDE-E857-46DE-813A-42CD0AB36E00&keywords={}'.format(place_encode)
                all_url.append(url)
        return all_url
#请求发送类
class Response_Cast(object):
    def Get_response(self,url):
        headers={
        'User-Agent':UserAgent().chrome
        }
        response = requests.get(url=url,headers = headers)
        return  response.text
#数据管理类
class Info_Manger:
    def Parse_html(self,info_text):      #解析
        latitude=re.findall(r',"location":".+,(.+)","tel"',info_text)
        longitude = re.findall('"location":"([0-9]{3}\.[0-9]{6}),.+","tel"',info_text)
        latitude = latitude[0]
        longitude = longitude[0]
        return latitude,longitude

    # def Data_join(self,latitude,longitude):
    #     lat.append(latitude)
    #     longi.append(longitude)
    #     return lat,longi

    def Make_dataform(self,lat,longi):
        lat_length = len(lat)
        longi_length = len(longi)
        a_lat = lat[0:lat_length+1:2]
        a_longi = longi[0:longi_length+1:2]
        b_lat = lat[1:lat_length+1:2]
        b_longi = longi[1:longi_length+1:2]
        df = pd.DataFrame({'a_point':place_name.iloc[:,1],'a_lat':a_lat,'a_longi':a_longi,'b_point':place_name.iloc[:,2],'b_lat':b_lat,'b_longi':b_longi})
        return df
#数据保存
    def Savedata(self,df):
        df.to_csv('geo_data_gaode5.csv',encoding='GBK')
        # with open('geo.csv','w',encoding = 'utf-8') as f:
        #     f.write(df)
        #     f.close()

class Run_Scrapy:
    def __init__(self):
        url_manger = Url_Mnger()
        url_list = url_manger.Url_join(hangshu,leishu)
        url_list_length = len(url_list)
        response_cast = Response_Cast()
        info_manger = Info_Manger()
        lat = []
        longi = []
        print(url_list)
        for url,j in zip(url_list, range(url_list_length)):
            print(j,'/',url_list_length)
            sleep(random.uniform(0.6, 1.2))
            if (j % 130) == 0 :
                sleep(random.uniform(3, 10))
                response_info = response_cast.Get_response(url)
                info_latitude, info_longitude = info_manger.Parse_html(response_info)
                lat.append(info_latitude)
                longi.append(info_longitude)
                print(lat,longi)
            else:
                response_info = response_cast.Get_response(url)
                info_latitude,info_longitude = info_manger.Parse_html(response_info)
                lat.append(info_latitude)
                longi.append(info_longitude)
                print(lat,longi)
        make_dataform = info_manger.Make_dataform(lat,longi)
        info_manger.Savedata(make_dataform)

if __name__ == '__main__':
    Run_Scrapy()
#数据去重,优秀节能
import pandas as pd
place_name = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_name - all.csv',encoding='GBK')
d = place_name.iloc[:,1]
print(d)
hangshu = place_name.shape[0]
name = []
for i in range(hangshu):
    name.append(d.iloc[i])
namequchong = list(set(name))
df = pd.DataFrame({'a_point':namequchong})
df.to_csv('geo_name_quchong.csv',encoding='GBK')





#数据爬取
import requests
from fake_useragent import UserAgent
import pandas as pd
#import xlrd
import numpy as np
from urllib.parse import quote
import re
from time import sleep
from random import randint
import random
d  = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_name_quchong1.csv',encoding='GBK')
place_name = d.iloc[:,:]#5
hangshu = place_name.shape[0]
leishu = place_name.shape[1]
class Url_Mnger:
    # hangshu = place_name.shape[0]
    # leishu = place_name.shape[1]
    def Url_join(self,hangshu):
        all_url = []
        for i in range(hangshu):       #长度
                #宽度
            village_name = place_name.iloc[i,1]
            place_encode = quote(village_name)
            url = 'https://restapi.amap.com/v3/place/text?s=rsv3&children=&key=8325164e247e15eea68b59e89200988b&page=1&offset=10&city=371300&language=zh_cn&callback=jsonp_175268_&platform=JS&logversion=2.0&sdkversion=1.3&appname=https%3A%2F%2Flbs.amap.com%2Fconsole%2Fshow%2Fpicker&csid=0DD8EBDE-E857-46DE-813A-42CD0AB36E00&keywords={}'.format(place_encode)
            all_url.append(url)
        return all_url
#请求发送类
class Response_Cast(object):
    def Get_response(self,url):
        headers={
        'User-Agent':UserAgent().chrome
        }
        response = requests.get(url=url,headers = headers)
        return  response.text
#数据管理类
class Info_Manger:
    def Parse_html(self,info_text):      #解析
        latitude=re.findall(r',"location":".+,(.+)","tel"',info_text)
        longitude = re.findall('"location":"([0-9]{3}\.[0-9]{6}),.+",',info_text)
        latitude = latitude[0]
        longitude = longitude[0]
        return latitude,longitude

    # def Data_join(self,latitude,longitude):
    #     lat.append(latitude)
    #     longi.append(longitude)
    #     return lat,longi

    def Make_dataform(self,lat,longi):
        df = pd.DataFrame({'a_point':place_name.iloc[:,1],'a_lat':lat,'a_longi':longi})
        return df
#数据保存
    def Savedata(self,df):
        df.to_csv('geo_data_gaode_quchong.csv',encoding='GBK')
        # with open('geo.csv','w',encoding = 'utf-8') as f:
        #     f.write(df)
        #     f.close()

class Run_Scrapy:
    def __init__(self):
        url_manger = Url_Mnger()
        url_list = url_manger.Url_join(hangshu)
        url_list_length = len(url_list)
        response_cast = Response_Cast()
        info_manger = Info_Manger()
        lat = []
        longi = []
        print(url_list)
        for url,j in zip(url_list, range(url_list_length)):
            print(j,'/',url_list_length)
            sleep(random.uniform(1, 1.5))
            if (j % 130) == 0 :
                # sleep(random.uniform(3, 10))
                response_info = response_cast.Get_response(url)
                info_latitude, info_longitude = info_manger.Parse_html(response_info)
                lat.append(info_latitude)
                longi.append(info_longitude)
                print(lat,longi)
            else:
                response_info = response_cast.Get_response(url)
                info_latitude,info_longitude = info_manger.Parse_html(response_info)
                lat.append(info_latitude)
                longi.append(info_longitude)
                print(lat,longi)
        make_dataform = info_manger.Make_dataform(lat,longi)
        info_manger.Savedata(make_dataform)

if __name__ == '__main__':
    Run_Scrapy()







#数据整合

import pandas as pd

place_name  = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_name.csv',encoding='GBK')
h = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_data_gaode_quchong.csv',encoding='GBK')
hangshu = place_name.shape[0]
hangshu1 = h.shape[0]
leishu = place_name.shape[1]
lat = []
longi = []
for i in range(hangshu):
    for j in [1,2]:
        a = place_name.iloc[i,j]
        for w in range(hangshu1):
            b = h.iloc[w,1]
            if a == b:
                lat.append(h.iloc[w,2])
                longi.append(h.iloc[w,3])
                break
            else:
                pass
lat_length = len(lat)
longi_length = len(longi)
a_lat = lat[0:lat_length + 1:2]
a_longi = longi[0:longi_length + 1:2]
b_lat = lat[1:lat_length + 1:2]
b_longi = longi[1:longi_length + 1:2]
df = pd.DataFrame(
    {'a_point': place_name.iloc[:, 1], 'a_lat': a_lat, 'a_longi': a_longi, 'b_point': place_name.iloc[:, 2],
     'b_lat': b_lat, 'b_longi': b_longi})
df.to_csv('geo_data_gaode_end.csv',encoding='GBK')


#根据经纬度计算距离
from math import radians, cos, sin, asin,sqrt
import pandas as pd
def geodistance(lng1, lat1, lng2, lat2):
    lng1, lat1, lng2, lat2 = map(radians, [lng1, lat1, lng2, lat2])  # 将经纬度转化为弧度
    dlon = lng2-lng1
    dlat = lat2-lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    distance = 2*asin(sqrt(a))*6371
    return distance
place_name = pd.read_csv(r'C:\Users\Admin\PycharmProjects\untitled\geo_data_gaode_end.csv',encoding='GBK')
hangshu = place_name.shape[0]
distance_list = []
for i in range(hangshu):
    print(i,'/',hangshu)
    lng1 = place_name.iloc[i,3]
    lat1 = place_name.iloc[i,2]
    lng2 = place_name.iloc[i,6]
    lat2 = place_name.iloc[i,5]
    distance_start_end = geodistance(lng1=lng1,lat1=lat1,lng2=lng2,lat2=lat2)
    distance_list.append(distance_start_end)
df = pd.DataFrame({'start_point':place_name.iloc[:,1],'lat1':place_name.iloc[:,2],'lon1':place_name.iloc[:,3],'end_point':place_name.iloc[:,4],'lat2':place_name.iloc[:,5],'lon2':place_name.iloc[:,6],'distance':distance_list})
df.to_csv('distance_geo_data_gaode.csv',encoding='GBK')

posted @ 2020-09-02 10:30  kuanleung  阅读(42)  评论(0)    收藏  举报  来源