第二天

数据的爬取

高德poi的全国下至到县的景点数据爬取

 1 import requests
 2 import pandas as pd
 3 import json
 4 import time
 5 import math
 6 import sheet
 7 
 8 def read_poi():
 9 
10     #各下级行政区的代码,若是嫌逐个复制麻烦可以通过读取文件的方式实现,此处不进行讲解
11     # arr= ['120100','120101','120102','120103','120104','120105','120106','1201010','120111','120112','120113','120114','120115','120116','120117','120118','120119']
12     #API的URL,在这里进行了结构化处理
13     arr = sheet.read_excel()
14     url1="https://restapi.amap.com/v3/place/text?key=807c3aaf8b58a288aa83b28d11c817e4&keywords=景区&types=风景名胜&city="
15     url2="&output=JSON&children=&offset=20&page="
16     url3="&extensions=all"
17     #用于储存数据
18     x=[]
19     #用于计数
20     num=0
21 
22     #循环各下级行政区进行POI检索
23     for i in range(0,len(arr)):
24         #当前行政区
25         city=arr[i]
26         #因为官方对API检索进行了45页限制,所以只要检索到45页即可
27         for page in range(1,46):
28             #若该下级行政区的POI数量达到了限制,则警告使用者,之后考虑进行POI类型切分
29             if page==45:
30                 print("警告!!POI检索可能受到限制!!")
31             #构造URL
32             thisUrl=url1+city+url2+str(page)+url3
33             #获取POI数据
34             data=requests.get(thisUrl)
35             #转为JSON格式
36             s=data.json()
37 
38             #解析JSON
39             aa = s["pois"]
40 
41             #若解析的JSON为空,即当前行政区的数据不够45页(即没有达到限制),返回
42             if len(aa)==0:
43                 break
44             #对每条POI进行存储
45             for k in range(0,len(aa)):
46                 b={}
47                 b["name"]=aa[k]["name"]
48                 b["type"]=aa[k]["type"]
49                 b["address"]=aa[k].get("address")
50                 b["adname"]=aa[k]["adname"]
51                 b["locationleft"]=str(aa[k]["location"].split(",")[0])
52                 b["locationright"]=str(aa[k]["location"].split(",")[1])
53                 x.append(b)
54                 num+=1
55                 print("爬取了 "+str(num)+" 条数据")
56             time.sleep(0.5)
57 
58 
59     #将数据结构化存储至规定目录的CSV文件中
60     result = json.dumps(x, sort_keys=True, indent=2)
61     with open('./jingqu/datapoi.json','w',encoding='utf-8') as file:
62         for i in result:
63                 file.write(i)
64         print('数据已写入json文件...')
import xlrd
import datetime
from datetime import date
def read_excel():
    wb = xlrd.open_workbook(r'E:\poi.xlsx')
    print(wb.sheet_names())
    sheet1 = wb.sheet_by_index(0)
    cols2 = sheet1.col_values(1)
    return cols2
col= read_excel()
print(col)

存入数据库

import mysql.connector
import json
import time
import datetime

import null as null

with open('./jingqu/datapoi.json', 'r') as file:
    data = file.read()
    data = json.loads(data)

def du_sql():
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="password",
        database="scence",
        auth_plugin="mysql_native_password"
    )
    dbpath = mydb.cursor()
    savaDataSql(dbpath)
    mydb.commit()
def savaDataSql(dbpath):
cur = dbpath
    try:
        for each in data:
            name = each['name']
            type = each['type']
            if(each['address']!=None):
                if(len(each['address'])==0):
                    address = ''
                else:
                    address = each['address']
            else:
                address = ''
            adname = each['adname']
            locationleft = each['locationleft']
            locationright = each['locationright']
            sql = "INSERT INTO scence1 (name,type,address,adname,locationleft,locationright) values (%s,%s,%s,%s,%s,%s)"
            var = (name,type,address,adname,locationleft,locationright)
            cur.execute(sql,var)
    except :
        print(name)

main.py

import poi
poi.read_poi()
import daosqlscene
daosqlscene.du_sql()

 

posted @ 2021-04-24 16:43  韦德·沃兹  阅读(72)  评论(0)    收藏  举报