2020学习05 爬虫,修改了一些bug
在上个爬虫代码中没有对信件类型进行分类,而且爬取的数据会出现大片时间爬取不到和回复内容爬取不到,
对代码进行优化后,
得到如下数据:

只显示部分数据,可以看到爬取的完整度基本完好。
代码如下:
#coding:utf-8
import requests
from lxml import etree
import time
import pymysql
import datetime
import urllib
import json
from IPython.core.page import page
conn = pymysql.connect(
host="localhost",
user="root",
port=3306,
password="123456",
database="bjxj")
gg=2950
def db(conn, reqcontent,reqname,reqtime,resname,restime,rescontent,reqtype,isreply):
cursor = conn.cursor()
# cursor.execute(
# "INSERT INTO xinjian(name) VALUES (%s)",
# [name])
if isreply == False :
isreply = 0
restime1 = ''
else :
isreply = 1
restime1 = restime
# print(reqcontent)
# print(reqname)
# print(reqtime)
# print(resname)
# #print(restime)
# print(rescontent)
# print(reqtype)
# print(isreply)
cursor.execute("INSERT INTO aaa (reqcontent,reqname,reqtime,resname,rescontent,reqtype,isreply,restime) VALUES (%s,%s,%s,%s,%s,%s,%s,%s);", [reqcontent,reqname,reqtime,resname,rescontent,reqtype,isreply,restime1])
conn.commit()
cursor.close()
def shijinOU(json1,url,i):
print(i)
head = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'method': 'POST',
'Content-Type': 'application/json;charset=UTF-8',
}
data_json = json.dumps(json1)
r = requests.post(url,data = data_json,headers=head)
html = r.content.decode("utf-8")
print("Status code:",r.status_code)
new_data = json.loads(html)
#print("6666:" + html)
for s in range(0,6):
print(new_data['mailList'][s])
reqname = new_data['mailList'][s]['letter_title']
reqtime = new_data['mailList'][s]['create_date']
resname = new_data['mailList'][s]['org_id']
isreply = new_data['mailList'][s]['isReply']
reqtype = new_data['mailList'][s]['letter_type']
if new_data['mailList'][s]['letter_type'] == '咨询' :
#print(isreply)
#print("询问标题:" + reqname + "询问时间:" + reqtime + "回答部门:" + resname + "是否回答:")
lettertype = 'consult'
lettertype1 = 'consultDetail'
zixunTiqu(new_data['mailList'][s]['original_id'],reqname,reqtime,resname,isreply,reqtype,lettertype,lettertype1)
if new_data['mailList'][s]['letter_type'] == '建议' :
lettertype = 'suggest'
lettertype1 = 'suggesDetail'
zixunTiqu(new_data['mailList'][s]['original_id'], reqname, reqtime, resname, isreply, reqtype, lettertype,
lettertype1)
if new_data['mailList'][s]['letter_type'] == '投诉' :
lettertype = 'complain'
lettertype1 = 'complainDetail'
zixunTiqu(new_data['mailList'][s]['original_id'], reqname, reqtime, resname, isreply, reqtype, lettertype,
lettertype1)
def zixunTiqu(AH,reqname,reqtime,resname,isreply,reqtype,lettertype,lettertype1):
#print("询问标题:"+reqname+"询问时间:"+reqtime+"回答部门:"+resname+"是否回答:"+isreply)
head = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'method': 'GET'
}
url2 = 'http://www.beijing.gov.cn/hudong/hdjl/com.web.'+lettertype+'.'+lettertype1+'.flow?originalId='+AH
r = requests.get(url2, headers=head)
#print(r.status_code)
html = r.content.decode("utf-8")
#print("777"+html)
html1 = etree.HTML(html)
#print(html)
reqcontent1 = html1.xpath('head/meta[@name="Description"]/@content')
restime1 = html1.xpath('//div[@class="col-xs-12 col-sm-3 col-md-3 my-2 "]//text()')
restime2 = html1.xpath('//div[@class="col-xs-12 col-sm-3 col-md-3 my-2"]//text()')
print(restime1)
restime = ''
rescontent = ''
if len(restime1) ==0 and len(restime2) ==0:
print("未回答")
restime = ''
rescontent = ''
else:
if len(restime1) == 0:
restime = restime2[0]
rescontent1 = html1.xpath('string(//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"])')
rescontent = rescontent1.strip()
else:
restime = restime1[0]
rescontent1 = html1.xpath('string(//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"])')
rescontent = rescontent1.strip()
#print(restime)
print(rescontent)
db(conn, reqcontent1[0], reqname, reqtime, resname, restime, rescontent, reqtype, isreply)
if __name__=='__main__':
for i in range(0,100):
print('***************************************************')
page = 6*i
fuck = {"PageCond/begin":page,
"PageCond/length":6,
"PageCond/isCount":"true",
"keywords":"","orgids":"",
"startDate":"","endDate":"",
"letterType":"","letterStatue":""
}
shijinOU(fuck,"http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.mailList.biz.ext",i)
#break
#print(fuck)
html1 = etree.HTML(html)
总结:对于页面元素内容的提取可以通过html = etree.html(html1) 将html元素转换成可以使用xpath解析定位的内容,进而通过xpath解析定位得到元素的值。

浙公网安备 33010602011771号