寒假大数据学习笔记十五

  完成了数据爬取工作,共33335条数据,全部保存入库。

  1 import requests
  2 from fake_useragent import UserAgent
  3 from lxml import etree
  4 import re
  5 import pymysql
  6 import time
  7 
  8 
  9 def open_file(file):
 10     original_id = []
 11     f = open(file, "r")
 12     for line in f:
 13         original_id.append(line.splitlines()[0])
 14     f.close()
 15     return original_id
 16 
 17 
 18 def open_url(url, file, type):
 19     original_id = open_file(file)
 20     for id in original_id:
 21         detail_url = url + id
 22         header = {
 23             "User-Agent": UserAgent().random
 24         }
 25         req = requests.get(detail_url, headers=header)
 26         html = etree.HTML(req.text)
 27         try:
 28             question_title = html.xpath(
 29                 '//div[contains(@class,"col-xs-10 col-sm-10")]//strong/text()')[0].strip()
 30             question_date = html.xpath(
 31                 '//div[contains(@class,"col-xs-5 col-lg-3")]/text()')[0].strip()
 32             question_content = html.xpath(
 33                 '//div[contains(@class,"col-xs-12 col-md-12 column p-2")]//text()')
 34             # if type == "投诉":
 35             #     reply_organ = html.xpath(
 36             #         '//div[contains(@class,"col-xs-9 col-sm-7")]//span/text()')[0].strip()
 37             reply_organ = html.xpath(
 38                 '//div[contains(@class,"col-xs-9 col-sm-7")]/text()')[1].strip()
 39             reply_date = html.xpath(
 40                 '//div[contains(@class,"col-xs-12 col-sm-3")]/text()')[0].strip()
 41             reply_content = html.xpath(
 42                 '//div[contains(@class,"col-xs-12 col-md-12 column p-4")]//text()')
 43             date_pattern = re.compile(r"(\d{4}-\d\d-\d\d)")
 44             print(question_title)
 45             q_date = date_pattern.findall(question_date)[0]
 46             print(q_date)
 47             q_con = "".join(question_content).strip()
 48             print(q_con)
 49             print(reply_organ)
 50             r_date = date_pattern.findall(reply_date)[0]
 51             print(r_date)
 52             r_con = "".join(reply_content).strip()
 53             print(r_con)
 54             print(type)
 55             r = add(open_conn("letter"),
 56                     question_title,
 57                     q_date,
 58                     q_con,
 59                     reply_organ,
 60                     r_date,
 61                     r_con,
 62                     id,
 63                     type)
 64             print(r)
 65         except IndexError as e:
 66             pass
 67         time.sleep(0.5)
 68         print("=" * 20)
 69 
 70 
 71 def open_conn(dbname):
 72     db = pymysql.connect(
 73         host="localhost",
 74         port=3306,
 75         user="root",
 76         passwd="123456",
 77         db=dbname,
 78         charset="utf8")
 79 
 80     return db
 81 
 82 
 83 def add(
 84         db,
 85         question_title,
 86         question_date,
 87         question_content,
 88         reply_organ,
 89         reply_date,
 90         reply_content,
 91         original_id,
 92         type):
 93 
 94     cursor = db.cursor()
 95     sql = "insert into detail_letter(question_title,question_date,question_content,reply_organ,reply_date,reply_content,original_id,type) values(%s,%s,%s,%s,%s,%s,%s,%s)"
 96     cursor.execute(
 97         sql,
 98         [question_title,
 99          question_date,
100          question_content,
101          reply_organ,
102          reply_date,
103          reply_content,
104          original_id,
105          type])
106     db.commit()
107     db.close()
108     return "数据插入成功!"
109 
110 
111 if __name__ == '__main__':
112     # open_url(
113     #     "http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId=",
114     #     "tousu.txt",
115     #     "投诉")
116     # open_url(
117     #     "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=",
118     #     "zixun.txt",
119     #     "咨询")
120     open_url(
121         "http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId=",
122         "jianyi.txt",
123         "建议")

 

posted @ 2020-02-16 16:46  一夕思醉  阅读(171)  评论(0编辑  收藏  举报