'''
中华人民共和国民政局官网中的行政区域代码爬取:
技术点:
1>进入二级页面(数据展示页)时,url发生跳转(js作用的),需要在二级页面源码中找到真实url
2>数据入库实时更新:保存url,下次爬取时,先对比url,若相同,不更新,否则更新
'''
import requests
from lxml import etree
import re
import pymysql
class GovementSpider:
def __init__(self):
self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
self.db = pymysql.connect('localhost', 'root', '123456', 'govermentdb', charset='utf8')
self.cursor = self.db.cursor()
# 提取二级页面链接(假链接),一定是最新的那个链接
def get_false_link(self):
html = requests.get(url=self.one_url, headers=self.headers).content.decode('utf-8', 'ignore')
parse_html = etree.HTML(html)
a_list = parse_html.xpath('//a [@class="artitlelist"]')
for a in a_list:
title = a.xpath('./@title')[0].strip()
if re.findall(r'.*以上行政区划代码', title, re.S):
two_false_link = 'http://www.mca.gov.cn' + a.get('href')
return two_false_link
# 提取真实二级页面链接(返回数据)
def get_true_link(self):
# 获取响应内容
false_link = self.get_false_link()
html = requests.get(url=false_link, headers=self.headers).content.decode('utf-8', 'ignore')
pattern = re.compile(r'window.location.href="(.*?)"', re.S)
real_link = pattern.findall(html)[0]
print(real_link)
# 实现增量爬取
# 即到version表中查询是否有real_link,如果有,直接返回数据已是最新,否则,抓取最新数据
sel = 'select * from version where link="{}"'.format(real_link)
self.cursor.execute(sel)
# 不为空元组(不需要抓取数据),即链接已存在
if self.cursor.fetchall():
print('数据已是最新')
else:
# 先抓数据
self.get_data(real_link)
# 把real_link插入到version表中
ins = 'insert into version values(%s)'
self.cursor.execute(ins, [real_link])
self.db.commit()
# 真正提取数据函数
def get_data(self, real_link):
html = requests.get(url=real_link, headers=self.headers).text
parse_html = etree.HTML(html)
tr_list = parse_html.xpath('//tr[@height="19"]')
for tr in tr_list:
code = tr.xpath('./td[2]/text()')[0]
name = tr.xpath('./td[3]/text()')[0]
print(name, code)
# 主函数
def main(self):
pass
if __name__ == '__main__':
spider = GovementSpider()
spider.main()
spider.get_true_link()
'''
使用selenium+chrome进行爬取,可以避免js对二级页面链接的渲染,爬取更简单
'''
from selenium import webdriver
import time
import pymysql
class GovementSpider:
def __init__(self):
self.browser = webdriver.Chrome()
self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
self.db = pymysql.connect('localhost', 'root', '123456', db='govdb', charset='utf8')
self.cursor = self.db.cursor()
# 定义三个空列表,为了excutemany()
self.province_list = []
self.city_list = []
self.county_list = []
# 获取首页并提取二级页面链接(虚假链接即可,真实链接可以不用)
def get_false_url(self):
self.browser.get(self.one_url)
td_list = self.browser.find_elements_by_xpath('//td[@class="arlisttd"]/a[contains(@title,"代码")]')
if td_list:
# 找节点对象,因为要click()
two_url_element = td_list[0]
# 增量爬取,取出链接,和数据库中version表中作比对
two_url = two_url_element.get_attribute('href')
sel = 'select * from version where link=%s'
self.cursor.execute(sel, [two_url])
result = self.cursor.fetchall()
if len(result) != 0:
print('数据已最新,无需爬取')
else:
# 点击
two_url_element.click()
time.sleep(3)
# 切换browser
all_handles = self.browser.window_handles
self.browser.switch_to_window(all_handles[1])
# 数据抓取
self.get_data()
# 结束后把two_url插入version表中
ins = 'insert into version values(%s)'
self.cursor.execute(ins, [two_url])
self.db.commit()
# 二级页面中提取行政区划代码
def get_data(self):
tr_list = self.browser.find_elements_by_xpath('//tr[@height="19"]')
for tr in tr_list:
code = tr.find_element_by_xpath('./td[2]').text.strip()
name = tr.find_element_by_xpath('./td[3]').text.strip()
print(name, code)
# 判断层级关系,添加到对应的数据库表中(对应表中字段)
if code[-4:] == '0000':
self.province_list.append([name, code])
if name in ['北京市', '天津市', '上海市', '重庆市']:
city = [name, code, code[:2] + '0000']
self.city_list.append(city)
elif code[-2:] == '00':
city = [name, code, code[:2] + '0000']
self.city_list.append(city)
else:
county = [name, code, code[:4] + '00']
self.county_list.append(county)
# 所有数据爬取完成之后,统一excutemany()
self.insert_mysql()
def insert_mysql(self):
# 更新时一定要删除表记录
del_province = 'delete from province'
del_city = 'delete from city'
del_county = 'delete from county'
self.cursor.execute(del_province)
self.cursor.execute(del_city)
self.cursor.execute(del_county)
# 插入新的数据
ins_province = 'insert into province values(%s,%s)'
ins_city = 'insert into city values(%s,%s,%s)'
ins_county = 'insert into county values(%s,%s,%s)'
self.cursor.executemany(ins_province, self.province_list)
self.cursor.executemany(ins_city, self.city_list)
self.cursor.executemany(ins_county, self.county_list)
self.db.commit()
print('数据抓取完成,成功存入数据库')
def main(self):
self.get_false_url()
# 断开连接
self.cursor.close()
self.db.close()
self.browser.quit()
if __name__ == "__main__":
spider = GovementSpider()
spider.main()