爬取三级联动省市区银行支行信息
Note:省市区竟然隐藏在js代码里。
新建表:
CREATE TABLE `bank_branchs_perfect` (
`sub_branch_name` varchar(255) NOT NULL,
`sub_branch_id` varchar(50) DEFAULT NULL,
`province_id` int(11) DEFAULT NULL,
`province` varchar(255) DEFAULT NULL,
`city_id` int(11) DEFAULT NULL,
`city` varchar(255) DEFAULT NULL,
`bank_name` varchar(255) NOT NULL,
`bank_id` int(11) DEFAULT NULL,
KEY `idx_banks_name` (`bank_id`,`province_id`,`city_id`,`sub_branch_id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT;
#!/usr/bin/env python
# encoding: utf-8
"""
@author: roc
@contact: roc@csdn.com
@software: PyCharm
@file: bank.py
@create: 2021/9/17 11:37
"""
import time
import requests
from lxml import etree
from selenium import webdriver
import json
import pymysql
import re
# driver = webdriver.Chrome(r'C:\Users\Hurrian\AppData\Local\Google\Chrome\Application\chromedriver.exe')
# #driver.maximize_window() #最大化浏览器
# driver.implicitly_wait(23)
def getCity(province):
citys_dic = {
"prov": ["上海", "内蒙古", "广东", "四川", "北京", "浙江", "福建", "重庆", "贵州", "陕西", "江苏", "天津", "湖南", "山东", "江西", "安徽", "甘肃",
"河北", "河南", "辽宁", "广西", "海南", "黑龙江", "湖北", "吉林", "宁夏", "青海", "山西", "云南", "新疆", "西藏", "北京市"],
"city": {
"上海": ["上海"],
"内蒙古": ["乌兰察布", "乌海", "兴安盟", "包头", "呼伦贝尔", "呼和浩特", "巴彦淖尔", "赤峰", "通辽", "鄂尔多斯", "锡林郭勒盟", "阿拉善盟"],
"广东": ["东莞", "中山", "云浮", "佛山", "广州", "惠州", "揭阳", "梅州", "汕头", "汕尾", "江门", "河源", "深圳", "清远", "湛江", "潮州", "珠海",
"肇庆", "茂名", "阳江", "韶关"],
"四川": ["乐山", "内江", "凉山州", "凉山彝族自治州", "南充", "宜宾", "巴中", "广元", "广安", "德阳", "成都", "攀枝花", "泸州", "甘孜州",
"甘孜藏族自治州", "眉山", "绵阳", "自贡", "资阳", "达州", "遂宁", "阿坝州", "阿坝藏族羌族自治州", "雅安"],
"北京": ["北京"],
"浙江": ["丽水", "台州", "嘉兴", "宁波", "杭州", "温州", "湖州", "绍兴", "舟山", "衢州", "金华"],
"福建": ["三明", "南平", "厦门", "宁德", "泉州", "漳州", "福州", "莆田", "龙岩"],
"重庆": ["重庆"],
"贵州": ["六盘水", "安顺", "毕节地区", "贵阳", "遵义", "铜仁地区", "黔东南苗族侗族自治州", "黔南布依族苗族自治州", "黔西南州", "黔西南布依族苗族自治州"],
"陕西": ["咸阳", "商洛", "安康", "宝鸡", "延安", "榆林", "汉中", "渭南", "西安", "铜川"],
"江苏": ["南京", "南通", "宿迁", "常州", "徐州", "扬州", "无锡", "昆山", "泰州", "淮安", "盐城", "苏州", "连云港", "镇江"],
"天津": ["天津"],
"湖南": ["娄底", "岳阳", "常德", "张家界", "怀化", "株洲", "永州", "湘潭", "湘西土家族苗族自治州", "益阳", "衡阳", "邵阳", "郴州", "长沙"],
"山东": ["东营", "临沂", "威海", "德州", "日照", "枣庄", "泰安", "济南", "济宁", "淄博", "滨州", "潍坊", "烟台", "聊城", "莱芜", "菏泽",
"青岛"],
"江西": ["上饶", "九江", "南昌", "吉安", "宜春", "抚州", "新余", "景德镇", "萍乡", "赣州", "鹰潭"],
"安徽": ["三明", "亳州", "六安", "南平", "厦门", "合肥", "宁德", "安庆", "宣城", "宿州", "巢湖", "池州", "泉州", "淮北", "淮南", "滁州", "漳州",
"福州", "芜湖", "莆田", "蚌埠", "铜陵", "阜阳", "马鞍山", "黄山", "龙岩"],
"甘肃": ["临夏回族自治州", "临夏州", "兰州", "嘉峪关", "天水", "定西", "平凉", "庆阳", "张掖", "武威", "甘南州", "甘南藏族自治州", "白银", "酒泉",
"金昌", "陇南"],
"河北": ["保定", "唐山", "廊坊", "张家口", "承德", "沧州", "石家庄", "秦皇岛", "衡水", "邢台", "邯郸"],
"河南": ["三门峡", "信阳", "南阳", "周口", "商丘", "安阳", "平顶山", "开封", "新乡", "洛阳", "济源", "漯河", "濮阳", "焦作", "许昌", "郑州",
"驻马店", "鹤壁"],
"辽宁": ["丹东", "大连", "抚顺", "朝阳", "本溪", "沈阳", "盘锦", "营口", "葫芦岛", "辽阳", "铁岭", "锦州", "阜新", "鞍山"],
"广西": ["北海", "南宁", "崇左", "来宾", "柳州", "桂林", "梧州", "河池", "玉林", "百色", "贵港", "贺州", "钦州", "防城港"],
"海南": ["三亚", "海口", "白沙"],
"黑龙江": ["七台河", "伊春", "佳木斯", "双鸭山", "哈尔滨", "大兴安岭", "大庆", "牡丹江", "绥化", "鸡西", "鹤岗", "黑河", "齐齐哈尔"],
"湖北": ["十堰", "咸宁", "孝感", "宜昌", "恩施", "恩施土家族苗族自治州", "武汉", "荆州", "荆门", "襄樊", "鄂州", "随州", "黄冈", "黄石"],
"吉林": ["吉林", "吉林市", "四平", "延边朝鲜族自治州", "松原", "白城", "白山", "辽源", "通化", "长春"],
"宁夏": ["中卫", "吴忠", "固原", "石嘴山", "银川"],
"青海": ["果洛州", "果洛藏族自治州", "海东地区", "海北州", "海北藏族自治州", "海南州", "海南藏族自治州", "海西州", "海西蒙古族藏族自治州", "玉树州", "玉树藏族自治州",
"西宁", "黄南州", "黄南藏族自治州"],
"山西": ["临汾", "吕梁", "大同", "太原", "忻州", "晋中", "晋城", "朔州", "运城", "长治", "阳泉"],
"云南": ["临沧", "丽江", "保山", "大理州", "大理白族自治州", "德宏傣族景颇族自治州", "德宏州", "怒江傈僳族自治州", "怒江州", "思茅", "文山", "文山壮族苗族自治州",
"昆明", "昭通", "普洱", "曲靖", "楚雄州", "楚雄彝族自治州", "玉溪", "红河哈尼族彝族自治州", "红河州", "西双版纳傣族自治州", "迪庆州", "迪庆藏族自治州"],
"新疆": ["乌鲁木齐", "伊犁哈萨克自治州", "克孜勒苏柯尔克孜自治州", "克拉玛依", "博尔塔拉蒙古自治州", "吐鲁番地区", "和田地区", "哈密地区", "喀什地区", "塔城地区",
"巴音郭楞蒙古自治州", "昌吉回族自治州", "昌吉州", "阿克苏地区", "阿勒泰地区", "阿拉尔"],
"西藏": ["山南地区", "拉萨", "日喀则地区", "昌都地区", "林芝地区", "那曲地区"],
"北京市": ["北京"]
}
}
city = citys_dic['city'][province]
return city
def get_html_data(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
} # 增加代理
r = requests.get(url, timeout = 30, headers = header)
ret = r.content.decode()
return ret
def spider(bank_name):
bank_name_rank = get_bank_rank(bank_name)
bank_name_url = 'https://www.rong360.com/credit/wangdian/'+str(bank_name_rank)+'-'
province_urls = [bank_name_url+"prov{}".format(str(i)) for i in range(1, 34)]
for province_url in province_urls:
home_data = get_html_data(province_url)
result = etree.HTML(home_data)
province_name = result.xpath("//span[@id='province-selector']/span[@class='droptitle']//text()")[0]
print(province_url+'===='+province_name)
if province_name == '选择省份':
continue
city_list = getCity(province_name)
print(city_list)
for i,city in enumerate(city_list):
city_name = city
i = str(int(i) + 1)
#得到最终的城市url
final_url = province_url+'-city'+i
city_data = get_html_data(final_url)
result = etree.HTML(city_data)
#得出有多少页
total_pages_url = result.xpath("//div[@id='page_section']/a[@class='next-page'][2]/@href")
print(total_pages_url)
if total_pages_url:
#有多页匹配出数字
total_pages_url = total_pages_url[0].split('-')[-1]
num = re.findall('\d+', total_pages_url)[0]
total_pages_url = [final_url+"-p{}".format(str(i)) for i in range(1, int(num)+1)]
print('====多页数据======')
for x in total_pages_url:
scrapy_detail(x,province_name, city_name, bank_name)
else:
#一页
print('====单页数据======')
total_pages_url = final_url
scrapy_detail(total_pages_url, province_name, city_name, bank_name)
break
def scrapy_detail(detail_url,province_name, city_name, bank_name):
branch_bank_data = get_html_data(detail_url)
result = etree.HTML(branch_bank_data)
branch_list = result.xpath("//div[@class='clearfix list']/div[@class='lists']/p[@class=\"p1\"]//text()")
for branch_name in branch_list:
'''开始存储数据'''
db = pymysql.connect('localhost', 'root', 'root', 'python3')
cursor = db.cursor()
sql = """INSERT INTO bank_branchs_perfect (sub_branch_name, province, city, bank_name) VALUES (%s , %s, %s, %s) """
try:
cursor.execute(sql, (branch_name, province_name, city_name, bank_name))
# 提交到数据库中执行
db.commit()
except:
# 如果发生错误则回滚
db.rollback()
finally:
print("程序处理完毕")
# 关闭数据库
db.close()
url = 'https://www.rong360.com/credit/wangdian'
home_data = get_html_data(url)
result = etree.HTML(home_data)
bank_name_list = result.xpath("//div[@class='vals clearfix']/a//text()")
bank_name_sort = result.xpath("//div[@class='vals clearfix']/a//@href")
bank_ranks = []
for bank_name_url in bank_name_sort:
bank_sort = bank_name_url.split('/')[-1]
bank_ranks.append(bank_sort)
'''根据银行名称获取银行序号'''
def get_bank_rank(bank_name):
bank_name_dic = dict(zip(bank_name_list, bank_ranks))
bank_rank = bank_name_dic[bank_name]
return bank_rank
def run():
for bank_name in bank_name_list:
spider(bank_name)
if __name__ == '__main__':
run()
龙卷风之殇

浙公网安备 33010602011771号