爬虫获取gbcode

# -*- coding: utf-8 -*-
# @Time : 2020/11/22 17:45
# @Author : Mr.Hei
import requests
import bs4
from bs4 import BeautifulSoup
import urllib.request

import re
import sys
def get_html(url):
page = urllib.request.urlopen(url)
htmlcode = page.read() #这里没有进行任何更改
return htmlcode

regex_str = ".*?([\u4E00-\u9FA5])+大学"
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
html = get_html(url)
soup = BeautifulSoup(html,'html.parser') #使用html.parser
url1='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
list=[]
for tag in soup.find_all("a"):
#print(tag['href'])
s=tag['href']
url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' + str(s)

list.append(str(url1))
#print(list)
html = get_html(url)
soup = BeautifulSoup(html,'html.parser') #使用html.parser
#url1='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
list1=[]
list.pop(-1)
for url1 in list:
#print(url1)
html1 = get_html(url1)
soup = BeautifulSoup(html1, 'html.parser')
for tag in soup.find_all("a"):
s1=tag['href']
list1.append(str(s1))
dele_url="http://www.miibeian.gov.cn/"
#print(list1)
for i in list1:
if i!=dele_url:
pass
else:
list1.remove(i)
#print(list1)
url3='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
info = ''
with open("address.txt",'w',encoding='utf-8') as f:
for i in list1:
url4=url3+i
#print(url4)
html3 = get_html(url4)
soup4 = BeautifulSoup(html3, 'html.parser')
for tag4 in soup4.find_all("a"):
tag5=str(tag4)
f.write(tag5+"\n")

##重复写了几次,后期优化
产生文件后开始正则匹配
# -*- coding: utf-8 -*-
# @Time : 2020/11/27 11:28
# @Author : Mr.Hei
import re
info = ''
#str = '<@ba.rem>可以在下列状态和初始状态间切换:</>\n攻击范围<@ba.vdown>缩小</>,防御力<@ba.vup>+{0.35}</>,每秒恢复最大生命的<@ba.vup>{HP_RECOVERY_PER_SEC_BY_MAX_HP_RATIO:0.0%}</>'.replace('<@ba.rem>', '').replace('<@ba.vdown>', '').replace('<@ba.vup>', '')
f=open("address.txt",'r',encoding='utf-8')
str=[]
for i in f.readlines():
str.append(i)
for i in str[1::2]:
str=i
  ##过滤掉不需要的字符串
if str.strip()!='<a class="STYLE3" href="http://www.miibeian.gov.cn/" target="_blank">京ICP备05034670号</a>':
regStr = ".*?([\u4E00-\u9FA5]+).*?"
aa=re.findall(regStr,str)
if aa:
print(str)
# ##正则匹配
# info = ''
# str='<a href="01/110101.html">东城区</a>'
# for n in range(0, len(str)-1):
# if '\u4e00' <= str[n] <= '\u9fff' or str[n] in ':,,:0123456789%':
# info += str[n]
# print(info)
posted @ 2020-11-27 18:25  小黑仔学It  阅读(173)  评论(0)    收藏  举报