爬取湖北师范大学官网公告和具体内容并保存成json格式

from urllib import request
import json
from bs4 import BeautifulSoup #一个可以从html或者xml中提取结构化数据的python库
import re
def hbnu():
url = 'http://www.hbnu.edu.cn/'
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}
page = request.Request(url,headers=headers)
#打开url，获取httpresponse返回对象并读取其responsebody
page_info = request.urlopen(page).read().decode('utf-8')
#将获取到的内容转换成BeautifulSoup格式，并将html.parser作为解析器
soup = BeautifulSoup(page_info,'html.parser')
#以格式化的形式打印html
#查找所有a标签中class='title'的语句
cd = soup.find_all(title=True,href=True)
h=[]
t=[]
p=[]
a={}
u=[]
for c in cd:
if 'http://www.news.hbnu.edu.cn/'in str(c.get('href')):
h.append(c.get('href'))
t.append(c.get('title'))
h = sorted(set(h),key=h.index)
t = sorted(set(t),key=t.index)
for l in range(len(h)):
url = h[l]
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}
page = request.Request(url,headers=headers)
#打开url，获取httpresponse返回对象并读取其responsebody
page_info = request.urlopen(page).read().decode('utf-8')
#将获取到的内容转换成BeautifulSoup格式，并将html.parser作为解析器
soup = BeautifulSoup(page_info,'html.parser')
cdd=soup.find_all(class_="arti_update")
for x in cdd:
p.append(x.get_text())
cdd=soup.find_all('p')
e=[]
for v in cdd:
if '通讯'not in str(v.get_text())and '新闻'not in str(v.get_text()):
e.append(v.get_text())
e = sorted(set(e),key=e.index)
e='\n'.join(e)
for v in cd:
a['content']=str(e)
a['href']=h[l]
a['title']=t[l]
a['time']=p[l]
a["school"]="湖北师范大学"
u.append({'content':str(e),'href':h[l],'title':t[l],'time':p[l],'school':"湖北师范大学"})
key={'湖北师范大学':u}
with open("E:\小程序内容\学校\\湖北师范大学.json", "a+",encoding='utf-8_sig')as f:
json.dump(key,f,sort_keys=True,indent =4,ensure_ascii=False)
key_1=json.dumps(key,sort_keys=True,indent =4,ensure_ascii=False)
print(json.loads(key_1))

posted @ 2019-11-17 19:22 busishum1 阅读(199) 评论(1) 收藏举报

刷新页面返回顶部