爬取湖北师范大学官网公告和具体内容并保存成json格式
from urllib import request
import json
from bs4 import BeautifulSoup  #一个可以从html或者xml中提取结构化数据的python库
import re
def hbnu():
    url = 'http://www.hbnu.edu.cn/'
    headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}
    page = request.Request(url,headers=headers)
    #打开url,获取httpresponse返回对象并读取其responsebody
    page_info = request.urlopen(page).read().decode('utf-8')
    #将获取到的内容转换成BeautifulSoup格式,并将html.parser作为解析器
    soup = BeautifulSoup(page_info,'html.parser')
    #以格式化的形式打印html
    #查找所有a标签中class='title'的语句
    cd = soup.find_all(title=True,href=True)
    h=[]
    t=[]
    p=[]
    a={}
    u=[]
    for c in cd:
        if 'http://www.news.hbnu.edu.cn/'in str(c.get('href')):
            h.append(c.get('href'))
            t.append(c.get('title'))
    h = sorted(set(h),key=h.index)
    t = sorted(set(t),key=t.index)
    for l in range(len(h)):
        url = h[l]
        headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}
        page = request.Request(url,headers=headers)
    #打开url,获取httpresponse返回对象并读取其responsebody
        page_info = request.urlopen(page).read().decode('utf-8')
    #将获取到的内容转换成BeautifulSoup格式,并将html.parser作为解析器
        soup = BeautifulSoup(page_info,'html.parser')
        cdd=soup.find_all(class_="arti_update")
        for x in cdd:
            p.append(x.get_text())
        cdd=soup.find_all('p')
        e=[]
        for v in cdd:
            if '通讯'not in str(v.get_text())and '新闻'not in str(v.get_text()):
                e.append(v.get_text())
        e = sorted(set(e),key=e.index)
        e='\n'.join(e)
        for v in cd:
            a['content']=str(e)
            a['href']=h[l]
            a['title']=t[l]
            a['time']=p[l]
            a["school"]="湖北师范大学"
        u.append({'content':str(e),'href':h[l],'title':t[l],'time':p[l],'school':"湖北师范大学"})
    key={'湖北师范大学':u}
    with open("E:\小程序内容\学校\\湖北师范大学.json", "a+",encoding='utf-8_sig')as f:
        json.dump(key,f,sort_keys=True,indent =4,ensure_ascii=False)
    key_1=json.dumps(key,sort_keys=True,indent =4,ensure_ascii=False)
    print(json.loads(key_1))
                    
                
                
            
        
浙公网安备 33010602011771号