爬取校园新闻首页的新闻

'''
使用开发者工具观察，每则新闻所在的分枝<li>
遍历每个li
排除一些非新闻列表的<li>

获取每条新闻的时间，标题，描述，链接。
通过链接获取新闻页，取得新闻正文，作者，发布时间等数据项
对info字符串进行分解，取得时间，作者，审核等数据项。
'''
import requests
import re
from bs4 import BeautifulSoup

def getClickCount(str,url):
    requ=requests.get(url)
    requ.encoding='utf-8'
    # print("match练习"+re.match(str+'(.*).html',url).group(1).split('/')[-1])
    # print("search练习"+re.search('\_(.*).html',url).group(1).split('/')[-1])
    # print("findall练习"+re.findall('\d(.*).html',url)[0])
    return re.search('\_(.*).html',url).group(1).split('/')[-1]
url='http://news.gzcc.cn/html/2018/xiaoyuanxinwen_0404/9183.html'
str='http://news.gzcc.cn/html/2018/xiaoyuanxinwen_'
req=requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen')
req.encoding='utf-8'
soup=BeautifulSoup(req.text,'html.parser')
for news in soup.select('li'):
    if len(news.select('.news-list-title'))>0:
        time=news.select('.news-list-info')[0].contents[0].text
        title=news.select('.news-list-title')[0].text
        link=news.select('a')[0].attrs['href']
        description=news.select('.news-list-description')[0].text
        newsreq=requests.get(link)
        newsreq.encoding='utf-8'
        soupd=BeautifulSoup(newsreq.text,'html.parser')
        info=soupd.select('.show-info')[0].text
        datetime=info.lstrip('发布时间：')[:19] #具体发布时间
        author=info[info.find('作者：'):].split()[0].lstrip()[3:]#作者
        article=soupd.select('#content')[0].text
        clickcount=getClickCount(str,url)


        print(
              "\n标题："+title,
              "\n点击次数："+clickcount,
              "\n作者：" + author,
              "\n描述："+description,
              "\n链接："+link,
              "\n具体发布时间：" + datetime,
              "\n正文："+article)
        break

posted on 2018-04-03 17:51 242韦兴纳阅读(146) 评论(0) 收藏举报

刷新页面返回顶部

飞鹅

爬取校园新闻首页的新闻

导航

公告