获取全部校园新闻

1.取出一个新闻列表页的全部新闻 包装成函数。

2.获取总的新闻篇数,算出新闻总页数。

3.获取全部新闻列表页的全部新闻详情。

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime

newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
res = requests.get(newsurl)  # 返回response对象
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')


def getNewDetail(pageUrl):
    res = requests.get(pageUrl)  # 返回response对象
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('li'):
        if len(news.select('.news-list-title')) > 0:
            t = news.select('.news-list-title')[0].text  # 标题
            a = news.select('a')[0].attrs['href']  # 链接
            res = requests.get(a)
            res.encoding = 'utf-8'
            soupd = BeautifulSoup(res.text, 'html.parser')
            content = soupd.select('#content')[0].text
            description = news.select('.news-list-description')[0].text
            resd = requests.get(a)
            resd.encoding = 'utf-8'
            soupd = BeautifulSoup(resd.text, 'html.parser')
            info = soupd.select('.show-info')[0].text
            d = info.lstrip('发布时间:')[:19]
            dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
            author = info[info.find('作者:'):].split()[0].lstrip('作者:')
            source = info[info.find('来源:'):].split()[0].lstrip('来源:')
            photo = info[info.find('摄影:'):].split()[0].lstrip('摄影:')
            print("新闻标题:", t)
            print("链接:", a)
            print("发布时间:", dt)
            print("作者:", author)
            print("来源:", source)
            print("摄影:", photo)
            print("描述:", description)
            getClickCount(a)
            print("正文:", content)



def getClickCount(a):
    newsid = re.search(r"\_(.*).html", a).group(1)[-4:]
    clickcounturl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid)
    clickcount = int(requests.get(clickcounturl).text.split(".html(")[-1].lstrip("'").rstrip("');"))
    print('点击次数:',clickcount)


def getpagelist(path):
        res = requests.get(path)  # 返回response对象
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        newsnum=int(soup.select('.a1')[0].text.rstrip('条'))    #新闻总条数
        if(newsnum%10==0):
            totalpage=newsnum//10
        else:
            totalpage=newsnum//10+1   #新闻总页数

        for i in range(1,totalpage):
            pageUrl = path + '{}.html'.format(i)
            getNewDetail(pageUrl)
getpagelist(newsurl)

  

4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

# 爬取环球科技网新闻信息

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import jieba

newsurl = 'http://tech.huanqiu.com/internet/'


def sort(text):
    str = '''一!“”,。?;’"',.、:\n'''
    for s in str:
        text = text.replace(s, ' ')
    wordlist = list(jieba.cut(text))
    exclude = {'这', '\u3000', '\r', '\xa0', '的', '_', ' ', '将', '在', '是', '了', '一', '还', '也', '《', '》', '(', ')'}
    set2 = set(wordlist) - exclude
    dict = {}
    for key in set2:
        dict[key] = wordlist.count(key)
    dictlist = list(dict.items())
    dictlist.sort(key=lambda x: x[1], reverse=True)
    print("top5关键词:")
    for i in range(5):
        print(dictlist[i])


def getContent(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup2 = BeautifulSoup(res.text, 'html.parser')
    for news in soup2.select('.l_a'):
        if len(news.select('.author'))>0:
            author=news.select('.author')[0].text
            print("作者",author)
    content = soup2.select('.la_con')[0].text.rstrip('AD_SURVEY_Add_AdPos("7000531");')
    print("正文:", content)
    sort(content)


def getNewDetails(newsurl):
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    for news in soup.select('.item'):
        #  print(news)
        title = news.select('a')[0].attrs['title']
        a = news.select('a')[0].attrs['href']
        brief = news.select('h5')[0].text.rstrip('[详细]')
        time = news.select('h6')[0].text
        dt = datetime.strptime(time, '%Y-%m-%d %H:%M')
        print("新闻标题:", title)
        print("链接:", a)
        print("内容简介:", brief)
        print("时间:", dt)
        getContent(a)
        print('\n')
    # break


res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
getNewDetails(newsurl)
# for total in soup.select('#pages'):
#     all=int(total.select('a')[0].text.rstrip('条'))  #获取总条数计算总页数
#     #print(all)
#     if(all%60==0):
#         totalpages=all//60
#     else:
#         totalpages=all//60+1
#     print(totalpages)
#     for i in range(1,totalpages+1):     #所有页面的新闻信息
#         PageUrl = newsurl + '{}.html'.format(i)
#         getNewDetails(PageUrl)

  


  

 

posted @ 2018-04-11 13:50  193杨晓玲  阅读(161)  评论(0编辑  收藏  举报