获取全部校园新闻

1.取出一个新闻列表页的全部新闻包装成函数。

2.获取总的新闻篇数，算出新闻总页数。

3.获取全部新闻列表页的全部新闻详情。

 1 import requests
 2 from bs4 import BeautifulSoup
 3 from datetime import datetime
 4 import re
 5 # res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
 6 # res.encoding = 'utf-8'
 7 # soup = BeautifulSoup(res.text, 'html.parser')
 8 
 9 
10 # 获取新闻点击次数
11 def getNewsId(url):
12     #使用正则表达式获得新闻编号
13     newsId = re.findall(r'\_(.*).html', url)[0][-4:]
14     #生成点击次数的Request URL
15     clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
16     clickRes = requests.get(clickUrl)
17     # 利用正则表达式获取新闻点击次数
18     clickCount = int(re.search("hits'\).html\('(.*)'\);", clickRes.text).group(1))
19     return clickCount
20 
21 
22 
23 def getNewDetail(newsurl):
24     # 读取新闻详情
25     resDescript = requests.get(newsurl)
26     resDescript.encoding = "utf-8"
27     soupDescript = BeautifulSoup(resDescript.text, 'html.parser')
28     title = soupDescript.select('.show-title')[0].text
29     info = soupDescript.select('.show-info')[0].text
30     if (info.find('作者') > 0):
31         author = re.search('作者：((.{2,20}\s|.{2,20}、|.{2,20}，){1,5})', info).group(1)
32     else:
33         author = 'none'
34     if (info.find('审核') > 0):
35         right = re.search('审核：((.{2,20}\s|.{2,20}、|.{2,20}，){1,5})', info).group(1)
36     else:
37         right = 'none'
38     if (info.find('来源') > 0):
39         source = re.search('来源：((.{2,50}\s|.{2,50}、|.{2,50}，){1,5})', info).group(1)
40     else:
41         source = 'none'
42     if (info.find('摄影') > 0):
43         video = re.search('摄影：((.{2,50}\s|.{2,50}、|.{2,50}，){1,5})', info).group(1)
44     else:
45         video = 'none'
46     # author = re.search('作者：((.{2,20}\s|.{2,20}、|.{2,20}，){1,5})', info).group(1)
47     # right = re.search('审核：(.*)\xa0\xa0来源：', info).group(1)
48     # source = re.search('来源：(.*)\xa0\xa0\xa0\xa0摄影：', info).group(1)
49     # video = re.search('摄影：(.*)\xa0\xa0\xa0\xa0点击：', info).group(1)
50     dt = datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S')
51     content = soupDescript.select('.show-content')[0].text.strip()
52     click = getNewsId(newsurl)
53     # print(click,title,newsurl,source,dt)
54     print('发布时间：{0}\n作者：{1}\n审核：{2}\n来源：{3}\n摄影：{4}\n点击次数：{5}'.format(dt, author, right, source, video, click))
55 
56 def getListPage(listPageUrl):
57     res1 = requests.get(listPageUrl)
58     res1.encoding = 'utf-8'
59     soup = BeautifulSoup(res1.text,'html.parser')
60     for news in soup.select('li'):
61         if len(news.select('.news-list-title'))>0:
62             a = news.select('a')[0].attrs['href']
63             getNewDetail(a)
64 
65 resn = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
66 resn.encoding = 'utf-8'
67 soupn = BeautifulSoup(resn.text,'html.parser')
68 #新闻总篇数
69 listcount = int(soupn.select('.a1')[0].text.rstrip('条'))
70 print(listcount)
71 #新闻总页数
72 n = int(soupn.select('.a1')[0].text.rstrip('条'))//10+1
73 
74 #首页
75 # getListPage('http://news.gzcc.cn/html/xiaoyuanxinwen/')
76 
77 #最后一页
78 for i in range(n,n+1):
79     pageUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
80     getListPage(pageUrl)

4.找一个自己感兴趣的主题，进行数据爬取，并进行分词分析。（爬取腾讯体育-NBA）

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import re
 4 import jieba
 5 
 6 
 7 def getnewsdetail(newsurl):
 8     resDescript = requests.get(newsurl)
 9     resDescript.encoding = "utf-8"
10     soupDescript = BeautifulSoup(resDescript.text, 'html.parser')
11     content = soupDescript.select('.text')[0].text.strip()
12     words = jieba.lcut(content)
13     wcdict = {}
14     for i in set(words):
15         wcdict[i] = words.count(i)
16         delete = {'你', '我', '他', '都', '已经', '着', '不', '她', '没有', '和', '他们', '中', '下', '什么', '一个',
17                   '道', '的', '们', '所', '在', '来', '有', '过', '从', '而', '才', '要', '因', '为', '地', '将', '上', '共', '自', '是',
18                   '令', '但', '被', '就', '也', '说', '语', '呀', '啊',  '个', '人', '里', '罢', '内', '该', '与', '会', '对', '去',
19                   '出', '动', '却', '超', '已', '只', '放', '这', '比', '还', '则', '见', '到', '最', '话', '加', '更', '并', '把',
20                   '儿', '大', '小', '那', ' ', '了', '-', '\n', '，', '。', '？', '！', '“', '”', '：', '；', '、', '.', '‘', '’'}
21     for i in delete:
22         if i in wcdict:
23             del wcdict[i]
24     sort_word = sorted(wcdict.items(), key=lambda d: d[1], reverse=True)  # 排序
25     for i in range(20):  # 输出
26         print(sort_word[i])
27 
28 
29 def getnewslist(newsurl):
30     res = requests.get(newsurl)
31     res.encoding = 'gbk'
32     soup = BeautifulSoup(res.text, 'html.parser')
33     for newsList in soup.select('.list01')[0].select('li'):
34         title = newsList.select('a')[0].text
35         newsurl = newsList.select('a')[0]['href']
36         print('\n标题：{0}\n新闻链接:{1}\n'.format(title, newsurl))
37         getnewsdetail(newsurl)
38 
39 
40 url = "http://sports.qq.com/l/basket/original/qqinterview/list20150821155646.htm"
41 resn = requests.get(url)
42 resn.encoding = 'utf-8'
43 soupn = BeautifulSoup(resn.text,'html.parser')
44 getnewslist(url)
45 
46 for i in range(1, 30):
47     if (i == 1):
48         getnewslist(url)
49     else:
50         newsurl = "http://sports.qq.com/l/basket/original/qqinterview/list20150821155646_{}.htm".format(i)
51         getnewslist(newsurl)

posted @ 2018-04-11 10:40 201506110167陈广鹏阅读(163) 评论(0) 收藏举报

刷新页面返回顶部

guangp

获取全部校园新闻

公告