获取全部学校新闻

1.取出一个新闻列表页的全部新闻 包装成函数。

import requests
from bs4 import BeautifulSoup
newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
res=requests.get(newsurl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
def getallnews(url):
news=soup.select('.news-list')[0].select('li')
for i in news:
news1=i.a.attrs['href']
print(news1)

2.获取总的新闻篇数,算出新闻总页数。

soup1=BeautifulSoup(res.text,'html.parser')
newscount=int(soup1.select('.a1')[0].text.rstrip('条'))
newcount1=newscount//10+1

3.获取全部新闻列表页的全部新闻详情。

import requests
from bs4 import BeautifulSoup
newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
res=requests.get(newsurl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
def getallnews(url):
news=soup.select('.news-list')[0].select('li')
for i in news:
news1=i.a.attrs['href']
print(news1)

soup1=BeautifulSoup(res.text,'html.parser')
newscount=int(soup1.select('.a1')[0].text.rstrip('条'))
newcount1=newscount//10+1

for i in range(1,newcount1+1):
pageurl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
getallnews(pageurl)

4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。

import jieba
import requests
from bs4 import BeautifulSoup
newsurl='http://news.qq.com/'
res=requests.get(newsurl)
soup=BeautifulSoup(res.text,'html.parser')
newlist=soup.select('.Q-tpList')[0].select('a')[1].attrs['href']
print(newlist)
res2=requests.get(newlist)
soup1=BeautifulSoup(res2.text,'html.parser')
content=soup1.select('.content-article')[1].text
news_word=list(jieba.lcut(content))
wordset=set(news_word)
key={}
dict={}
for i in wordset:
key[i] = news_word.count(i)
dict[i]=key[i]
missword= {',','。','的','地','得','一','~',';',':',''}
for wordkey2 in dict.items():
if wordkey2 in missword:
del wordkey2
for wordkey,value in dict.items():
print(wordkey+':'+str(value))
sort=sorted(key.items(),key=lambda d:d[1],reverse=True)
for j in range(20):
print(sort[j])

posted @ 2018-04-10 22:38  161蔡瑞奇  阅读(198)  评论(0编辑  收藏  举报