import requests
import pandas
import re
from bs4 import BeautifulSoup
from datetime import datetime
def getNewsDetail(newsUrl):
resd = requests.get(newsUrl)
resd.encoding = 'utf-8'
soupd = BeautifulSoup(resd.text,'html.parser')
news = {}
news['title']=soupd.select('.show-title')[0].text
info = soupd.select('.show-info')[0].text
news['dt']=datetime.strptime(info.lstrip('发布时间:“')[0:19],':%Y-%m-%d %H:%M:%S')
if info.find('来源:')>0:
news['source']=info[info.find('来源:'):].split()[0].lstrip('来源:')
else:
news['source']='none'
news['content']=soupd.select('.show-content')[0].text.strip()
news['click']=getClickCount(newsUrl)
news['newsUrl']=newsUrl
return (news)
'''c=soupd.select('#content')[0].text
info=soupd.select('.show-info')[0].text
d= info.lstrip('发布时间')[:19]
dt=datetime.strftime(d,'%Y-%m-%d %H:%M:%S')
au=info[info.find('作者:'):].split()[0].lstrip('作者:')
clickCount = getClickCount(newsUrl)
print(clickCount,newsUrl,dt,au)'''
def getClickCount(newsUrl):
newId =re.search('\_(.*).html',newsUrl).group(1).split('/')[1]
clickUrl = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(newId)
return (int(requests.get(clickUrl).text.split('.html')[-1].lstrip("('").rstrip("');")))
def getNewsList(pageUrl):
res =requests.get(pageUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
newsUrl = news.select('a')[0].attrs['href']
getNewsDetail(newsUrl)
break
def getPageN(pageUrl):
res =requests.get(pageUrl)
res.encoding='utf-8'
soup =BeautifulSoup(res.text,'html.parser')
page = soup.select('#pages .a1')[0].text.strip('条')
n = int(int(page) / 10)
return n
def writeNewsDetail(content):
f=open("gjcc","a",encoding='utf-8')
f.write(content)
f.close()
def getListPage(pageUrl):
resd = requests.get(pageUrl)
resd.encoding = 'utf-8'
soupd = BeautifulSoup(resd.text,'html.parser')
newsList=[]
for news in soupd.select('li'):
if len(news.select('.news-list-title'))>0:
newsUrl = news.select('a')[0].attrs['href']
newsList.append(getNewsDetail(newsUrl))
return (newsList)
pageUrl ="http://news.gzcc.cn/html/xiaoyuanxinwen/"
print(getListPage(pageUrl))
newsTotal = []
newsTotal.extend(getListPage(pageUrl))
n = getPageN(pageUrl)
for i in range(2, 3):
listPageUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
newsTotal.extend(getListPage(listPageUrl))
df = pandas.DataFrame(newsTotal)
print(df[['click', 'title', 'sources']].head(6))
print(df[(df['click'] > 3000) & (df['sources'] == '学校综合办')])
sou = ['国际学院', '学生工作处']
print(df[df['sources'].isin(sou)])
df.to_excel('gjcc.xlsx')
for i in range(n,n+1):
listPageUrl = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(1)
getListPage(listPageUrl)