import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
def get_soup(url):
req = requests.get(url)
req.encoding = 'utf-8'
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getDownNum(urls):
pagename = urls.split('/')[-2].split('_')[0]
html_id = (re.search('http://news.gzcc.cn/html/2018/'+pagename+'_(.*).html', urls).group(1).split('/')[-1])
down_url = 'http://oa.gzcc.cn/api.php?op=count&id=' + html_id + '&modelid=80'
reqd = requests.get(down_url)
down_num = (re.search("\('#hits'\).html\('(.*)'\);", reqd.text).group(1))
return down_num
def getNewInfo(pageurl):
soup = get_soup(pageurl)
li_list = soup.select('li')
title = list()
a = list()
info_list = list()
con_list = list()
cs = list()
i=0
for new in li_list:
if(len(new.select('.news-list-text'))>0):
title.append(new.select('.news-list-text')[0].select('.news-list-title')[0].text)
a.append(new.a.attrs['href'])
con_soup = get_soup(a[i])
con_list.append(con_soup.select('#content')[0].text)
info_list.append(con_soup.select('.show-info')[0].text.split("\xa0\xa0"))
cs.append(''.join(con_list[i]))
down_num = getDownNum(a[i])
print('标题:' + title[i])
print('链接:' + a[i])
for j in range(len(info_list[i])):
if (len(info_list[i][j]) > 0 and info_list[i][j] != ' '):
if (j != len(info_list[i]) - 1):
print(info_list[i][j])
else:
print(info_list[i][j].rstrip('次'), down_num, '次')
print(cs[i])
i=i+1
def getPageNum(url):
newsoup = get_soup(url)
return int(int(newsoup.select('.a1')[0].text.rstrip('条'))/10)
n = getPageNum('http://news.gzcc.cn/html/xiaoyuanxinwen/')
for i in range(0,n+2):
if(i==0):
getNewInfo('http://news.gzcc.cn/html/xiaoyuanxinwen/')
else:
getNewInfo('http://news.gzcc.cn/html/xiaoyuanxinwen/'+str(i)+'.html')