获取全部校园新闻

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
def get_soup(url):
    req = requests.get(url)
    req.encoding = 'utf-8'
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup

def getDownNum(urls):
    pagename = urls.split('/')[-2].split('_')[0]
    html_id = (re.search('http://news.gzcc.cn/html/2018/'+pagename+'_(.*).html', urls).group(1).split('/')[-1])
    down_url = 'http://oa.gzcc.cn/api.php?op=count&id=' + html_id + '&modelid=80'
    reqd = requests.get(down_url)
    down_num = (re.search("\('#hits'\).html\('(.*)'\);", reqd.text).group(1))
    return down_num

def getNewInfo(pageurl):
    soup = get_soup(pageurl)
    li_list = soup.select('li')
    title = list()
    a = list()
    info_list = list()
    con_list = list()
    cs = list()
    i=0
    for new in li_list:
        if(len(new.select('.news-list-text'))>0):
            title.append(new.select('.news-list-text')[0].select('.news-list-title')[0].text)
            a.append(new.a.attrs['href'])
            con_soup = get_soup(a[i])
            con_list.append(con_soup.select('#content')[0].text)
            info_list.append(con_soup.select('.show-info')[0].text.split("\xa0\xa0"))
            cs.append(''.join(con_list[i]))
            down_num = getDownNum(a[i])
            print('标题:' + title[i])
            print('链接:' + a[i])
            for j in range(len(info_list[i])):
                if (len(info_list[i][j]) > 0 and info_list[i][j] != ' '):
                    if (j != len(info_list[i]) - 1):
                        print(info_list[i][j])
                    else:
                        print(info_list[i][j].rstrip(''), down_num, '')
            print(cs[i])
            i=i+1

def getPageNum(url):
    newsoup = get_soup(url)
    return int(int(newsoup.select('.a1')[0].text.rstrip(''))/10)

n = getPageNum('http://news.gzcc.cn/html/xiaoyuanxinwen/')


for i in range(0,n+2):
    if(i==0):
        getNewInfo('http://news.gzcc.cn/html/xiaoyuanxinwen/')
    else:
        getNewInfo('http://news.gzcc.cn/html/xiaoyuanxinwen/'+str(i)+'.html')

 

posted on 2018-04-11 14:58  148崔格畅  阅读(173)  评论(0编辑  收藏  举报