爬取所有校园新闻

1.获取单条新闻的#标题#链接#时间#来源#内容 #点击次数,并包装成一个函数。

2.获取一个新闻列表页的所有新闻的上述详情,并包装成一个函数。

def printnews(url):
    resp=requests.get(url)
    resp.encoding='utf-8'
    soupp =BeautifulSoup(res.text,'html.parser')
    for news in soup.select('li'):
        if len((news.select('.news-list-title')))>0:
            time=news.select('.news-list-info')[0].span.text#时间
        
            title=news.select('.news-list-title')[0].contents[0]#标题
            url=news.select('a')[0]['href']#链接
            source=news.select('.news-list-info')[0].select('span')[1].text
            detail=getdetail(url)

            dtime=getdetailtime(url)
            addt=time+'-'+dtime
            dt=datetime.strptime(addt,'%Y-%m-%d-%H:%M:%S')#将其中的时间str转换成datetime类型

            curl=getclickurl(url)
            click=int(requests.get(getclickurl(url)).text.split('.')[-1].lstrip("html('").rstrip("');")) 
            print( "时间:",time,"\t标题:",title,"\t链接:",url,"\t点击次数:",click)#,"\n详情:",detail
            print("---------------------|这是一条可爱的分割线|----------------------------")

 

3.获取所有新闻列表页的网址,调用上述函数。

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

gzccurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
res=requests.get(gzccurl)
res.encoding='utf-8'
soup =BeautifulSoup(res.text,'html.parser')

def getpage():
    lists=int(soup.select('.a1')[0].text.rstrip(""))
    page=lists//10+1
    return page
for i in range(2,getpage()+1):
    listurl=('http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i))

 

4.完成所有校园新闻的爬取工作。

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

gzccurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
res=requests.get(gzccurl)
res.encoding='utf-8'
soup =BeautifulSoup(res.text,'html.parser')
print("---------------------|--------------------|----------------------------\n---------------------|这是一条可爱的分割线|----------------------------\n---------------------|____________________|----------------------------")
def getdetail(url):#将取得详细内容的代码包装成函数
    resn=requests.get(url)
    resn.encoding='utf-8'
    soupn=BeautifulSoup(resn.text,'html.parser')
    detail=soupn.select('.show-content')[0].text
    return (detail)

def getdetailtime(url):#将取得详细时间的代码包装成函数
    rest=requests.get(url)
    rest.encoding='utf-8'
    soupt=BeautifulSoup(rest.text,'html.parser')
    detailtime=soupt.select('.show-info')[0].text[16:24]
    return(detailtime)

def getclickurl(url):#
    id=re.match('http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html',url).groups()
    clickurl='http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(id[0].split('/')[1])
    return(clickurl)

def printnews(url):
    resp=requests.get(url)
    resp.encoding='utf-8'
    soupp =BeautifulSoup(res.text,'html.parser')
    for news in soup.select('li'):
        if len((news.select('.news-list-title')))>0:
            time=news.select('.news-list-info')[0].span.text#时间
        
            title=news.select('.news-list-title')[0].contents[0]#标题
            url=news.select('a')[0]['href']#链接
            source=news.select('.news-list-info')[0].select('span')[1].text
            detail=getdetail(url)

            dtime=getdetailtime(url)
            addt=time+'-'+dtime
            dt=datetime.strptime(addt,'%Y-%m-%d-%H:%M:%S')#将其中的时间str转换成datetime类型

            curl=getclickurl(url)
            click=int(requests.get(getclickurl(url)).text.split('.')[-1].lstrip("html('").rstrip("');")) 
            print( "时间:",time,"\t标题:",title,"\t链接:",url,"\t点击次数:",click,"\n详情:",detail)
            print("---------------------|这是一条可爱的分割线|----------------------------")
            
        
def getpage():
    lists=int(soup.select('.a1')[0].text.rstrip(""))
    page=lists//10+1
    return page

printnews(gzccurl)
for i in range(2,getpage()+1):
    listurl=('http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i))
    printnews(listurl)

 

5.完成自己所选其他主题相应数据的爬取工作。

 

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

aopaurl='http://www.aopa.org.cn/news/list.php?catid=8'
res=requests.get(aopaurl)
res.encoding='utf-8'
soup =BeautifulSoup(res.text,'html.parser')

def getdetail(url):#将取得详细内容的代码包装成函数
    resn=requests.get(url)
    resn.encoding='utf-8'
    soupn=BeautifulSoup(resn.text,'html.parser')
    detail=soupn.select('.content')[0].text
    return (detail)

def printnews(url):
    res=requests.get(url)
    res.encoding='utf-8'
    soup =BeautifulSoup(res.text,'html.parser')
    for news in soup.select('.catlist'):
        for news2 in news.select('.catlist_li'):
            if len((news2))>0:
                time=news2.select('span')[0].text #时间
                title=news2.select('a')[0]['title']#标题
                url=news2.select('a')[0]['href']#链接
                dt=datetime.strptime(time,'%Y-%m-%d %H:%M')#将其中的时间str转换成datetime类型
                detail=getdetail(url)
                print( "时间:",dt,"\t标题:",title,"\t链接:",url)#,"详情:",detail

printnews(aopaurl)

for i in range(2,44):
    listurl=('http://www.aopa.org.cn/news/list-8-{}.html'.format(i))
    printnews(listurl)

 

posted @ 2017-10-12 11:13  14郭子维  阅读(232)  评论(0编辑  收藏  举报