用requests库和BeautifulSoup4库爬取新闻列表

1、用requests库和BeautifulSoup4库,爬取校园新闻列表的时间、标题、链接、来源、详细内容;将其中的时间str转换成datetime类型;将取得详细内容的代码包装成函数。

import requests
from bs4 import BeautifulSoup
from datetime import datetime

gzccurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
res=requests.get(gzccurl)
res.encoding='utf-8'
soup =BeautifulSoup(res.text,'html.parser')

def getdetail(url):#将取得详细内容的代码包装成函数
    resn=requests.get(url)
    resn.encoding='utf-8'
    soupn=BeautifulSoup(resn.text,'html.parser')
    detail=soupn.select('.show-content')[0].text
    return (detail)

def getdetailtime(url):#将取得详细时间的代码包装成函数
    rest=requests.get(url)
    rest.encoding='utf-8'
    soupt=BeautifulSoup(rest.text,'html.parser')
    detailtime=soupt.select('.show-info')[0].text[16:24]
    return(detailtime)


for news in soup.select('li'):
    if len((news.select('.news-list-title')))>0:
        time=news.select('.news-list-info')[0].span.text#时间
        
        title=news.select('.news-list-title')[0].contents[0]#标题
        url=news.select('a')[0]['href']#链接
        source=news.select('.news-list-info')[0].select('span')[1].text
        detail=getdetail(url)

        dtime=getdetailtime(url)
        addt=time+'-'+dtime
        dt=datetime.strptime(addt,'%Y-%m-%d-%H:%M:%S')#将其中的时间str转换成datetime类型

 

2、爬取AOPA网站上的内容

 

import requests
from bs4 import BeautifulSoup
from datetime import datetime

aopaurl='http://www.aopa.org.cn/news/list.php?catid=8'
res=requests.get(aopaurl)
res.encoding='utf-8'
soup =BeautifulSoup(res.text,'html.parser')

def getdetail(url):#将取得详细内容的代码包装成函数
    resn=requests.get(url)
    resn.encoding='utf-8'
    soupn=BeautifulSoup(resn.text,'html.parser')
    detail=soupn.select('.content')[0].text
    return (detail)


for news in soup.select('.catlist'):
    #print(news)
    for news2 in news.select('.catlist_li'):
        if len((news2))>0:
            time=news2.select('span')[0].text #时间
            title=news2.select('a')[0]['title']#标题
            url=news2.select('a')[0]['href']#链接
            dt=datetime.strptime(time,'%Y-%m-%d %H:%M')#将其中的时间str转换成datetime类型
            detail=getdetail(url)
            print( "时间:",dt,"\t标题:",title,"\t链接:",url,"详情:",detail)

 

posted @ 2017-09-28 19:33  14郭子维  阅读(232)  评论(0编辑  收藏  举报