网络爬虫基础练习

取出h5标签的文本
取出a标签的链接
取出所有li标签的所有内容
取出一条新闻的标题、链接、发布时间、来源

 

import requests
from bs4 import BeautifulSoup
url ='http://www.gzcc.cn/'
res = requests.get(url)
res.encoding = 'utf-8'
#print(res.text)
page = BeautifulSoup(res.text ,'html.parser')
#print(page.text)
print(page.h5.text)
urlList=page.select('img')
#循环输出图片地址
for item in urlList:
    print(item.attrs['src'])
newsurl='http://news.gzcc.cn/html/2018/xiaoyuanxinwen_0328/9113.html'
newsRes=requests.get(newsurl)
newsRes.encoding='utf-8'
newsPage=BeautifulSoup(newsRes.text,'html.parser')
#输出标题
print(newsPage.select('.show-title')[0].text)
#输出时间和作者
print(newsPage.select('.show-info')[0].text)
#输出连接
#print(newsPage.select('.show-content')[0].select('p')[4].select('img')[0].attrs['src'])
imagePath=newsPage.select('.show-content')[0].select('img')
for item in imagePath:
    print(item.attrs['src'])

 

posted @ 2018-03-29 22:01  234陈壬询  阅读(148)  评论(0编辑  收藏  举报