爬虫相关
今日内容
-
-
数据解析的作用:
-
可以帮助我们实现聚焦爬虫
-
-
数据解析的实现方式:
-
正则
-
bs4
-
xpath
-
pyquery
-
-
数据解析的通用原理
-
问题1:聚焦爬虫爬取的数据是存储在哪里的?
-
都被存储在了相关的标签之中and相关标签的属性中
-
-
1.定位标签
-
2.取文本或者取属性
-
-
import requests
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
#如何爬取图片
url = 'https://pic.qiushibaike.com/system/pictures/12223/122231866/medium/IZ3H2HQN8W52V135.jpg'
img_data = requests.get(url,headers=headers).content #byte类型数据
with open('./img.jpg','wb') as fp:
fp.write(img_data)
#弊端:不能使用UA伪装
from urllib import request
url = 'https://pic.qiushibaike.com/system/pictures/12223/122231866/medium/IZ3H2HQN8W52V135.jpg'
request.urlretrieve(url,filename='./qiutu.jpg')
('./qiutu.jpg', <http.client.HTTPMessage at 0x28e3ec3e0f0>)
import re
import os
<div class="thumb">
<a href="/article/123591644" target="_blank">
<img src="//pic.qiushibaike.com/system/pictures/12359/123591644/medium/PW7RCD4L98ZYSMQO.jpg" alt="糗事#123591644" class="illustration" width="100%" height="auto">
</a>
</div>
ex = "<div class="thumb">.*?<img src="(.*?)" alt.*?</div>"
#糗图爬取1-3页所有的图片
#1.使用通用爬虫将前3页对应的页面源码数据进行爬取
#通用的url模板(不可变)
import requests
from urllib import request
import os
import re
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
dirName = './imgLibs'
if not os.path.exists(dirName):
os.mkdir(dirName)
url = 'https://www.qiushibaike.com/imgrank/page/%d/'
for page in range(1,4):
new_url = format(url%page)
page_text = requests.get(new_url,headers=headers).text #每一个页码对应的页面源码数据
#在通用爬虫的基础上实现聚焦爬虫(每一个页码对应页面源码数据中解析出图片地址)
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_src_list = re.findall(ex,page_text,re.S)
for src in img_src_list:
src = 'https:'+src
img_name = src.split('/')[-1]
img_path = dirName+'/'+img_name #./imgLibs/xxxx.jpg
request.urlretrieve(src,filename=img_path)
print(img_name,'下载成功!!!')
-
bs4解析
-
bs4解析的原理:
-
实例化一个BeautifulSoup的对象,需要将即将被解析的页面源码数据加载到该对象中
-
调用BeautifulSoup对象中的相关方法和属性进行标签定位和数据提取
-
-
环境的安装:
-
pip install bs4
-
pip install lxml
-
-
BeautifulSoup的实例化:
-
BeautifulSoup(fp,'lxml'):将本地存储的一个html文档中的数据加载到实例化好的BeautifulSoup对象中
-
BeautifulSoup(page_text,'lxml'):将从互联网上获取的页面源码数据加载到实例化好的BeautifulSoup对象中
-
-
-
定位标签的操作:
-
soup.tagName:定位到第一个出现的tagName标签
-
属性定位:soup.find('tagName',attrName='value')
-
属性定位:soup.find_all('tagName',attrName='value'),返回值为列表
-
选择器定位:soup.select('选择器')
-
层级选择器:>表示一个层级 空格表示多个层级
-
-
-
取文本
-
.string:获取直系的文本内容
-
.text:获取所有的文本内容
-
-
取属性
-
tagName['attrName']
-
from bs4 import BeautifulSoup
fp = open('./test.html','r',encoding='utf-8')
soup = BeautifulSoup(fp,'lxml')
# soup.div
# soup.find('div',class_='song')
# soup.find('a',id="feng")
# soup.find_all('div',class_="song")
# soup.select('#feng')
# soup.select('.tang > ul > li')
# soup.select('.tang li') #
# a_tag = soup.select('#feng')[0]
# print(a_tag)
# a_tag.text
soup.find('div',class_='haha left').text
# soup.select('.left')
# div = soup.div
# div.string
# div = soup.find('div',class_="song")
# div.string
# a_tag = soup.select('#feng')[0]
# a_tag['href']
'hahah'
#爬取三国整篇内容(章节名称+章节内容)http://www.shicimingju.com/book/sanguoyanyi.html
# fp = open('sanguo.txt','w',encoding='utf-8')
main_url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
page_text = requests.get(main_url,headers=headers).text
#解析出章节名称和章节详情页的url
soup = BeautifulSoup(page_text,'lxml')
a_list = soup.select('.book-mulu > ul > li > a') #返回的列表中存储的是一个个a标签
for a in a_list:
title = a.string
detail_url = 'http://www.shicimingju.com'+a['href']
detail_page_text = requests.get(detail_url,headers=headers).text
#解析详情页中的章节内容
soup = BeautifulSoup(detail_page_text,'lxml')
content = soup.find('div',class_='chapter_content').text
# fp.write(title+':'+content+'\n')
print(title,'下载成功!')
# fp.close()
-
xpath解析
-
xpath解析的实现原理
-
1.实例化一个etree的对象,然后将即将被解析的页面源码加载到改对象中
-
2.使用etree对象中的xpath方法结合着不同形式的xpath表达式实现标签定位和数据提取
-
-
环境安装:
-
pip install lxml
-
-
etree对象的实例化:
-
etree.parse('test.html')
-
etree.HTML(page_text)
-
-
-
xpath表达式:xpath方法的返回值一定是一个列表
-
最左侧的/表示:xpath表达式一定要从根标签逐层进行标签查找和定位
-
最左侧的//表示:xpath表达式可以从任意位置定位标签
-
非最左侧的/:表示一个层级
-
非最左侧的//:表示夸多个层级
-
属性定位://tagName[@attrName="value"]
-
索引定位://tagName[index] 索引是从1开始
-
-
取文本:
-
/text():直系文本内容
-
//text():所有的文本内容
-
-
取属性:
-
/@attrName
-
from lxml import etree
tree = etree.parse('./test.html')
tree.xpath('/html/head/title')
tree.xpath('//title')
tree.xpath('/html/body//p')
# tree.xpath('//p')
tree.xpath('//div[@class="haha left"]/text()')
# tree.xpath('//div[@class="song"]')
# tree.xpath('//li[7]')
# tree.xpath('//a[@id="feng"]/text()')[0]
# tree.xpath('//div[@class="song"]//text()')
# tree.xpath('//a[@id="feng"]/@href')
['hahah', 'yaay']
import requests
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
#爬取糗百中的段子内容和作者名称
url = 'https://www.qiushibaike.com/text/'
page_text = requests.get(url,headers=headers).text
#解析内容
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="content-left"]/div')
for div in div_list:
author = div.xpath('./div[1]/a[2]/h2/text()')[0]#实现局部解析
content = div.xpath('./a[1]/div/span//text()')
# 将列表转化为字符串
content = ''.join(content)
print(author,content)
#http://pic.netbian.com/4kmeinv/中文乱码的处理
dirName = './meinvLibs'
if not os.path.exists(dirName):
os.mkdir(dirName)
url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
for page in range(1,11):
if page == 1:
new_url = 'http://pic.netbian.com/4kmeinv/'
else:
new_url = format(url%page)
page_text = requests.get(new_url,headers=headers).text
tree = etree.HTML(page_text)
a_list = tree.xpath('//div[@class="slist"]/ul/li/a')
for a in a_list:
img_src = 'http://pic.netbian.com'+a.xpath('./img/@src')[0]
img_name = a.xpath('./b/text()')[0]
img_name = img_name.encode('iso-8859-1').decode('gbk')
img_data = requests.get(img_src,headers=headers).content
imgPath = dirName+'/'+img_name+'.jpg'
with open(imgPath,'wb') as fp:
fp.write(img_data)
print(img_name,'下载成功!!!')
#https://www.aqistudy.cn/historydata/所有城市名称
page_text = requests.get('https://www.aqistudy.cn/historydata/',headers=headers).text
tree = etree.HTML(page_text)
# hot_cities = tree.xpath('//div[@class="bottom"]/ul/li/a/text()')
# all_cities = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()')
cities = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()') #提高xpath的通用性
cities

浙公网安备 33010602011771号