爬虫(五)数据清洗-xpath、BeautifulSoup模块
xpath介绍和lxml安装
xpath表达式
如果正则表达式用的不好,处理html文档很累,有没有其他的方法?
有!就是用xpath,我们可以先将html文件转成xml文档,然后用xpath查找html节点或元素
我们需要安装lxml模块来支持xpath的操作
安装依赖
pip install lxml
解析字符串形式html
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">张三</a></li>
<li class="item-1"><a href="link2.html">李四</a></li>
<li class="item-inactive"><a href="link3.html">王五</a></li>
<li class="item-1"><a href="link4.html">赵六</a></li>
<li class="item-0"><a href="link5.html">老七</a></li>
</ul>
</div>
'''
from lxml import etree
# etree.HTML()将字符串解析成了特殊的html对象
html = etree.HTML(text)
print(type(html))
# 将html对象转成字符串
result = etree.tostring(html,encoding="utf-8").decode()
print(result)

解析本地html
解析本地html
爬虫中网页处理方式
在爬虫中,数据获取和数据清洗一体,HTML()
数据获取和数据清洗分开,parse()
from lxml import etree # 获取本地html文档 html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html') result = etree.tostring(html,encoding="utf-8").decode() print(result)

获取一类标签
from lxml import etree
html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')
result = html.xpath('//a') # 获取所有a标签的信息
print(result)
print(result[3].text)

获取指定属性的标签
from lxml import etree
html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')
result1 = html.xpath('//li[@class="item-1"]/span') # 获取所有span标签的信息
result2 = html.xpath('//li[@class="item-100"]/a') # 获取指定a标签的信息
print(result1[0].text)
print(result2[0].text)

获取标签的属性
from lxml import etree
html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')
result1 = html.xpath('//li/@class') # 获取所有li标签的属性信息
result2 = html.xpath('//li[@class="item-100"]/a/@href') # 获取指定a标签的属性信息
print(result1)
print(result2)

获取子标签
<div>
<ul>
<li class="item-0"><a href="link1.html">张三</a></li>
<li class="item-11">
<a href="link2.html">
<span class="nnpp">李四</span>
</a>
<span>好人</span>
</li>
<li class="item-1"><span>小正正</span></li>
<li class="item-inactive">
<a href="link3.html">
<span class="nppp">王五</span>
</a>
</li>
<li class="item-100"><a href="link4.html">赵六</a></li>
<li class="item-0"><a href="link5.html">老七</a></li>
</ul>
</div>
from lxml import etree
html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')
result1 = html.xpath('//li/a') # 获取所有li标签下的a标签
result2 = html.xpath('//li//span') # 获取所有符合条件的子标签
print(result1)
print(result2[0].text)
# 获取li标签里所有的class
result3 = html.xpath("//li//a//@class")
print(result3)

获取标签内容和标签名
from lxml import etree
html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')
# 获取倒数第二个li元素下a标签的内容
result1 = html.xpath('//li[last()-1]/a')
print(result1[0].text)
result2 = html.xpath('//li/a')
print(result2[-2].text)
# 获取 class 值为bold的标签名
result3 = html.xpath("//*[@class='bold']")
print(result3[1].tag) # tag表示获取标签名

爬取网络段子
import requests
from lxml import etree
url = 'https://www.qiushibaike.com/'
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}
response = requests.get(url,headers=header).text
html = etree.HTML(response)
result1 = html.xpath('//div//a[@class="recmd-content"]/@href') # 获取div下的所有a标签href属性信息
for site in result1:
xurl = "https://www.qiushibaike.com" + site
response2 = requests.get(xurl,headers=header).text
html2 = etree.HTML(response2)
result2 = html2.xpath("//div[@class='content']")
print(result2[0].text)

爬取贴吧图片
# 图片爬虫
import urllib
import urllib.request
from lxml import etree
class Spider(object):
def __init__(self):
self.tiebaName = "车模"
self.beginPage = 1
self.endPage = 3
self.url = "http://tieba.baidu.com/f?"
self.ua_header = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
self.fileName = 1
# 构造url
def tiebaSpider(self):
for page in range(self.beginPage, self.endPage + 1):
pn = (page - 1) * 50
wo = {'pn': pn, 'kw': self.tiebaName}
word = urllib.parse.urlencode(wo)
myurl = self.url + word
self.loadPage(myurl)
# 爬取页面内容
def loadPage(self, url):
req = urllib.request.Request(url, headers=self.ua_header)
data = urllib.request.urlopen(req).read()
html = etree.HTML(data)
links = html.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
for link in links:
link = "http://tieba.baidu.com" + link
self.loadImages(link)
# 爬取帖子详情页,获得图片的链接
def loadImages(self, link):
req = urllib.request.Request(link, headers=self.ua_header)
data = urllib.request.urlopen(req).read()
html = etree.HTML(data)
links = html.xpath('//img[@class="BDE_Image"]/@src')
for imageslink in links:
self.writeImages(imageslink)
# 通过图片所在链接,爬取图片并保存图片到本地:
def writeImages(self, imagesLink):
print("正在存储图片:", self.fileName, "....")
image = urllib.request.urlopen(imagesLink).read()
# 保存图片到本地
file = open(r"C:\\Users\\Administrator\\Desktop\\贴吧图片\\" + str(self.fileName) + ".jpg", "wb")
file.write(image)
file.close()
self.fileName += 1
if __name__ == '__main__':
mySpider = Spider()
mySpider.tiebaSpider()


BeautifulSoup简介和安装
安装BeautifulSoup
pip install beautifulsoup4
CSS 选择器:beautifulsoup4
- 和lxml 一样,beautifulsoup 也是一个HTML/XML的解析器
- 主要的功能也是如何解析和提取HTML/XML 数据
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# 解析本地html文件
# soup2 = BeautifulSoup(open('index.html'))
# print(soup)
# prettify 格式化输出soup对象(美化)
print(soup.prettify()) # prettify是用来美化代码

获取标签信息
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# 根据标签名获取标签信息 soup、标签名
print(soup.title)
print(soup.title.string) # 标签里面的内容
# 获取标签名
print(soup.title.name)
# 获取标签内所有属性
print(soup.p.attrs['name']) # 获取p标签内name属性值
# 获取直接子标签,结果是一个列表
print(soup.head.contents)
# 获取直接子标签,结果是一个生成器
print(soup.head.children)
# 获取所有的子标签
print(soup.descendants)
for i in soup.p.descendants: # 打印所有p标签信息,标签内的信息又会独立作为一个信息展示
print(i)

搜索文档树
文档树,即:所有标签
# 搜索文档树 find_all()
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# 根据标签名获取标签信息 soup、标签名
print(soup.title)
print(soup.title.string) # 标签里面的内容
# 根据字符串查找所有的a标签,返回一个结果集,里面装的是标签对象
data = soup.find_all("a")
print(type(data)) # <class 'bs4.element.ResultSet'>集合
print(data[0].string)
for i in data:
print(i.string)

# 搜索文档树 find_all()
from bs4 import BeautifulSoup
import re
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# 根据标签名获取标签信息 soup、标签名
# print(soup.title)
# print(soup.title.string) # 标签里面的内容
# 方式一
# 根据字符串查找所有的a标签,返回一个结果集,里面装的是标签对象
data = soup.find_all("a")
# print(type(data)) # <class 'bs4.element.ResultSet'>集合
# print(data[0].string)
# for i in data:
# print(i.string)
# 方式二
# 根据正则表达式查找标签
data2 = soup.find_all(re.compile("^b"))
for i in data2:
print(i.string)
# 方式三
# 根据属性查找标签
data3 = soup.find_all(id="link2")
for i in data3:
print(i)
# 方式四
# 根据标签内容获取标签内容
data4 = soup.find_all(text="Lacie")
data5 = soup.find_all(text=['Lacie','Tillie'])
data6 = soup.find_all(text=re.compile("Do")) # 查找包含Do文本的数据
print(data4)
print(data5)
print(data6)

CSS选择器
CSS选择器 - 通过select来查找
根据样式表来查找标签
CSS选择器类型:标签选择器、class选择器、id选择器
# 搜索文档树 find_all()
from bs4 import BeautifulSoup
import re
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")
# CSS选择器类型:标签选择器、class选择器、id选择器
# 通过标签名获取标签
data = soup.select('a')
print(type(data)) # <class 'bs4.element.ResultSet'>集合
print(data)
# 通过class名来查找
data2 = soup.select(".sister")
print(data2)
# 通过id来查找
data3 = soup.select("#link2")
print(data3)
# 通过组合查找
data4 = soup.select("p #link1") # p标签下面id为link1的标签
print(data4)
# 通过其他属性查找
data5 = soup.select('a[href="http://example.com/lacie"]')
print(data5)

实战-爬取腾讯招聘信息
在爬取腾讯招聘信息前先分析url
import urllib.request
import time
import re
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84."
"0.4147.125 Safari/537.36 Edg/84.0.522.59"}
timestamp = int(time.time())
# print(timestamp)
for x in range(1, 3):
page = x
url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=" + str(timestamp) + "&pageIndex=" + str(page) + "&pageSize=10"
# print(url)
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read().decode()
# print(data)
part = r'PostId":"(.*?)",'
pattern = re.compile(part)
data1 = pattern.findall(data)
# print(data1)
for x in data1:
myurl = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=" + str(timestamp) + "&postId=" + str(x) + "&language=zh-cn"
# print(myurl)
req2 = urllib.request.Request(myurl,headers=headers)
data3 = urllib.request.urlopen(req2).read().decode()
# print(data3)
part2 = r'RecruitPostName":"(.*?)",'
pattern2 = re.compile(part2)
name = pattern2.findall(data3)
part3 = r'Responsibility":"(.*?)",'
pattern3 = re.compile(part3)
text = pattern3.findall(data3)
print(name)
print(text)
print("-------------------------------------------------")


浙公网安备 33010602011771号