Loading

爬虫(五)数据清洗-xpath、BeautifulSoup模块

xpath介绍和lxml安装

xpath表达式

 

如果正则表达式用的不好,处理html文档很累,有没有其他的方法?

有!就是用xpath,我们可以先将html文件转成xml文档,然后用xpath查找html节点或元素

 

我们需要安装lxml模块来支持xpath的操作

安装依赖

pip install lxml

 

解析字符串形式html

text = '''
<div>
    <ul>
        <li class="item-0"><a href="link1.html">张三</a></li>
        <li class="item-1"><a href="link2.html">李四</a></li>
        <li class="item-inactive"><a href="link3.html">王五</a></li>
        <li class="item-1"><a href="link4.html">赵六</a></li>
        <li class="item-0"><a href="link5.html">老七</a></li>
    </ul>
</div>
'''

from lxml import etree

# etree.HTML()将字符串解析成了特殊的html对象
html = etree.HTML(text)

print(type(html))

# 将html对象转成字符串
result = etree.tostring(html,encoding="utf-8").decode()

print(result)

 

解析本地html

解析本地html
爬虫中网页处理方式
在爬虫中,数据获取和数据清洗一体,HTML()
数据获取和数据清洗分开,parse()

from lxml import etree

#  获取本地html文档
html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')

result = etree.tostring(html,encoding="utf-8").decode()

print(result)

 

获取一类标签

from lxml import etree

html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')

result = html.xpath('//a')  # 获取所有a标签的信息

print(result)

print(result[3].text)

 

获取指定属性的标签

from lxml import etree

html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')

result1 = html.xpath('//li[@class="item-1"]/span')  # 获取所有span标签的信息
result2 = html.xpath('//li[@class="item-100"]/a')  # 获取指定a标签的信息

print(result1[0].text)

print(result2[0].text)

 

获取标签的属性

from lxml import etree

html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')

result1 = html.xpath('//li/@class')  # 获取所有li标签的属性信息
result2 = html.xpath('//li[@class="item-100"]/a/@href')  # 获取指定a标签的属性信息

print(result1)

print(result2)

 

获取子标签

<div>
    <ul>
        <li class="item-0"><a href="link1.html">张三</a></li>
        <li class="item-11">
            <a href="link2.html">
                <span class="nnpp">李四</span>
            </a>
            <span>好人</span>
        </li>
        <li class="item-1"><span>小正正</span></li>
        <li class="item-inactive">
            <a href="link3.html">
                <span class="nppp">王五</span>
            </a>
        </li>
        <li class="item-100"><a href="link4.html">赵六</a></li>
        <li class="item-0"><a href="link5.html">老七</a></li>
    </ul>
</div>
from lxml import etree

html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')

result1 = html.xpath('//li/a')  # 获取所有li标签下的a标签
result2 = html.xpath('//li//span')  # 获取所有符合条件的子标签

print(result1)

print(result2[0].text)

# 获取li标签里所有的class
result3 = html.xpath("//li//a//@class")

print(result3)

 

获取标签内容和标签名

from lxml import etree

html = etree.parse(r'C:\Users\Administrator\PycharmProjects\Reptiles\a.html')

# 获取倒数第二个li元素下a标签的内容
result1 = html.xpath('//li[last()-1]/a')

print(result1[0].text)

result2 = html.xpath('//li/a')

print(result2[-2].text)

# 获取 class 值为bold的标签名
result3 = html.xpath("//*[@class='bold']")

print(result3[1].tag)  # tag表示获取标签名

 

爬取网络段子

import requests
from lxml import etree

url = 'https://www.qiushibaike.com/'

header = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58',
    'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}

response = requests.get(url,headers=header).text

html = etree.HTML(response)

result1 = html.xpath('//div//a[@class="recmd-content"]/@href') # 获取div下的所有a标签href属性信息

for site in result1:
    xurl = "https://www.qiushibaike.com" + site
    response2 = requests.get(xurl,headers=header).text
    html2 = etree.HTML(response2)
    result2 = html2.xpath("//div[@class='content']")
    print(result2[0].text)

 

爬取贴吧图片

# 图片爬虫
import urllib
import urllib.request
from lxml import etree


class Spider(object):
    def __init__(self):
        self.tiebaName = "车模"
        self.beginPage = 1
        self.endPage = 3
        self.url = "http://tieba.baidu.com/f?"
        self.ua_header = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
        self.fileName = 1

    # 构造url
    def tiebaSpider(self):
        for page in range(self.beginPage, self.endPage + 1):
            pn = (page - 1) * 50
            wo = {'pn': pn, 'kw': self.tiebaName}
            word = urllib.parse.urlencode(wo)
            myurl = self.url + word
            self.loadPage(myurl)

    # 爬取页面内容
    def loadPage(self, url):
        req = urllib.request.Request(url, headers=self.ua_header)
        data = urllib.request.urlopen(req).read()

        html = etree.HTML(data)
        links = html.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')

        for link in links:
            link = "http://tieba.baidu.com" + link
            self.loadImages(link)

    # 爬取帖子详情页,获得图片的链接
    def loadImages(self, link):
        req = urllib.request.Request(link, headers=self.ua_header)
        data = urllib.request.urlopen(req).read()
        html = etree.HTML(data)
        links = html.xpath('//img[@class="BDE_Image"]/@src')
        for imageslink in links:
            self.writeImages(imageslink)

    # 通过图片所在链接,爬取图片并保存图片到本地:
    def writeImages(self, imagesLink):
        print("正在存储图片:", self.fileName, "....")

        image = urllib.request.urlopen(imagesLink).read()

        # 保存图片到本地
        file = open(r"C:\\Users\\Administrator\\Desktop\\贴吧图片\\" + str(self.fileName) + ".jpg", "wb")
        file.write(image)

        file.close()

        self.fileName += 1


if __name__ == '__main__':
    mySpider = Spider()

    mySpider.tiebaSpider()

 

 

BeautifulSoup简介和安装

 安装BeautifulSoup

pip install beautifulsoup4

 

CSS 选择器:beautifulsoup4

  • 和lxml 一样,beautifulsoup 也是一个HTML/XML的解析器
  • 主要的功能也是如何解析和提取HTML/XML 数据
from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# 解析本地html文件
# soup2 = BeautifulSoup(open('index.html'))

# print(soup)

# prettify 格式化输出soup对象(美化)
print(soup.prettify()) # prettify是用来美化代码

 

获取标签信息

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# 根据标签名获取标签信息 soup、标签名
print(soup.title)
print(soup.title.string) # 标签里面的内容

# 获取标签名
print(soup.title.name)

# 获取标签内所有属性
print(soup.p.attrs['name']) # 获取p标签内name属性值

# 获取直接子标签,结果是一个列表
print(soup.head.contents)

# 获取直接子标签,结果是一个生成器
print(soup.head.children)

# 获取所有的子标签
print(soup.descendants)

for i in soup.p.descendants: # 打印所有p标签信息,标签内的信息又会独立作为一个信息展示
    print(i)

 

 

搜索文档树

 文档树,即:所有标签

# 搜索文档树 find_all()

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# 根据标签名获取标签信息 soup、标签名
print(soup.title)
print(soup.title.string) # 标签里面的内容

# 根据字符串查找所有的a标签,返回一个结果集,里面装的是标签对象
data = soup.find_all("a")
print(type(data)) # <class 'bs4.element.ResultSet'>集合
print(data[0].string)
for i in data:
    print(i.string)

 

# 搜索文档树 find_all()

from bs4 import BeautifulSoup
import re

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# 根据标签名获取标签信息 soup、标签名
# print(soup.title)
# print(soup.title.string) # 标签里面的内容

# 方式一
# 根据字符串查找所有的a标签,返回一个结果集,里面装的是标签对象
data = soup.find_all("a")
# print(type(data)) # <class 'bs4.element.ResultSet'>集合
# print(data[0].string)
# for i in data:
#     print(i.string)

# 方式二
# 根据正则表达式查找标签
data2 = soup.find_all(re.compile("^b"))
for i in data2:
    print(i.string)

# 方式三
# 根据属性查找标签
data3 = soup.find_all(id="link2")
for i in data3:
    print(i)

# 方式四
# 根据标签内容获取标签内容
data4 = soup.find_all(text="Lacie")
data5 = soup.find_all(text=['Lacie','Tillie'])
data6 = soup.find_all(text=re.compile("Do")) # 查找包含Do文本的数据
print(data4)
print(data5)
print(data6)

 

CSS选择器

CSS选择器 - 通过select来查找
根据样式表来查找标签
CSS选择器类型:标签选择器、class选择器、id选择器

# 搜索文档树 find_all()

from bs4 import BeautifulSoup
import re

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 解析字符串形式的html
soup = BeautifulSoup(html,"lxml")

# CSS选择器类型:标签选择器、class选择器、id选择器

# 通过标签名获取标签
data = soup.select('a')
print(type(data)) # <class 'bs4.element.ResultSet'>集合
print(data)

# 通过class名来查找
data2 = soup.select(".sister")
print(data2)

# 通过id来查找
data3 = soup.select("#link2")
print(data3)

# 通过组合查找
data4 = soup.select("p #link1") # p标签下面id为link1的标签
print(data4)

# 通过其他属性查找
data5 = soup.select('a[href="http://example.com/lacie"]')
print(data5)

 

实战-爬取腾讯招聘信息

 在爬取腾讯招聘信息前先分析url

import urllib.request
import time
import re

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84."
                         "0.4147.125 Safari/537.36 Edg/84.0.522.59"}

timestamp = int(time.time())
# print(timestamp)

for x in range(1, 3):
    page = x
    url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=" + str(timestamp) + "&pageIndex=" + str(page) + "&pageSize=10"
    # print(url)

    req = urllib.request.Request(url, headers=headers)
    data = urllib.request.urlopen(req).read().decode()
    # print(data)

    part = r'PostId":"(.*?)",'
    pattern = re.compile(part)

    data1 = pattern.findall(data)

    # print(data1)

    for x in data1:
        myurl = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=" + str(timestamp) + "&postId=" + str(x) + "&language=zh-cn"
        # print(myurl)
        req2 = urllib.request.Request(myurl,headers=headers)
        data3 = urllib.request.urlopen(req2).read().decode()

        # print(data3)

        part2 = r'RecruitPostName":"(.*?)",'
        pattern2 = re.compile(part2)
        name = pattern2.findall(data3)

        part3 = r'Responsibility":"(.*?)",'
        pattern3 = re.compile(part3)
        text = pattern3.findall(data3)
        
        print(name)
        print(text)


    print("-------------------------------------------------")

 

posted @ 2020-08-11 16:09  Binzichen  阅读(807)  评论(0)    收藏  举报