Beautiful Soup的使用

Beautiful Soup的使用

1. 基本使用

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
print(soup.title.string)

----------------------
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
The Dormouse's story
  • 变量html中的HTML文本<body><html>节点没有闭合,从输出结果来看,Beautiful Soup自动更正了错误的HTML格式字符串。
  • soup.prettify(),这个方法是将要解析的字符串按照缩进的格式进行输出
  • soup.title.string的意思是,将要解析的字符串中title节点中的文本输出

2. 节点选择器

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html, 'lxml')
print(soup.title)  # 输出要解析文本的title节点
print(soup.title.string)  # 输出要解析文本title节点中的文本
print(type(soup.title))  # 输出title节点的类型是'bs4.element.Tag'
print(type(soup.title.string))
print(soup.head)  # 输出要解析文本的head节点
print(soup.p)  # 输出要解析文本的p节点,有多个p节点,只输出第一个匹配的p节点
print(soup.a)  # 输出要解析文本的a节点,有多个a节点,只输出第一个匹配的a节点
----------------
<title>The Dormouse's story</title>
The Dormouse's story
<class 'bs4.element.Tag'>
<class 'bs4.element.NavigableString'>
<head><title>The Dormouse's story</title></head>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>

3.关联选择

3.1 子节点和子孙节点

from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
for i in soup.p.contents:   # 找出p节点的所有直接子节点,返回的结果是列表格式,其中span节点没有单独选出来
    print(i)
for i, child in enumerate(soup.p.children):  # 通过使用children属性得到结果,返回的结果是生成器类型
    print(i, child)
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):  # 要获得所有的子孙节点,可以使用descendants属性来获取,返回的结果还是生成器格式,可以发现span节点被单独选出来了
    print(i, child)

3.2 父节点和祖先节点

from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent)  # 如果要获得某个节点元素的父节点,可以使用parent属性,需要注意的是仅仅输出了直接父节点的内容
print(soup.a.parents)
print(list(enumerate(soup.a.parents)))  # 使用parents属性可以获得一个节点元素的所有祖先节点

3.3 兄弟节点(同级节点)

from bs4 import BeautifulSoup
html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            aaa
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibling)  # 属性next_sibling获取某个节点元素的下一个同级节点,以第一个a节点为基准
print(soup.a.previous_sibling)  # previous_sibling属性获取某个节点元素的上一个同级节点
print(list(enumerate(soup.a.next_siblings)))  # next_siblings属性获取某个节点元素后面所有的同级节点
print(list(enumerate(soup.a.previous_siblings)))  # previous_siblings属性获取某个节点元素前面所有的同级节点

3.4 提取信息

from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            aaa
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibling.string)  # 如果是单个节点,直接可以使用string属性来获取节点中的文本
print(soup.a.previous_sibling.string)
print(list(soup.a.parents)[0])  # 如果返回的结果包含多个节点的生成器,现将其转换成列表,再从中提取元素,然后调用string、attrs等属性来获取对应的文本和属性
print(list(soup.a.parents)[0].attrs['class'])

4.方法选择器

  • find_all

查询所有符合条件的元素,给find_all传入一些属性和文本来得到符合条件的元素

  • name

根据name参数来查询元素

from bs4 import BeautifulSoup

html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name="ul"))  # 调用find_all方法查找name为ul的节点,返回的结果是一个列表
print(type(soup.find_all(name="ul")[0]))  # 通过结果可以发现返回的结果是bs4.element.Tag类型,说明可以继续嵌套查询
for ul in soup.find_all(name="ul"):
    print(ul.find_all(name="li"))
    for li in ul.find_all(name="li"):
        print(li.string)
  • attrs
from bs4 import BeautifulSoup

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1" name="elements">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))  # 方法attrs传入的参数是字典,返回的结果是列表形式
print(soup.find(id="list-1"))  # 对于常见的属性class,id,可以直接在attrs内使用id="list-1"进行查询
print(soup.find_all(attrs={'class': 'element'}))
print(soup.find_all(class_="element"))  # 由于class是Python中的关键字,因此向attrs传递class属性时需要在class后添加"_"
  • text

text参数可以匹配节点中的文本,传入形式可以是字符串,也可以是正则表达式对象

import re

from bs4 import BeautifulSoup
html = '''
<div class="panel">
    <div class="panel-body">
        <a>Hello, this is a link</a>
        <a>Hello, this is a link, too</a>
        <a>a</a>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text=re.compile('link')))  # 使用compile将正则字符串编译成正则对象,传入字符串要与节点中的文本完全相同
print(soup.find_all(text=re.compile(r'\w{2}')))  # 与上一个表达式的输出结果一样
  • find

find与find_all的功能差不多,都可以查询符合条件的元素,但是find返回的只是第一个匹配的元素,而find_all返回的是所有匹配的元素的列表

from bs4 import BeautifulSoup

html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''

soup = BeautifulSoup(html, 'lxml')
print(soup.find(class_="element"))  # 返回的第一个class属性值为element的节点
print(type(soup.find(class_="element")))  # 通过输出结果可以看出返回的结果是bs4.element.Tag类型的

5.CSS选择器

在写CSS的时候,标签不做修饰id前加#属性值前加.,在使用CSS选择器调用select语句时使用类似的方法筛选元素,返回结果的类型是列表

from bs4 import BeautifulSoup
html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''

soup = BeautifulSoup(html, 'lxml')
print(soup.select('div'))  # 通过标签名来查找
print(soup.select('.element'))  # 通过属性值来查找
print(soup.select('#list-2'))  # 通过id值来查找
print(soup.select('ul li'))  # 查找ul节点下所有的li节点
print(soup.select('div #list-1'))  # 组合查找,把要查找的标签名、属性值和id值使用空格分开
#  查找div标签下id为list-1的内容
print(soup.select('ul')[1])  # 查找第二个ul节点下的所有内容
print(type(soup.select('ul')[1]))  # 输出的列表中元素是bs4.element.Tag类型的

5.1 嵌套查询

select方法支持嵌套选择

from bs4 import BeautifulSoup


html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
    print(ul.select('li'))   # 查找ul节点下所有li节点

5.2 获取属性

html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
    print(ul['id'])  # 打印所有ul节点的id值
    print(ul.attrs['id'])

5.3 获取文本

html = '''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
    print(li.get_text())  # 获取所有li节点的文本,除了使用string方法外,这是第二种方法
posted @ 2021-12-17 08:35  写代码的小灰  阅读(55)  评论(0)    收藏  举报