Beautiful Soup的使用
Beautiful Soup的使用
1. 基本使用
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
print(soup.title.string)
----------------------
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title" name="dromouse">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
The Dormouse's story
- 变量html中的HTML文本
<body>和<html>节点没有闭合,从输出结果来看,Beautiful Soup自动更正了错误的HTML格式字符串。 soup.prettify(),这个方法是将要解析的字符串按照缩进的格式进行输出soup.title.string的意思是,将要解析的字符串中title节点中的文本输出
2. 节点选择器
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.title) # 输出要解析文本的title节点
print(soup.title.string) # 输出要解析文本title节点中的文本
print(type(soup.title)) # 输出title节点的类型是'bs4.element.Tag'
print(type(soup.title.string))
print(soup.head) # 输出要解析文本的head节点
print(soup.p) # 输出要解析文本的p节点,有多个p节点,只输出第一个匹配的p节点
print(soup.a) # 输出要解析文本的a节点,有多个a节点,只输出第一个匹配的a节点
----------------
<title>The Dormouse's story</title>
The Dormouse's story
<class 'bs4.element.Tag'>
<class 'bs4.element.NavigableString'>
<head><title>The Dormouse's story</title></head>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>
3.关联选择
3.1 子节点和子孙节点
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
for i in soup.p.contents: # 找出p节点的所有直接子节点,返回的结果是列表格式,其中span节点没有单独选出来
print(i)
for i, child in enumerate(soup.p.children): # 通过使用children属性得到结果,返回的结果是生成器类型
print(i, child)
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants): # 要获得所有的子孙节点,可以使用descendants属性来获取,返回的结果还是生成器格式,可以发现span节点被单独选出来了
print(i, child)
3.2 父节点和祖先节点
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent) # 如果要获得某个节点元素的父节点,可以使用parent属性,需要注意的是仅仅输出了直接父节点的内容
print(soup.a.parents)
print(list(enumerate(soup.a.parents))) # 使用parents属性可以获得一个节点元素的所有祖先节点
3.3 兄弟节点(同级节点)
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
aaa
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibling) # 属性next_sibling获取某个节点元素的下一个同级节点,以第一个a节点为基准
print(soup.a.previous_sibling) # previous_sibling属性获取某个节点元素的上一个同级节点
print(list(enumerate(soup.a.next_siblings))) # next_siblings属性获取某个节点元素后面所有的同级节点
print(list(enumerate(soup.a.previous_siblings))) # previous_siblings属性获取某个节点元素前面所有的同级节点
3.4 提取信息
from bs4 import BeautifulSoup
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
aaa
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibling.string) # 如果是单个节点,直接可以使用string属性来获取节点中的文本
print(soup.a.previous_sibling.string)
print(list(soup.a.parents)[0]) # 如果返回的结果包含多个节点的生成器,现将其转换成列表,再从中提取元素,然后调用string、attrs等属性来获取对应的文本和属性
print(list(soup.a.parents)[0].attrs['class'])
4.方法选择器
- find_all
查询所有符合条件的元素,给find_all传入一些属性和文本来得到符合条件的元素
- name
根据name参数来查询元素
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name="ul")) # 调用find_all方法查找name为ul的节点,返回的结果是一个列表
print(type(soup.find_all(name="ul")[0])) # 通过结果可以发现返回的结果是bs4.element.Tag类型,说明可以继续嵌套查询
for ul in soup.find_all(name="ul"):
print(ul.find_all(name="li"))
for li in ul.find_all(name="li"):
print(li.string)
- attrs
from bs4 import BeautifulSoup
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'})) # 方法attrs传入的参数是字典,返回的结果是列表形式
print(soup.find(id="list-1")) # 对于常见的属性class,id,可以直接在attrs内使用id="list-1"进行查询
print(soup.find_all(attrs={'class': 'element'}))
print(soup.find_all(class_="element")) # 由于class是Python中的关键字,因此向attrs传递class属性时需要在class后添加"_"
- text
text参数可以匹配节点中的文本,传入形式可以是字符串,也可以是正则表达式对象
import re
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-body">
<a>Hello, this is a link</a>
<a>Hello, this is a link, too</a>
<a>a</a>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text=re.compile('link'))) # 使用compile将正则字符串编译成正则对象,传入字符串要与节点中的文本完全相同
print(soup.find_all(text=re.compile(r'\w{2}'))) # 与上一个表达式的输出结果一样
- find
find与find_all的功能差不多,都可以查询符合条件的元素,但是find返回的只是第一个匹配的元素,而find_all返回的是所有匹配的元素的列表
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find(class_="element")) # 返回的第一个class属性值为element的节点
print(type(soup.find(class_="element"))) # 通过输出结果可以看出返回的结果是bs4.element.Tag类型的
5.CSS选择器
在写CSS的时候,标签不做修饰,id前加#,属性值前加.,在使用CSS选择器调用select语句时使用类似的方法筛选元素,返回结果的类型是列表
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.select('div')) # 通过标签名来查找
print(soup.select('.element')) # 通过属性值来查找
print(soup.select('#list-2')) # 通过id值来查找
print(soup.select('ul li')) # 查找ul节点下所有的li节点
print(soup.select('div #list-1')) # 组合查找,把要查找的标签名、属性值和id值使用空格分开
# 查找div标签下id为list-1的内容
print(soup.select('ul')[1]) # 查找第二个ul节点下的所有内容
print(type(soup.select('ul')[1])) # 输出的列表中元素是bs4.element.Tag类型的
5.1 嵌套查询
select方法支持嵌套选择
from bs4 import BeautifulSoup
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li')) # 查找ul节点下所有li节点
5.2 获取属性
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul['id']) # 打印所有ul节点的id值
print(ul.attrs['id'])
5.3 获取文本
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
print(li.get_text()) # 获取所有li节点的文本,除了使用string方法外,这是第二种方法

浙公网安备 33010602011771号