Python爬虫 #006 BeatuifulSoup
点击查看代码
# html数据赋值给data
data = '''
<?xml version="1.0" encoding="utf-8"?>
<bookstore>
<book class="a" id="one">
<title id="one" lang ="an" href="www.one">boookone</title>
<author>Giada De Laurentiis</author>
<year>2005</year>
<price>30.00</price>
</book>
<book class="b" id="two">
<title id="two" lang ="bn" href="www.two">booktwo</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
</book>
<book class="c" id="three">
<title id="three" lang ="cn" href="www.three">bookthree</title>
<author>James</author>
<year>2001</year>
<price>40</price>
</book>
<book class="d" id="four">
<title id="four" lang ="dn" href="www.four">bookfour</title>
<author>Feng Feng</author>
<year>1998</year>
<price>99.99</price>
</book>
</bookstore>
# 导入相应的库
from bs4 import BeautifulSoup
soup = BeautifulSoup(data,'lxml')
6.1-find方法
# 只查找第一个遇到的 title
title = soup.find(name = "title")
print(title)
>>>: <title href="www.one" id="one" lang="an">boookone</title>
# 添加条件找 title
title = soup.find(name="title",attrs={'lang':'bn'})
print(title)
>>>: <title href="www.two" id="two" lang="bn">booktwo</title>
6.2-find_all方法
# 得到的titles是列表
titles = soup.find_all(name="title")
print(titles)
[<title href="www.one" id="one" lang="an">boookone</title>, <title href="www.two" id="two" lang="bn">booktwo</title>, <title href="www.three" id="three" lang="cn">bookthree</title>, <title href="www.four" id="four" lang="dn">bookfour</title>]
6.3-select方法
不根据条件查找
# select方法的到的数据被封装成列表
## 括号内不能写成 name="title" 会报错,和find方法有些区别
titles = soup.select("title")
print(titles)
[<title href="www.one" id="one" lang="an">boookone</title>, <title href="www.two" id="two" lang="bn">booktwo</title>, <title href="www.three" id="three" lang="cn">bookthree</title>, <title href="www.four" id="four" lang="dn">bookfour</title>]
根据条件查找
# ! '#'只能用于标签属性为id '.'只能用于标签属性为class
# 1.相当于找标签为 <title id="two">
titles1 = soup.select('title#two')
print(titles1)
# 2.相当<book class="a">里面找<title>
# 方法一
title2 = soup.select('book.a title')
print(title2)
# 方法二
## 限制更多的条件 [class="a"][id="one"] > title
title3 = soup.select('book[class="a"] > title')
print(title3)
# 3.相当于找标签为 <title lang="bn">
title4 = soup.select('title[lang="bn"]')
print(title4)
[<title href="www.two" id="two" lang="bn">booktwo</title>]
[<title href="www.one" id="one" lang="an">boookone</title>]
[<title href="www.one" id="one" lang="an">boookone</title>]
[<title href="www.two" id="two" lang="bn">booktwo</title>]
6.4-获取标签内的数据
# tag = soup.find_all(name = 'title')得到的是html标签,且封装成列表
tag = soup.find_all(name = 'title')[0]
print(tag)
# tag.attrs只能把单个标签转化为字典
print(tag.attrs)
print(tag.attrs['href'])
#获取标签外的内容
print(tag.string)
print(tag.get_text())
<title href="www.one" id="one" lang="an">boookone</title>
{'id': 'one', 'lang': 'an', 'href': 'www.one'}
www.one
boookone
boookone
保存网址小案例
#保存网址
tags = soup.find_all(name = 'title')
list = []
for tag in tags:
print(tag.attrs['href'])
list.append(tag.attrs['href'])
print(list)
www.one
www.two
www.three
www.four
['www.one', 'www.two', 'www.three', 'www.four']
6.5-易错点
# 相当于找标签为 <book id="one">
title1= soup.select('book#one')
print(title1)
print('\n')
# 相当于找标签为 <book> 的子标签<id="one">,加空格就是找子标签
title2= soup.select('book #one')
print(title2)
[<book class="a" id="one">
<title href="www.one" id="one" lang="an">boookone</title>
<author>Giada De Laurentiis</author>
<year>2005</year>
<price>30.00</price>
</book>]
[<title href="www.one" id="one" lang="an">boookone</title>]
本文来自博客园,作者:{枫_Null},转载请注明原文链接:https://www.cnblogs.com/fengNull/articles/15488799.html

浙公网安备 33010602011771号