Python爬虫 #006 BeatuifulSoup

点击查看代码
# html数据赋值给data
data = '''
<?xml version="1.0" encoding="utf-8"?>
<bookstore>
    <book class="a" id="one">
        <title id="one" lang ="an" href="www.one">boookone</title>
        <author>Giada De Laurentiis</author>
        <year>2005</year>
        <price>30.00</price>
    </book>
    <book class="b" id="two">
        <title id="two" lang ="bn" href="www.two">booktwo</title>
        <author>J K. Rowling</author>
        <year>2005</year>
        <price>29.99</price>
    </book>
    <book class="c" id="three">
        <title id="three" lang ="cn" href="www.three">bookthree</title>
        <author>James</author>
        <year>2001</year>
        <price>40</price>
    </book>
    <book class="d" id="four">
        <title id="four" lang ="dn" href="www.four">bookfour</title>
        <author>Feng Feng</author>
        <year>1998</year>
        <price>99.99</price>
    </book>
</bookstore>

# 导入相应的库
from bs4 import BeautifulSoup
soup = BeautifulSoup(data,'lxml')

6.1-find方法

# 只查找第一个遇到的 title
title = soup.find(name = "title")
print(title)
>>>: <title href="www.one" id="one" lang="an">boookone</title>
# 添加条件找 title
title = soup.find(name="title",attrs={'lang':'bn'})
print(title)
>>>: <title href="www.two" id="two" lang="bn">booktwo</title>

6.2-find_all方法

# 得到的titles是列表
titles = soup.find_all(name="title")
print(titles)
[<title href="www.one" id="one" lang="an">boookone</title>, <title href="www.two" id="two" lang="bn">booktwo</title>, <title href="www.three" id="three" lang="cn">bookthree</title>, <title href="www.four" id="four" lang="dn">bookfour</title>]

6.3-select方法

不根据条件查找

# select方法的到的数据被封装成列表
## 括号内不能写成 name="title" 会报错,和find方法有些区别
titles = soup.select("title")
print(titles)
[<title href="www.one" id="one" lang="an">boookone</title>, <title href="www.two" id="two" lang="bn">booktwo</title>, <title href="www.three" id="three" lang="cn">bookthree</title>, <title href="www.four" id="four" lang="dn">bookfour</title>]

根据条件查找

# ! '#'只能用于标签属性为id '.'只能用于标签属性为class
# 1.相当于找标签为 <title id="two"> 
titles1 = soup.select('title#two')
print(titles1)

# 2.相当<book class="a">里面找<title>
# 方法一
title2 = soup.select('book.a title')
print(title2)
# 方法二
## 限制更多的条件 [class="a"][id="one"] > title
title3 = soup.select('book[class="a"] > title')
print(title3)

# 3.相当于找标签为 <title lang="bn"> 
title4 = soup.select('title[lang="bn"]')
print(title4)
[<title href="www.two" id="two" lang="bn">booktwo</title>]
[<title href="www.one" id="one" lang="an">boookone</title>]
[<title href="www.one" id="one" lang="an">boookone</title>]
[<title href="www.two" id="two" lang="bn">booktwo</title>]

6.4-获取标签内的数据

# tag = soup.find_all(name = 'title')得到的是html标签,且封装成列表
tag = soup.find_all(name = 'title')[0]
print(tag)

# tag.attrs只能把单个标签转化为字典
print(tag.attrs)
print(tag.attrs['href'])

#获取标签外的内容
print(tag.string)
print(tag.get_text())
<title href="www.one" id="one" lang="an">boookone</title>
{'id': 'one', 'lang': 'an', 'href': 'www.one'}
www.one
boookone
boookone

保存网址小案例

#保存网址
tags = soup.find_all(name = 'title')

list = []
for tag in tags:
    print(tag.attrs['href'])
    list.append(tag.attrs['href'])
print(list)
www.one
www.two
www.three
www.four
['www.one', 'www.two', 'www.three', 'www.four']

6.5-易错点

# 相当于找标签为 <book id="one"> 
title1= soup.select('book#one')
print(title1)
print('\n')

# 相当于找标签为 <book> 的子标签<id="one">,加空格就是找子标签
title2= soup.select('book #one')
print(title2)

[<book class="a" id="one">
<title href="www.one" id="one" lang="an">boookone</title>
<author>Giada De Laurentiis</author>
<year>2005</year>
<price>30.00</price>
</book>]

[<title href="www.one" id="one" lang="an">boookone</title>]

posted @ 2023-06-28 22:55  枫_Null  阅读(11)  评论(0)    收藏  举报