#再端一碗BeautifulSoup
#获取《战争与和平》中的人物名字
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html,'html.parser')
#namelist = bsObj.findAll("span",{"class":"green"})
#for name in namelist:
# print(name.get_text())
name_number = bsObj.findAll(text='the prince')
print(len(name_number))
allText = bsObj.findAll(id="text") #bsObj.findAll(id='text')等价于bsObj.findAll(" ",{"id":"text"})
print(allText[0].get_text())
#get_text()会把正在处理的HTML文档中所有的标签都清除,然后返回一个只包含文字的字符串。
#通常在准备打印、存储和操作数据时,应该最后才使用get_text()
#BeautifulSoup的find()和findAll()
#其定义如下
#findAll(tag,attributes,recursive,text,limit,keywords)
#find(tag,attributes,recursive,text,keywords)
#find等价于findAll的limit等于1时的情形
#如果只对网页中获取的前X项结果感兴趣,就可以设置它
#但是得注意这个参数设置之后,获得的前几项结果是按照网页上的顺序排序的
#未必是想要的那几项
#其他BeautifulSoup对象
#1.NavigableSring对象:用来表示标签里的文字
#2.Comment对象:用来查找HTML文档的注释标签,<!-- 文字 -->
#子标签和后代标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html,'html.parser')
for child in bsObj.find("table",{"id":"giftList"}).children:
print(child)
#处理兄弟标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html,'html.parser')
for sibling in bsObj.find('table',{'id':'giftList'}).tr.next_siblings:
print(sibling)
#父标签处理
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bsObj = BeautifulSoup(html,'html.parser')
print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
#正则表达式
#通过商品图片的文件路径查找
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bsObj = BeautifulSoup(html,'html.parser')
images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:
print(image["src"])