复杂HTML解析

#再端一碗BeautifulSoup
#获取《战争与和平》中的人物名字


from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html,'html.parser')

#namelist = bsObj.findAll("span",{"class":"green"})
#for name in namelist:
#    print(name.get_text())

name_number = bsObj.findAll(text='the prince')
print(len(name_number))

allText = bsObj.findAll(id="text") #bsObj.findAll(id='text')等价于bsObj.findAll(" ",{"id":"text"})
print(allText[0].get_text())


#get_text()会把正在处理的HTML文档中所有的标签都清除，然后返回一个只包含文字的字符串。
#通常在准备打印、存储和操作数据时，应该最后才使用get_text()


#BeautifulSoup的find()和findAll()
#其定义如下
#findAll(tag,attributes,recursive,text,limit,keywords)
#find(tag,attributes,recursive,text,keywords)

#find等价于findAll的limit等于1时的情形
#如果只对网页中获取的前X项结果感兴趣，就可以设置它
#但是得注意这个参数设置之后，获得的前几项结果是按照网页上的顺序排序的
#未必是想要的那几项

#其他BeautifulSoup对象
#1.NavigableSring对象：用来表示标签里的文字
#2.Comment对象：用来查找HTML文档的注释标签，<!-- 文字 -->



#子标签和后代标签
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html,'html.parser')

for child in bsObj.find("table",{"id":"giftList"}).children:
    print(child)




#处理兄弟标签
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html,'html.parser')

for sibling in bsObj.find('table',{'id':'giftList'}).tr.next_siblings:
    print(sibling)
    



#父标签处理
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bsObj = BeautifulSoup(html,'html.parser')
print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())



#正则表达式
#通过商品图片的文件路径查找
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bsObj = BeautifulSoup(html,'html.parser')
images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:
    print(image["src"])
posted @ 2019-06-25 10:53 红桃6 阅读(288) 评论(0) 收藏举报
刷新页面返回顶部
红桃6

努力学习是为了自由。

复杂HTML解析

公告