记性不好,收集,方便查询
lxml_xpath
xpath('//button/span[contains(text(), "指定文本内容")]')
#使用 xpath 定位包含指定文本内容的标签,例如:a = tree.xpath('//a[contains(text(), "下一页")]/@href')[0]
文本完全匹配,a = tree.xpath('//a[text()="下一页"]/@href')[0]
#from lxml import etree
tree = etree.HTML(html) node_list = tree.xpath('//div[contains(@id, "qiushi_tag")]') #id属性中包含'qiushi_tag' #<div class="article block untagged mb15 typs_long" id="qiushi_tag_120321510"> for node in node_list: author = node.xpath('./div/a[2]/h2')[0].text.strip() content = node.xpath('.//div[@class="content"]/span')[0].text.strip() zan = node.xpath('.//div[@class="stats"]/span[1]/i')[0].text #zan = node.xpath('.//div[@class="stats"]/span[1]/i/text()')[0]
comments = node.xpath('.//a[@class="qiushi_comments"]/i')[0].text items = {"author" : author, "content" : content, "zan" : zan, "comments" : comments}
属性名称杂乱,'//','/','[ ]','|'等符号,让节点的选取更方便
‘/’ 是用来获取子元素的,因此前后要父子关系
‘[2]’ 表示第二个,非list类型中的索引
BeautifulSoup_CSS 详细常用:http://www.cnblogs.com/math98/p/8776898.html
from bs4 import BeautifulSoup doc = ''' <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>''' soup = BeautifulSoup(doc, 'lxml') a = soup.select('body p')[1].select('a')[0].text b = soup.select('ul span')[0].text #ul span 前后两标签不一定要父子关系 c = soup.select('.item-0 a')[0]
print(c['href'])
浙公网安备 33010602011771号