from scrapy import Selector

>>> doc = """

... <div>

...     <ul>

...         <li class="item-0"><a href="link1.html">first item</a></li>

...         <li class="item-1"><a href="link2.html">second item</a></li>

...         <li class="item-inactive"><a href="link3.html">third item</a></li>

...         <li class="item-1"><a href="link4.html">fourth item</a></li>

...         <li class="item-0"><a href="link5.html">fifth item</a></li>

...     </ul>

... </div>

... """

>>> sel = Selector(text=doc, type="html")

>>> sel.xpath('//li//@href').extract()

[u'link1.html', u'link2.html', u'link3.html', u'link4.html', u'link5.html']

在xpath中使用正则表达式

>>> sel.xpath('//li[re:test(@class, "item-\d$")]//@href').extract()

[u'link1.html', u'link2.html', u'link4.html', u'link5.html']

在xpath中使用变量,用$标识,下面路径表示提取包含5个<a>标签的div标签的属性id的值

response.xpath('//div[count(a)=$cnt]/@id',cnt=5).extract_first()

response.xpath('//div[@id=$val]/a/text()', val='images').extract_first()

u'Name: My image 1 '

 

response.xpath('//base/@href').extract()

[u'http://example.com/']

response.css('base::attr(href)').extract()

[u'http://example.com/']

response.xpath('//a[contains(@href,"img")]/@href').extract()

response.css(

 posted on 2018-08-27 17:19  庭明  阅读(93)  评论(0编辑  收藏  举报