scrapy的选择器
选择器
Request是一个封装用户请求的类,在回调函数中yield该对象表示继续访问
HtmlXpathSelector用于结构化HTML代码并提供选择器功能
from scrapy.selector import Selector, HtmlXPathSelector from scrapy.http import HtmlResponse html = """<!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id='i1' href="link.html">first item</a></li> <li class="item-0"><a id='i2' href="llink.html">first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> </body> </html> """ response = HtmlResponse(url='http://example.com', body=html, encoding='utf-8') hxs = HtmlXPathSelector(response) print(hxs) hxs = Selector(response=response).xpath('//a') print(hxs) hxs = Selector(response=response).xpath('//a[2]') print(hxs) hxs = Selector(response=response).xpath('//a[@id]') print(hxs) hxs = Selector(response=response).xpath('//a[@id="i1"]') print(hxs) hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') print(hxs) hxs = Selector(response=response).xpath('//a[contains(@href, "link")]') print(hxs) hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]') print(hxs) hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]') print(hxs) hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract() print(hxs) hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract() print(hxs) hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() print(hxs) hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() print(hxs) ul_list = Selector(response=response).xpath('//body/ul/li') for item in ul_list: v = item.xpath('./a/span') # 或 # v = item.xpath('a/span') # 或 # v = item.xpath('*/a/span') print(v)