scrapy的选择器

选择器
Request是一个封装用户请求的类，在回调函数中yield该对象表示继续访问
HtmlXpathSelector用于结构化HTML代码并提供选择器功能
from scrapy.selector import Selector, HtmlXPathSelector
from scrapy.http import HtmlResponse

html = """<!DOCTYPE html>
<html>
    <head lang="en">
        <meta charset="UTF-8">
        <title></title>
    </head>
    <body>
        <ul>
            <li class="item-"><a id='i1' href="link.html">first item</a></li>
            <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
            <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
        </ul>
        <div><a href="llink2.html">second item</a></div>
    </body>
</html>
"""
response = HtmlResponse(url='http://example.com', body=html, encoding='utf-8')
hxs = HtmlXPathSelector(response)
print(hxs)
hxs = Selector(response=response).xpath('//a')
print(hxs)
hxs = Selector(response=response).xpath('//a[2]')
print(hxs)
hxs = Selector(response=response).xpath('//a[@id]')
print(hxs)
hxs = Selector(response=response).xpath('//a[@id="i1"]')
print(hxs)
hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
print(hxs)
hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
print(hxs)
hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
print(hxs)
hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
print(hxs)
hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
print(hxs)
hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
print(hxs)
hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
print(hxs)
hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
print(hxs)

ul_list = Selector(response=response).xpath('//body/ul/li')
for item in ul_list:
    v = item.xpath('./a/span')
    # 或
    # v = item.xpath('a/span')
    # 或
    # v = item.xpath('*/a/span')
    print(v)
posted @ 2018-07-02 11:38 liang哥哥阅读(93) 评论(0) 收藏举报
刷新页面返回顶部
liang哥哥

scrapy的选择器

公告