1 from scrapy.selector import Selector, HtmlXPathSelector
2 from scrapy.http import HtmlResponse
3 html = """<!DOCTYPE html>
4 <html>
5 <head lang="en">
6 <meta charset="UTF-8">
7 <title></title>
8 </head>
9 <body>
10 <ul>
11 <li class="item-"><a id='i1' href="link.html">first item</a></li>
12 <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
13 <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
14 </ul>
15 <div><a href="llink2.html">second item</a></div>
16 </body>
17 </html>
18 """
19 response = HtmlResponse(url='http:example.com',body=html,encoding='utf-8')
20 # hxs = HtmlXPathSelector(response=response)
21 # print(hxs)
22
23 # hxs = Selector(response=response).xpath('//a')
24 # print(hxs)
25
26 # hxs = Selector(response).xpath('//a[1]')
27 # print(hxs)
28
29 # hxs = Selector(response).xpath('//a[@id]')
30 # print(hxs)
31
32 # hxs = Selector(response).xpath('//a[@id="i1"]')
33 # print(hxs)
34
35 # hxs = Selector(response).xpath('//a[@id="i1"]')
36 # print(hxs)
37
38 # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') #两个属性
39 # print(hxs)
40
41 # hxs = Selector(response=response).xpath('//a[contains(@href,"link")]') #包含
42 # print(hxs)
43
44 # hxs = Selector(response=response).xpath('//a[starts-with(@href,"link")]') #开头
45 # print(hxs)
46
47 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]')
48 # print(hxs)
49
50 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]/text()').extract() #文本
51 # print(hxs)
52
53 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]/@href').extract() #href连接
54 # print(hxs)
55
56 # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() #一层一层递进
57 # print(hxs)
58
59 # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() #取第一个
60 # print(hxs)
61
62 # ul_list = Selector(response=response).xpath('//body/ul/li')
63 # for item in ul_list:
64 # v = item.xpath('./a/span')
65 # # 或
66 # # v = item.xpath('a/span')
67 # # 或
68 # # v = item.xpath('*/a/span')
69 # print(v)