1 import scrapy
2 import sys
3 # import io
4 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18303')
5 from scrapy.selector import Selector, HtmlXPathSelector
6 class ChoutiSpider(scrapy.Spider):
7 name = 'chouti'
8 # allowed_domains = ['chouti.com']
9 start_urls = ['http://dig.chouti.com/']
10
11 def parse(self, response):
12 # print(response.text)
13 # content = str(response.body, encoding='utf-8')
14 # print(content)
15 # hxs = Selector(response=response).xpath('//a').extract()
16 # for i in hxs:
17 # print(i)
18 # hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]').extract()
19 # for i in hxs:
20 # print(i)
21 hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]') # 标签对像列表
22 for obj in hxs:
23 a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first()
24 print(a.strip())
25
26 '''
27 // 表示子孙中
28 .// 当前对像的子孙中
29 / 儿子
30 /div 儿子中的div标签
31 /div[@id="i1" 儿子中的div标签且id=i1
32 obj.extract() 列表中的每一个对象转换成字符串 =>[]
33 obj.extract_first() 列表中的每一个对象转换成字符串=>列表第一个元素
34 //div/text() 获取某个标签的文本