import scrapy
class BooksSpider(scrapy.Spider):
'''
爬取http://books.toscrape.com/的书籍信息
'''
# 爬虫标识
name = 'books'
# 定义爬虫起始点即url
start_urls = [
'http://lab.scrapyd.cn/'
]
'''
定义 传入参数的爬虫
命令 scrapy crawl argsSpider -a tag=励志
def start_requests(self):
url = 'http://lab.scrapyd.cn/'
tag = getattr(self,'tag',None)
if tag:
url = url + 'tag/' + tag
yield scrapy.Request(url,self.parse)
'''
# 定义页面解析函数
def parse(self,response):
# 数据提取
# data = response.css('div.quote')
# print(data)
for info in response.css('div.quote'):
mingyan = info.css('.text::text').extract_first()
aothor = info.css('.author::text').extract_first()
tags = info.css('.tags .tag ::text').extract()
# print(mingyan)
# print(aothor)
# print(tags)
with open('mingyan.csv','a+',encoding='utf-8')as f:
f.writelines(mingyan+'\r\n'+aothor+'\r'+str(tags)+'\r\n')
next_url = response.css('li.next a::attr(href)').extract_first()
# print(next_url)
if next_url:
url = response.urljoin(next_url)
yield scrapy.Request(url,callback=self.parse)