CrawlSpider
作用:基于全栈数据的爬取。
首先创建项目
-
scrapy startproject choutiPro
-
cd choutiPro
-
scrapy genspider -t crawl chouti www.xxx.com
【需求】:
爬取抽屉网段子类中所有的分页URL
代码部分:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ChoutiSpider(CrawlSpider):
name = 'chouti'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://dig.chouti.com/r/scoff/hot/1']
# 链接提取器,allow表示的就是链接提取器提取链接的正则
link = LinkExtractor(allow=r'/r/scoff/hot/\d+')
rules = (
# 规则解析器:将提取到的链接所对应的页面数据进行制定形式的解析
Rule(link, callback='parse_item', follow=True),
#如果follow设置为False的话。就只能拿到当前页的分页数据,也就是十条数据
# 让链接提取器继续作用到链接提取器提取到的链接所对应的页面中
)
def parse_item(self, response):
item = {}
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()
print(response)
【需求】:
爬取抽屉网段子类中所有的分页URL
代码部分:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ChoutiSpider(CrawlSpider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/pic/']
# 连接提取器:
# allow:表示的就是链接提取器提取连接的规则(正则)/pic/page/3?s=5172496
link = LinkExtractor(allow=r'/pic/page/\d+\?s=\d+')
link1 = LinkExtractor(allow=r'/pic/$')
# link1 = LinkExtractor(allow=r'')
rules = (
# 规则解析器:将链接提取器提取到的连接所对应的页面数据进行指定形式的解析
Rule(link, callback='parse_item', follow=True),
# 让连接提取器继续作用到链接提取器提取到的连接所对应的页面中
Rule(link1, callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response)

浙公网安备 33010602011771号