- scrapy genspider -t crawl xxx xxx.com
- 连接提取器 LinkExtactor(allow='正则')
- 规则提取器 Rule(link,callback,follow=True)
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# class CrawldemoSpider(CrawlSpider):
# name = 'crawlDemo'
# # allowed_domains = ['www.xxx.com']
# start_urls = ['https://dig.chouti.com/all/hot/recent/1']
# # 连接提取器:前提(follow=False),作用就是用来提取起始url对应页面中符合要求的连接
# link = LinkExtractor(allow=r'/all/hot/recent/\d+')
# rules = (
# # 规则解析器对象:将连接提取器提取到的连接对应的页面源码数据根据只用要求进行解析
# #follow=True:让连接提取器继续作用在连接提取器提取出的来连接所对应的页面源码中
# Rule(link, callback='parse_item', follow=True), # 并且去重了。如果follow=False 只会作用域起始URL对应的页面
# )
#
# def parse_item(self, response): # 调用次数由链接提取器 提取到的个数决定
# print(response)
LinkExtractor 的参数 只会在起始url对应的页面中有作用
allow 满足正则 会被提取,为空,全部提取
deny 满则正则的不会被提取
restrict_xpaths xpath表达式
restrict_css
deny_domains