dupefilter对访问的url做去重
第一步:
在爬虫文件中chouti.py中
import scrapy
from xdb.items import XdbItem
from scrapy.dupefilters import RFPDupeFilter
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://chouti.com/']
# start_urls = ['http://127.0.0.1:80/app01/login/']
def parse(self, response):
# print(response, type(response))
# print(response.text)
content_list = response.xpath('//div[@class="link-con"]//div[@class="link-detail"]')
for item in content_list:
text = item.xpath('./a/text()').extract_first()
href = item.xpath('./a/@href').extract_first()
yield XdbItem(text=text, href=href)
# print(href)
page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
for page in page_list:
from scrapy.http import Request
page = "https://dig.chouti.com" + page
yield Request(url=page, callback=self.parse) # 在内部会先调用XdbDupeFilter类中request_seen方法
# yield Request(url=page, callback=self.parse, dont_filter=True) # dont_filter为True时不去重规则失效
第二步:
先创建一个自定义文件dupefilters.py,写
from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
class XdbDupeFilter(BaseDupeFilter):
def __init__(self):
self.visited_fd = set()
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
# 给url进行加密成一个固定的字符串
fd = request_fingerprint(request)
if fd in self.visited_fd:
return True # 如果返回True表示之前访问过了,不再访问了
self.visited_fd.add(fd)
def open(self): # can return deferred
# 爬虫开始
pass
def close(self, reason): # can return a deferred
# 爬虫结束
pass
def log(self, request, spider): # log that a request has been filtered
pass
第三步:
在settings.py中配置
# 修改默认的去重规则
# DUPILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
DUPILTER_CLASS = 'xdb.dupefilters.XdbDupeFilter'