crawl——scrapy(配置文件,持久化,请求传递参数,提高爬虫效率,爬虫中间件,集成selenium,去重规则)

一、爬虫配置文件

setting.py

#1  是否遵循爬虫协议
ROBOTSTXT_OBEY = False

#2  浏览器类型(默认写的是scrapy,)
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
#3 日志级别(默认是info,执行爬虫,info会被打印出来)
# 设置成ERROR,只打印错误信息(提高爬虫效率)
LOG_LEVEL='ERROR'

#4 使用自己的去重规则
DUPEFILTER_CLASS = 'filter.MyDupeFilter'

#5 支持执行的最大并发请求默认是16个,默认开启的并发线程是32个,可以自己修改并发线程数
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

二、scrapy持久化

#保存到json文件中
1 scrapy crawl cnblogs -o cnblogs.json  (这个不需要记)
    -1 在items中写类,类中写字段
      class CnblogsSpiderItem(scrapy.Item):
          title = scrapy.Field()
          desc=scrapy.Field()
          url=scrapy.Field()
          author=scrapy.Field()
          # 重点(文章详情,如果跟之前爬过的文章对应)
          content=scrapy.Field()
      -2 在爬虫中把要保存的字段放到item对象中
      article_item['url']=url
      article_item['title']=title
      article_item['desc']=desc
      article_item['author']=author
      yield article_item    
      
      -3 在控制台输入:scrapy crawl cnblogs -o cnblogs.json
-o代表输出 cnblogs.json是输出的文件名(输出后会在项目根路径多cnblogs.json文件)


2 常用方式,只记住这一种
    -1 在items中写类,类中写字段
      class CnblogsSpiderItem(scrapy.Item):
          title = scrapy.Field()
          desc=scrapy.Field()
          url=scrapy.Field()
          author=scrapy.Field()
          # 重点(文章详情,如果跟之前爬过的文章对应)
          content=scrapy.Field()
   -2 在爬虫中把要保存的字段放到item对象中
      article_item['url']=url
      article_item['title']=title
      article_item['desc']=desc
      article_item['author']=author
      yield article_item    
   -3 在setting中配置
      ITEM_PIPELINES = {
       'cnblogs_spider.pipelines.CnblogsSpiderFilePipeline': 300,  # 数字表示优先级,数字越小,优先级越大.
       'cnblogs_spider.pipelines.CnblogsSpiderMysqlPipeline': 400,  # 数字表示优先级,数字越小,优先级越大
        }
   -4 在pipline中写
    class CnblogsSpiderFilePipeline:
      # 爬虫启动他会执行
      def open_spider(self,spider):
          # spider是爬虫对象
          print(spider.name)
          print('爬虫开始了')
          self.f=open('cnblogs.txt','w',encoding='utf-8')
      def close_spider(self,spider):
          # 爬虫停止会执行
          print('爬虫停止了')
          self.f.close()
      def process_item(self, item, spider):
          self.f.write(item['title']+item['desc']+item['author']+item['url'])
          self.f.write('/n')
          return item


  import pymysql
  class CnblogsSpiderMysqlPipeline:

      def open_spider(self,spider):
          self.conn=pymysql.connect( host='127.0.0.1', user='root', password="123",database='cnblogs', port=3306)
          self.cursor=self.conn.cursor()
      def close_spider(self,spider):
          self.conn.commit()
          self.cursor.close()
          self.conn.close()
      def process_item(self, item, spider):
          sql='insert into aritcle (title,`desc`,url,author) values (%s,%s,%s,%s )'
          self.cursor.execute(sql,args=[item['title'],item['desc'],item['url'],item['author']])
          return item

三、请求传递参数

1 给另一个请求传递参数,在响应中拿到(借助meta)
#Request(url,meta={'key':value})
yield Request(url=url,callback=self.parser_detail,meta={'item':article_item}) 2 在解析方法中通过response对象获取
#response.meta.get('key') item
=response.meta.get('item')

四、提高爬虫效率

提高scrapy的爬取效率(异步框架,基于twisted,性能很高了,但是也有可以优化的点):

- 在配置文件中进行相关的配置即可:(默认还有一套setting,类比django)

#1 增加并发(并发请求数):
默认scrapy开启的并发线程为32个,可以适当进行增加。在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100。
CONCURRENT_REQUESTS = 100

#2 降低日志级别:
在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可。在配置文件中编写:LOG_LEVEL = ‘INFO’
# 3 禁止cookie:(cnblogs不需要cookie)
如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率。在配置文件中编写:COOKIES_ENABLED = False
# 4禁止/取消重试(爬失败的网站就不要重试了):
对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试。在配置文件中编写:
RETRY_ENABLED = False
# 5 减少下载超时间(超时时间设置短一些):
如果对一个非常慢的链接进行爬取,减少下载超时可以能让卡住的链接快速被放弃,从而提升效率。在配置文件中进行编写:
DOWNLOAD_TIMEOUT = 10 超时时间为10s

#记住前5个就行

 #6 加入cookie池
 #7 加入代理池
 #8 随机请求头(浏览器头---》放到列表中 fake_useragent使用这个模块)

五、scrapy中间件(下载中间件,爬虫中间件

 1.爬虫中间件用的比较少(略)

 2.下载中间件   

CnblogsSpiderDownloaderMiddleware

# 下载中间件,可以有多个,数字越小,优先级越高
DOWNLOADER_MIDDLEWARES = {
  'cnblogs_spider.middlewares.CnblogsSpiderDownloaderMiddleware': 543,
}

process_request

1 请求来的时候
  # Must either:
  # - return None: continue processing this request     返回None,进入下一个下载中间件的process_request
  # - or return a Response object                       返回response对象,会给引擎,引擎给爬虫,进入解析
  # - or return a Request object                        返回请求对象,会给引擎,引擎给调度器,放到调度器
  # - or raise IgnoreRequest: process_exception() methods of 抛异常,就会触发process_exception的执行
  
  # 总结:
  返回None,继续爬取
  返回Resoponse对象,会给引擎,给爬虫,去解析
  返回Request对象,会给引擎,给调度器,等待下一次被调度
  
    
    
    
    
  # 什么情况会用它:
  加代理,加cookie,加浏览器类型
    集成 selenium
  
#为何要集成selenium?
# 爬虫scrapy框架基于Twisted 的框架,内部用的是协程。如果用selenium,它是阻塞式的,会影响效率,但是因为它有用所以selenium?在爬虫中广泛使用 

  # 修改cookie
  request.cookies={'name':'lqz'}
  # 使用代理
  proxy='http://154.16.63.16:8080'  # 从代理池中获取
  request.meta["proxy"] =proxy
  # 修改请求头
  request.headers.setlist(b'User-Agent','asdfadsfadsf')

process_response

# Must either;
# - return a Response object        正常逻辑,给 引擎,引擎个爬虫去解析
# - return a Request object         返回Request对象,给引擎,引擎给调度器,等待下一次被调度
# - or raise IgnoreRequest          抛异常IgnoreRequest,响应不要了,不会给引擎,再给爬虫解析

# 总结
返回Response,正常逻辑走
返回Request,放到调度器


# 什么情况会用到
抛异常,不再解析该响应(用的也不多)

2.在下载中间件中修改浏览器

先自己随便写一个服务端,这里写的是flask

from flask import Flask

app=Flask(__name__)
app.secret_key='qazwsxedc'
@app.route('/test')
def test():
    print(request.cookies)
    return 'xxxxxxxxx'
if __name__ == '__main__':
    app.run()

客户端 下载中间件中写

from scrapy import Request
from scrapy.http.response.html import HtmlResponse
# 下载中间件(介于下载器和引擎之间)
class CnblogsSpiderDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    # 请求走的时候
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # 能拿到request对象(当次请求的地址),能拿到spider对象(当前爬虫)
        # print('中间件中打印当次请求的地址',request.url)
        # 加cookie
        print(request.cookies)
        # 搭建好cookie池,从cookie池中取一个,赋值给它,换cookie了\
        # request.cookies={'name':'lqz'}

        # 加代理(使用代理池)
        print(request.meta)
        # proxy = "http://" + get_proxy()
        # proxy='http://154.16.63.16:8080'  # 从代理池中获取
        # request.meta["proxy"] =

        # 修改请求头(token是放在请求头中的,token池)
        # request.header['xx']='sssssssss'
        # 修改浏览器类型
        # request.header 重写了 setitem系列
        # print(type(request.headers))
        # print(request.headers)
        from scrapy.http.headers import Headers
        # print(request.headers.get(b'User-Agent'))
        request.headers.setlist(b'User-Agent','asdfadsfadsf')

六、集成selenium

#使用selenium比用原生的download效率低

1 在爬虫中写
  class ChoutiSpider(scrapy.Spider):
      name = 'chouti'
      allowed_domains = ['www.bilibili.com']
      start_urls = ['https://www.bilibili.com/v/dance/otaku/#/all/default/0/1/']
      bro=webdriver.Chrome(executable_path='./chromedriver')
      bro.implicitly_wait(10)
      @staticmethod
      def close(spider, reason):
          spider.bro.close()
          
          
2 在中间件中直接使用
  class CnblogsSpiderDownloaderMiddleware:
      def process_request(self, request, spider):
          spider.bro.get(request.url)
          response=HtmlResponse(url=request.url,body=bytes(spider.bro.page_source,encoding='utf-8'))
          return response
        
        
3 如何隐藏浏览器?
    -用无头浏览器

七、去重规则

#爬虫去重方式:

-方式一:scrapy默认的去重规则是:集合
-方式二:redis集合实现去重(自定义去重规则)
-方式三:使用布隆过滤器


1 默认会去重,使用了 from scrapy.dupefilters import RFPDupeFilter 2 在默认的setting中配的 3 本身原理是使用的是集合去重 4 更高级部分

  -对地址进行了获取指纹(为了处理同样地址,但是参数位置不一样导致的两个地址不一致)
   127.0.0.1/name=lili&age=19&sex=male

# 是不是同一个地址
127.0.0.1/?name=lili&age=19
127.0.0.1/?age=19&name=lili
# 本质原理,把?后的打散了,再排序
fp = self.request_fingerprint(request) # 得到一个指纹,上面那两个地址结果一样


## 自定义去重规则
如果你自己写去重类,如何使用?
写一个类,继承 BaseDupeFilter,重写def request_seen(self, request):
返回true表示爬过了
返回false表示没有爬过


# 尝试用一个更牛逼的去重方案
    -集合去重可以,存在问题,如果地址特别多,上亿个地址,集合会特别大,会非常占用内存
  -极小内存,完成去重(布隆过滤器)

自定义去重规则

# 使用自己的去重规则 settings.py配置
DUPEFILTER_CLASS = 'filter.MyDupeFilter'

#fitter.py

from scrapy.dupefilters import BaseDupeFilter
class MyDupeFilter(BaseDupeFilter):
    pool=set()
    def request_seen(self, request):
        print('走了自己的')
        if request.url in self.pool:
            return True
        else:
            self.pool.add(request.url)
            return False

八、总结代码

run.py #右键运行,可无

from scrapy.cmdline import execute

# execute(['scrapy', 'crawl', 'cnblogs','--nolog'])
# execute(['scrapy', 'crawl', 'cnblogs'])
execute(['scrapy', 'crawl', 'chouti'])

filter.py  #自己写的去重规则,可无

from scrapy.dupefilters import BaseDupeFilter
class MyDupeFilter(BaseDupeFilter):
    pool=set()
    def request_seen(self, request):
        print('走了自己的')
        if request.url in self.pool:
            return True
        else:
            self.pool.add(request.url)
            return False

s1.py  #检验是否爬重复了,可无

from scrapy.utils.request import request_fingerprint

from scrapy import Request
#检测是否会爬重复,这里写的是同一个地址,结论:说明获取的指纹,不会爬重复
url1=Request(url='http://127.0.0.1/?name=lqz&age=19')
url2=Request(url='http://127.0.0.1/?age=19&name=lqz')
fp1=request_fingerprint(url1)
fp2=request_fingerprint(url2)
print(fp1)#afbaa0881bb50eb208caba3c4ac4aa8ffdbb7ba4
print(fp2)#afbaa0881bb50eb208caba3c4ac4aa8ffdbb7ba4

cnblogs_spider\settings.py

BOT_NAME = 'cnblogs_spider'

SPIDER_MODULES = ['cnblogs_spider.spiders']
NEWSPIDER_MODULE = 'cnblogs_spider.spiders'

# Obey robots.txt rules
#1  是否遵循爬虫协议
ROBOTSTXT_OBEY = False

#2  浏览器类型(默认写的是scrapy,)
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
#3 日志级别(默认是info,执行爬虫,info会被打印出来)
# 设置成ERROR,只打印错误信息(提高爬虫效率)
LOG_LEVEL='ERROR'

# 使用自己的去重规则
DUPEFILTER_CLASS = 'filter.MyDupeFilter'

#支持一次性发送多少个请求,默认是16个
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32


# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 下载中间件,可以有多个,数字越小,优先级越高
DOWNLOADER_MIDDLEWARES = {
   'cnblogs_spider.middlewares.CnblogsSpiderDownloaderMiddleware': 543,
}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 持久化相关的配置
ITEM_PIPELINES = {
   # 'cnblogs_spider.pipelines.CnblogsSpiderFilePipeline': 300,  # 数字表示优先级,数字越小,优先级越大
   'cnblogs_spider.pipelines.CnblogsSpiderMysqlPipeline': 400,  # 数字表示优先级,数字越小,优先级越大
}

cnblogs_spider\items.py

import scrapy

# 类比models.py,写一个个的类,一定要继承scrapy.Item
# 如何判断一个对象是不是Item的对象
# isinstance 跟 type的区别是什么

class CnblogsSpiderItem(scrapy.Item):
    title = scrapy.Field()
    desc=scrapy.Field()
    url=scrapy.Field()
    author=scrapy.Field()
    # 重点(文章详情,如果跟之前爬过的文章对应)
    content=scrapy.Field()

cnblogs_spider\pipelines.py

from itemadapter import ItemAdapter

# 这个类就是存储相关,存到文件,存到数据库


# 把爬取的数据保存到文件中
class CnblogsSpiderFilePipeline:
    # 爬虫启动他会执行
    def open_spider(self,spider):
        # spider是爬虫对象
        print(spider.name)
        print('爬虫开始了')
        self.f=open('cnblogs.txt','w',encoding='utf-8')
    def close_spider(self,spider):
        # 爬虫停止会执行
        print('爬虫停止了')
        self.f.close()
    def process_item(self, item, spider):
        # 没yield一个item会执行
        # item 就是在爬虫中yield的那个item
        # with open('cnblogs.txt','w',encoding='utf-8') as f:
        #     f.write('标题是:'+item['title'])
        #     f.write('摘要是:'+item['desc'])
        #     f.write('作者是:'+item['author'])
        #     f.write('连接是:'+item['url'])
        # print('来了一个itme')
        self.f.write(item['title']+item['desc']+item['author']+item['url'])
        self.f.write('/n')
        return item


import pymysql
class CnblogsSpiderMysqlPipeline:

    def open_spider(self,spider):
        self.conn=pymysql.connect( host='127.0.0.1', user='root', password="123",database='cnblogs', port=3306)
        self.cursor=self.conn.cursor()
    def close_spider(self,spider):
        self.conn.commit()
        self.cursor.close()
        self.conn.close()
    def process_item(self, item, spider):
        sql='insert into aritcle (title,`desc`,url,author,content) values (%s,%s,%s,%s,%s )'
        self.cursor.execute(sql,args=[item['title'],item['desc'],item['url'],item['author'],item['content']])
        return item

cnblogs_spider\middlewares.py  #注意:跟浏览器相关,需要在项目中加入适合本电脑的浏览器 chromedriver

from scrapy import signals
from itemadapter import is_item, ItemAdapter

# 爬虫中间件(介于爬虫和引擎之间)
class CnblogsSpiderSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i


    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)



from scrapy import Request
from scrapy.http.response.html import HtmlResponse
# 下载中间件(介于下载器和引擎之间)
class CnblogsSpiderDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    # 请求走的时候
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # 能拿到request对象(当次请求的地址),能拿到spider对象(当前爬虫)
        # print('中间件中打印当次请求的地址',request.url)
        # 加cookie
        print(request.cookies)
        # 搭建好cookie池,从cookie池中取一个,赋值给它,换cookie了\
        # request.cookies={'name':'lqz'}

        # 加代理(使用代理池)
        print(request.meta)
        # proxy = "http://" + get_proxy()
        # proxy='http://154.16.63.16:8080'  # 从代理池中获取
        # request.meta["proxy"] =

        # 修改请求头(token是放在请求头中的,token池)
        # request.header['xx']='sssssssss'

# 修改浏览器类型 # request.header 重写了 setitem系列 # print(type(request.headers)) # print(request.headers) from scrapy.http.headers import Headers # print(request.headers.get(b'User-Agent')) # request.headers.setlist(b'User-Agent','asdfadsfadsf') # 集成selenium # from selenium import webdriver # bro=webdriver.Chrome(executable_path='./chromedriver') spider.bro.get(request.url) # bro.page_source # 包装一个Response对象即可 # scrapy.http.response.html.HtmlResponse response=HtmlResponse(url=request.url,body=bytes(spider.bro.page_source,encoding='utf-8')) return response # 响应来的时候 def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response # 出异常的时候 # 在这里把所有爬取失败的地址,用request.url取出来并且存起来,下次再爬 def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass # 爬虫开启的时候 def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)

cnblogs_spider\spiders\cnblogs.py #使用默认去重规则

import scrapy
from .. import items  # 相对导入只能用在包内,如果该文件以脚本运行,必须是绝对导入
# from cnblogs_spider import items
from scrapy import Request
class CnblogsSpider(scrapy.Spider):
    name = 'cnblogs'   # 爬虫名字(不能重复)
    # allowed_domains = ['www.cnblogs.com'] # 允许的域(只爬取当前域下的地址)
    allowed_domains = ['127.0.0.1:5000'] # 允许的域(只爬取当前域下的地址)
    start_urls = ['http://127.0.0.1:5000/test'] # 爬取的起始地址

    # def parse(self, response): # 解析方法
    #     # response:响应对象(requests模块的响应对象)
    #     # print(response.text)
    #     # response.xpath()
    #     # response.css()
    #     ll = [] # 持久化相关
    #     article_list=response.xpath('//article[@class="post-item"]')
    #     for article in article_list:
    #         article_item=items.CnblogsSpiderItem()
    #
    #         title=article.xpath('.//a[@class="post-item-title"]/text()').extract_first()
    #         desc=article.css('p.post-item-summary::text').extract()[-1]
    #         # author=article.css('a.post-item-author>span::text').extract_first()
    #         # author=article.css('footer.post-item-foot span::text').extract_first()
    #         # > css
    #         # // xpath
    #         author=article.xpath('.//a[@class="post-item-author"]/span/text()').extract_first()
    #         # url=article.xpath('.//a[@class="post-item-title"]/@href').extract_first()
    #         url=article.css('a.post-item-title::attr(href)').extract_first()
    #         # 持久化的第一种,讲完就忘掉
    #         ll.append({'title':title,'desc':desc,'url':url})
    #         # print(url)
    #         # callback解析方法
    #         yield Request(url,callback=self.parser_detail)
    #
    #
    #     # 解析下一页
    #     # next=response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first()
    #     next=response.css('div.pager a:last-child::attr(href)').extract_first()
    #     next='https://www.cnblogs.com'+next
    #     print(next)
    #     # yield Request(next)
    #     return ll

    def parse(self, response):  # 解析方法
        print(type(response))
        article_list = response.xpath('//article[@class="post-item"]')
        for article in article_list:
            article_item = items.CnblogsSpiderItem()
            author=article.css('a.post-item-autho>span::text').extract_first()
            author=article.css('footer.post-item-foot span::text').extract_first()
            title = article.xpath('.//a[@class="post-item-title"]/text()').extract_first()
            desc = article.css('p.post-item-summary::text').extract()[-1]
            author = article.xpath('.//a[@class="post-item-author"]/span/text()').extract_first()
            url = article.css('a.post-item-title::attr(href)').extract_first()
            # 往对象中放属性(不能用.的方式,只能用[])
            # article_item.url=url
            # article_item.title=title
            # article_item.desc=desc
            # article_item.author=author
            article_item['url']=url
            article_item['title']=title
            article_item['desc']=desc
            article_item['author']=author
            # print(title)
            # yield article_item
            yield Request(url=url,callback=self.parser_detail,meta={'item':article_item})

            # yield Request(url, callback=self.parser_detail)
        next = response.css('div.pager a:last-child::attr(href)').extract_first()
        next = 'https://www.cnblogs.com' + next
        print(next)
        yield Request(next)

    def parser_detail(self,response):
        item=response.meta.get('item')
        # 获取到标签,把标签直接转成字符串直接存标签(因为标签会有样式,而存为text的话没有样式)
        content=response.css('#cnblogs_post_body').extract_first()
        # 把文章详情放入
        item['content']=str(content)
        yield item

cnblogs_spider\spiders\chouti.py #验证自己写的去重规则

import scrapy


from scrapy.dupefilters import RFPDupeFilter  # 默认用的去重类
# DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'  在内置的配置文件中写死的,如果你想更换,可以自己写一个类

from scrapy import Request
from scrapy.http.request import Request
from selenium import webdriver
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['www.bilibili.com']
    #起始地址
    start_urls = ['https://www.bilibili.com/v/dance/otaku/#/all/default/0/1/']
    bro=webdriver.Chrome(executable_path='./chromedriver')
    bro.implicitly_wait(10)
    @staticmethod
    def close(spider, reason):
        spider.bro.close()


    # 真正的开始爬取的地址(看源码),所以可以不用start_urls,直接重写start_requests
    # def start_requests(self):
    #     yield Request('http://www.baidu.com')

    def parse(self, response):
        # print(response.text)
        li_list=response.css('ul.vd-list li')
        print(len(li_list))
        for li in li_list:
            url=li.css('div.r>a::attr(href)').extract_first()
            print(url)
            # yield Request(url='https:'+url,callback=self.parser_detail)
            yield Request(url='https://www.bilibili.com/v/dance/otaku/#/all/default/0/1/')


    def parser_detail(self,response):
        print(len(response.text))

 

posted @ 2021-02-02 21:37  1024bits  阅读(644)  评论(0编辑  收藏  举报