爬虫常用设置

1. setting.py

 1 #!/usr/bin/python
 2 # -*- coding: utf-8 -*-
 3 """
 4 @author: yugengde
 5 @contact: yugengde@163.com
 6 @file : settings.py
 7 @time: 2017/11/22 15:41
 8 """
 9 
10 BOT_NAME = 'pro'
11 
12 SPIDER_MODULES = ['pro.spiders']
13 NEWSPIDER_MODULE = 'pro.spiders'
14 
15 ROBOTSTXT_OBEY = False
16 
17 DOWNLOAD_DELAY = 3
18 COOKIES_ENABLED = False
19 
20 DOWNLOADER_MIDDLEWARES = {
21     'pro.middlewares.PhantomJSMiddleware': 301,
22     'pro.middlewares.UserAgentMiddleware': 300,
23 }
24 
25 ITEM_PIPELINES = {
26     'scrapy_redis.pipelines.RedisPipeline': 301,
27     'pro.pipelines.DuplicatesPipeline': 300,
28 }
29 
30 LOG_ENABLED = True
31 LOG_ENCODING = 'utf-8'
32 LOG_FILE = 'pro.log'
33 LOG_LEVEL = 'DEBUG'
34 # LOG_STDOUT =
35 
36 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
37 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
38 REDIS_URL = 'redis://root:password@localhost:6379'

 

2. middlewares.py

  1 class PhantomJSMiddleware(object):
  2     @classmethod
  3     def process_request(cls, request, spider):
  4         from selenium import webdriver
  5         from scrapy.http import HtmlResponse
  6         driver = webdriver.PhantomJS(r'C:\InstallFile\Phantomjs\bin\phantomjs.exe')
  7         driver.get(request.url)
  8         content = driver.page_source.encode('utf-8')
  9         driver.quit()
 10 
 11         return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
 12 
 13 
 14 class UserAgentMiddleware(object):
 15     @classmethod
 16     def process_request(cls, request, spider):
 17         import random
 18         user_agents = [ 
20
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0", 21 "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0", # 可以使用 UserAgent()函数生成      ] 294 request.headers.setdefault('UserAgent',random.choice(user_agents))

 3. pipelines.py

 1 #!/usr/bin/python
 2 # -*-coding:utf-8-*-
 3 
 4 from scrapy.exceptions import DropItem
 5 
 6 
 7 # 数据的去重
 8 class DuplicatesPipeline(object):
 9     def __init__(self):
10         self.ids_seen = set()
11 
12     def process_item(self, item, spider):
13         if not item['title']:
14             raise DropItem("Missing title in %s " % item)
15 
16         if item['item_id'] in self.ids_seen:
17             raise DropItem("Duplicate item found: %s" % item)
18         else:
19             self.ids_seen.add(item['item_id'])
20             yield item

 

posted @ 2017-11-22 15:48  人微言轻1  阅读(213)  评论(0编辑  收藏  举报