scrapy 例:爬取豆瓣电影-数据存放mongo+中间件

一. 新建项目(scrapy startproject)

scrapy startproject doubanSpider

 

二、明确目标(doubanSpider/items.py)

1 import scrapy
2 
3 class DoubanspiderItem(scrapy.Item):
4     # define the fields for your item here like:
5     name = scrapy.Field()
6     content=scrapy.Field()
7     num=scrapy.Field()
8     quote=scrapy.Field()

 

三、制作爬虫 (spiders/douban.py)

 1、scrapy genspider douban "movie.douban.com"

   2、打开doubanSpider/spider目录里的 douban.py,代码如下:

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from ..items import DoubanspiderItem
 4 
 5 class DoubanSpider(scrapy.Spider):
 6     name = 'douban'
 7     allowed_domains = ['movie.douban.com']
 8 
 9     url="https://movie.douban.com/top250?start="
10     offset=0
11     start_urls = [url+str(offset)]
12 
13     def parse(self, response):
14         node_list=response.xpath("//div[@class='info']")
15         for node in node_list:
16             item = DoubanspiderItem()
17             item['name'] = node.xpath('./div[1]/a/span[1]/text()').extract()[0]
18             content = []
19             content.append(node.xpath('./div[2]/p[1]/text()').extract()[0].strip())
20             content.append(node.xpath('./div[2]/p[1]/text()[last()]').extract()[0].strip())  # 使用text()[last()]方法,取文本<br>后内容
21             item['content'] = content
22             item['num'] = node.xpath('./div[2]/div/span[2]/text()').extract()[0]
23             item['quote'] = node.xpath('./div[2]/p[2]/span/text()').extract()[0]
24 
25             yield item
26 
27         if self.offset<100:
28             self.offset+=25
29 
30         yield scrapy.Request(self.url+str(self.offset),callback=self.parse)

 

四、存储内容 (pipelines.py)

修改settings.py以下几个地方(可将日志打印在文件里):

 1 LOG_FILE = "douban.log"
 2 LOG_LEVEL = "DEBUG"
 3 
 4 ROBOTSTXT_OBEY = True
 5 
 6 #除非特殊需要,禁用cookies,防止某些网站根据Cookie来封锁爬虫。
 7 COOKIES_ENABLED = False
 8 
 9 # Enable or disable downloader middlewares
10 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
11 #添加自己编写的下载中间件类
12 DOWNLOADER_MIDDLEWARES = {
13    'doubanSpider.middlewares.RandomUserAgent': 100,
14    'doubanSpider.middlewares.RandomProxy': 200,}
15 
16 USER_AGENTS = [
17     'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
18     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
19     'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
20     'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
21     'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
22     'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
23     'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
24 ]
25 
26 PROXIES = [
27         {"ip_port" :"124.237.83.14:53281", "user_passwd" : ""},
28         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : "888888"}
29         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
30         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
31 ]
32 
33 # Configure item pipelines
34 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
35 ITEM_PIPELINES = {
36    'doubanSpider.pipelines.DoubanspiderPipeline': 300,
37 }
38 
39 # MONGODB 主机名
40 MONGO_HOST="127.0.0.1"
41 # MONGODB 端口号
42 MONGO_PORT=27017
43 # 数据库名称
44 MONGO_DBNAME="Douban"
45 # 存放数据的表名称
46 MONGO_SHEETNAME="doubanmovies"

编写pipelines.py文件(数据存入mongodb)

 1 from scrapy.utils.project import get_project_settings  #导入settings.py文件
 2 import pymongo
 3 
 4 class DoubanspiderPipeline(object):
 5     def __init__(self):
 6         host = get_project_settings().get('MONGO_HOST')
 7         port = get_project_settings().get('MONGO_PORT')
 8         dbname = get_project_settings().get('MONGO_DBNAME')
 9         sheetname = get_project_settings().get('MONGO_SHEETNAME')
10 
11         # 创建MONGODB数据库链接
12         client = pymongo.MongoClient(host=host, port=port)
13         # 指定数据库
14         db = client[dbname]
15         # 存放数据的数据库表名
16         self.sheet = db[sheetname]
17 
18     def process_item(self, item, spider):
19         self.sheet.insert(dict(item))
20         return item

mongodb命令如下:

 

 

 

 

 编写middlewares.py文件

 1 import random
 2 import base64
 3 from scrapy.utils.project import get_project_settings  #导入settings.py文件
 4 
 5 # 随机的User-Agent
 6 class RandomUserAgent():
 7     def process_request(self,request,spider):
 8         useragent=random.choice(get_project_settings().get('USER_AGENTS'))
 9         # print(useragent)
10         request.headers.setdefault("User-Agent",useragent)
11 
12 class RandomProxy():
13     def process_request(self, request, spider):
14         proxy=random.choice(get_project_settings().get('PROXIES'))
15 
16         if proxy['user_passwd'] is None:
17             # 没有代理账户验证的代理使用方式
18             request.meta['proxy']='http://'+proxy['ip_port']
19         # else:
20         #     # 对账户密码进行base64编码转换
21         #     base64_userpasswd=base64.b64encode(proxy['user_passwd'])
22         #     # 对应到代理服务器的信令格式里
23         #     request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
24         #
25         #     request.meta['proxy'] = "http://" + proxy['ip_port']

 

命令执行:

 

 

posted on 2020-03-17 22:54  cherry_ning  阅读(179)  评论(0)    收藏  举报

导航