scrapy 例:爬取豆瓣电影-数据存放mongo+中间件
一. 新建项目(scrapy startproject)
scrapy startproject doubanSpider
二、明确目标(doubanSpider/items.py)
1 import scrapy 2 3 class DoubanspiderItem(scrapy.Item): 4 # define the fields for your item here like: 5 name = scrapy.Field() 6 content=scrapy.Field() 7 num=scrapy.Field() 8 quote=scrapy.Field()
三、制作爬虫 (spiders/douban.py)
1、scrapy genspider douban "movie.douban.com"
2、打开doubanSpider/spider目录里的 douban.py,代码如下:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from ..items import DoubanspiderItem 4 5 class DoubanSpider(scrapy.Spider): 6 name = 'douban' 7 allowed_domains = ['movie.douban.com'] 8 9 url="https://movie.douban.com/top250?start=" 10 offset=0 11 start_urls = [url+str(offset)] 12 13 def parse(self, response): 14 node_list=response.xpath("//div[@class='info']") 15 for node in node_list: 16 item = DoubanspiderItem() 17 item['name'] = node.xpath('./div[1]/a/span[1]/text()').extract()[0] 18 content = [] 19 content.append(node.xpath('./div[2]/p[1]/text()').extract()[0].strip()) 20 content.append(node.xpath('./div[2]/p[1]/text()[last()]').extract()[0].strip()) # 使用text()[last()]方法,取文本<br>后内容 21 item['content'] = content 22 item['num'] = node.xpath('./div[2]/div/span[2]/text()').extract()[0] 23 item['quote'] = node.xpath('./div[2]/p[2]/span/text()').extract()[0] 24 25 yield item 26 27 if self.offset<100: 28 self.offset+=25 29 30 yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
四、存储内容 (pipelines.py)
修改settings.py以下几个地方(可将日志打印在文件里):
1 LOG_FILE = "douban.log" 2 LOG_LEVEL = "DEBUG" 3 4 ROBOTSTXT_OBEY = True 5 6 #除非特殊需要,禁用cookies,防止某些网站根据Cookie来封锁爬虫。 7 COOKIES_ENABLED = False 8 9 # Enable or disable downloader middlewares 10 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 11 #添加自己编写的下载中间件类 12 DOWNLOADER_MIDDLEWARES = { 13 'doubanSpider.middlewares.RandomUserAgent': 100, 14 'doubanSpider.middlewares.RandomProxy': 200,} 15 16 USER_AGENTS = [ 17 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)', 18 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)', 19 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 20 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 21 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 22 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30', 23 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13' 24 ] 25 26 PROXIES = [ 27 {"ip_port" :"124.237.83.14:53281", "user_passwd" : ""}, 28 #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : "888888"} 29 #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} 30 #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} 31 ] 32 33 # Configure item pipelines 34 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 35 ITEM_PIPELINES = { 36 'doubanSpider.pipelines.DoubanspiderPipeline': 300, 37 } 38 39 # MONGODB 主机名 40 MONGO_HOST="127.0.0.1" 41 # MONGODB 端口号 42 MONGO_PORT=27017 43 # 数据库名称 44 MONGO_DBNAME="Douban" 45 # 存放数据的表名称 46 MONGO_SHEETNAME="doubanmovies"
编写pipelines.py文件(数据存入mongodb)
1 from scrapy.utils.project import get_project_settings #导入settings.py文件 2 import pymongo 3 4 class DoubanspiderPipeline(object): 5 def __init__(self): 6 host = get_project_settings().get('MONGO_HOST') 7 port = get_project_settings().get('MONGO_PORT') 8 dbname = get_project_settings().get('MONGO_DBNAME') 9 sheetname = get_project_settings().get('MONGO_SHEETNAME') 10 11 # 创建MONGODB数据库链接 12 client = pymongo.MongoClient(host=host, port=port) 13 # 指定数据库 14 db = client[dbname] 15 # 存放数据的数据库表名 16 self.sheet = db[sheetname] 17 18 def process_item(self, item, spider): 19 self.sheet.insert(dict(item)) 20 return item
mongodb命令如下:




编写middlewares.py文件
1 import random 2 import base64 3 from scrapy.utils.project import get_project_settings #导入settings.py文件 4 5 # 随机的User-Agent 6 class RandomUserAgent(): 7 def process_request(self,request,spider): 8 useragent=random.choice(get_project_settings().get('USER_AGENTS')) 9 # print(useragent) 10 request.headers.setdefault("User-Agent",useragent) 11 12 class RandomProxy(): 13 def process_request(self, request, spider): 14 proxy=random.choice(get_project_settings().get('PROXIES')) 15 16 if proxy['user_passwd'] is None: 17 # 没有代理账户验证的代理使用方式 18 request.meta['proxy']='http://'+proxy['ip_port'] 19 # else: 20 # # 对账户密码进行base64编码转换 21 # base64_userpasswd=base64.b64encode(proxy['user_passwd']) 22 # # 对应到代理服务器的信令格式里 23 # request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd 24 # 25 # request.meta['proxy'] = "http://" + proxy['ip_port']
命令执行:


posted on 2020-03-17 22:54 cherry_ning 阅读(179) 评论(0) 收藏 举报
浙公网安备 33010602011771号