scrapy-redis 爬虫实例:有缘网分布式爬虫
搭建redis分布式环境见:centos-redis安装及分布连接
其实就是scrapy-redis 版 CrawlSpider 类,先写好CrawlSpider 类,然后再在其基础上修改
1、scrapy startproject youyuanSpider
items.py
1 import scrapy 2 3 class YouyuanspiderItem(scrapy.Item): 4 # define the fields for your item here like: 5 name = scrapy.Field() # 用户名 6 age=scrapy.Field() # 年龄 7 hobby=scrapy.Field() # 爱好 8 header_url=scrapy.Field() # 头像图片的链接 9 image_url=scrapy.Field() # 相册图片的链接 10 content=scrapy.Field() #内心独白 11 place=scrapy.Field() # 籍贯 12 education=scrapy.Field() # 学历 13 source_url=scrapy.Field() # 个人主页 14 source=scrapy.Field() # 数据来源网站
2、scrapy genspider -t crawl youyuan "www.youyuan.com"
spiders/youyuan.py (使用scrapy-redis分布式时,1.2.3.4.5是需要修改的地方。只使用redis数据库存储不做分布式或不使用redis数据库,这些点不用修改)
1 # -*- coding: utf-8 -*- 2 import sys,os 3 sys.path.append(os.path.dirname(os.path.dirname(__file__))) 4 # from ..items import YouyuanspiderItem 5 from items import YouyuanspiderItem 6 7 import scrapy 8 from scrapy.linkextractors import LinkExtractor 9 # from scrapy.spiders import CrawlSpider, Rule 10 from scrapy.spiders import Rule 11 from scrapy_redis.spiders import RedisCrawlSpider #1.导入RedisCrawlSpider类,不使用CrawlSpider 12 import re 13 14 # class YouyuanSpider(CrawlSpider): 15 class YouyuanSpider(RedisCrawlSpider): #2.修改父类 RedisCrawlSpider 16 name = 'youyuan' 17 18 # 3. 取消 allowed_domains() 和 start_urls 19 # allowed_domains = ['youyuan.com'] 20 # start_urls = ['http://www.youyuan.com/find/shanghai/mm18-30/advance-0-0-0-0-0-0-0/p1/'] 21 22 # 4. 增加redis-key 23 redis_key = "youyuanSpider:start_urls" 24 25 # 5. 增加__init__()方法,动态获取allowed_domains() 26 def __init__(self, *args, **kwargs): 27 # Dynamically define the allowed domains list. 28 domain = kwargs.pop('domain', '') 29 self.allowed_domains = filter(None, domain.split(',')) 30 super(YouyuanSpider, self).__init__(*args, **kwargs) 31 32 page_links=LinkExtractor(allow=r"/find/shanghai/mm18-30/advance-0-0-0-0-0-0-0/p\d+/") 33 profile_links=LinkExtractor(allow=r"/\d+-profile/") 34 35 rules = ( 36 Rule(page_links), 37 Rule(profile_links, callback='parse_item'), 38 ) 39 40 def parse_item(self, response): 41 item = YouyuanspiderItem() 42 43 item['name']=self.get_name(response) 44 item['age']=self.get_age(response) 45 item['hobby']=self.get_hobby(response) 46 item['header_url']=self.get_header_url(response) 47 item['image_url']=self.get_image_url(response) 48 item['content']=self.get_content(response) 49 item['place']=self.get_place(response) 50 item['education']=self.get_education(response) 51 item['source_url']=response.url 52 item['source']='youyuan' 53 yield item 54 55 def get_name(self,response): 56 name=response.xpath("//dd/div/strong/text()").extract() 57 if len(name): 58 name=name[0].strip() 59 else: 60 name='NULL' 61 return name 62 63 def get_age(self,response): 64 ages=response.xpath("//dd/p/text()").extract() 65 if len(ages): 66 # age=re.match(r'\w+\s\s(\d+岁)',ages[0]).group(1) #获取年龄方法一 67 # age=re.findall(r'\d+岁',ages[0])[0] #获取年龄方法二 68 age=ages[0].split(' ')[2] #获取年龄方法三 69 else: 70 age='NULL' 71 return age.strip() 72 73 def get_hobby(self,response): 74 hobbys=response.xpath("//dd/ol/li/text()").extract() 75 if len(hobbys): 76 hobby=','.join(hobbys).replace(' ','').replace('\xa0','') 77 else: 78 hobby='NULL' 79 return hobby 80 81 def get_header_url(self,response): 82 header_url=response.xpath("//dt/img/@src").extract() 83 if len(header_url): 84 header_url=header_url[0] 85 else: 86 header_url = 'NULL' 87 return header_url 88 89 def get_image_url(self,response): 90 image_url = response.xpath("//ul[@class='block_photo']/li/a/img/@src").extract() 91 if len(image_url): 92 image_url = image_url #图片列表 93 else: 94 image_url = 'NULL' 95 return image_url 96 97 def get_content(self,response): 98 content = response.xpath("//ul[@class='requre']/li[1]/p/text()").extract() 99 if len(content): 100 content = content[0].strip() 101 else: 102 content = 'NULL' 103 return content 104 105 def get_place(self,response): 106 place = response.xpath("//div[@class='pre_data']/ul/li[2]//ol[1]/li[1]/span/text()").extract() 107 if len(place): 108 place = place[0] 109 else: 110 place = 'NULL' 111 return place 112 113 def get_education(self,response): 114 education = response.xpath("//div[@class='pre_data']/ul/li[2]//ol[1]/li[3]/span/text()").extract() 115 if len(education): 116 education = education[0] 117 else: 118 education = 'NULL' 119 return education
3、pipelines.py(不使用redis时用到这个类)
1 import json 2 3 class YouyuanspiderPipeline(object): 4 def process_item(self, item, spider): 5 with open('youyuan.json','a') as f: 6 f.write(json.dumps(dict(item),ensure_ascii=False)+',\n') 7 return item
4、settings.py
1 #下面四行:使用本地redis或scrapy-redis分布式 2 # 使用了scrapy-redis里的去重组件,不使用scrapy默认的去重 3 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 4 # 使用了scrapy-redis里的调度器组件,不实用scrapy默认的调度器 5 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 6 # 允许暂停,redis请求记录不丢失 7 SCHEDULER_PERSIST = True 8 # 使用队列形式 9 SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 10 11 DEFAULT_REQUEST_HEADERS = { 12 "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5", 13 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 14 'Accept-Language': 'en', 15 } 16 17 SPIDER_MIDDLEWARES = { 18 # 'youyuanSpider.middlewares.YouyuanspiderSpiderMiddleware': 543, 19 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None, #使用scrapy-redis分布式时,必须写这个,不然会报错 :[scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.youyuan.com': <GET http:/ 20 } 21 22 ITEM_PIPELINES = { 23 'youyuanSpider.pipelines.YouyuanspiderPipeline': 300, 24 'scrapy_redis.pipelines.RedisPipeline' : 400, #使用本地redis或scrapy-redis分布式 25 }
非分布式执行:

分布式执行:
1、在Master端启动redis-server服务和redis-cli数据库,在Slave端启动redis-cli数据库
Master端:


Slave端:

2. 在Master端或Slave端分别启动爬虫,不分先后:(这里只使用了Master端作为例子,Slave端是centos的linux环境,由于安装了python3.7.3,使用pip install scrapy时报错:ssl的问题,导致linux环境python 用不了,网上找了很多文章试了很多方法都没有解决,无奈暂时放弃了,有时间再弄这个吧)

(PS:记得先进入E:\python_practice_ku\pachong\youyuanSpider\youyuanSpider\spiders>目录下)
虚拟机pip install scrapy报错如下,待解决:

3. 在Master端的redis-cli里push一个start_urls

4. 爬虫启动,查看redis数据库数据。


参考:分布式案例-官方文档
将redis数据存入mongodb:
process_youyuan_profile.py
1 import json 2 import redis 3 import pymongo 4 5 def item_mongo(): 6 # 指定Redis数据库信息 7 redisCli=redis.StrictRedis(host='127.0.0.1',port=6379) 8 # 指定MongoDB数据库信息 9 mongoCli=pymongo.MongoClient(host='127.0.0.1',port=27017) 10 11 # 创建数据库名 12 db=mongoCli['youyuan'] 13 # 创建表名 14 sheet=db['shanghai_18_30'] 15 16 while True: 17 # FIFO模式为 blpop,LIFO模式为 brpop,获取键值 18 source,data=redisCli.blpop(['youyuan:items']) 19 print(source) #打印结果:youyuan:items' 20 21 item=json.loads(data) 22 sheet.insert_one(item) 23 24 try: 25 print("Processing: %(name)s <%(link)s>" % item) 26 except KeyError: 27 print("Error procesing: %r" % item) 28 29 if __name__ == '__main__': 30 item_mongo()

将redis数据存入mysql:
process_item_for_sql.py
1 import redis 2 import pymysql 3 import json 4 5 def item_mysql(): 6 # 指定redis数据库信息 7 rediscli=redis.StrictRedis(host='127.0.0.1',port=6379) 8 # 指定mysql数据库 9 mysqlcli=pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='mysql123',db='youyuan') 10 11 offset=0 12 while True: 13 # FIFO模式为 blpop,LIFO模式为 brpop,获取键值 14 source,data=rediscli.blpop(["youyuan:items"]) 15 item=json.loads(data) 16 17 try: 18 # 使用cursor()方法获取操作游标 19 cur=mysqlcli.cursor() 20 sql="insert into youyuan(name,age,hobby,header_url,image_url,content,place,education,source_url,source) \ 21 values('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}')".format(item['name'],item['age'],item['hobby'],item['header_url'],item['image_url'],item['content'],item['place'],item['education'],item['source_url'],item['source']) 22 # 使用execute方法执行SQL INSERT语句 23 cur.execute(sql) 24 # 提交sql事务 25 mysqlcli.commit() 26 # 关闭本次操作 27 cur.close() 28 29 offset+=1 30 print(offset) 31 except pymysql.Error as e: 32 print("Mysql Error : %s" %e) 33 34 if __name__ == '__main__': 35 item_mysql()

posted on 2020-03-23 20:13 cherry_ning 阅读(290) 评论(0) 收藏 举报
浙公网安备 33010602011771号