What is 增量式爬虫?
用来 监测 网站数据更新的情况,只会爬取网站中更新出来的新数据
增量式爬虫的核心
去重,因为你爬取到的数据是不可以出现重复的
怎么进行增量式爬取呢?
- 在发送请求之前判断这个URL是不是之前爬取过
- 在解析内容后判断这部分内容是不是之前爬取过
- 写入存储到 redis 时判断内容是不是已经在介质中存在
#总结分析
对比三种方式增量爬取的核心是去重, 至于去重的操作在哪个步骤起作用,只能说各有利弊。在我看来,前两种思路需要根据实际情况取一个(也可能都用)。
#第一种思路适合不断有新页面出现的网站,比如说小说的新章节,每天的最新新闻等等;
#第二种思路则适合页面内容会更新的网站。
#第三个思路是相当于是最后的一道防线。这样做可以最大程度上达到去重的目的。
去重方法
1. 将爬取过程中产生的url进行存储,存储在redis的set中。当下次进行数据爬取时,首先对即将要发起的请求对应的url在存储的url的set中做判断,如果存在则不进行请求,否则才进行请求。
2. 对爬取到的网页内容进行唯一标识的制定,然后将该唯一表示存储至redis的set中。当下次爬取到网页数据的时候,在进行持久化存储之前,首先可以先判断该数据的唯一标识在redis的set中是否存在,在决定是否进行持久化存储。
项目实例
demo1 爬取4567tv网站中所有的电影详情数据
spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from incrementPro.items import IncrementproItem
class MovieSpider(CrawlSpider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.4567tv.tv/frim/index7-11.html']
rules = (
Rule(LinkExtractor(allow=r'/frim/index7-\d+\.html'), callback='parse_item', follow=True),
)
#创建redis链接对象
conn = Redis(host='127.0.0.1',port=6379)
def parse_item(self, response):
li_list = response.xpath('//li[@class="p1 m1"]')
for li in li_list:
#获取详情页的url
detail_url = 'http://www.4567tv.tv'+li.xpath('./a/@href').extract_first()
#将详情页的url存入redis的set中
ex = self.conn.sadd('urls',detail_url)
if ex == 1:
print('该url没有被爬取过,可以进行数据的爬取')
yield scrapy.Request(url=detail_url,callback=self.parst_detail)
else:
print('数据还没有更新,暂无新数据可爬取!')
#解析详情页中的电影名称和类型,进行持久化存储
def parst_detail(self,response):
item = IncrementproItem()
item['name'] = response.xpath('//dt[@class="name"]/text()').extract_first()
item['kind'] = response.xpath('//div[@class="ct-c"]/dl/dt[4]//text()').extract()
item['kind'] = ''.join(item['kind'])
yield item
pipeline.py
from redis import Redis
class IncrementproPipeline(object):
conn = None
def open_spider(self,spider):
self.conn = Redis(host='127.0.0.1',port=6379)
def process_item(self, item, spider):
dic = {
'name':item['name'],
'kind':item['kind']
}
print(dic)
self.conn.lpush('movieData',dic)
return item
demo2 爬取糗事百科中的段子和作者数据。
spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from incrementByDataPro.items import IncrementbydataproItem
from redis import Redis
import hashlib
class QiubaiSpider(CrawlSpider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']
rules = (
Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
Rule(LinkExtractor(allow=r'/text/$'), callback='parse_item', follow=True),
)
#创建redis链接对象
conn = Redis(host='127.0.0.1',port=6379)
def parse_item(self, response):
div_list = response.xpath('//div[@id="content-left"]/div')
for div in div_list:
item = IncrementbydataproItem()
item['author'] = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first()
#将解析到的数据值生成一个唯一的标识进行redis存储
source = item['author']+item['content']
source_id = hashlib.sha256(source.encode()).hexdigest()
#将解析内容的唯一表示存储到redis的data_id中
ex = self.conn.sadd('data_id',source_id)
if ex == 1:
print('该条数据没有爬取过,可以爬取......')
yield item
else:
print('该条数据已经爬取过了,不需要再次爬取了!!!')
pipeline.py
from redis import Redis
class IncrementbydataproPipeline(object):
conn = None
def open_spider(self, spider):
self.conn = Redis(host='127.0.0.1', port=6379)
def process_item(self, item, spider):
dic = {
'author': item['author'],
'content': item['content']
}
# print(dic)
self.conn.lpush('qiubaiData', dic)
return item
CrazyShenldon
posted @
2019-05-09 23:04
FindSoul
阅读(
533)
评论()
收藏
举报
var RENDERER = {
POINT_INTERVAL : 5,
FISH_COUNT : 3,
MAX_INTERVAL_COUNT : 50,
INIT_HEIGHT_RATE : 0.5,
THRESHOLD : 50,
init : function(){
this.setParameters();
this.reconstructMethods();
this.setup();
this.bindEvent();
this.render();
},
setParameters : function(){
this.$window = $(window);
this.$container = $('#jsi-flying-fish-container');
this.$canvas = $('');
this.context = this.$canvas.appendTo(this.$container).get(0).getContext('2d');
this.points = [];
this.fishes = [];
this.watchIds = [];
},
createSurfacePoints : function(){
var count = Math.round(this.width / this.POINT_INTERVAL);
this.pointInterval = this.width / (count - 1);
this.points.push(new SURFACE_POINT(this, 0));
for(var i = 1; i < count; i++){
var point = new SURFACE_POINT(this, i * this.pointInterval),
previous = this.points[i - 1];
point.setPreviousPoint(previous);
previous.setNextPoint(point);
this.points.push(point);
}
},
reconstructMethods : function(){
this.watchWindowSize = this.watchWindowSize.bind(this);
this.jdugeToStopResize = this.jdugeToStopResize.bind(this);
this.startEpicenter = this.startEpicenter.bind(this);
this.moveEpicenter = this.moveEpicenter.bind(this);
this.reverseVertical = this.reverseVertical.bind(this);
this.render = this.render.bind(this);
},
setup : function(){
this.points.length = 0;
this.fishes.length = 0;
this.watchIds.length = 0;
this.intervalCount = this.MAX_INTERVAL_COUNT;
this.width = this.$container.width();
this.height = this.$container.height();
this.fishCount = this.FISH_COUNT * this.width / 500 * this.height / 500;
this.$canvas.attr({width : this.width, height : this.height});
this.reverse = false;
this.fishes.push(new FISH(this));
this.createSurfacePoints();
},
watchWindowSize : function(){
this.clearTimer();
this.tmpWidth = this.$window.width();
this.tmpHeight = this.$window.height();
this.watchIds.push(setTimeout(this.jdugeToStopResize, this.WATCH_INTERVAL));
},
clearTimer : function(){
while(this.watchIds.length > 0){
clearTimeout(this.watchIds.pop());
}
},
jdugeToStopResize : function(){
var width = this.$window.width(),
height = this.$window.height(),
stopped = (width == this.tmpWidth && height == this.tmpHeight);
this.tmpWidth = width;
this.tmpHeight = height;
if(stopped){
this.setup();
}
},
bindEvent : function(){
this.$window.on('resize', this.watchWindowSize);
this.$container.on('mouseenter', this.startEpicenter);
this.$container.on('mousemove', this.moveEpicenter);
this.$container.on('click', this.reverseVertical);
},
getAxis : function(event){
var offset = this.$container.offset();
return {
x : event.clientX - offset.left + this.$window.scrollLeft(),
y : event.clientY - offset.top + this.$window.scrollTop()
};
},
startEpicenter : function(event){
this.axis = this.getAxis(event);
},
moveEpicenter : function(event){
var axis = this.getAxis(event);
if(!this.axis){
this.axis = axis;
}
this.generateEpicenter(axis.x, axis.y, axis.y - this.axis.y);
this.axis = axis;
},
generateEpicenter : function(x, y, velocity){
if(y < this.height / 2 - this.THRESHOLD || y > this.height / 2 + this.THRESHOLD){
return;
}
var index = Math.round(x / this.pointInterval);
if(index < 0 || index >= this.points.length){
return;
}
this.points[index].interfere(y, velocity);
},
reverseVertical : function(){
this.reverse = !this.reverse;
for(var i = 0, count = this.fishes.length; i < count; i++){
this.fishes[i].reverseVertical();
}
},
controlStatus : function(){
for(var i = 0, count = this.points.length; i < count; i++){
this.points[i].updateSelf();
}
for(var i = 0, count = this.points.length; i < count; i++){
this.points[i].updateNeighbors();
}
if(this.fishes.length < this.fishCount){
if(--this.intervalCount == 0){
this.intervalCount = this.MAX_INTERVAL_COUNT;
this.fishes.push(new FISH(this));
}
}
},
render : function(){
requestAnimationFrame(this.render);
this.controlStatus();
this.context.clearRect(0, 0, this.width, this.height);
this.context.fillStyle = 'hsl(0, 0%, 95%)';
for(var i = 0, count = this.fishes.length; i < count; i++){
this.fishes[i].render(this.context);
}
this.context.save();
this.context.globalCompositeOperation = 'xor';
this.context.beginPath();
this.context.moveTo(0, this.reverse ? 0 : this.height);
for(var i = 0, count = this.points.length; i < count; i++){
this.points[i].render(this.context);
}
this.context.lineTo(this.width, this.reverse ? 0 : this.height);
this.context.closePath();
this.context.fill();
this.context.restore();
}
};
var SURFACE_POINT = function(renderer, x){
this.renderer = renderer;
this.x = x;
this.init();
};
SURFACE_POINT.prototype = {
SPRING_CONSTANT : 0.03,
SPRING_FRICTION : 0.9,
WAVE_SPREAD : 0.3,
ACCELARATION_RATE : 0.01,
init : function(){
this.initHeight = this.renderer.height * this.renderer.INIT_HEIGHT_RATE;
this.height = this.initHeight;
this.fy = 0;
this.force = {previous : 0, next : 0};
},
setPreviousPoint : function(previous){
this.previous = previous;
},
setNextPoint : function(next){
this.next = next;
},
interfere : function(y, velocity){
this.fy = this.renderer.height * this.ACCELARATION_RATE * ((this.renderer.height - this.height - y) >= 0 ? -1 : 1) * Math.abs(velocity);
},
updateSelf : function(){
this.fy += this.SPRING_CONSTANT * (this.initHeight - this.height);
this.fy *= this.SPRING_FRICTION;
this.height += this.fy;
},
updateNeighbors : function(){
if(this.previous){
this.force.previous = this.WAVE_SPREAD * (this.height - this.previous.height);
}
if(this.next){
this.force.next = this.WAVE_SPREAD * (this.height - this.next.height);
}
},
render : function(context){
if(this.previous){
this.previous.height += this.force.previous;
this.previous.fy += this.force.previous;
}
if(this.next){
this.next.height += this.force.next;
this.next.fy += this.force.next;
}
context.lineTo(this.x, this.renderer.height - this.height);
}
};
var FISH = function(renderer){
this.renderer = renderer;
this.init();
};
FISH.prototype = {
GRAVITY : 0.4,
init : function(){
this.direction = Math.random() < 0.5;
this.x = this.direction ? (this.renderer.width + this.renderer.THRESHOLD) : -this.renderer.THRESHOLD;
this.previousY = this.y;
this.vx = this.getRandomValue(4, 10) * (this.direction ? -1 : 1);
if(this.renderer.reverse){
this.y = this.getRandomValue(this.renderer.height * 1 / 10, this.renderer.height * 4 / 10);
this.vy = this.getRandomValue(2, 5);
this.ay = this.getRandomValue(0.05, 0.2);
}else{
this.y = this.getRandomValue(this.renderer.height * 6 / 10, this.renderer.height * 9 / 10);
this.vy = this.getRandomValue(-5, -2);
this.ay = this.getRandomValue(-0.2, -0.05);
}
this.isOut = false;
this.theta = 0;
this.phi = 0;
},
getRandomValue : function(min, max){
return min + (max - min) * Math.random();
},
reverseVertical : function(){
this.isOut = !this.isOut;
this.ay *= -1;
},
controlStatus : function(context){
this.previousY = this.y;
this.x += this.vx;
this.y += this.vy;
this.vy += this.ay;
if(this.renderer.reverse){
if(this.y > this.renderer.height * this.renderer.INIT_HEIGHT_RATE){
this.vy -= this.GRAVITY;
this.isOut = true;
}else{
if(this.isOut){
this.ay = this.getRandomValue(0.05, 0.2);
}
this.isOut = false;
}
}else{
if(this.y < this.renderer.height * this.renderer.INIT_HEIGHT_RATE){
this.vy += this.GRAVITY;
this.isOut = true;
}else{
if(this.isOut){
this.ay = this.getRandomValue(-0.2, -0.05);
}
this.isOut = false;
}
}
if(!this.isOut){
this.theta += Math.PI / 20;
this.theta %= Math.PI * 2;
this.phi += Math.PI / 30;
this.phi %= Math.PI * 2;
}
this.renderer.generateEpicenter(this.x + (this.direction ? -1 : 1) * this.renderer.THRESHOLD, this.y, this.y - this.previousY);
if(this.vx > 0 && this.x > this.renderer.width + this.renderer.THRESHOLD || this.vx < 0 && this.x < -this.renderer.THRESHOLD){
this.init();
}
},
render : function(context){
context.save();
context.translate(this.x, this.y);
context.rotate(Math.PI + Math.atan2(this.vy, this.vx));
context.scale(1, this.direction ? 1 : -1);
context.beginPath();
context.moveTo(-30, 0);
context.bezierCurveTo(-20, 15, 15, 10, 40, 0);
context.bezierCurveTo(15, -10, -20, -15, -30, 0);
context.fill();
context.save();
context.translate(40, 0);
context.scale(0.9 + 0.2 * Math.sin(this.theta), 1);
context.beginPath();
context.moveTo(0, 0);
context.quadraticCurveTo(5, 10, 20, 8);
context.quadraticCurveTo(12, 5, 10, 0);
context.quadraticCurveTo(12, -5, 20, -8);
context.quadraticCurveTo(5, -10, 0, 0);
context.fill();
context.restore();
context.save();
context.translate(-3, 0);
context.rotate((Math.PI / 3 + Math.PI / 10 * Math.sin(this.phi)) * (this.renderer.reverse ? -1 : 1));
context.beginPath();
if(this.renderer.reverse){
context.moveTo(5, 0);
context.bezierCurveTo(10, 10, 10, 30, 0, 40);
context.bezierCurveTo(-12, 25, -8, 10, 0, 0);
}else{
context.moveTo(-5, 0);
context.bezierCurveTo(-10, -10, -10, -30, 0, -40);
context.bezierCurveTo(12, -25, 8, -10, 0, 0);
}
context.closePath();
context.fill();
context.restore();
context.restore();
this.controlStatus(context);
}
};
$(function(){
RENDERER.init();
});