scrapy版本爬取某网站,加入了ua池,ip池,不限速不封号,100个线程爬崩网站

scrapy版本爬取妹子图

不封号,不限速,无限爬取

关键所在下载图片

from scrapy.pipelines.images import ImagesPipeline
原来的类继承object,改为继承ImagesPipeline
其中 函数名固定
def get_media_requests  # 下载图片
def item_completed  	#  是否下载成功
def file_path  			# 图片存放


前期准备

代理ip池

import pymysql
import random

def get_ip():
    conn = pymysql.connect(
        host='127.0.0.1',
        port=3306,
        user='root',
        password='123',
        database='pachong',
        charset='utf8',
        autocommit=True
    )
    cursor = conn.cursor(pymysql.cursors.DictCursor)

    sql = 'select ip from ip_list'
    cursor.execute(sql)
    ip_list = cursor.fetchall()
    ip = random.choice(ip_list)['ip']
    conn.close()
    return ip

UserAgent池

def get_UserAgent():
    from fake_useragent import UserAgent
    ua = UserAgent(verify_ssl=False)
    UserAgent = ua.random  # 随机获取一个UserAgent
    return UserAgent

middlewares中间件(破解反爬)

from mzitu.pool.ip import get_ip
from mzitu.pool.useragent import get_UserAgent
    
def process_request(self, request, spider):
    request.meta['proxies'] = get_ip()
    request.headers['User-Agent'] = get_UserAgent()
    request.headers['Referer'] = 'https://www.mzitu.com/'
    return None

settings配置

LOG_LEVEL='ERROR'
IMAGES_STORE = 'E:\python13\pachong\images'  # 下载目录配置,没有自动创建
RETRY_ENABLED = False  # 禁止重试
DOWNLOAD_TIMEOUT = 10  # 超时时间放弃

# 打开下载
ITEM_PIPELINES = {
   'mzitu.pipelines.MzituPipeline': 300,
}

# 打开中间件
DOWNLOADER_MIDDLEWARES = {
   'mzitu.middlewares.MzituDownloaderMiddleware': 543,
}

正题

爬虫

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
from mzitu.items import MzituItem

class AmzituSpider(scrapy.Spider):
    name = 'Amzitu'
    start_urls = ['https://www.mzitu.com/197251']


    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        img_url = soup.select('.main-image img')[0].attrs['src']
        next = soup.select('.pagenavi a:nth-last-child(1)')[0].attrs['href']
        img_name = img_url.rsplit('/',1)[-1]
        item = MzituItem()
        item['img_url'] = img_url
        item['img_name'] = img_name
        yield item
        yield Request(next)

保存下载图片

from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request


class MzituPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        print(item['img_url'])
        # 下载图片,如果传过来的是集合需要循环下载
        # meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
        yield Request(url=item['img_url'], meta={'name': item['img_name']})

    def item_completed(self, results, item, info):
        # 是一个元组,第一个元素是布尔值表示是否成功
        if not results[0][0]:
            raise DropItem('下载失败')
        return item
	
    # 重命名图片名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字
    def file_path(self, request, response=None, info=None):
        img_name = request.meta['name']
        return img_name

posted @ 2020-04-17 08:43  Jeff的技术栈  阅读(571)  评论(0编辑  收藏  举报
回顶部