五、Scrapy框架(8)——scrapy爬虫实战

小程序社区爬虫

数据保存到json文件中。使用 CrawlSpider

模拟登录豆瓣网爬虫

发送 post 请求模拟登录

图片下载爬虫

汽车之家宝马5系爬虫

BOSS直聘爬虫

BOSS直聘有很高的反爬虫机制,只要用同 个IP访问多个职位列表页,就会被封掉IP。采用代理ip的方式可解决问题。

简书网站整站爬虫

数据保存到mysql数据库中

将 selenium + chromedriver集成到scrapy

一、页面解析

  1. 在终端中创建项目
scrapy startproject jianshu_spider
cd jianshu_spider
scrapy genspider -t crawl js "jianshu.com"
  1. 用Pycharm打开项目,修改settings.py配置,打开请求头
ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
  1. 新建 start.py 文件,用于启动项目

    from scrapy import cmdline
    cmdline.execute("scrapy crawl js".split())
    
  2. 分析网页,选择要爬虫的项目,在items.py创建ArticleItem类

    import scrapy
    
    class ArticleItem(scrapy.Item):
        title = scrapy.Field()  # 标题
        content = scrapy.Field()    # 内容
        article_id = scrapy.Field() # 文章id
        origin_url = scrapy.Field() # 原始地址
        author = scrapy.Field() # 作者
        avatar = scrapy.Field() # 头像
        pub_time = scrapy.Field()   # 发布时间
    
  3. 开始写爬虫js.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from jianshu_spider.items import ArticleItem
    
    
    class JsSpider(CrawlSpider):
        name = 'js'
        allowed_domains = ['jianshu.com']
        start_urls = ['https://www.jianshu.com/']
    
        rules = (
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}'), callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            title = response.xpath("//h1[@class='title']/text()").get()
            avatar = response.xpath("//a[@class='avatar']/img/@src").get()
            author = response.xpath("//span[@class='name']/a//text()").get()
            pub_time = response.xpath("//span[@class='publish-time']//text()").get()
    
            # https://www.jianshu.com/p/66aeb8473df1?/u/052e3bd4d2bc?utm_campaign=maleskine&utm_content=user&utm_medium=seo_notes&utm_source=recommendation
            url = response.url
            url1 = url.split("?")[0]
            article_id = url1.split('/')[-1]
            content = response.xpath("//div[@class='show-content']").get()
    
            item = ArticleItem(
                title=title,
                avatar=avatar,
                author=author,
                pub_time=pub_time,
                article_id=article_id,
                origin_url=response.url,
                content=content
            )
            yield item
    

二、数据保存到Mysql数据库

  1. settings.py中打开 ITEM_PIPELINES设置

    ITEM_PIPELINES = {
       'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
    }
    
    # 下载延迟3秒钟
    DOWNLOAD_DELAY = 3
    
  2. 在mysql中创建 jianshu数据库,新建 article表

  3. 编写pipelines.py,连接数据库

import pymysql
from pymysql import cursors


class JianshuSpiderPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': 'root',
            'database': 'jianshu',
            'charset': 'utf8'
        }
        self.conn = pymysql.connect(**dbparams)
        self.cursor = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql, (
            item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'],
            item['article_id']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
            insert into article(title,content,author,avatar,pub_time,origin_url,article_id) values(%s,%s,%s,%s,%s,%s,%s)
            """
            return self._sql
        return self._sql
  1. 改成异常处理,pipelines.py
import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors

class JianshuTwistedPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': 'root',
            'database': 'jianshu',
            'charset': 'utf8',
            'cursorclass': cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
                insert into article(title,content,author,avatar,pub_time,origin_url,article_id) values(%s,%s,%s,%s,%s,%s,%s)
                """
            return self._sql
        return self._sql

    def process_item(self, item, spider):
        defer = self.dbpool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error,item,spider)

    def insert_item(self, cursor, item):
        cursor.execute(self.sql,(
            item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'],
            item['article_id']))

    def handle_error(self, error, item, spider):
        print('='*10+"error"+'='*10)
        print(error)
        print('='*10+"error"+'='*10)

settings.py中的ITEM_PIPELINES改一下

ITEM_PIPELINES = {
   # 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
   'jianshu_spider.pipelines.JianshuTwistedPipeline': 300,
}

三、爬取ajax数据

将 selenium + chromedriver集成到scrapy

  1. 在数据库加入阅读、喜欢、字数、专题、评论数 字段

  2. items.py中添加字段

    import scrapy
    
    
    class ArticleItem(scrapy.Item):
        title = scrapy.Field()  # 标题
        content = scrapy.Field() # 内容
        article_id = scrapy.Field() # 文章id
        origin_url = scrapy.Field() # 原始地址
        author = scrapy.Field() # 作者
        avatar = scrapy.Field() # 头像
        pub_time = scrapy.Field()   # 发布时间
        read_count = scrapy.Field() # 阅读量
        like_count = scrapy.Field() # 喜欢
        word_count = scrapy.Field() # 字数
        comment_count = scrapy.Field() # 评论数
        subjects = scrapy.Field()   # 专题
    
  3. 编写 middlewares.py

from scrapy.http import HtmlResponse
from selenium import webdriver
import time


class SeleniumDownloadMiddleware(object):
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path="/Users/ren/Applications/chromedriver")

    def process_request(self, request, spider):
        self.driver.get(request.url)
        time.sleep(1)
        try:
            while True :
                showMore = self.driver.find_elements_by_class_name('show-more')
                showMore.click()
                time.sleep(0.3)
                if not showMore:
                    break
        except:
            pass
        source = self.driver.page_source
        response = HtmlResponse(url=self.driver.current_url, body=source, request=request)
        return response
  1. 优化 js.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import ArticleItem


class JsSpider(CrawlSpider):
    name = 'js'
    allowed_domains = ['jianshu.com']
    start_urls = ['https://www.jianshu.com/']

    rules = (
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}'), callback='parse_detail', follow=True),
    )

    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class='name']/a//text()").get()
        pub_time = response.xpath("//span[@class='publish-time']//text()").get()

        # https://www.jianshu.com/p/66aeb8473df1?/u/052e3bd4d2bc?utm_campaign=maleskine&utm_content=user&utm_medium=seo_notes&utm_source=recommendation
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split('/')[-1]
        content = response.xpath("//div[@class='show-content']").get()

        word_count = response.xpath("//span[@class='wordage']/text()").get()
        comment_count = response.xpath("//span[@class='comments-count']/text()").get()
        read_count = response.xpath("//span[@class='views-count']/text()").get()
        like_count = response.xpath("//span[@class='likes-count']/text()").get()

        subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall())

        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            article_id=article_id,
            origin_url=response.url,
            content=content,
            word_count=word_count,
            read_count=read_count,
            like_count=like_count,
            comment_count=comment_count,
            subjects=subjects
        )
        yield item

  1. 把下载中间件加到 settings.py

    DOWNLOADER_MIDDLEWARES = {
       'jianshu_spider.middlewares.SeleniumDownloadMiddleware': 543,
    }
    
posted @ 2019-04-30 21:19  渣爷  阅读(150)  评论(0)    收藏  举报