爬虫第六篇:scrapy框架爬取某书网整站爬虫爬取

新建项目

# 新建项目
$ scrapy startproject jianshu
# 进入到文件夹 $ cd jainshu
# 新建spider文件 $ scrapy genspider
-t crawl jianshu_spider jainshu.com

 

items.py文件

import scrapy


class ArticleItem(scrapy.Item):
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    pub_time = scrapy.Field()

 

jianshu_spider.py文件

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import ArticleItem


class JianshuSpiderSpider(CrawlSpider):
    name = 'jianshu_spider'
    allowed_domains = ['jianshu.com']
    start_urls = ['https://www.jianshu.com/']

    rules = (
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
    )

    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        content = response.xpath("//div[@class='show-content-free']").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//div[@class='info']/span/a/text()").get()
        pub_time = response.xpath("//span[@class='publish-time']/text()").get()
        article_id = response.url.split("?")[0].split("/")[-1]
        origin_url = response.url
        item = ArticleItem(
            title=title,
            content=content,
            avatar=avatar,
            pub_time=pub_time,
            article_id=article_id,
            origin_url=origin_url,
            author=author
        )
        yield item

 

同步的MySQL插入数据

import pymysql


class JianshuPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'user': 'root',
            'password': '123456',
            'database': 'jianshu',
            'port': 3306,
            'charset': 'utf8'
        }
        self.conn = pymysql.connect(**dbparams)
        self.cursor = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], \
                                       item['pub_time'], item['origin_url'], item['article_id']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
            insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s)
            """
            return self._sql
        return self._sql

 

异步的MySQL插入数据

from twisted.enterprise import adbapi
from pymysql import cursors
class JianshuTwistedPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'user': 'root',
            'password': '123456',
            'database': 'jianshu',
            'port': 3306,
            'charset': 'utf8',
            'cursorclass': cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
                insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s)
                """
            return self._sql
        return self._sql

    def process_item(self, item, spider):
        defer = self.dbpool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error, item, spider)

    def insert_item(self, cursor, item):
        cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], \
                                  item['pub_time'], item['origin_url'], item['article_id']))

    def handle_error(self, error, item, spider):
        print('=' * 10 + 'error' + '=' * 10)
        print(error)
        print('=' * 10 + 'error' + '=' * 10)

 

 

 

posted @ 2017-12-20 18:46  风起了,风停了  阅读(3462)  评论(1编辑  收藏  举报