scrapy爬取简书整站文章

在这里我们使用CrawlSpider爬虫模板, 通过其过滤规则进行抓取, 并将抓取后的结果存入mysql中,下面直接上代码:

jianshu_spider.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.linkextractors import LinkExtractor
 4 from scrapy.spiders import CrawlSpider, Rule
 5 from jianshu.items import JianshuItem
 6 import html
 7 
 8 
 9 class JianshuSpiderSpider(CrawlSpider):
10     name = 'jianshu_spider'
11     allowed_domains = ['jianshu.com']
12     start_urls = ['http://jianshu.com/']
13 
14     rules = (
15         Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_article', follow=True),
16     )
17 
18     def parse_article(self, response):
19         article_code = response.url.split("?")[0].split("/")[-1]
20         title = response.xpath('//h1[@class="title"]/text()').get().strip()
21         author = response.xpath('//div[contains(@class, "author")]/div[@class="info"]//span[@class="name"]/a/text()').get().strip()
22         head_img = response.xpath('//div[contains(@class, "author")]/a[@class="avatar"]/img/@src').get()
23         pub_time = response.xpath('//span[@class="publish-time"]/text()').get().strip().replace('*','')
24         head_img_url = "http:{}".format(head_img)
25         # 存储到数据库中,需要对‘/’转义
26         # content = html.escape(response.xpath('//div[@class="show-content"]').get())
27         content = response.xpath('//div[@class="show-content"]').get()
28 
29         yield JianshuItem(
30             article_code = article_code,
31             title = title,
32             author = author,
33             head_img_url = head_img_url,
34             content = content,
35             pub_time = pub_time,)

 

 

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class JianshuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    article_code = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    pub_time = scrapy.Field()
    head_img_url = scrapy.Field()
    content = scrapy.Field()

 

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from jianshu import model

class JianshuPipeline(object):

    def __init__(self):
        self.session = model.DBSession()

    def process_item(self, item, spider):
        # 这里的item属于字典类型
        article = model.Article(**item)
        try:
            self.session.add(article)
            self.session.commit()
        except Exception as e:
            print("="*100)
            print("INSERT ERROR!")
            self.session.rollback()
        return item
    def open_spider(self, spider):
        pass

    def close_spider(self, spider):
        self.session.close()

 

model.py

from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import String, Text, Time, Column, Integer, VARCHAR
from sqlalchemy.orm import sessionmaker

# 创建数据库链接接口
engine = create_engine("mysql+pymysql://jianshu:jianshu@localhost:3306/jianshu?charset=utf8mb4", echo=False)

# 声明映像, 即实际数据库表的基本准则的映射类
# 其维持类和数据库表关系目录
Base = declarative_base()

class Article(Base):

    __tablename__ = "jianshu_article"

    id = Column(Integer, autoincrement=True, primary_key=True)
    article_code = Column(String(16), nullable=False)
    title = Column(Text)
    author = Column(String(16))
    pub_time = Column(Time)
    head_img_url = Column(VARCHAR(256))
    content = Column(Text)

DBSession = sessionmaker(bind=engine)

if __name__ == '__main__':
    Base.metadata.create_all(engine)

 

posted @ 2019-05-22 15:43  风来与你安  阅读(427)  评论(0编辑  收藏  举报