cnblogs_spider.py

普通 scrapy

# -*- coding: utf-8 -*-
import scrapy

from ..items import TttItem

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'  # 爬虫名字
    start_urls = ['https://www.cnblogs.com']

    def parse(self, response):
        div_list = response.xpath('//div[@class="post_item_body"]')
        for div in div_list:
            title = div.xpath('./h3/a/text()').extract_first()
            url = div.xpath('./h3/a/@href').extract_first()
            outline = div.css('.post_item_summary::text').extract()[-1]
            author = div.xpath('./div[@class="post_item_foot"]/a/text()').extract_first()


            item = TttItem()
            item['title'] = title
            item['outline'] = outline
            item['author'] = author
            item['url'] = url
            yield scrapy.Request(url, callback=self.get_detail, meta={'item': item})

        beforeurl = response.url
        print(beforeurl)

        # 获取最后一个 a 标签
        next_url = response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first()
        print('next_url', next_url)

        yield scrapy.Request(self.start_urls[0] + next_url, callback=self.parse)

    # 获取文章详情
    def get_detail(self, response):
        content = response.xpath('//div[@id="cnblogs_post_body"]').extract_first()
        if not content:
            content=response.css('content').extract_first()

        item = response.meta.get('item')
        item['content'] = content
        yield item

piplines.py

import pymysql

class CnblogsSaveMysqlPipline(object):
    def open_spider(self, spider):
        self.conn = pymysql.connect(user='root', password='123123', db='cnblogs')

    def close_spider(self, spider):
        self.conn.close()

    def process_item(self, item, spider):
        cursor = self.conn.cursor()
        sql = '''insert into cnb (title, outline, author, url, content) values (%s,%s,%s,%s,%s)'''
        cursor.execute(sql, args=(item['title'], item['outline'], item['author'], item['url'], item['content']))
        self.conn.commit()

分布式爬取

cnblogs_spider.py

# -*- coding: utf-8 -*-
import scrapy

from ..items import TttItem
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider

class ChoutiSpider(RedisSpider):
    name = 'chouti'  # 爬虫名字
    allowed_domains = ['www.cnblogs.com']
    redis_key = 'myspider:start_urls'


    def parse(self, response):
        div_list = response.xpath('//div[@class="post_item_body"]')
        for div in div_list:
            title = div.xpath('./h3/a/text()').extract_first()
            url = div.xpath('./h3/a/@href').extract_first()
            outline = div.css('.post_item_summary::text').extract()[-1]
            author = div.xpath('./div[@class="post_item_foot"]/a/text()').extract_first()

            item = TttItem()
            item['title'] = title
            item['outline'] = outline
            item['author'] = author
            item['url'] = url
            yield Request(url, callback=self.get_detail, meta={'item': item})

        beforeurl = response.url
        print(beforeurl)

        # 获取最后一个 a 标签
        next = response.css('div.pager a:last-child::attr(href)').extract_first()
        # print('https://www.cnblogs.com/'+next)
        print('----爬取下一页地址', next)
        yield Request('https://www.cnblogs.com/' + next)

    def get_detail(self, response):
        content = response.xpath('//div[@id="cnblogs_post_body"]').extract_first()
        if not content:
            content=response.css('content').extract_first()

        item = response.meta.get('item')
        item['content'] = content
        yield item

settings.py

# Enables scheduling storing requests queue in redis.
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"


REDIS_PARAMS = {'password':'redis123'}

posted on 2020-04-11 19:18 Rannie` 阅读(214) 评论(0) 收藏举报

刷新页面返回顶部

普通 scrapy

分布式爬取

公告