(30)爬虫--CrawlSpider自动获取爬取链接

scrapy genspider -t crawl zwr zedu.com

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class ZwrSpider(CrawlSpider):
    name = 'zwr'
    allowed_domains = ['zwdu.com']
    start_urls = ['https://www.zwdu.com/book/10304/']

    rules = (
        Rule(LinkExtractor(restrict_xpaths=r'''//dd/a'''), callback='parse_item', follow=True),
        Rule(LinkExtractor(restrict_xpaths = r'''//div[@class='bottem1']/a[3]'''), callback = 'parse_item', follow = True),
    )

    def parse_item(self, response):
        title = response.xpath('//h1/text()').extract_first()
        content = ''.join(response.xpath('''//div[@id='content']/text()''').extract()).replace('    ', '\n    ')
        yield {'title': title,
               'content': content}

posted @ 2020-08-11 11:22  kuanleung  阅读(5)  评论(0)    收藏  举报  来源