scrapy Request方法

# -*- coding: utf-8 -*-
import scrapy


class TestSpider(scrapy.Spider):
    name = 'test'
    allowed_domains = ['yeves.cn']
    start_urls = ['https://yeves.cn/']
    base_domain = 'https://yeves.cn{}'  # 基础域名
    def parse(self, response):

        articles = response.xpath('//*[@id="article"]//div') # 获取首页的标题和链接


        for article in articles:
            title = article.xpath('./div/article/div/header/h2/a/text()').extract_first()
            href = article.xpath('./div/article/div/header/h2/a/@href').extract_first()
            if title is not None and href is not None:
                href = self.base_domain.format(href)
                yield scrapy.Request(href,callback=self.parse_detail,meta={"title":title})  #通过标题链接获取详情 把标题带过去

    def parse_detail(self,respone):
        print(respone.url)
        print(respone.meta.get('title'))
        detail = {}
        detail['title'] = respone.meta.get('title')

        created_at = respone.xpath('/html/body/section/div/div/header/div/span[1]/time/text()').extract_first() # 拿到详情数据
        category = respone.xpath('/html/body/section/div/div/header/div/span[2]/a/text()').extract_first()
        content = respone.xpath('/html/body/section/div/div/article//text()').extract_first()

        detail['created_at'] = created_at
        detail['category'] = category
        print(detail)
        yield detail

posted @ 2020-03-19 10:15 brady-wang 阅读(719) 评论(0) 收藏举报

刷新页面返回顶部

风行天下

天地不仁以万物为刍狗

scrapy Request方法

公告

风行天下

天地不仁 以万物为刍狗

scrapy Request方法

公告

天地不仁以万物为刍狗