使用scrapy爬取知乎用户信息

代码如下：

# -*- coding: utf-8 -*-
import scrapy
from scrapy import spiders ,Request
import json

class ZhihuuserSpider(scrapy.Spider):
    name = 'z'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['https://www.zhihu.com/']
    start_url = 'excited-vczh'
    allow_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={off}&limit={lim}'
    exit_url = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
    qwe_url = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
    one_url = 'https://www.zhihu.com/api/v4/members/{user}?include={two_url}'


    def start_requests(self):
            yield  Request(self.one_url.format(user=self.start_url,two_url=self.qwe_url),self.parse)
    def parse(self, response):
        result = json.loads(response.text)

        yield Request(self.allow_url.format(user=result.get('url_token'), include=self.exit_url, off=20, lim=20), self.parse_xx)

    def parse_xx(self,response):
        result = json.loads(response.text)
        for cc in result.get('data'):
            yield Request(self.one_url.format(user=cc.get('url_token'),two_url=self.qwe_url),self.parse)
        if 'paging' in result.keys() and result.get('paging').get('is_end') == False:
            next_page = result.get('paging').get('next')
            yield Request(next_page,self.parse_xx)
     #还可以增加个函数爬取关注者列表，基本就是复制parse——xx

    个人认为爬取知乎还是很简单的，但是有些地方不是很理解，这个爬虫参考的还是崔庆才大神的教程，感觉里面会重复很多次请求，浪费很多时间。。。，希望有大神
能指导下我如何改进。。。我对于pipelines还是一知半解，主要是对装饰器不懂，下一步目标是攻克pipelines

posted on 2018-02-20 22:02 pigwz 阅读(142) 评论(0) 收藏举报

刷新页面返回顶部

pigwz

使用scrapy爬取知乎用户信息

公告

导航