![]()
![]()
![]()
zhihuspider.py
# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy import Request
from zhihuuser.items import ZhihuuserItem
class ZhihuspiderSpider(scrapy.Spider):
name = 'zh'
# url='https://www.zhihu.com/api/v4/members/letitiachow?include=allow_message%2Cis_followed%2Cis_following%2Cis_org%2Cis_blocking%2Cemployments%2Canswer_count%2Cfollower_count%2Carticles_count%2Cgender%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics'
allowed_domains = ["www.zhihu.com"]
start_user = 'excited-vczh'
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
user_query='allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
# follows_url='https://www.zhihu.com/api/v4/members/excited-vczh/followees?include='
follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
#粉丝列表
followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
#自己本身信息
yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
self.parse_follows)
#粉丝列表
yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
self.parse_followers)
def parse_user(self, response):
'''
解析
:param response:
:return:
'''
# print(response.text)
result = json.loads(response.text)
item = ZhihuuserItem()
for field in item.fields:
if field in result.keys():#键名其中之一
item[field] = result.get(field)#就返回item
yield item
yield Request(
self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0),
self.parse_follows)
yield Request(
self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),
self.parse_followers)
def parse_follows(self, response):
# print(response.text)
results = json.loads(response.text)
#判断分页是否结束
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_follows)
def parse_followers(self, response):
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_followers)
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Field
class ZhihuuserItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
id = Field()
name = Field()
avatar_url = Field()
headline = Field()
description = Field()
url = Field()
url_token = Field()
gender = Field()
cover_url = Field()
type = Field()
badge = Field()
answer_count = Field()
articles_count = Field()
commercial_question_count = Field()
favorite_count = Field()
favorited_count = Field()
follower_count = Field()
following_columns_count = Field()
following_count = Field()
pins_count = Field()
question_count = Field()
thank_from_count = Field()
thank_to_count = Field()
thanked_count = Field()
vote_from_count = Field()
vote_to_count = Field()
voteup_count = Field()
following_favlists_count = Field()
following_question_count = Field()
following_topic_count = Field()
marked_answers_count = Field()
mutual_followees_count = Field()
hosted_live_count = Field()
participated_live_count = Field()
locations = Field()
educations = Field()
employments = Field()
pipeline.py
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db['user'].update({'url_token':item['url_token']},{'$set':item},True)
return item
# -*- coding: utf-8 -*-
# Scrapy settings for zhihuuser project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'zhihuuser'
SPIDER_MODULES = ['zhihuuser.spiders']
NEWSPIDER_MODULE = 'zhihuuser.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'zhihuuser.middlewares.ZhihuuserDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'zhihuuser.pipelines.MongoPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
MONGO_URI='localhost'
MONGO_DATABASE='Zhihu'
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_URL = 'redis//root:password@ip:port
# 指定redis数据库的连接参数
# REDIS_PASS是我自己加上的redis连接密码
# REDIS_HOST = '127.0.0.1'
# REDIS_PORT = 6379
#REDIS_PASS = 'redisP@ssw0rd'
# LOG等级
LOG_LEVEL = 'DEBUG'
#默认情况下,RFPDupeFilter只记录第一个重复请求。将DUPEFILTER_DEBUG设置为True会记录所有重复的请求。
DUPEFILTER_DEBUG =True