# -*- coding: utf-8 -*-
import scrapy
from ..items import QutoutiaoItem
import json
import re
from ..settings import CATEGORY_INFO, LIST_LIMIT
class QutoutiaoSpider(scrapy.Spider):
name = 'qutoutiao'
#allowed_domains = ['qutoutiao.net']
start_urls = []
# 各类小标题
categoryInfo = CATEGORY_INFO
limit = LIST_LIMIT
for value in categoryInfo:
url = BASE_API + "cid=%s&tn=1&page=1&limit=%s" % (
str(value['cid']), str(limit))
start_urls.append(url)
def parse(self, response):
response_url = response.url
# 分类id从url获取一次
searchObj = re.search(r'(.*)cid=(\d+)', response_url)
cid = searchObj and searchObj.group(2) or 0
data = json.loads(response.text)['data']['data']
for value in data:
# 初始化模型对象
item = QutoutiaoItem()
# 来源
item['source_name'] = value['source_name']
# 标题
item['title'] = value['title']
# 详细页url
url = item['url'] = value['url']
# url = url[0:url.find('?')]
# 简介
item['introduction'] = value['introduction']
# 封面图
item['cover'] = value['cover']
# 发布时间
item['publish_time'] = value['publish_time']
# 分类
item['cid'] = cid
# 爬取详情页
yield scrapy.Request(url=item['url'], meta={'meta_item': item},
callback=self.detail_parse)
# 详情页
def detail_parse(self, response):
# 提取每次Response的meta数据
meta_item = response.meta['meta_item']
# 取内容
content_selector = response.xpath('//div[@class="content"]')
meta_item['content_images'] = content_selector.xpath(
'//img/@src|//img/@data-src').extract()
meta_item['content'] = content_selector.extract()[0]
yield meta_item
# 列表API
BASE_API = 'http://api.1sapp.com/content/outList?'
# 爬取地址
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class QutoutiaoItem(scrapy.Item):
# define the fields for your item here like:
# 文章id
aid = scrapy.Field()
# 来源
source_name = scrapy.Field()
# 标题
title = scrapy.Field()
# 详细页url
url = scrapy.Field()
# 简介
introduction = scrapy.Field()
# 封面图
cover = scrapy.Field()
# 发布时间
publish_time = scrapy.Field()
# 分类ID
cid = scrapy.Field()
# 内容
content = scrapy.Field()
# 内容-中的图片
content_images = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class QutoutiaoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
from fake_useragent import UserAgent
import logging
class UserAgent_CookiesMiddleware(object):
# 随机更换user-agent
def __init__(self, crawler):
super(UserAgent_CookiesMiddleware, self).__init__()
self.ua = UserAgent()
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
self.logger = logging.getLogger(__name__)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
random_agent = get_ua()
if random_agent:
# 记录
request.headers['User-Agent'] = random_agent
request.headers['Accept'] = 'application/json, text/javascript, */*; q=0.01'
request.headers['Origin'] = 'http://home.qutoutiao.net'
request.headers['Referer'] = 'http://home.qutoutiao.net/pages/home.html'
self.logger.debug('Current UserAgent: ' + random_agent)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
import os
from .qttutils import QttUtils
# 封面下载
class CoverImagePipeline(ImagesPipeline):
# 获取settings中的常量
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
# 下载图片
def get_media_requests(self, item, info):
cover_images = item['cover']
if cover_images:
for image_url in cover_images:
yield scrapy.Request(url=image_url)
# 下载完成
def item_completed(self, results, item, info):
# print('*'*20,results,item,info)
image_path = [x['path'] for ok, x in results if ok]
# 获取自定义存储路径
store_path = QttUtils.getStorePath()
coverImages = []
# 将图片移动到新的路径
print('------------------image_path-',image_path)
print('-----------------type(image_path)--', type(image_path))
if image_path:
for image_url in image_path:
file_name = os.path.split(str(image_url))
print('------------------file_name-', file_name)
print('------------------file_name-', type(file_name))
new_image = store_path + '/' + file_name[1]
coverImages.append(new_image)
os.rename(self.IMAGES_STORE + '/' + image_url, new_image)
item['cover'] = coverImages
return item
# 内容图片下载
class ContentImagePipeline(ImagesPipeline):
# 获取settings中的常量
IMAGE_STORE = get_project_settings().get('IMAGES_STORE')
# 下载图片
def get_media_requests(self, item, info):
content_images = item['content_images']
if content_images:
for image_url in content_images:
yield scrapy.Request(image_url)
# 下载完成
def item_completed(self, results, item, info):
image_path = [x['path'] for ok, x in results if ok]
print('---------------------image_path', image_path)
# 获取自定义存储路径
store_path = QttUtils.getStorePath()
contentImages = []
# 将图片移动到新的路径
if image_path:
for base_path in image_path:
print('----------------value', base_path)
file_name = os.path.split(str(base_path))
new_image = store_path + "/" + file_name[1]
contentImages.append(new_image)
os.rename(self.IMAGE_STORE + "/" + base_path, new_image)
item['content_images'] = contentImages
return item
# # 下载完成 方法一
# def item_completed(self, results, item, info):
# image_info = [(x['path'], x['url']) for ok, x in results if ok]
# print('---------------------image_info', image_info)
# # 获取自定义存储路径
# store_path = QttUtils.getStorePath()
# contentImages = []
# # 将图片移动到新的路径
# if image_info:
# for value in image_info:
# print('----------------value', value)
# image_url = value[0]
# image_source = value[1]
#
# file_name = os.path.split(str(image_url))
# new_image = store_path + "/" + file_name[1]
# contentImages.append((new_image, image_source))
# os.rename(self.IMAGE_STORE + "/" + image_url, new_image)
# item['content_images'] = contentImages
# return item
import json
from .qttutils import QttUtils
class QutoutiaoPipeline(object):
def __init__(self):
# 获取自定义的存储路径
store_path = QttUtils.getStorePath()
json_path = store_path + '/' + 'qutoutiao.json'
self.filename = open(json_path, 'wb')
def process_item(self, item, spider):
text = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.filename.write(text.encode('utf-8'))
return item
def close_spider(self, spider):
self.filename.close()
# -*- coding: utf-8 -*-
# @Time : 2018-6-1 11:01
# @Author : Amir
# @Site :
# @File : qttutils.py
# @Software: PyCharm
'''
趣头条工具类
'''
import time
import os
import shutil
from .settings import DATA_PATH
class QttUtils:
# 获取存储路径
#
# @param [string] action [remove删除目录,默认create]
# @return [string] path/year/month/day/*
@staticmethod
def getStorePath(action='create'):
localtimes = time.localtime()
year = time.strftime("%Y", localtimes)
month = time.strftime('%m', localtimes)
day = time.strftime('%d', localtimes)
store_path = DATA_PATH + "/%s/%s/%s"%(year,month,day)
# 删除目录
if os.path.exists(store_path) and action == 'remove':
shutil.rmtree(store_path)
# 创建多级目录
if not os.path.exists(store_path) and action == 'create':
os.makedirs(store_path)
return store_path
# -*- coding: utf-8 -*-
# Scrapy settings for QuTouTiao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'QuTouTiao'
SPIDER_MODULES = ['QuTouTiao.spiders']
NEWSPIDER_MODULE = 'QuTouTiao.spiders'
#列表记录数
LIST_LIMIT = 10
# 储存路径
DATA_PATH = r'./data'
IMAGES_STORE = r'./image'
#分类
CATEGORY_INFO = [
{"cid":255,"name":"推荐"},
{"cid":1,"name":"热点"},
{"cid":6,"name":"娱乐"},
{"cid":5,"name":"养生"},
{"cid":2,"name":"搞笑"},
{"cid":7,"name":"科技"},
{"cid":8,"name":"生活"},
{"cid":10,"name":"财经"},
{"cid":9,"name":"汽车"},
]
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'QuTouTiao (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'QuTouTiao.middlewares.QutoutiaoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'QuTouTiao.middlewares.UserAgent_CookiesMiddleware': 299,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'QuTouTiao.pipelines.QutoutiaoPipeline': 300,
'QuTouTiao.pipelines.ContentImagePipeline': 301,
'QuTouTiao.pipelines.CoverImagePipeline': 302
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'