douban.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from book.items import DoubanItem
from scrapy.utils.project import get_project_settings
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['douban.com']
start_urls = ['https://book.douban.com/tag/?view=type&icn=index-sorttags-all']
def parse(self, response):
'''
:param response: response对象
:return: 提取分类链接
'''
# LinkExtractor 如不指定参数就提取页面的所有链接
# deny 接收一个正则表达式或一个正则表达式列表,与allow相反,排除绝对url与正则表达式匹配的链接
# 提取index.html中所有站外链接(及排除站内链接)
le = LinkExtractor(restrict_css='div.article',
deny='//book.douban.com/$')
# print(le.extract_links(response))
for link in le.extract_links(response):
yield scrapy.Request(
link.url,
callback=self.next_url_parse,
# link.text是图书的标签,传递进行分类
meta = {"item":link.text,
"url":link.url}
)
def next_url_parse(self,response):
'''
:param response:
:return: 提取下一页链接构造请求对象
'''
url = response.meta["url"]
next = '?start=%d&type=T'
# 从settings中获取定义的变量
setting = get_project_settings()
for page in range(setting.get('MAX_PAGE')):
i = page*20
next_url = url + next %(i)
yield scrapy.Request(
next_url,
callback=self.url_parse,
meta={"item":response.meta["item"]}
)
# allow与deny相反,提取与正则表达式匹配的链接
def url_parse(self,response):
'''
:param response: response对象
:return: 提取详情页链接,构造请求对象
'''
tag = response.meta["item"]
le = LinkExtractor(restrict_css='ul.subject-list',
allow='/subject/\d+/$')
for link in le.extract_links(response):
yield scrapy.Request(
link.url,
callback=self.detal_parse,
meta={"item":tag}
)
def detal_parse(self,response):
'''
:param response: response对象
:return: 解析详情页提取数据
'''
item = DoubanItem()
infos = response.xpath('string(//div[@id="info"])').extract_first().replace('\n', '').split(":")
for info in infos:
if '-' in info:
time = info[0:8]
title = response.css('div#wrapper h1 span::text').extract_first()
author = response.css('div#info a::text').extract_first().replace('\n','').replace(' ','')
score = response.css('div.rating_self.clearfix strong::text').extract_first()
# xpath的string方法提取节点下的全部文本
content = response.css('div.indent div div.intro').xpath('string(.)')
if content:
content = content.extract_first().replace('\n','').replace(' ','')
item['tag'] = response.meta["item"]
item['book_name'] = title
item['author'] = author
item['time'] = time
item['score'] = score
item['content'] = content
yield item
item.py
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
tag = scrapy.Field()
book_name = scrapy.Field()
author = scrapy.Field()
time = scrapy.Field()
score = scrapy.Field()
content = scrapy.Field()
middlewares.py
from scrapy import signals
import logging
import requests
class ProxyMiddleware():
def __init__(self,proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url
def get_random_proxy(self):
'''
连接代理池获取随机代理
:return: 随机代理
'''
try:
response = requests.get(self.proxy_url)
if response.status_code==200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False
def process_request(self,request,spider):
# 判断语句,第一次请求失败后再使用代理,因为代理爬取速度太慢
if request.meta.get("redirect_times")==1:
proxy = self.get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
# 日志输出
self.logger.debug('使用代理' + proxy)
# 传入随机代理
request.meta['proxy'] = uri
@classmethod
def from_crawler(cls,crawler):
return cls(
proxy_url=crawler.settings.get('PROXY_URL')
)
pipelines.py
import pymongo
class DoubanPipeline(object):
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@ classmethod
# 通过from_crawler传参
def from_crawler(cls,crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB')
)
# 连接mongodb数据库
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
# 插入数据
def process_item(self, item, spider):
self.db[item['tag']].insert(dict(item))
return item
# 关闭数据库
def close_spider(self,spider):
self.client.close()