# -*- coding: utf-8 -*-
import scrapy
from daomu.items import DaomuItem
class DaomuspiderSpider(scrapy.Spider):
name = "daomuspider"
# allowed_domains = ["www.daomubiji.com"]
start_urls = ['http://www.daomubiji.com/']
index_url = 'http://www.daomubiji.com/'
def start_requests(self):
yield scrapy.Request(url=self.index_url,callback=self.parse_book)
def parse_book(self, response):
for url in response.css('.article-content a'):
book_url = url.css('a::attr(href)').extract_first()
yield scrapy.Request(url=book_url, callback=self.parse_chapter)
def parse_chapter(self, response):
item = DaomuItem()
book_title = response.css('.focusbox .container h1::text').extract_first()
book_info = response.css('.focusbox .container .focusbox-text::text').extract_first()
book_url = response.url
for chapter in response.css('.excerpts-wrapper .excerpts .excerpt'):
chapter_title = chapter.css('a::text').extract_first().split(' ')[1] + ':'+ chapter.css('a::text').extract_first().split(' ')[-1]
chapter_url = chapter.css('a::attr(href)').extract_first()
item['book_title'] = book_title
item['book_info'] = book_info
item['book_url'] = book_url
item['chapter_title'] = chapter_title
item['chapter_url'] = chapter_url
yield item
yield scrapy.Request(url = chapter_url,callback=self.parse_detail, meta={'item':item})#重点在这里,用meta进行转移到下一个函数
def parse_detail(self, response):
item = response.meta['item']
content = response.css('.article-content p::text').extract()
item['content'] = content
yield item
import pymongo
class DaomuPipeline(object):
def __init__(self):
self.mongo_uri = 'localhost'
self.mongo_db = 'daomu'
# @classmethod
# def frow_crawler(cls, crawler):
# return cls(
# mongo_uri = crawler.settings.get('MONGO_URI'),
# mongo_db = crawler.settings.get('MONGO_DB')
# )
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))#一定要注意这里用dict
return item
def close_spider(self, spider):
self.client.close()