python Scrapy -6 实战东莞民生网站 2

dongdong.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from newdongguan.items import NewdongguanItem


class DongdongSpider(CrawlSpider):
    name = 'dongdong'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']

    # 每一页的匹配规则
    pagelink = LinkExtractor(allow=("type=4"))
    # 每一页里的每个帖子的匹配规则
    contentlink = LinkExtractor(allow=(r"/html/question/\d+/\d+.shtml"))

    rules = (
        # 本案例的url被web服务器篡改,需要调用process_links来处理提取出来的url
        Rule(pagelink, process_links = "deal_links"),
        Rule(contentlink, callback = "parse_item")
    )

    # links 是当前response里提取出来的链接列表
    def deal_links(self, links):
        for each in links:
            each.url = each.url.replace("?","&").replace("Type&","Type?")
        return links

    def parse_item(self, response):
        item = NewdongguanItem()
        # 标题
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
        # 编号
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]
        # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则
        if len(content) == 0:
            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()
        # 链接
        item['url'] = response.url

        yield item

  items.py

# -*- coding: utf-8 -*-

import scrapy

class NewdongguanItem(scrapy.Item):
    # define the fields for your item here like:
    # 标题
    title = scrapy.Field()
    # 编号
    number = scrapy.Field()
    # 内容
    content = scrapy.Field()
    # 链接
    url = scrapy.Field()

  pipelines.py

# -*- coding: utf-8 -*-

import codecs
import json

class NewdongguanPipeline(object):
    def __init__(self):
        # 创建一个文件
        self.filename = codecs.open("donggguan.json", "w", encoding = "utf-8")

    def process_item(self, item, spider):
        # 中文默认使用ascii码来存储,禁用后默认为Unicode字符串
        content = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.filename.write(content)
        return item

    def close_spider(self, spider):
        self.filename.close()

  settings.py

ITEM_PIPELINES = {
'newdongguan.pipelines.NewdongguanPipeline': 1,
}

BOT_NAME = 'newdongguan'

SPIDER_MODULES = ['newdongguan.spiders']
NEWSPIDER_MODULE = 'newdongguan.spiders'

USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

  

posted @ 2017-09-03 16:24  AlamZ  阅读(155)  评论(0)    收藏  举报