pyspider示例代码六:传递参数

传递参数

示例一

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-10-25 14:31:24

import re
import json
from libs.pprint import pprint
from libs.base_handler import *

class Handler(BaseHandler):
    '''
    this is a sample handler
    '''
    crawl_config = {
    }
    proxy = ""

    @every(0, 30)
    def on_start(self):
        self.crawl(self.proxy+'http://www.douban.com/group/haixiuzu/discussion',
                   force_update=True, callback=self.index_page)

    @config(age=10)
    def index_page(self, response):
        for each in response.doc('tr > .title > a').items():
            self.crawl(self.proxy+each.attr.href, callback=self.detail_page)

    @config(age=30*24*60*60)
    def detail_page(self, response):
        assert response.url != "https://www.douban.com/"
        return {
            "url": response.url,
            "title": response.doc("#content h1").text(),
            "author": response.doc(".topic-content .from a").text(),
            "author_url": response.doc("DIV.topic-doc>H3>SPAN.from>A").attr.href,
            "imgs": [x.attr.src for x in response.doc('.topic-doc img').items()]
        }
        
    def on_result(self, result):
        if not result or not result['imgs']:
            return
        post_id = re.search("topic/(\d+)", self.response.url).group(1)
        self.crawl("https://api.duoshuo.com/posts/import.json#"+post_id, method="POST",
            data={
            "short_name": "database",
            "secret": "8e5a5be8873ad7e9a59147c3cfd10e73",
            "posts[0][post_key]": post_id,
            "posts[0][thread_key]": "haixiuzu",
            "posts[0][message]": json.dumps(result).encode("base64").replace("\n", "")
        }, callback=self.post_to_duoshuo)


    def post_to_duoshuo(self):
        pass

示例二

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-08-30 19:11:28
# Project: prieto

import re
from pyspider.libs.base_handler import *


class Handler(BaseHandler):

    
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self): 
        for i in range(10000): 
            self.crawl('data:,step%d' % i, callback=self.gen_url, save=i) 

    @config(priority=0) 
    def gen_url(self, respond): 
        for i in range(respond.save * 700, (respond.save + 1) * 700): 
            self.crawl("http://bbs.fobshanghai.com/viewthread.php?action=printable&tid=%d" % i, callback=self.index_page) 

    @config(priority=1) 
    def index_page(self, respond): 

     
        # title = response.doc
        hr_black = u'<hr noshade="noshade" size="2" width="100%" color="#808080"/>'
        hr_blue = u'<br/><br/><br/><br/><hr noshade="noshade" size="2" width="100%" color="#698cc3"/>'
        
        #posts = respond.doc('body').html().split(hr_blue)[0].split(hr_black)[1:]
        
        if respond.doc('head').html().startswith('<meta'):
            return {
               "tid": respond.url.split('=')[-1],
               "url": respond.url,
               "html": 'The specified thread does not exist.',
            }
        
        
        return {
            "tid": respond.url.split('=')[-1],
            "url": respond.url,
            #"t_author": posts[0].split('\n')[1].split('<b>')[0].strip(), # 用正则更好
            "html": respond.doc.html(),
            #"replies": [i for i in posts[1:]]
        }

 

posted @ 2016-12-07 13:18  microman  阅读(2596)  评论(0编辑  收藏  举报