twisted的实现过程,Request的封装,url的唯一标识,DupeFilter,中间件,异常的使用

# twisted:基于事件循环的异步非阻塞框架,(简单的讲就是一个线程向多个目标发起http请求)
# requests.get()
# response.text
# sk = socket.socket()
# sk.setblocking(False)-socket默认是阻塞的,该设置是设置为非阻塞
#
# twisted的执行
from twisted.web.client import getPage, defer
from twisted.internet import reactor

#1.代理开始接受任务
def callback(contents):
print(contents)
deferred_list = []
url_list = ['http://www.bing.com','https://www.baidu.com']
for url in url_list:
deferred = getPage(bytes(url, encoding='utf8'))
deferred.addCallback(callback)
deferred_list.append(deferred)
print(111111111111111,deferred)
#2.代理完成任务后,停止
dlist = defer.DeferredList(deferred_list)
print(2222222222222,dlist)
def all_done(arg):
reactor.stop()
dlist.addBoth(all_done)
#3.代理开始去处理
reactor.run()
Request的封装
class Request(object):
    def __init__(self, url, callback):
        self.url = url
        self.callback = callback
Request(url='',callback=lambda x:x)

url的唯一标识
from
scrapy.utils.request import request_fingerprint from scrapy.http import Request url1 = 'http://www.luffycity.com?k1=123&k2=456' req1 = Request(url=url1) url2 = 'http://www.luffycity.com?k2=456&k1=123' req2 = Request(url=url2) fd1 = request_fingerprint(request=req1) fd2 = request_fingerprint(request=req2) print(fd1) ---500b31c7eed3dee70088a2dcf6c0906713e9d428 print(fd2) ---500b31c7eed3dee70088a2dcf6c0906713e9d428
CookieJar
from
scrapy.http.cookies import CookieJar from scrapy.http.response.html import HtmlResponse from scrapy.http import Request import scrapy class ChoutiSpider(scrapy): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def parse(self, response): # 实例化CookieJar对象 cookie_jar = CookieJar() # 去响应头中获取cookie,cookie保存在cookie_jar对象中 cookie_jar.extract_cookies(response, response.request) # 去对象中将cookie解析到字典 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value yield Request( url='https://dig.chouti.com/login', method='POST', body="phone=8613121758648&password=woshiniba&oneMonth=1", # # body=urlencode({})"phone=8615131255555&password=12sdf32sdf&oneMonth=1" cookies=self.cookie_dict, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.check_login ) # 进行全局编码方式控制 # import sys,os,io # sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
DupeFilter
from
scrapy.dupefilter import BaseDupeFilter from scrapy.utils.request import request_fingerprint class XdbDuperFilter(BaseDupeFilter): def __init__(self): self.visited_fd = set() @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): fd = request_fingerprint(request=request) if fd in self.visited_fd: return True self.visited_fd.add(fd) def open(self): # can return deferred print('开始') def close(self, reason): # can return a deferred print('结束') def log(self, request, spider): # log that a request has been filtered print('日志')
中间件的方法
from
scrapy import signals class XdbSpiderMiddleware(object): @classmethod def from_crawler(cls, crawler): s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): return None def process_spider_output(self, response, result, spider): for i in result: yield i def process_spider_exception(self, response, exception, spider): pass def process_start_requests(self, start_requests, spider): for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
异常的使用
from
scrapy.exceptions import DropItem class FilePipeline(object): def process_item(self, item, spider): # f = open('xx.log','a+') # f.write(item['href']+'\n') # f.close() print('File', item['href']) self.f.write(item['href'] + '\n') # return item raise DropItem()
深度的限制
import
scrapy class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] def parse(self, response): print(response.request.url, response.meta.get('depth',0)) # print(response.request.url) # item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') # for item in item_list: # text = item.xpath('.//a/text()').extract_first() # href = item.xpath('.//a/@href').extract_first() page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: from scrapy.http import Request page = "https://dig.chouti.com" + page yield Request(url=page,callback=self.parse,dont_filter=False) # https://dig.chouti.com/all/hot/recent/2 # yield Request(url=page,callback=self.parse,dont_filter=True) # https://dig.chouti.com/all/hot/recent/2 配置文件: # 限制深度 DEPTH_LIMIT = 3 dont_filter=False 不过滤

 

 
posted @ 2018-07-02 09:25  liang哥哥  阅读(222)  评论(0)    收藏  举报