# twisted:基于事件循环的异步非阻塞框架,(简单的讲就是一个线程向多个目标发起http请求)
# requests.get()
# response.text
# sk = socket.socket()
# sk.setblocking(False)-socket默认是阻塞的,该设置是设置为非阻塞
#
# twisted的执行
from twisted.web.client import getPage, defer
from twisted.internet import reactor
#1.代理开始接受任务
def callback(contents):
print(contents)
deferred_list = []
url_list = ['http://www.bing.com','https://www.baidu.com']
for url in url_list:
deferred = getPage(bytes(url, encoding='utf8'))
deferred.addCallback(callback)
deferred_list.append(deferred)
print(111111111111111,deferred)
#2.代理完成任务后,停止
dlist = defer.DeferredList(deferred_list)
print(2222222222222,dlist)
def all_done(arg):
reactor.stop()
dlist.addBoth(all_done)
#3.代理开始去处理
reactor.run()
Request的封装
class Request(object):
def __init__(self, url, callback):
self.url = url
self.callback = callback
Request(url='',callback=lambda x:x)
url的唯一标识
from scrapy.utils.request import request_fingerprint
from scrapy.http import Request
url1 = 'http://www.luffycity.com?k1=123&k2=456'
req1 = Request(url=url1)
url2 = 'http://www.luffycity.com?k2=456&k1=123'
req2 = Request(url=url2)
fd1 = request_fingerprint(request=req1)
fd2 = request_fingerprint(request=req2)
print(fd1) ---500b31c7eed3dee70088a2dcf6c0906713e9d428
print(fd2) ---500b31c7eed3dee70088a2dcf6c0906713e9d428
CookieJar
from scrapy.http.cookies import CookieJar
from scrapy.http.response.html import HtmlResponse
from scrapy.http import Request
import scrapy
class ChoutiSpider(scrapy):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['https://dig.chouti.com/']
cookie_dict = {}
def parse(self, response):
# 实例化CookieJar对象
cookie_jar = CookieJar()
# 去响应头中获取cookie,cookie保存在cookie_jar对象中
cookie_jar.extract_cookies(response, response.request)
# 去对象中将cookie解析到字典
for k, v in cookie_jar._cookies.items():
for i, j in v.items():
for m, n in j.items():
self.cookie_dict[m] = n.value
yield Request(
url='https://dig.chouti.com/login',
method='POST',
body="phone=8613121758648&password=woshiniba&oneMonth=1",
# # body=urlencode({})"phone=8615131255555&password=12sdf32sdf&oneMonth=1"
cookies=self.cookie_dict,
headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
},
callback=self.check_login
)
# 进行全局编码方式控制
# import sys,os,io
# sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
DupeFilter
from scrapy.dupefilter import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
class XdbDuperFilter(BaseDupeFilter):
def __init__(self):
self.visited_fd = set()
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
fd = request_fingerprint(request=request)
if fd in self.visited_fd:
return True
self.visited_fd.add(fd)
def open(self): # can return deferred
print('开始')
def close(self, reason): # can return a deferred
print('结束')
def log(self, request, spider): # log that a request has been filtered
print('日志')
中间件的方法
from scrapy import signals
class XdbSpiderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
异常的使用
from scrapy.exceptions import DropItem
class FilePipeline(object):
def process_item(self, item, spider):
# f = open('xx.log','a+')
# f.write(item['href']+'\n')
# f.close()
print('File', item['href'])
self.f.write(item['href'] + '\n')
# return item
raise DropItem()
深度的限制
import scrapy
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['https://dig.chouti.com/']
def parse(self, response):
print(response.request.url, response.meta.get('depth',0))
# print(response.request.url)
# item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]')
# for item in item_list:
# text = item.xpath('.//a/text()').extract_first()
# href = item.xpath('.//a/@href').extract_first()
page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
for page in page_list:
from scrapy.http import Request
page = "https://dig.chouti.com" + page
yield Request(url=page,callback=self.parse,dont_filter=False) # https://dig.chouti.com/all/hot/recent/2
# yield Request(url=page,callback=self.parse,dont_filter=True) # https://dig.chouti.com/all/hot/recent/2
配置文件:
# 限制深度
DEPTH_LIMIT = 3
dont_filter=False 不过滤