高性能-爬虫原理
socket :原理(本质上就是一个socket服务端,一个客户端. 客户端在连接服务端的时候只是封装了HTTP协议头以及访问的地址)
阻塞:
import socket #####################阻塞 请求 ip_post=(('43.226.160.17',80)) s=socket.socket(socket.AF_INET,socket.SOCK_STREAM) s.connect(ip_post) # pro=b"GET {url} HTTP/1.1\r\nHost: {host_port}\r\n\r\n".format(url='/login.html',host_port='127.0.0.1:9000') pro=b"GET / HTTP/1.0\r\nHost: dig.chouti.com\r\n\r\n" print(pro) s.send(pro) feedback=s.recv(8096) print(feedback.decode('utf-8')) # print(s.recv(1024)) s.close()
非阻塞:HTTP用的就是这种 请求方式,请求只管发出去,而阻塞的方式还等请求连接成功再发消息。
mport socket ip_post=(('43.226.160.17',80)) s=socket.socket(socket.AF_INET,socket.SOCK_STREAM) print('这是非阻塞') s.setblocking(False)#非阻塞 如果 这里不加try 会报一个阻塞异常 try: #因为请求已经发过去了 s.connect(ip_post) except BlockingIOError as e : print(e) import time time.sleep(5) # pro=b"GET {url} HTTP/1.1\r\nHost: {host_port}\r\n\r\n".format(url='/login.html',host_port='127.0.0.1:9000') pro=b"GET / HTTP/1.0\r\nHost: dig.chouti.com\r\n\r\n" print(pro) s.send(pro) time.sleep(3) feedback=s.recv(8096) print(feedback) # print(s.recv(1024)) s.close()
IO多路复用:用来检测【多个】socket 对象是否有变化?
import socket import select socket_list=[] for i in [100个URL]: client=socket.socket(socket.AF_INET,socket.SOCK_STREAM) #创建socket对象 client.setblocking(False)#设置非阻塞,这里设置非阻塞会导致连接报错。所以需要 try #连接 try: client.connect((i,80)) #连接的请求已经发出去 except BlockingIOError as e: print(e) socket_list.append(client)#将socket 对象添加到socket_list 列表里。 while True:#也称为事件循环、事件驱动 r,w,e=select.select(socket_list,socket_list,[],0.05) # w 检测连接是否成功。 for obj in w: obj.send('GET / http/1.0/r/n/r/nHost: www.baidu.com\r\n\r\n"') #r 是接收数据 recv for obj in r: response=obj.recv(1024) print(response)
异步非阻塞模块原理socket客户端:Twisted,Tornado 内部实现就是这样
import select import socket class Request(object): def __init__(self,sock,info): self.sock=sock self.info=info def fileno(self): """ 封装socket对象有fileno方法 :return: """ return self.sock.fileno() class Custom(object): def __init__(self): self.sock_list=[] self.conns=[] def add_reqeust(self,req_info): """ 创建发送请求 req_info:{'host':'www.baidu.com','port':80,'path':'/'}, :return: """ sock=socket.socket() sock.setblocking(False) try: sock.connect((req_info['host'],req_info['port'])) except BlockingIOError as e: pass obj=Request(sock,req_info) print(obj.sock.fileno(),'------') self.sock_list.append(obj) self.conns.append(obj) def run(self): """ 开始事件循环,检测连接成功与否,数据是否返回 ? :return: """ while 1: #在select的r 列表中,值必须是socket对象,对象一定要有fileno方法 ,select内部就是获取fileno方法的值,所以创建了一个 Request类, #并socket对象已经变成Request对象 r,w,e=select.select(self.sock_list,self.conns,[],0.05)#e表示error,0 #w 是否连接成功 for obj in w: #这里可能是cnblogs ,可能是baidu 的字典,所以 这里需要检查obj是哪个字典 #所以用了Request类 info 就是req_info data="GET %s http/1.1\r\nhost:%s\r\n\r\n"%(obj.info['path'],obj.info['host']) obj.sock.send(data.encode('utf-8')) self.conns.remove(obj)#因为self.conns 检测成功的就发送数据,防止第二次进来再发送数据 #数据返回,接收数据 for obj in r: response=obj.sock.recv(8192) print(obj.info['host']) #如果这个连接成功,这个连接就移除掉。http协议的一次连接一次断开 self.sock_list.remove(obj) obj.info['callback'](response) #所有的请求已经完成 if not self.sock_list: break def done(response): print(response) url_list=[ {'host':'www.baidu.com','port':80,'path':'/','callback':done}, {'host':'www.bing.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, {'host':'www.cnblogs.com','port':80,'path':'/','callback':done}, ] custom=Custom() for item in url_list: custom.add_reqeust(item) custom.run()

浙公网安备 33010602011771号