高性能-爬虫原理

 socket :原理(本质上就是一个socket服务端,一个客户端.  客户端在连接服务端的时候只是封装了HTTP协议头以及访问的地址)

阻塞:

import socket

#####################阻塞 请求
ip_post=(('43.226.160.17',80))
s=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
s.connect(ip_post)
# pro=b"GET {url} HTTP/1.1\r\nHost: {host_port}\r\n\r\n".format(url='/login.html',host_port='127.0.0.1:9000')
pro=b"GET / HTTP/1.0\r\nHost: dig.chouti.com\r\n\r\n"
print(pro)
s.send(pro)


feedback=s.recv(8096)
print(feedback.decode('utf-8'))


# print(s.recv(1024))
s.close()

 

非阻塞:HTTP用的就是这种 请求方式,请求只管发出去,而阻塞的方式还等请求连接成功再发消息。

mport socket
ip_post=(('43.226.160.17',80))
s=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
print('这是非阻塞')
s.setblocking(False)#非阻塞  如果 这里不加try 会报一个阻塞异常


try: #因为请求已经发过去了
    s.connect(ip_post)
except BlockingIOError as e :
    print(e)

import time
time.sleep(5)
# pro=b"GET {url} HTTP/1.1\r\nHost: {host_port}\r\n\r\n".format(url='/login.html',host_port='127.0.0.1:9000')
pro=b"GET / HTTP/1.0\r\nHost: dig.chouti.com\r\n\r\n"
print(pro)
s.send(pro)

time.sleep(3)
feedback=s.recv(8096)
print(feedback)

# print(s.recv(1024))
s.close()

 

IO多路复用:用来检测【多个】socket 对象是否有变化?

import socket
import select 



socket_list=[]
for i in [100个URL]:
      client=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
      #创建socket对象
      client.setblocking(False)#设置非阻塞,这里设置非阻塞会导致连接报错。所以需要 try
      #连接
      try:
           client.connect((i,80))  #连接的请求已经发出去
      except BlockingIOError as e:
             print(e)
      socket_list.append(client)#将socket 对象添加到socket_list 列表里。
 
while True:#也称为事件循环、事件驱动
      r,w,e=select.select(socket_list,socket_list,[],0.05)
      #  w 检测连接是否成功。
      for obj in w:
           obj.send('GET / http/1.0/r/n/r/nHost: www.baidu.com\r\n\r\n"')
      #r 是接收数据  recv 
      for obj in r:
            response=obj.recv(1024)
            print(response)

     

 异步非阻塞模块原理socket客户端:Twisted,Tornado 内部实现就是这样

import select

import socket

class Request(object):
    def __init__(self,sock,info):
        self.sock=sock
        self.info=info


    def fileno(self):
        """
        封装socket对象有fileno方法
        :return:
        """
        return self.sock.fileno()


class Custom(object):
    def __init__(self):
        self.sock_list=[]
        self.conns=[]

    def add_reqeust(self,req_info):
        """
        创建发送请求
        req_info:{'host':'www.baidu.com','port':80,'path':'/'},
        :return:
        """

        sock=socket.socket()

        sock.setblocking(False)

        try:
            sock.connect((req_info['host'],req_info['port']))
        except BlockingIOError as e:
            pass
        obj=Request(sock,req_info)
        print(obj.sock.fileno(),'------')
        self.sock_list.append(obj)
        self.conns.append(obj)

    def run(self):
        """
        开始事件循环,检测连接成功与否,数据是否返回 ?

        :return:
        """
        while 1:
            #在select的r 列表中,值必须是socket对象,对象一定要有fileno方法 ,select内部就是获取fileno方法的值,所以创建了一个 Request类,
            #并socket对象已经变成Request对象
            r,w,e=select.select(self.sock_list,self.conns,[],0.05)#e表示error,0
            #w 是否连接成功
            for obj in w:
                #这里可能是cnblogs ,可能是baidu 的字典,所以 这里需要检查obj是哪个字典 #所以用了Request类 info 就是req_info
                data="GET %s http/1.1\r\nhost:%s\r\n\r\n"%(obj.info['path'],obj.info['host'])
                obj.sock.send(data.encode('utf-8'))
                self.conns.remove(obj)#因为self.conns 检测成功的就发送数据,防止第二次进来再发送数据

            #数据返回,接收数据
            for obj in r:
                response=obj.sock.recv(8192)
                print(obj.info['host'])
                #如果这个连接成功,这个连接就移除掉。http协议的一次连接一次断开
                self.sock_list.remove(obj)


                obj.info['callback'](response)
            #所有的请求已经完成
            if not self.sock_list:
                break


def done(response):
    print(response)


url_list=[
    {'host':'www.baidu.com','port':80,'path':'/','callback':done},
    {'host':'www.bing.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
    {'host':'www.cnblogs.com','port':80,'path':'/','callback':done},
]


custom=Custom()
for item in url_list:
    custom.add_reqeust(item)
custom.run()

 

posted @ 2017-08-31 09:57  tonycloud  阅读(282)  评论(0)    收藏  举报