import socket
from urllib.parse import urlparse
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE
selecter = DefaultSelector()
urls = ["http://app2.jg.eastmoney.com/m/ScienceBoardTopic/index.html#/topic"]
stop = False
# 通过socket请求html
# 使用select完成http请求
# select + 回调 + 事件循环
class Fetcher:
def connected(self, key):
selecter.unregister(key.fd)
self.client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(self.path, self.host).encode("utf8"))
selecter.register(self.client.fileno(), EVENT_READ, self.readable)
def readable(self, key):
d = self.client.recv(1024)
if d:
self.data += d
else:
selecter.unregister(key.fd)
data = self.data.decode("utf8")
html_data = data.split("\r\n\r\n")[1:]
print("\r\n\r\n".join(html_data))
self.client.close()
urls.remove(self.spider_url)
if not urls:
global stop
stop = True
def get_url(self, url):
self.spider_url = url
self.data = b""
url = urlparse(url)
self.host = url.netloc
self.path = url.path
if self.path == "":
self.path = "/"
# 建立socket连接
self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.client.setblocking(False)
try:
self.client.connect((self.host, 80))
except BlockingIOError as e:
pass
# 注册
selecter.register(self.client.fileno(), EVENT_WRITE, self.connected)
# 事件循环
def loop():
while not stop:
ready = selecter.select()
for key, mask in ready:
call_back = key.data
call_back(key)
if __name__ == "__main__":
fetcher = Fetcher()
fetcher.get_url("http://www.baidu.com")
loop()