aiohttp 异步爬虫实战
想检测一下内网开放8080端口提供http服务的计算机,写了以下代码:
# 导入 requests 包
import requests,pathlib
# 发送请求
def req(ip):
try:
print(ip)
r = requests.get(ip,timeout=0.5)
if r.status_code == 200:
with pathlib.Path("1.txt").open("a") as f:
f.write(ip+"\n")
except:
pass
for i in range(1,255):
for j in range(1,255):
ip="http://" + "192.168."+ str(i) +"."+ str(j)+":8080"
req(ip)
print("完成")
然而上述代码是同步执行(写文件的用法也不对),太慢了。
于是用改成:
import asyncio
import aiohttp,pathlib,time
CONCURRENCY = 500 #linux打开文件最大数1024,windows默认509
semaphore = asyncio.Semaphore(CONCURRENCY)
session = None
async def scrape_api(URL):
async with semaphore:
print(URL)
try:
async with session.get(URL) as response:
await asyncio.sleep(1)
if response.status == 200:
return response
else:
return None #object int can't be used in 'await' expression
except:
pass
async def main():
global session
timeout = aiohttp.ClientTimeout(total=10)
session = aiohttp.ClientSession(timeout=timeout)
scrape_index_tasks = [asyncio.ensure_future(scrape_api(URL)) for URL in ["http://192.168."+ str(i) +"." + str(j) + ":8080" for i in range(1,255) for j in range(1,255)]]
results = await asyncio.gather(*scrape_index_tasks)
r2 = [r for r in results if r != None]
print(*r2,file=open(r'./1.txt','w'))
if __name__ == '__main__':
time_start=time.time()
asyncio.get_event_loop().run_until_complete(main())
time_end=time.time()
print('耗时:',time_end-time_start)
运行一遍大约半小时
还可以考虑改用socket 参考https://fiime.cn/blog/218138,如:
import socket target_host = "192.168.1.5" target_port = 445 # 创建socket对象 client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # 连接客户端 res = client.connect_ex((target_host, target_port)) print(res) client.close()
参考:https://blog.csdn.net/rhx_qiuzhi/article/details/124332114
https://blog.csdn.net/weixin_38819889/article/details/108632640
浙公网安备 33010602011771号