改进 网站资源探测工具(添加代理)

对网站资源探测工具进行调整,并且添加代理,防止多次访问ip被封的情况。

#获取代理,并写入agents列
def agent_list(url):
    global agent_lists
    agent_lists = []
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
    r = requests.get(url,headers = header)
    agent_info = BeautifulSoup(r.content,'html.parser').find(id = "ip_list").find_all('tr')[1:]
    for i in range(len(agent_info)):
        info = agent_info[i].find_all('td')
        agents = {info[5].string : 'http://' + info[1].string}
        agent_lists.append(agents)

修改后的 网站资源扫描工具:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# by 默不知然 2018-03-15

import threading
from threading import Thread
from bs4 import BeautifulSoup
import time
import sys
import requests
import getopt
import random

#创建类,并对目标网站发起请求
class scan_thread (threading.Thread):
    global real_url_list
    real_url_list = []
    def __init__(self,url):
        threading.Thread.__init__(self)
        self.url = url

    def run(self):
        try:
            proxy = random.sample(agent_lists,1)[0]
            r = requests.get(self.url, proxies = proxy)
            print(self.url,'------->>',str(r.status_code))
            if int(r.status_code) == 200:
                real_url_list.append(self.url)
            l[0] = l[0] - 1
        except Exception as e:
            print(e)

#获取字典并构造url并声明扫描线程
def url_makeup(dicts,url,threshold):
    global url_list
    global l
    url_list = []
    l =[]
    l.append(0)
    dic = str(dicts)
    with open (dic,'r') as f:
        code_list = f.readlines()
        for i in code_list:
            url_list.append(url+i.replace('\r','').replace('\n',''))
    while len(url_list):
        try:
            if l[0] < threshold:
                n = url_list.pop(0)
                l[0] = l[0] + 1
                thread = scan_thread(n)
                thread.start()
        except KeyboardInterrupt:
            print('用户停止了程序,完成目录扫描。')
            sys.exit()

#获取输入参数
def get_args():
    global get_url
    global get_dicts
    global get_threshold
    try:
        options,args = getopt.getopt(sys.argv[1:],"u:f:n:")
    except getopt.GetoptError:
        print("错误参数")
        sys.exit()
    for option,arg in options:
        if option == '-u':
            get_url = arg
        if option == '-f':
            get_dicts = arg
        if option == '-n':
            get_threshold = int(arg)
#获取代理,并写入agents列
def agent_list(url):
    global agent_lists
    agent_lists = []
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
    r = requests.get(url,headers = header)
    agent_info = BeautifulSoup(r.content,'html.parser').find(id = "ip_list").find_all('tr')[1:]
    for i in range(len(agent_info)):
        info = agent_info[i].find_all('td')
        agents = {info[5].string : 'http://' + info[1].string}
        agent_lists.append(agents)

#主函数,运行扫描程序
def main():
    agent_url = 'http://www.xicidaili.com/nn/1'
    agent_list(agent_url)
    get_args()
    url = get_url
    dicts = get_dicts
    threshold = get_threshold
    url_makeup(dicts,url,threshold)
    time.sleep(0.5)
    print('目标网站存在目录: ','\n', real_url_list)
    with open(r'网站目录扫描结果.txt','w') as f:
        for i in real_url_list:
            f.write(i)
            f.write('\n')

if __name__ == '__main__':
    main()

 

对某网站扫描结果:

 

posted @ 2018-03-16 10:01  默不知然  阅读(287)  评论(0编辑  收藏  举报