一款好用的js敏感信息收集脚本分享

0x00 作用:

用于输入网址后获取该页面所有的js脚本,根据自定义的关键字匹配对应的value,实现js信息收集的便利性与灵活性。

 

0x01 环境:

python3

 

0x02 脚本:

import requests
import re
import time
import random
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill


class JSAnalyzerPro:
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/120.0.0.0 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate'
        }
        self.visited = set()
        self.domain = ""
        self.keywords = []
        self.max_depth = 2
        self.results = []

    def configure(self, target_url, keywords):
        """配置参数并验证有效性"""
        try:
            # 自动补全URL协议
            if not target_url.startswith(('http://', 'https://')):
                target_url = f'https://{target_url}'

            parsed = urlparse(target_url)
            self.domain = parsed.netloc

            # 关键字处理(自动纠错+去重)
            self.keywords = list({k.strip().lower().replace('secert', 'secret')
                                  for k in keywords.split(',')})

            # 验证目标可达性
            test_res = self.session.head(target_url, timeout=10)
            test_res.raise_for_status()

            print(f"[系统检测] 成功连接到目标网站: {target_url}")

        except Exception as e:
            raise ConnectionError(f"配置失败: {str(e)}")

    def safe_extract(self, content):
        """增强型键值提取(带调试输出)"""
        try:
            pattern = r'''
            (["']?)\b({})\b(["']?)   # 键
            \s*[:=]\s*               # 分隔符
            (["']?)(.*?)(["']?)      # 值
            (?=\s*[,;}}\n])          # 终止符
            '''.format('|'.join(map(re.escape, self.keywords)))

            matches = re.findall(
                pattern,
                content,
                flags=re.IGNORECASE | re.VERBOSE | re.DOTALL
            )

            processed = []
            for q1, key, q2, q3, value, q4 in matches:
                clean_key = key.strip('\'"')
                clean_value = value.strip('\'"')
                if clean_key.lower() in self.keywords:
                    print(f"[数据提取] 发现匹配: {clean_key} = {clean_value[:30]}...")  # 调试输出
                    processed.append((clean_key, clean_value))
            return processed

        except Exception as e:
            print(f"[警告] 正则解析异常: {str(e)}")
            return []

    def crawl_js(self, url, depth=0):
        """带调试日志的递归爬取"""
        if depth > self.max_depth or url in self.visited:
            return

        print(f"\n[深度 {depth}] 分析: {url}")
        self.visited.add(url)

        try:
            # 请求参数优化
            time.sleep(random.uniform(1.5, 3.0))  # 更长的随机延迟
            verify_ssl = False  # 临时关闭SSL验证

            if url.startswith('http'):
                response = self.session.get(
                    url,
                    timeout=20,
                    verify=verify_ssl,
                    allow_redirects=True
                )
                response.encoding = 'utf-8'  # 强制UTF-8编码
                content = response.text
                print(f"[网络请求] 成功获取内容,长度: {len(content)}字节")
            else:
                content = url

            # 执行提取
            matches = self.safe_extract(content)
            for key, value in matches:
                self.results.append({
                    'source': url,
                    'keyword': key,
                    'value': value,
                    'depth': depth
                })

            # 深度控制
            if depth < self.max_depth:
                print(f"[深度控制] 进入第{depth + 1}层分析...")
                soup = BeautifulSoup(content, 'html.parser') if depth == 0 else None
                for js_url in self.find_js_links(content, soup, url):
                    self.crawl_js(js_url, depth + 1)

        except Exception as e:
            print(f"[严重错误] 处理 {url} 失败: {str(e)}")

    def find_js_links(self, content, soup, base_url):
        """改进的链接发现方法"""
        links = set()

        # 从HTML标签解析
        if soup:
            for tag in soup.find_all(['script', 'link']):
                src = None
                if tag.name == 'script' and tag.has_attr('src'):
                    src = tag['src']
                elif tag.name == 'link' and 'script' in tag.get('as', ''):
                    src = tag.get('href', '')

                if src:
                    absolute_url = urljoin(base_url, src.split('?')[0])
                    if self.is_valid_url(absolute_url):
                        links.add(absolute_url)

        # 从JS代码解析
        patterns = [
            r'\.src\s*=\s*["\'](.*?)["\']',
            r'load\(["\'](.*?)["\']\)',
            r'fetch\(["\'](.*?)["\']\)'
        ]
        for pattern in patterns:
            for match in re.finditer(pattern, content, re.IGNORECASE):
                absolute_url = urljoin(base_url, match.group(1))
                if self.is_valid_url(absolute_url):
                    links.add(absolute_url)

        print(f"[链接发现] 找到{len(links)}个新链接")
        return [link for link in links if link not in self.visited]

    def is_valid_url(self, url):
        """增强型URL验证"""
        parsed = urlparse(url)
        # 允许相对路径和同域名资源
        return not parsed.netloc or parsed.netloc == self.domain

    def generate_report(self):
        """带验证的报告生成"""
        if not self.results:
            print("[警告] 未发现任何匹配数据,不生成空报告")
            return

        try:
            wb = Workbook()
            ws = wb.active
            ws.title = "安全扫描报告"

            # 标题行
            headers = ["来源URL", "关键词", "提取值", "数据深度", "风险等级"]
            ws.append(headers)

            # 设置标题样式
            header_fill = PatternFill(start_color="2F5496", fill_type="solid")
            for col in ws.iter_cols(min_row=1, max_row=1):
                for cell in col:
                    cell.font = Font(bold=True, color="FFFFFF")
                    cell.fill = header_fill

            # 填充数据
            risk_keywords = {'password', 'secret', 'key', 'token'}
            for item in self.results:
                risk = "高危" if item['keyword'].lower() in risk_keywords else "中危"
                ws.append([
                    item['source'],
                    item['keyword'],
                    item['value'],
                    item['depth'],
                    risk
                ])

            # 自动调整列宽
            for col in ws.columns:
                max_len = max(len(str(cell.value)) for cell in col)
                adjusted_width = (max_len + 2) * 1.2
                ws.column_dimensions[col[0].column_letter].width = adjusted_width

            # 生成文件名
            domain_clean = self.domain.replace('.', '_').replace(':', '')
            filename = f"security_report_{domain_clean}_{int(time.time())}.xlsx"

            # 保存前验证
            if len(ws['A']) > 1:  # 确保有数据行
                wb.save(filename)
                print(f"[报告生成] 成功保存到: {filename}")
            else:
                print("[警告] 检测到空数据表,取消保存操作")

        except Exception as e:
            print(f"[严重错误] 生成报告失败: {str(e)}")
            raise


if __name__ == "__main__":
    analyzer = JSAnalyzerPro()

    try:
        target = input("请输入目标URL: ").strip()
        keys = input("请输入要提取的关键字(多个用逗号分隔): ").strip()

        analyzer.configure(target, keys)
        analyzer.crawl_js(target)
        analyzer.generate_report()

        print(f"[系统通知] 扫描完成,共发现 {len(analyzer.results)} 条敏感信息")

    except Exception as e:
        print(f"[致命错误] 程序运行失败: {str(e)}")
        print("故障排查建议:")
        print("1. 检查网络连接和代理设置")
        print("2. 尝试降低最大爬取深度(修改 max_depth)")
        print("3. 安装最新依赖:pip install --upgrade requests beautifulsoup4 openpyxl")

 

0x03 效果:

 

 

 

posted @ 2025-05-18 15:26  铺哩  阅读(151)  评论(0)    收藏  举报