【工具集】数据提取工具
数据提取工具
python 写了一个数据提取工具,用于对于一些大批量的杂数据提取,获得 ips、urls、domains
代码
import re
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext
import threading
# 正则表达式
domain_pattern = re.compile(r'\b((?!-)[A-Za-z0-9-]{1,63}(?<!-)\.)+[A-Za-z]{2,6}\b')
url_pattern = re.compile(r'https?://(?:[a-zA-Z0-9$-_@.&+!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
ip_pattern = re.compile(r'((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?)')
class DataExtractorApp:
def __init__(self, root):
self.root = root
self.root.title("数据提取工具")
self.root.geometry("900x600")
self.root.minsize(600, 400) # 最小窗口大小
# 配置 Grid 行和列,使其自适应
self.root.grid_rowconfigure(2, weight=1) # 让输出框所在行可拉伸
self.root.grid_columnconfigure((0, 1, 2), weight=1) # 让三列自适应窗口大小
# 文件选择框
self.file_path_entry = tk.Entry(root, width=50)
self.file_path_entry.grid(row=0, column=0, columnspan=2, padx=10, pady=5, sticky="ew")
self.file_button = tk.Button(root, text="选择文件", command=self.choose_file)
self.file_button.grid(row=0, column=2, padx=10, pady=5, sticky="ew")
# 开始按钮
self.start_button = tk.Button(root, text="开始清洗", command=self.start_extraction)
self.start_button.grid(row=1, column=0, columnspan=3, padx=10, pady=5, sticky="ew")
# 输出框(3 列)
self.ip_output = self.create_output_box("提取出的 IP 地址", 0)
self.domain_output = self.create_output_box("提取出的 域名", 1)
self.url_output = self.create_output_box("提取出的 URL", 2)
def create_output_box(self, label_text, column):
frame = tk.Frame(self.root)
frame.grid(row=2, column=column, padx=10, pady=5, sticky="nsew")
label = tk.Label(frame, text=label_text)
label.pack()
text_box = scrolledtext.ScrolledText(frame, wrap=tk.WORD)
text_box.pack(fill=tk.BOTH, expand=True)
# 让 Frame 在窗口变化时拉伸
self.root.grid_columnconfigure(column, weight=1)
return text_box
def choose_file(self):
file_path = filedialog.askopenfilename()
if file_path:
self.file_path_entry.delete(0, tk.END)
self.file_path_entry.insert(0, file_path)
def start_extraction(self):
file_path = self.file_path_entry.get()
if not file_path:
messagebox.showerror("错误", "请输入有效的文件路径")
return
threading.Thread(target=self.extract_data, args=(file_path,), daemon=True).start()
def extract_data(self, file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
data = file.readlines()
domains, urls, ips = set(), set(), set()
for line in data:
parts = line.strip().split(",")
for part in parts:
domains.update(match.group(0) for match in domain_pattern.finditer(part))
urls.update(url_pattern.findall(part))
ips.update(match.group(0) for match in ip_pattern.finditer(part))
self.update_output(self.ip_output, "\n".join(sorted(ips)))
self.update_output(self.domain_output, "\n".join(sorted(domains)))
self.update_output(self.url_output, "\n".join(sorted(urls)))
messagebox.showinfo("完成", "数据提取完成!")
except Exception as e:
messagebox.showerror("错误", f"处理文件时出错: {e}")
def update_output(self, text_widget, content):
text_widget.config(state=tk.NORMAL)
text_widget.delete("1.0", tk.END)
text_widget.insert(tk.END, content)
text_widget.config(state=tk.DISABLED)
if __name__ == "__main__":
root = tk.Tk()
app = DataExtractorApp(root)
root.mainloop()
UI:


浙公网安备 33010602011771号