import re
def extract_emails(text):
"""提取文本中的所有邮箱地址"""
pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
return re.findall(pattern, text)
def extract_urls(text):
"""提取URL"""
pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w\.\-?%&=]*"
return re.findall(pattern, text)
def validate_phone(phone):
"""验证手机号"""
pattern = r"^1[3-9]\d{9}$"
return bool(re.match(pattern, phone))
def validate_email(email):
"""验证邮箱"""
pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
return bool(re.match(pattern, email))
def validate_id_card(id_card):
"""验证身份证"""
pattern = r"^[1-9]\d{5}(18|19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]$"
return bool(re.match(pattern, id_card))
class TextProcessor:
def __init__(self):
self.patterns = {
'email': re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"),
'phone': re.compile(r"1[3-9]\d{9}"),
'url': re.compile(r"https?://[^\s]+"),
'date': re.compile(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}"),
}
def extract_all(self, text):
"""从文本中提取所有信息"""
results = {}
for name, pattern in self.patterns.items():
results[name] = pattern.findall(text)
return results
def anonymize(self, text):
"""匿名化敏感信息"""
# 匿名化邮箱
text = re.sub(self.patterns['email'], "***@***.***", text)
# 匿名化电话(保留后4位)
text = re.sub(self.patterns['phone'], lambda m: "*******" + m.group()[-4:], text)
return text
def format_text(self, text):
"""格式化文本"""
# 统一日期格式为YYYY-MM-DD
text = re.sub(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})",
lambda m: f"{m.group(1)}-{m.group(2):0>2}-{m.group(3):0>2}", text)
# 统一电话号码格式
text = re.sub(r"1[3-9](\d{4})(\d{4})", r"1****\1\2", text)
# 标准化空格
text = re.sub(r"\s+", " ", text).strip()
return text
if __name__ == "__main__":
# 使用示例
processor = TextProcessor()
sample_text = """
用户信息:
姓名:张三
邮箱:zhangsan@example.com
电话:13800138000
备用电话:13912345678
注册日期:2023/1/5
上次登录:2023-12-25
个人网站:https://zhangsan.com
"""
print("原始文本:")
print(sample_text)
print("\n提取的信息:")
extracted = processor.extract_all(sample_text)
for key, value in extracted.items():
if value:
print(f"{key}: {value}")
print("\n匿名化文本:")
print(processor.anonymize(sample_text))
print("\n格式化文本:")
print(processor.format_text(sample_text))