python常用正则表达式

import re


def extract_emails(text):
    """提取文本中的所有邮箱地址"""
    pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    return re.findall(pattern, text)


def extract_urls(text):
    """提取URL"""
    pattern = r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w\.\-?%&=]*"
    return re.findall(pattern, text)


def validate_phone(phone):
    """验证手机号"""
    pattern = r"^1[3-9]\d{9}$"
    return bool(re.match(pattern, phone))


def validate_email(email):
    """验证邮箱"""
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return bool(re.match(pattern, email))


def validate_id_card(id_card):
    """验证身份证"""
    pattern = r"^[1-9]\d{5}(18|19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]$"
    return bool(re.match(pattern, id_card))


class TextProcessor:
    def __init__(self):
        self.patterns = {
            'email': re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"),
            'phone': re.compile(r"1[3-9]\d{9}"),
            'url': re.compile(r"https?://[^\s]+"),
            'date': re.compile(r"\d{4}[-/]\d{1,2}[-/]\d{1,2}"),
        }

    def extract_all(self, text):
        """从文本中提取所有信息"""
        results = {}
        for name, pattern in self.patterns.items():
            results[name] = pattern.findall(text)
        return results

    def anonymize(self, text):
        """匿名化敏感信息"""
        # 匿名化邮箱
        text = re.sub(self.patterns['email'], "***@***.***", text)
        # 匿名化电话(保留后4位)
        text = re.sub(self.patterns['phone'], lambda m: "*******" + m.group()[-4:], text)
        return text

    def format_text(self, text):
        """格式化文本"""
        # 统一日期格式为YYYY-MM-DD
        text = re.sub(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})",
                      lambda m: f"{m.group(1)}-{m.group(2):0>2}-{m.group(3):0>2}", text)

        # 统一电话号码格式
        text = re.sub(r"1[3-9](\d{4})(\d{4})", r"1****\1\2", text)

        # 标准化空格
        text = re.sub(r"\s+", " ", text).strip()

        return text


if __name__ == "__main__":
    # 使用示例
    processor = TextProcessor()

    sample_text = """
        用户信息:
        姓名:张三
        邮箱:zhangsan@example.com
        电话:13800138000
        备用电话:13912345678
        注册日期:2023/1/5
        上次登录:2023-12-25
        个人网站:https://zhangsan.com
        """

    print("原始文本:")
    print(sample_text)

    print("\n提取的信息:")
    extracted = processor.extract_all(sample_text)
    for key, value in extracted.items():
        if value:
            print(f"{key}: {value}")

    print("\n匿名化文本:")
    print(processor.anonymize(sample_text))

    print("\n格式化文本:")
    print(processor.format_text(sample_text))

 

posted on 2025-12-15 17:35  fengZQ  阅读(5)  评论(0)    收藏  举报

导航