Fork me on GitHub

【自然语言处理】对句子进行预处理

主要是去除掉换行符、空格、制表符以及无效的字符:

import collections
import re
import unicodedata
import six

def clean_br(text):
    br_pattern = ('<br\s*?/?>')
    text = re.sub(br_pattern, '', text)
    return text

def is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat in ("Cc", "Cf"):
        return True
    return False

def is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False

def clean_text(text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
        cp = ord(char)
        if cp == 0 or cp == 0xfffd or is_control(char):
            continue
        if is_whitespace(char):
            output.append("")
        else:
            output.append(char)
    return clean_br("".join(output))


if __name__ == '__main__':
   text = '我爱<br> × 北京<br/>	天安门\n。哈哈'
   print(clean_text(text))
posted @ 2021-04-21 17:01  西西嘛呦  阅读(260)  评论(0)    收藏  举报