处理html中所有text的空格字符

# -*- coding: utf-8 -*-

import re


def clear_str(string):
    """
    清理字符串
    """
    clear_string = strQ2B(string).strip()
    return clear_string


def handle_space(string, repl=None):
    """
    处理空白字符
    """
    string = re.sub(r'((\s)|(&nbsp;)|(\?)|(_)|(<br>)|( )|( )|( ))+', repl if repl else '', string)
    return string


def handle_re_match(matched):
    string = matched.group('value')
    return '>' + handle_space(string) + '<'


def handle_html_space(html):
    """
    处理html中所有text的空格字符
    """
    html = html.replace('\n', '')
    html = re.sub('>(?P<value>.*?)<', handle_re_match, html)
    return clear_str(html)


def strQ2B(ustring):
    """全角转半角"""
    r_string = ""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:  # 全角空格直接转换
            inside_code = 32
        elif 65281 <= inside_code <= 65374:  # 全角字符(除空格)根据关系转化
            inside_code -= 65248
        r_string += chr(inside_code)
    return r_string

posted @ 2022-04-28 11:04  二二二狗子  阅读(173)  评论(0)    收藏  举报