检测字符语言种类

# 通过Unicode范围来判断字符语言,要注意的是繁中和简中有很多重叠的,解决办法就是检测出来繁体字了,在经过判断是否是简体字判断,要用到库hanzidentifier,也不要用hanzidentifier.is_traditional()判断繁体判断,
# 很多简体字符简繁都会为true,例如'繁','世','界','本'等等。也就是说简体字有可能判断繁体字会为true,但繁体字一定不会识别成简体字一定会是false,所以逻辑是简繁都识别把简体字判断为false的识别为繁体字
import hanzidentifier

def is_simplified(char):
    """判断单个字符是否为简体字"""
    return hanzidentifier.is_simplified(char)
def detect_language_characters(text):
    """检测字符串中的语言字符"""
    results = {
        'has_traditional_chinese': False,
        'has_japanese': False,
        'has_korean': False,
        'details': {
            'traditional_chinese_chars': [],
            'japanese_chars': [],
            'korean_chars': []
        }
    }
    
    if not text or not isinstance(text, str):
        return results
    
    # 繁体字Unicode范围
    traditional_chinese_ranges = [
        (0x4E00, 0x9FFF),    # 基本汉字(包含简繁)
        (0x3400, 0x4DBF),    # 扩展A
        (0x20000, 0x2A6DF),  # 扩展B
        (0x2A700, 0x2B73F),  # 扩展C
        (0x2B740, 0x2B81F),  # 扩展D
        (0x2B820, 0x2CEAF),  # 扩展E
        (0x2CEB0, 0x2EBEF),  # 扩展F
        (0x3000, 0x303F),    # 中文标点符号
    ]
    
    # 日语字符范围
    japanese_ranges = [
        (0x3040, 0x309F),    # 平假名
        (0x30A0, 0x30FF),    # 片假名
        (0x31F0, 0x31FF),    # 片假名音标扩展
        (0xFF65, 0xFF9F),    # 半角片假名
    ]
    
    # 韩语字符范围
    korean_ranges = [
        (0xAC00, 0xD7AF),    # 韩文音节
        (0x1100, 0x11FF),    # 韩文字母
        (0x3130, 0x318F),    # 韩文兼容字母
    ]
    
    for char in text:
        code_point = ord(char)
        
        # 检查繁体字(中文)
        for start, end in traditional_chinese_ranges:
            if start <= code_point <= end and is_simplified(char) is False:
                if char not in results['details']['traditional_chinese_chars']:
                    results['details']['traditional_chinese_chars'].append(char)
                results['has_traditional_chinese'] = True
                break
        
        # 检查日语
        for start, end in japanese_ranges:
            if start <= code_point <= end:
                if char not in results['details']['japanese_chars']:
                    results['details']['japanese_chars'].append(char)
                results['has_japanese'] = True
                break
        
        # 检查韩语
        for start, end in korean_ranges:
            if start <= code_point <= end:
                if char not in results['details']['korean_chars']:
                    results['details']['korean_chars'].append(char)
                results['has_korean'] = True
                break
    
    return results
posted @ 2025-12-29 20:25  树下黑猫  阅读(1)  评论(0)    收藏  举报