# 通过Unicode范围来判断字符语言,要注意的是繁中和简中有很多重叠的,解决办法就是检测出来繁体字了,在经过判断是否是简体字判断,要用到库hanzidentifier,也不要用hanzidentifier.is_traditional()判断繁体判断,
# 很多简体字符简繁都会为true,例如'繁','世','界','本'等等。也就是说简体字有可能判断繁体字会为true,但繁体字一定不会识别成简体字一定会是false,所以逻辑是简繁都识别把简体字判断为false的识别为繁体字
import hanzidentifier
def is_simplified(char):
"""判断单个字符是否为简体字"""
return hanzidentifier.is_simplified(char)
def detect_language_characters(text):
"""检测字符串中的语言字符"""
results = {
'has_traditional_chinese': False,
'has_japanese': False,
'has_korean': False,
'details': {
'traditional_chinese_chars': [],
'japanese_chars': [],
'korean_chars': []
}
}
if not text or not isinstance(text, str):
return results
# 繁体字Unicode范围
traditional_chinese_ranges = [
(0x4E00, 0x9FFF), # 基本汉字(包含简繁)
(0x3400, 0x4DBF), # 扩展A
(0x20000, 0x2A6DF), # 扩展B
(0x2A700, 0x2B73F), # 扩展C
(0x2B740, 0x2B81F), # 扩展D
(0x2B820, 0x2CEAF), # 扩展E
(0x2CEB0, 0x2EBEF), # 扩展F
(0x3000, 0x303F), # 中文标点符号
]
# 日语字符范围
japanese_ranges = [
(0x3040, 0x309F), # 平假名
(0x30A0, 0x30FF), # 片假名
(0x31F0, 0x31FF), # 片假名音标扩展
(0xFF65, 0xFF9F), # 半角片假名
]
# 韩语字符范围
korean_ranges = [
(0xAC00, 0xD7AF), # 韩文音节
(0x1100, 0x11FF), # 韩文字母
(0x3130, 0x318F), # 韩文兼容字母
]
for char in text:
code_point = ord(char)
# 检查繁体字(中文)
for start, end in traditional_chinese_ranges:
if start <= code_point <= end and is_simplified(char) is False:
if char not in results['details']['traditional_chinese_chars']:
results['details']['traditional_chinese_chars'].append(char)
results['has_traditional_chinese'] = True
break
# 检查日语
for start, end in japanese_ranges:
if start <= code_point <= end:
if char not in results['details']['japanese_chars']:
results['details']['japanese_chars'].append(char)
results['has_japanese'] = True
break
# 检查韩语
for start, end in korean_ranges:
if start <= code_point <= end:
if char not in results['details']['korean_chars']:
results['details']['korean_chars'].append(char)
results['has_korean'] = True
break
return results