java提取中英文混合文本中英文
需求背景
一些英文经过人工翻译和中文拼接到一起,后面走大模型翻译,这些历史数据也需要重新翻译,所以需要提取出英文。
如果是单纯的把中文去掉比较简单,用正则就可以完成,如下:
原文本:所有轴承松动。ALL BEARING LOOSE. (10 OFF)
提取后文本:ALL BEARING LOOSE. (10 OFF)
现实需求还是比较复杂一些,可能中文里也包含数字、特殊符号或者括号,括号中可能会包含中英和数字等,括号中的文字是中文的一部分不需要提取。
下面是几个例子
1.原文本: 1点钟位置隔热层上有3处破洞 HOLE DAMAGE(3 OFF,DIA:0.45''X0.3'',1.85''X0.3'',0.2''X0.2'') ON INSULATION BLANKET
提取后文本:HOLE DAMAGE(3 OFF,DIA:0.45''X0.3'',1.85''X0.3'',0.2''X0.2'') ON INSULATION BLANKET
说明:这段文本中文包含了数字,1点钟、3处破洞,1和3不能被当做英文处理
2.原文本: 件号P/N:725Z3572-107 序号S/N:16011385
提取后文本:P/N:725Z3572-107 S/N:16011385
说明:这段中英混合有两段
3.原文本: MISSING T/S P/N & S/N (平移罩零件号,序列号缺失)
提取后文本:MISSING T/S P/N & S/N
说明:英文后的括号是中文翻译,这个写的不标准,需要将括号和中文都去掉
4.原文本: 连接的紧固件磨损 ATTACHING FASTENERS WORN(数量 QTY 15 EA)
提取后文本:ATTACHING FASTENERS WORN(QTY 15 EA)
说明:将括号内的数量中文去掉,英文保留。
这是列举了几个有代表性的中英混合文本,整体看下来如何判断数字、特殊符号、括号是中文还是英文,需要根据前面的字符推断。
因此设计的时候将文本分为中文模式和英文模式两种,并在两种模式之间切换。只做了常见的处理,没针对极端个例做特殊处理。
代码实现
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.aicloud.translate.model.EnTextDTO;
import com.alibaba.fastjson2.JSON;
import lombok.extern.slf4j.Slf4j;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.UUID;
import java.util.stream.Collectors;
@Slf4j
public class EnTextUtil {
private static final String REGEX = "[\u4E00-\u9FA5]";
private static final String cnLeftIncludeStr = "(";
private static final String cnRightIncludeStr = ")";
private static final String enLeftIncludeStr = "(";
private static final String enRightIncludeStr = ")";
public static void main(String[] args) {
System.out.println(enToSegments("凿痕GOUGES(a)(长, 宽 ,深 ):__________"));
System.out.println(removeInclude("a((1))", 0));
}
/**
* 将文本中的英文分段提取出来
*/
public static String enToSegments(String text) {
String uuid = UUID.randomUUID().toString();
log.debug("uuid:{} enToSegments request text:{}", uuid, text);
if (StrUtil.isBlank(text)) {
return StrUtil.EMPTY;
}
List<EnTextDTO> dtoList = new ArrayList<>();
text = text.trim();
//每段英文字符
StringBuilder segment = new StringBuilder();
//右括号是否结束
boolean includeEnd = true;
int sort = 1;
//中/英文模式
boolean cnMode = false;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
//当前字符是否为中文
boolean curCn = isChinese(c);
//当前字符是否为特殊字符
boolean curSymbol = isSymbol(c);
boolean leftInclude = isLeftInclude(c);
boolean rightInclude = isRightInclude(c);
if (i == 0 && (curCn || curSymbol || leftInclude || rightInclude)) {
cnMode = true;
}
if (curCn) {
//中文
//中文模式
if (cnMode) {
//跳过
continue;
} else {
//英文模式下,遇到中文且右括号没结束
if (!includeEnd) {
segment.append(c);
} else {
//英文模式下,遇到中文且右括号结束
if (segment.length() > 0) {
EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
dtoList.add(textDTO);
segment.setLength(0);
sort += 1;
}
cnMode = true;
}
}
} else if (leftInclude || rightInclude) {
//左右括号
//中文模式下左右括号都跳过
if (cnMode) {
if (leftInclude) {
includeEnd = false;
}
if (!includeEnd && rightInclude) {
includeEnd = true;
}
continue;
} else {
//英文模式下左右括号拼接
segment.append(c);
//英文模式下最后一个字符为右括号
if (i + 1 == text.length()) {
EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
dtoList.add(textDTO);
segment.setLength(0);
sort += 1;
}
}
//左括号开始未结束
if (leftInclude) {
includeEnd = false;
}
if (!includeEnd && rightInclude) {
includeEnd = true;
}
} else if (curSymbol) {//特殊字符
if (cnMode) {
//中文模式下跳过
continue;
} else {
//拼接
segment.append(c);
if (i + 1 == text.length()) {
EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
dtoList.add(textDTO);
segment.setLength(0);
sort += 1;
}
}
} else {//英文
if (cnMode) {
if (!includeEnd) {
//中文模式下遇到英文,右括号没结束跳过
continue;
} else {
//中文模式下英文,拼接
segment.append(c);
cnMode = false;
//英文模式下最后一个字符为英文
if (i + 1 == text.length()) {
EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
dtoList.add(textDTO);
segment.setLength(0);
sort += 1;
}
}
} else {
//拼接
segment.append(c);
//无中文文本
if (text.length() == segment.length()) {
EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
dtoList.add(textDTO);
segment.setLength(0);
sort += 1;
}
//英文模式下最后一个字符为英文
if (i + 1 == text.length()) {
EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
dtoList.add(textDTO);
segment.setLength(0);
sort += 1;
}
}
}
}
String content = dtoList.stream().sorted(Comparator.comparingInt(EnTextDTO::getSort)).map(EnTextDTO::getEn).collect(Collectors.joining(" "));
log.debug("uuid:{} enToSegments return text:{}", uuid, content);
content = removeInclude(content, 0);
log.debug("uuid:{} removeInclude return text:{}", uuid, content);
return content;
}
public static boolean isChinese(char c) {
// 基本的中文Unicode范围
// \u4e00-\u9fa5 是常用汉字
// 还可以包括其他Unicode区域的中文字符,如扩展的汉字字符等
// 这里只检查常用汉字范围
return (c >= '\u4e00' && c <= '\u9fa5');
}
public static boolean isSymbol(char c) {
boolean isSymbol = false;
for (char value : symbolList) {
if (value == c) {
isSymbol = true;
break;
}
}
return isSymbol;
}
public static boolean isLeftInclude(char c) {
return enLeftInclude == c || cnLeftInclude == c;
}
public static boolean isRightInclude(char c) {
return enRightInclude == c || cnRightInclude == c;
}
private static final char[] symbolList = new char[34];
private static final char enLeftInclude;
private static final char enRightInclude;
private static final char cnLeftInclude;
private static final char cnRightInclude;
private static final Integer zero = 0;
//非数字的特殊符号
private static final char[] nonNumericSymbolList = new char[28];
static {
symbolList[0] = ("_".toCharArray()[zero]);
symbolList[1] = ("@".toCharArray()[zero]);
symbolList[2] = ("#".toCharArray()[zero]);
symbolList[3] = ("$".toCharArray()[zero]);
symbolList[4] = ("%".toCharArray()[zero]);
symbolList[5] = ("^".toCharArray()[zero]);
symbolList[6] = ("&".toCharArray()[zero]);
symbolList[7] = ("*".toCharArray()[zero]);
symbolList[8] = ("-".toCharArray()[zero]);
symbolList[9] = ("=".toCharArray()[zero]);
symbolList[10] = ("[".toCharArray()[zero]);
symbolList[11] = ("]".toCharArray()[zero]);
symbolList[12] = ("|".toCharArray()[zero]);
symbolList[13] = (",".toCharArray()[zero]);
symbolList[14] = (".".toCharArray()[zero]);
symbolList[15] = ("/".toCharArray()[zero]);
symbolList[16] = ("¥".toCharArray()[zero]);
symbolList[17] = ("…".toCharArray()[zero]);
symbolList[18] = (",".toCharArray()[zero]);
symbolList[19] = ("。".toCharArray()[zero]);
symbolList[20] = ("、".toCharArray()[zero]);
symbolList[21] = ("”".toCharArray()[zero]);
symbolList[22] = (":".toCharArray()[zero]);
symbolList[23] = ("0".toCharArray()[zero]);
symbolList[24] = ("1".toCharArray()[zero]);
symbolList[25] = ("2".toCharArray()[zero]);
symbolList[26] = ("3".toCharArray()[zero]);
symbolList[27] = ("4".toCharArray()[zero]);
symbolList[28] = ("5".toCharArray()[zero]);
symbolList[29] = ("6".toCharArray()[zero]);
symbolList[30] = ("7".toCharArray()[zero]);
symbolList[31] = ("8".toCharArray()[zero]);
symbolList[32] = ("9".toCharArray()[zero]);
symbolList[33] = (" ".toCharArray()[zero]);
enLeftInclude = "(".toCharArray()[0];
enRightInclude = ")".toCharArray()[0];
cnLeftInclude = "(".toCharArray()[0];
cnRightInclude = ")".toCharArray()[0];
nonNumericSymbolList[0] = ("_".toCharArray()[zero]);
nonNumericSymbolList[1] = ("@".toCharArray()[zero]);
nonNumericSymbolList[2] = ("#".toCharArray()[zero]);
nonNumericSymbolList[3] = ("$".toCharArray()[zero]);
nonNumericSymbolList[4] = ("%".toCharArray()[zero]);
nonNumericSymbolList[5] = ("^".toCharArray()[zero]);
nonNumericSymbolList[6] = ("&".toCharArray()[zero]);
nonNumericSymbolList[7] = ("*".toCharArray()[zero]);
nonNumericSymbolList[8] = ("-".toCharArray()[zero]);
nonNumericSymbolList[9] = ("=".toCharArray()[zero]);
nonNumericSymbolList[10] = ("[".toCharArray()[zero]);
nonNumericSymbolList[11] = ("]".toCharArray()[zero]);
nonNumericSymbolList[12] = ("|".toCharArray()[zero]);
nonNumericSymbolList[13] = (",".toCharArray()[zero]);
nonNumericSymbolList[14] = (".".toCharArray()[zero]);
nonNumericSymbolList[15] = ("/".toCharArray()[zero]);
nonNumericSymbolList[16] = ("¥".toCharArray()[zero]);
nonNumericSymbolList[17] = ("…".toCharArray()[zero]);
nonNumericSymbolList[18] = (",".toCharArray()[zero]);
nonNumericSymbolList[19] = ("。".toCharArray()[zero]);
nonNumericSymbolList[20] = ("、".toCharArray()[zero]);
nonNumericSymbolList[21] = ("”".toCharArray()[zero]);
nonNumericSymbolList[22] = (":".toCharArray()[zero]);
nonNumericSymbolList[23] = (" ".toCharArray()[zero]);
nonNumericSymbolList[24] = ("(".toCharArray()[zero]);
nonNumericSymbolList[25] = (")".toCharArray()[zero]);
nonNumericSymbolList[26] = ("(".toCharArray()[zero]);
nonNumericSymbolList[27] = (")".toCharArray()[zero]);
log.debug("symbolList:{}", JSON.toJSONString(symbolList));
}
/**
* 去除文本中的中文
*/
public static String removeChinese(String text) {
if (StrUtil.isBlank(text)) {
return text;
}
return text.replaceAll(REGEX, "");
}
/**
* 去掉无意义的括号
*/
public static String removeInclude(String text, Integer fromIndex) {
if (StrUtil.isBlank(text)) {
return text;
}
//第一次原始文本不包含左右括号
if (fromIndex == 0 && !((text.contains(enLeftIncludeStr) || text.contains(cnLeftIncludeStr))
&& (text.contains(enRightIncludeStr) || text.contains(cnRightIncludeStr)))) {
return text;
}
//截取后的文本是否满足同时有左右括号
if ((text.contains(enLeftIncludeStr) || text.contains(cnLeftIncludeStr)) && (text.contains(enRightIncludeStr) || text.contains(cnRightIncludeStr))) {
//原文本上查找
int enLeftIncludeStrIndex = text.indexOf(enLeftIncludeStr, fromIndex);
int cnLeftIncludeStrIndex = text.indexOf(cnLeftIncludeStr, fromIndex);
int enRightIncludeStrIndex = text.indexOf(enRightIncludeStr, fromIndex);
int cnRightIncludeStrIndex = text.indexOf(cnRightIncludeStr, fromIndex);
int start;
int end;
if (enLeftIncludeStrIndex == 0 && cnLeftIncludeStrIndex == 0) {
start = -1;
} else if (enLeftIncludeStrIndex == -1) {
start = cnLeftIncludeStrIndex;
} else if (cnLeftIncludeStrIndex == -1) {
start = enLeftIncludeStrIndex;
} else {
start = Math.min(enLeftIncludeStrIndex, cnLeftIncludeStrIndex);
}
if (enRightIncludeStrIndex == 0 && cnRightIncludeStrIndex == 0) {
end = -1;
} else if (enRightIncludeStrIndex == -1) {
end = cnRightIncludeStrIndex;
} else if (cnRightIncludeStrIndex == -1) {
end = enRightIncludeStrIndex;
} else {
end = Math.min(enRightIncludeStrIndex, cnRightIncludeStrIndex);
}
if ((start != -1 && end != -1) && start < end) {
//是否包含有效的字符
char[] includeCharArr = text.substring(start + 1, end).toCharArray();
int count = 0;
for (char c : includeCharArr) {
for (char c1 : nonNumericSymbolList) {
if (c == c1) {
count++;
break;
}
}
}
//是否全部为特殊符号
if (count == includeCharArr.length) {
int length = text.length();
text = text.substring(0, start) + " " + text.substring(end + 1);
//索引位置前移
fromIndex = end - (length - text.length());
text = removeInclude(text, fromIndex);
} else {
//包含有效字符,不截取,跳过找下一个括号
fromIndex = end + 1;
text = removeInclude(text, fromIndex);
}
}
}
return text;
}
}

浙公网安备 33010602011771号