java提取中英文混合文本中英文

需求背景
一些英文经过人工翻译和中文拼接到一起,后面走大模型翻译,这些历史数据也需要重新翻译,所以需要提取出英文。

如果是单纯的把中文去掉比较简单,用正则就可以完成,如下:
原文本:所有轴承松动。ALL BEARING LOOSE. (10 OFF)
提取后文本:ALL BEARING LOOSE. (10 OFF)

现实需求还是比较复杂一些,可能中文里也包含数字、特殊符号或者括号,括号中可能会包含中英和数字等,括号中的文字是中文的一部分不需要提取。
下面是几个例子

1.原文本: 1点钟位置隔热层上有3处破洞 HOLE DAMAGE(3 OFF,DIA:0.45''X0.3'',1.85''X0.3'',0.2''X0.2'') ON INSULATION BLANKET
提取后文本:HOLE DAMAGE(3 OFF,DIA:0.45''X0.3'',1.85''X0.3'',0.2''X0.2'') ON INSULATION BLANKET
说明:这段文本中文包含了数字,1点钟、3处破洞,1和3不能被当做英文处理

2.原文本: 件号P/N:725Z3572-107 序号S/N:16011385
提取后文本:P/N:725Z3572-107 S/N:16011385
说明:这段中英混合有两段

3.原文本: MISSING T/S P/N & S/N (平移罩零件号,序列号缺失)
提取后文本:MISSING T/S P/N & S/N
说明:英文后的括号是中文翻译,这个写的不标准,需要将括号和中文都去掉

4.原文本: 连接的紧固件磨损 ATTACHING FASTENERS WORN(数量 QTY 15 EA)
提取后文本:ATTACHING FASTENERS WORN(QTY 15 EA)
说明:将括号内的数量中文去掉,英文保留。

这是列举了几个有代表性的中英混合文本,整体看下来如何判断数字、特殊符号、括号是中文还是英文,需要根据前面的字符推断。
因此设计的时候将文本分为中文模式和英文模式两种,并在两种模式之间切换。只做了常见的处理,没针对极端个例做特殊处理。

代码实现


import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.aicloud.translate.model.EnTextDTO;
import com.alibaba.fastjson2.JSON;
import lombok.extern.slf4j.Slf4j;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.UUID;
import java.util.stream.Collectors;

@Slf4j
public class EnTextUtil {

    private static final String REGEX = "[\u4E00-\u9FA5]";

    private static final String cnLeftIncludeStr = "(";
    private static final String cnRightIncludeStr = ")";
    private static final String enLeftIncludeStr = "(";
    private static final String enRightIncludeStr = ")";

    public static void main(String[] args) {
        System.out.println(enToSegments("凿痕GOUGES(a)(长, 宽 ,深 ):__________"));
        System.out.println(removeInclude("a((1))", 0));
    }

    /**
     * 将文本中的英文分段提取出来
     */
    public static String enToSegments(String text) {
        String uuid = UUID.randomUUID().toString();
        log.debug("uuid:{} enToSegments request text:{}", uuid, text);
        if (StrUtil.isBlank(text)) {
            return StrUtil.EMPTY;
        }

        List<EnTextDTO> dtoList = new ArrayList<>();
        text = text.trim();
        //每段英文字符
        StringBuilder segment = new StringBuilder();
        //右括号是否结束
        boolean includeEnd = true;
        int sort = 1;
        //中/英文模式
        boolean cnMode = false;

        for (int i = 0; i < text.length(); i++) {
            char c = text.charAt(i);
            //当前字符是否为中文
            boolean curCn = isChinese(c);
            //当前字符是否为特殊字符
            boolean curSymbol = isSymbol(c);
            boolean leftInclude = isLeftInclude(c);
            boolean rightInclude = isRightInclude(c);

            if (i == 0 && (curCn || curSymbol || leftInclude || rightInclude)) {
                cnMode = true;
            }

            if (curCn) {
                //中文
                //中文模式
                if (cnMode) {
                    //跳过
                    continue;
                } else {
                    //英文模式下,遇到中文且右括号没结束
                    if (!includeEnd) {
                        segment.append(c);
                    } else {
                        //英文模式下,遇到中文且右括号结束
                        if (segment.length() > 0) {
                            EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
                            dtoList.add(textDTO);
                            segment.setLength(0);
                            sort += 1;
                        }

                        cnMode = true;
                    }
                }
            } else if (leftInclude || rightInclude) {
                //左右括号
                //中文模式下左右括号都跳过
                if (cnMode) {
                    if (leftInclude) {
                        includeEnd = false;
                    }
                    if (!includeEnd && rightInclude) {
                        includeEnd = true;
                    }
                    continue;
                } else {
                    //英文模式下左右括号拼接
                    segment.append(c);
                    //英文模式下最后一个字符为右括号
                    if (i + 1 == text.length()) {
                        EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
                        dtoList.add(textDTO);
                        segment.setLength(0);
                        sort += 1;
                    }
                }
                //左括号开始未结束
                if (leftInclude) {
                    includeEnd = false;
                }
                if (!includeEnd && rightInclude) {
                    includeEnd = true;
                }
            } else if (curSymbol) {//特殊字符
                if (cnMode) {
                    //中文模式下跳过
                    continue;
                } else {
                    //拼接
                    segment.append(c);
                    if (i + 1 == text.length()) {
                        EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
                        dtoList.add(textDTO);
                        segment.setLength(0);
                        sort += 1;
                    }
                }
            } else {//英文
                if (cnMode) {
                    if (!includeEnd) {
                        //中文模式下遇到英文,右括号没结束跳过
                        continue;
                    } else {
                        //中文模式下英文,拼接
                        segment.append(c);
                        cnMode = false;

                        //英文模式下最后一个字符为英文
                        if (i + 1 == text.length()) {
                            EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
                            dtoList.add(textDTO);
                            segment.setLength(0);
                            sort += 1;
                        }
                    }
                } else {
                    //拼接
                    segment.append(c);
                    //无中文文本
                    if (text.length() == segment.length()) {
                        EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
                        dtoList.add(textDTO);
                        segment.setLength(0);
                        sort += 1;
                    }

                    //英文模式下最后一个字符为英文
                    if (i + 1 == text.length()) {
                        EnTextDTO textDTO = EnTextDTO.builder().sort(sort).en(removeChinese(segment.toString())).build();
                        dtoList.add(textDTO);
                        segment.setLength(0);
                        sort += 1;
                    }
                }
            }
        }

        String content = dtoList.stream().sorted(Comparator.comparingInt(EnTextDTO::getSort)).map(EnTextDTO::getEn).collect(Collectors.joining(" "));
        log.debug("uuid:{} enToSegments return text:{}", uuid, content);
        content = removeInclude(content, 0);
        log.debug("uuid:{} removeInclude return text:{}", uuid, content);
        return content;
    }

    public static boolean isChinese(char c) {
        // 基本的中文Unicode范围
        // \u4e00-\u9fa5 是常用汉字
        // 还可以包括其他Unicode区域的中文字符,如扩展的汉字字符等
        // 这里只检查常用汉字范围
        return (c >= '\u4e00' && c <= '\u9fa5');
    }

    public static boolean isSymbol(char c) {
        boolean isSymbol = false;
        for (char value : symbolList) {
            if (value == c) {
                isSymbol = true;
                break;
            }
        }

        return isSymbol;
    }

    public static boolean isLeftInclude(char c) {
        return enLeftInclude == c || cnLeftInclude == c;
    }

    public static boolean isRightInclude(char c) {
        return enRightInclude == c || cnRightInclude == c;
    }

    private static final char[] symbolList = new char[34];
    private static final char enLeftInclude;
    private static final char enRightInclude;
    private static final char cnLeftInclude;
    private static final char cnRightInclude;
    private static final Integer zero = 0;
    //非数字的特殊符号
    private static final char[] nonNumericSymbolList = new char[28];

    static {
        symbolList[0] = ("_".toCharArray()[zero]);
        symbolList[1] = ("@".toCharArray()[zero]);
        symbolList[2] = ("#".toCharArray()[zero]);
        symbolList[3] = ("$".toCharArray()[zero]);
        symbolList[4] = ("%".toCharArray()[zero]);
        symbolList[5] = ("^".toCharArray()[zero]);
        symbolList[6] = ("&".toCharArray()[zero]);
        symbolList[7] = ("*".toCharArray()[zero]);
        symbolList[8] = ("-".toCharArray()[zero]);
        symbolList[9] = ("=".toCharArray()[zero]);
        symbolList[10] = ("[".toCharArray()[zero]);
        symbolList[11] = ("]".toCharArray()[zero]);
        symbolList[12] = ("|".toCharArray()[zero]);
        symbolList[13] = (",".toCharArray()[zero]);
        symbolList[14] = (".".toCharArray()[zero]);
        symbolList[15] = ("/".toCharArray()[zero]);
        symbolList[16] = ("¥".toCharArray()[zero]);
        symbolList[17] = ("…".toCharArray()[zero]);
        symbolList[18] = (",".toCharArray()[zero]);
        symbolList[19] = ("。".toCharArray()[zero]);
        symbolList[20] = ("、".toCharArray()[zero]);
        symbolList[21] = ("”".toCharArray()[zero]);
        symbolList[22] = (":".toCharArray()[zero]);
        symbolList[23] = ("0".toCharArray()[zero]);
        symbolList[24] = ("1".toCharArray()[zero]);
        symbolList[25] = ("2".toCharArray()[zero]);
        symbolList[26] = ("3".toCharArray()[zero]);
        symbolList[27] = ("4".toCharArray()[zero]);
        symbolList[28] = ("5".toCharArray()[zero]);
        symbolList[29] = ("6".toCharArray()[zero]);
        symbolList[30] = ("7".toCharArray()[zero]);
        symbolList[31] = ("8".toCharArray()[zero]);
        symbolList[32] = ("9".toCharArray()[zero]);
        symbolList[33] = (" ".toCharArray()[zero]);

        enLeftInclude = "(".toCharArray()[0];
        enRightInclude = ")".toCharArray()[0];

        cnLeftInclude = "(".toCharArray()[0];
        cnRightInclude = ")".toCharArray()[0];

        nonNumericSymbolList[0] = ("_".toCharArray()[zero]);
        nonNumericSymbolList[1] = ("@".toCharArray()[zero]);
        nonNumericSymbolList[2] = ("#".toCharArray()[zero]);
        nonNumericSymbolList[3] = ("$".toCharArray()[zero]);
        nonNumericSymbolList[4] = ("%".toCharArray()[zero]);
        nonNumericSymbolList[5] = ("^".toCharArray()[zero]);
        nonNumericSymbolList[6] = ("&".toCharArray()[zero]);
        nonNumericSymbolList[7] = ("*".toCharArray()[zero]);
        nonNumericSymbolList[8] = ("-".toCharArray()[zero]);
        nonNumericSymbolList[9] = ("=".toCharArray()[zero]);
        nonNumericSymbolList[10] = ("[".toCharArray()[zero]);
        nonNumericSymbolList[11] = ("]".toCharArray()[zero]);
        nonNumericSymbolList[12] = ("|".toCharArray()[zero]);
        nonNumericSymbolList[13] = (",".toCharArray()[zero]);
        nonNumericSymbolList[14] = (".".toCharArray()[zero]);
        nonNumericSymbolList[15] = ("/".toCharArray()[zero]);
        nonNumericSymbolList[16] = ("¥".toCharArray()[zero]);
        nonNumericSymbolList[17] = ("…".toCharArray()[zero]);
        nonNumericSymbolList[18] = (",".toCharArray()[zero]);
        nonNumericSymbolList[19] = ("。".toCharArray()[zero]);
        nonNumericSymbolList[20] = ("、".toCharArray()[zero]);
        nonNumericSymbolList[21] = ("”".toCharArray()[zero]);
        nonNumericSymbolList[22] = (":".toCharArray()[zero]);
        nonNumericSymbolList[23] = (" ".toCharArray()[zero]);
        nonNumericSymbolList[24] = ("(".toCharArray()[zero]);
        nonNumericSymbolList[25] = (")".toCharArray()[zero]);
        nonNumericSymbolList[26] = ("(".toCharArray()[zero]);
        nonNumericSymbolList[27] = (")".toCharArray()[zero]);

        log.debug("symbolList:{}", JSON.toJSONString(symbolList));
    }

    /**
     * 去除文本中的中文
     */
    public static String removeChinese(String text) {
        if (StrUtil.isBlank(text)) {
            return text;
        }

        return text.replaceAll(REGEX, "");
    }

    /**
     * 去掉无意义的括号
     */
    public static String removeInclude(String text, Integer fromIndex) {
        if (StrUtil.isBlank(text)) {
            return text;
        }

        //第一次原始文本不包含左右括号
        if (fromIndex == 0 && !((text.contains(enLeftIncludeStr) || text.contains(cnLeftIncludeStr))
                && (text.contains(enRightIncludeStr) || text.contains(cnRightIncludeStr)))) {
            return text;
        }

        //截取后的文本是否满足同时有左右括号
        if ((text.contains(enLeftIncludeStr) || text.contains(cnLeftIncludeStr)) && (text.contains(enRightIncludeStr) || text.contains(cnRightIncludeStr))) {
            //原文本上查找
            int enLeftIncludeStrIndex = text.indexOf(enLeftIncludeStr, fromIndex);
            int cnLeftIncludeStrIndex = text.indexOf(cnLeftIncludeStr, fromIndex);
            int enRightIncludeStrIndex = text.indexOf(enRightIncludeStr, fromIndex);
            int cnRightIncludeStrIndex = text.indexOf(cnRightIncludeStr, fromIndex);

            int start;
            int end;
            if (enLeftIncludeStrIndex == 0 && cnLeftIncludeStrIndex == 0) {
                start = -1;
            } else if (enLeftIncludeStrIndex == -1) {
                start = cnLeftIncludeStrIndex;
            } else if (cnLeftIncludeStrIndex == -1) {
                start = enLeftIncludeStrIndex;
            } else {
                start = Math.min(enLeftIncludeStrIndex, cnLeftIncludeStrIndex);
            }

            if (enRightIncludeStrIndex == 0 && cnRightIncludeStrIndex == 0) {
                end = -1;
            } else if (enRightIncludeStrIndex == -1) {
                end = cnRightIncludeStrIndex;
            } else if (cnRightIncludeStrIndex == -1) {
                end = enRightIncludeStrIndex;
            } else {
                end = Math.min(enRightIncludeStrIndex, cnRightIncludeStrIndex);
            }

            if ((start != -1 && end != -1) && start < end) {
                //是否包含有效的字符
                char[] includeCharArr = text.substring(start + 1, end).toCharArray();
                int count = 0;
                for (char c : includeCharArr) {
                    for (char c1 : nonNumericSymbolList) {
                        if (c == c1) {
                            count++;
                            break;
                        }
                    }
                }

                //是否全部为特殊符号
                if (count == includeCharArr.length) {
                    int length = text.length();
                    text = text.substring(0, start) + " " + text.substring(end + 1);
                    //索引位置前移
                    fromIndex = end - (length - text.length());
                    text = removeInclude(text, fromIndex);
                } else {
                    //包含有效字符,不截取,跳过找下一个括号
                    fromIndex = end + 1;
                    text = removeInclude(text, fromIndex);
                }
            }
        }

        return text;
    }
}

posted @ 2024-08-07 18:20  meow_world  阅读(101)  评论(0)    收藏  举报