java中文分句工具类

需求背景

使用tts（文字转语音服务）能力将大模型的文本输出转为语音播报，大模型的输出为流式输出，每次输出不是一个完整的句子，需要的等待拼接为完整的句子再进行转写。分句的规则（标点符号）需要可配置，无标点文本长度需要强制分句，可设置强制分句长度。
java开源库没找到合适的，有合适的可以评论分享。

"result":{"response":" ","tokenCount":1}}

"result":{"response":"我是一个","tokenCount":1}}

"result":{"response":"名为 Chat","tokenCount":2}}

"result":{"response":"GLM-6B 的","tokenCount":7}}

"result":{"response":"人工智能","tokenCount":1}}

"result":{"response":"助手,是基于","tokenCount":3}}

"result":{"response":"清华大学","tokenCount":1}}

代码实现

import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson2.JSON;
import lombok.extern.slf4j.Slf4j;
import java.util.LinkedList;
import java.util.List;


@Slf4j
public class ClauseUtils {

    //分句符号有两个优先级，像句号这种为第一优先级分句符号，逗号这种为第二优先级分句符号
    public static final List<String> SYMBOL_FIRST_LIST = new ArrayList<>();
    public static final List<String> SYMBOL_SECOND_LIST = new ArrayList<>();


    /**
     * 如果在强制分句长度内没有分句符号，则强制分句。
     * 如果在强制分句长度内有分句符号，优先按照第一优先级分句符号，如果没有则按照第二优先级分句符号
     *
     * @param waitStr       	上一次未处理的文本（初始为空）
     * @param text          	本次输入文本
     * @param clauseList    	分句后存储的队列
	 * @param firstCharLength	首句长度限制
     * @param maxCharLength 	后续分句长度限制
     * @param uuid          	连接内随机字符串
     */
    public static String optimize(String waitStr, String text, LinkedList<String> clauseList, Integer firstCharLength, Integer maxCharLength, String uuid) {
        log.debug("uuid:{} 待处理文本 {} 待分句文本 {}", uuid, waitStr, text);
        String remainStr = "";
        try {
            int tmpMaxCharLength = maxCharLength;
            LinkedList<String> linkedList = new LinkedList<>();
            //分句符号数组
            List<String> symbolFirstList = SYMBOL_FIRST_LIST;
            List<String> symbolSecondList = SYMBOL_SECOND_LIST;
            text = waitStr + text;
            boolean firstTextCompleted = false;
            if (firstCharLength == maxCharLength) {
                firstTextCompleted = true;
            }

            while (StrUtil.isNotEmpty(text)) {
                if (!firstTextCompleted) {
                    maxCharLength = firstCharLength;
                } else {
                    maxCharLength = tmpMaxCharLength;
                }

                //文本中位置最前的符号索引
                int firstIndex = maxCharLength - 1;
                //文本中没找到符号的次数
                int firstMisCount = 0;
                int symbolLen = 1;
                boolean firstFind = false;
                //寻找符号匹配最小索引
                for (String symbol : symbolFirstList) {
                    int index = text.indexOf(symbol);
                    if (index == -1) {
                        firstMisCount++;
                        continue;
                    }

                    //匹配字符长度
                    if (index <= firstIndex) {
                        symbolLen = symbol.length();
                    }
                    firstIndex = Math.min(firstIndex, index);
                }

                if (!firstTextCompleted && (firstIndex < maxCharLength - 1
                        || (firstIndex == maxCharLength - 1 && text.length() >= maxCharLength && symbolFirstList.contains(text.substring(firstIndex, firstIndex + symbolLen)))
                        || text.length() >= maxCharLength)) {
                    firstFind = true;
                }

                //完整分句符号没有匹配到按照次级分句符号匹配 从最大限制位置倒着找最近的分句符
                int secondMisCount = 0;
                boolean secondFind = false;
                if (firstMisCount == symbolFirstList.size() || (firstIndex == maxCharLength - 1
                        && firstMisCount < symbolFirstList.size()
                        && !symbolFirstList.contains(text.substring(firstIndex, firstIndex + symbolLen)))) {
                    //寻找符号匹配最小索引
                    String maxCharText;
                    if (text.length() > maxCharLength) {
                        maxCharText = text.substring(0, maxCharLength);
                    } else if (text.length() == maxCharLength) {
                        maxCharText = text.substring(0, maxCharLength - 1);
                    } else {
                        maxCharText = text;
                    }

                    int secondIndex = 0;
                    for (String symbol : symbolSecondList) {
                        int index = maxCharText.lastIndexOf(symbol);
                        if (index == -1) {
                            secondMisCount++;
                            continue;
                        }

                        //匹配字符长度
                        if (index >= secondIndex) {
                            symbolLen = symbol.length();
                        }
                        secondIndex = Math.max(secondIndex, index);
                    }

                    if (secondIndex > 0 || (secondIndex == 0 && symbolSecondList.contains(maxCharText.substring(0, symbolLen)))) {
                        firstIndex = Math.min(secondIndex, firstIndex);
                        secondFind = true;
                    }
                }

                if (!firstTextCompleted && (firstFind || secondFind)) {
                    firstTextCompleted = true;
                }

                //开始截取文本
                String substring;
                //一次都没有匹配到
                if (firstMisCount == symbolFirstList.size() && secondMisCount == symbolSecondList.size()) {
                    //文本长度大于最大限制
                    if (text.length() > maxCharLength) {
                        substring = text.substring(0, maxCharLength);
                        text = text.substring(maxCharLength);
                    } else {
                        //文本长度等于小于最大限制
                        substring = text;
                        text = StrUtil.EMPTY;
                    }
                } else {
                    //文本第一个字符就是符号
                    if (firstIndex == 0) {
                        substring = text.substring(0, symbolLen);
                        text = text.substring(symbolLen);
                    } else {
                        //大于
                        if (firstIndex + 1 > text.length()) {
                            substring = text.substring(0, firstIndex + symbolLen);
                            text = text.substring(firstIndex + symbolLen);
                        } else if (firstIndex + 1 == text.length()) {
                            substring = text;
                            text = StrUtil.EMPTY;
                        } else {
                            //小于
                            substring = text.substring(0, firstIndex + symbolLen);
                            text = text.substring(firstIndex + symbolLen);
                        }
                    }
                }

                if (StrUtil.isNotEmpty(substring)) {
                    linkedList.offer(substring);
                }
            }

            //处理队列最后串字符
            String last = linkedList.getLast();
            if (StrUtil.isNotEmpty(last)) {
                boolean fmatch = symbolFirstList.stream().anyMatch(s -> last.contains(s));
                boolean smatch = symbolSecondList.stream().anyMatch(s -> last.contains(s));
                if (!fmatch && !smatch && last.length() < maxCharLength) {
                    linkedList.removeLast();
                    remainStr = last;
                }
            }

            if (!linkedList.isEmpty()) {
                clauseList.addAll(linkedList);
            }

            log.debug("uuid:{} 分句后待处理文本 {} 分句集合 {}", uuid, remainStr, JSON.toJSONString(linkedList));
        } catch (Exception e) {
            log.error("uuid:{} ClauseUtils", uuid, e);
        }

        return remainStr;
    }

    public static void main(String[] args) {
        List<String> symbolList = SYMBOL_FIRST_LIST;
        symbolList.add("。");
        symbolList.add("？");
        symbolList.add("！");
        symbolList.add("；");

        List<String> symbolSecondList = SYMBOL_SECOND_LIST;
        symbolSecondList.add("，");
        symbolSecondList.add("、");
        symbolSecondList.add("：");
        symbolSecondList.add("——");
        symbolSecondList.add("……");
        symbolSecondList.add("《");
        symbolSecondList.add("》");
        symbolSecondList.add("“");
        symbolSecondList.add("”");
        symbolSecondList.add("‘");
        symbolSecondList.add("’");
        symbolSecondList.add("（");
        symbolSecondList.add("）");
        symbolSecondList.add("·");

        String waitStr = "";
        String text = "福田区，招聘职位包括";
        LinkedList<String> clauseList = new LinkedList<>();
        optimize(waitStr, text, clauseList, 20, 60, "_");
    }
}

posted @ 2024-08-22 14:08 meow_world 阅读(90) 评论(0) 收藏举报

刷新页面返回顶部

meow_world

java中文分句工具类

目录：

需求背景

代码实现

公告