java中文分句工具类
目录:
1.需求背景
2.代码实现
需求背景
使用tts(文字转语音服务)能力将大模型的文本输出转为语音播报,大模型的输出为流式输出,每次输出不是一个完整的句子,需要的等待拼接为完整的句子再进行转写。分句的规则(标点符号)需要可配置,无标点文本长度需要强制分句,可设置强制分句长度。
java开源库没找到合适的,有合适的可以评论分享。
"result":{"response":" ","tokenCount":1}}
"result":{"response":"我是一个","tokenCount":1}}
"result":{"response":"名为 Chat","tokenCount":2}}
"result":{"response":"GLM-6B 的","tokenCount":7}}
"result":{"response":"人工智能","tokenCount":1}}
"result":{"response":"助手,是基于","tokenCount":3}}
"result":{"response":"清华大学","tokenCount":1}}
代码实现
import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson2.JSON;
import lombok.extern.slf4j.Slf4j;
import java.util.LinkedList;
import java.util.List;
@Slf4j
public class ClauseUtils {
//分句符号有两个优先级,像句号这种为第一优先级分句符号,逗号这种为第二优先级分句符号
public static final List<String> SYMBOL_FIRST_LIST = new ArrayList<>();
public static final List<String> SYMBOL_SECOND_LIST = new ArrayList<>();
/**
* 如果在强制分句长度内没有分句符号,则强制分句。
* 如果在强制分句长度内有分句符号,优先按照第一优先级分句符号,如果没有则按照第二优先级分句符号
*
* @param waitStr 上一次未处理的文本(初始为空)
* @param text 本次输入文本
* @param clauseList 分句后存储的队列
* @param firstCharLength 首句长度限制
* @param maxCharLength 后续分句长度限制
* @param uuid 连接内随机字符串
*/
public static String optimize(String waitStr, String text, LinkedList<String> clauseList, Integer firstCharLength, Integer maxCharLength, String uuid) {
log.debug("uuid:{} 待处理文本 {} 待分句文本 {}", uuid, waitStr, text);
String remainStr = "";
try {
int tmpMaxCharLength = maxCharLength;
LinkedList<String> linkedList = new LinkedList<>();
//分句符号数组
List<String> symbolFirstList = SYMBOL_FIRST_LIST;
List<String> symbolSecondList = SYMBOL_SECOND_LIST;
text = waitStr + text;
boolean firstTextCompleted = false;
if (firstCharLength == maxCharLength) {
firstTextCompleted = true;
}
while (StrUtil.isNotEmpty(text)) {
if (!firstTextCompleted) {
maxCharLength = firstCharLength;
} else {
maxCharLength = tmpMaxCharLength;
}
//文本中位置最前的符号索引
int firstIndex = maxCharLength - 1;
//文本中没找到符号的次数
int firstMisCount = 0;
int symbolLen = 1;
boolean firstFind = false;
//寻找符号匹配最小索引
for (String symbol : symbolFirstList) {
int index = text.indexOf(symbol);
if (index == -1) {
firstMisCount++;
continue;
}
//匹配字符长度
if (index <= firstIndex) {
symbolLen = symbol.length();
}
firstIndex = Math.min(firstIndex, index);
}
if (!firstTextCompleted && (firstIndex < maxCharLength - 1
|| (firstIndex == maxCharLength - 1 && text.length() >= maxCharLength && symbolFirstList.contains(text.substring(firstIndex, firstIndex + symbolLen)))
|| text.length() >= maxCharLength)) {
firstFind = true;
}
//完整分句符号没有匹配到按照次级分句符号匹配 从最大限制位置倒着找最近的分句符
int secondMisCount = 0;
boolean secondFind = false;
if (firstMisCount == symbolFirstList.size() || (firstIndex == maxCharLength - 1
&& firstMisCount < symbolFirstList.size()
&& !symbolFirstList.contains(text.substring(firstIndex, firstIndex + symbolLen)))) {
//寻找符号匹配最小索引
String maxCharText;
if (text.length() > maxCharLength) {
maxCharText = text.substring(0, maxCharLength);
} else if (text.length() == maxCharLength) {
maxCharText = text.substring(0, maxCharLength - 1);
} else {
maxCharText = text;
}
int secondIndex = 0;
for (String symbol : symbolSecondList) {
int index = maxCharText.lastIndexOf(symbol);
if (index == -1) {
secondMisCount++;
continue;
}
//匹配字符长度
if (index >= secondIndex) {
symbolLen = symbol.length();
}
secondIndex = Math.max(secondIndex, index);
}
if (secondIndex > 0 || (secondIndex == 0 && symbolSecondList.contains(maxCharText.substring(0, symbolLen)))) {
firstIndex = Math.min(secondIndex, firstIndex);
secondFind = true;
}
}
if (!firstTextCompleted && (firstFind || secondFind)) {
firstTextCompleted = true;
}
//开始截取文本
String substring;
//一次都没有匹配到
if (firstMisCount == symbolFirstList.size() && secondMisCount == symbolSecondList.size()) {
//文本长度大于最大限制
if (text.length() > maxCharLength) {
substring = text.substring(0, maxCharLength);
text = text.substring(maxCharLength);
} else {
//文本长度等于小于最大限制
substring = text;
text = StrUtil.EMPTY;
}
} else {
//文本第一个字符就是符号
if (firstIndex == 0) {
substring = text.substring(0, symbolLen);
text = text.substring(symbolLen);
} else {
//大于
if (firstIndex + 1 > text.length()) {
substring = text.substring(0, firstIndex + symbolLen);
text = text.substring(firstIndex + symbolLen);
} else if (firstIndex + 1 == text.length()) {
substring = text;
text = StrUtil.EMPTY;
} else {
//小于
substring = text.substring(0, firstIndex + symbolLen);
text = text.substring(firstIndex + symbolLen);
}
}
}
if (StrUtil.isNotEmpty(substring)) {
linkedList.offer(substring);
}
}
//处理队列最后串字符
String last = linkedList.getLast();
if (StrUtil.isNotEmpty(last)) {
boolean fmatch = symbolFirstList.stream().anyMatch(s -> last.contains(s));
boolean smatch = symbolSecondList.stream().anyMatch(s -> last.contains(s));
if (!fmatch && !smatch && last.length() < maxCharLength) {
linkedList.removeLast();
remainStr = last;
}
}
if (!linkedList.isEmpty()) {
clauseList.addAll(linkedList);
}
log.debug("uuid:{} 分句后待处理文本 {} 分句集合 {}", uuid, remainStr, JSON.toJSONString(linkedList));
} catch (Exception e) {
log.error("uuid:{} ClauseUtils", uuid, e);
}
return remainStr;
}
public static void main(String[] args) {
List<String> symbolList = SYMBOL_FIRST_LIST;
symbolList.add("。");
symbolList.add("?");
symbolList.add("!");
symbolList.add(";");
List<String> symbolSecondList = SYMBOL_SECOND_LIST;
symbolSecondList.add(",");
symbolSecondList.add("、");
symbolSecondList.add(":");
symbolSecondList.add("——");
symbolSecondList.add("……");
symbolSecondList.add("《");
symbolSecondList.add("》");
symbolSecondList.add("“");
symbolSecondList.add("”");
symbolSecondList.add("‘");
symbolSecondList.add("’");
symbolSecondList.add("(");
symbolSecondList.add(")");
symbolSecondList.add("·");
String waitStr = "";
String text = "福田区,招聘职位包括";
LinkedList<String> clauseList = new LinkedList<>();
optimize(waitStr, text, clauseList, 20, 60, "_");
}
}
touch fish

浙公网安备 33010602011771号