TTS 合成语音

1. python脚本:

 

import sys
import os
import logging
import torch
import re
from TTS.utils.radam import RAdam
from collections import defaultdict
from TTS.api import TTS
from pydub import AudioSegment
from pydub.effects import normalize
import io

# 设置系统编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

# 安全序列化全局配置
torch.serialization.add_safe_globals([RAdam, defaultdict, dict])


class HybridTTS:
def __init__(self):
self.setup_logging()
self.setup_config()

def setup_logging(self):
# 创建支持Unicode的日志处理器
class UnicodeStreamHandler(logging.StreamHandler):
def emit(self, record):
try:
msg = self.format(record)
stream = self.stream
stream.write(msg + self.terminator)
self.flush()
except UnicodeEncodeError:
# 如果遇到编码错误,尝试使用UTF-8编码
msg = self.format(record).encode('utf-8').decode('utf-8', 'replace')
stream.write(msg + self.terminator)
self.flush()
except Exception:
self.handleError(record)

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("tts_hybrid.log", encoding='utf-8'),
UnicodeStreamHandler()
]
)

def setup_config(self):
self.config = {
# 模式切换阈值
"max_direct_length": 120,
"min_chunk_length": 20,

# 分块参数
"chunk_size": 50,
"crossfade_ms": 50,

# 模型参数
"max_decoder_steps": 5000,
"gate_threshold": 0.6,

# 音频参数
"sample_rate": 22050,
"bitrate": "192k",

# 特殊车次处理
"train_number_map": {
'G': '', 'D': '', 'C': '',
'K': '', 'T': '', 'Z': ''
}
}

def convert_train_numbers(self, text):
"""专门处理列车车次"""

# 匹配格式:字母+数字+次(可选)
def replace_match(match):
letter = match.group(1)
number = match.group(2)
suffix = match.group(3) or ''
return f"{self.config['train_number_map'].get(letter, letter)}{number}{suffix}"

return re.sub(
r'([A-Z])(\d+)()?',
replace_match,
text
)


def clean_text(self, text):
"""增强版文本清洗"""
# 先处理列车车次
text = self.convert_train_numbers(text)

# 常规清洗
pattern = re.compile(r'[^\u4e00-\u9fa5,。!?、;:"\'()《》【】\s0-9年月日点分秒]')
cleaned = pattern.sub('', text)

# 多音字处理
cleaned = cleaned.replace("同行", "xing")
cleaned = cleaned.replace("长时间", "常时间")

# 确保最后一个字符是标点符号
if not cleaned or cleaned[-1] not in ',。!?':
cleaned += ''

return cleaned

def should_chunk(self, text):
"""智能判断是否需要分块"""
return (len(text) > self.config["max_direct_length"] or
'\n\n' in text or
text.count('') > 3)

def split_text(self, text):
"""语义感知分块算法"""
if not self.should_chunk(text):
return [text]

chunks = []
current = ""

for segment in re.split(r'([。!?\n])', text):
if not segment:
continue

if len(current) + len(segment) <= self.config["chunk_size"]:
current += segment
else:
if current:
chunks.append(current)
current = ""

while len(segment) > self.config["chunk_size"]:
split_pos = segment.rfind("", 0, self.config["chunk_size"])
if split_pos < self.config["min_chunk_length"]:
split_pos = self.config["chunk_size"]
chunks.append(segment[:split_pos + 1])
segment = segment[split_pos + 1:]

current = segment

if current:
chunks.append(current)

return chunks

def initialize_tts(self):
"""安全的TTS初始化"""
try:
with torch.serialization.safe_globals([RAdam, defaultdict, dict]):
tts = TTS(
model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST",
progress_bar=False
)

if hasattr(tts.synthesizer, 'tts_config'):
config = tts.synthesizer.tts_config
config.max_decoder_steps = self.config["max_decoder_steps"]
config.gate_threshold = self.config["gate_threshold"]

# 强制预热
with open(os.devnull, 'w', encoding='utf-8') as f:
tts.tts_to_file(text="预热", file_path=f.name)

return tts
except Exception as e:
logging.error(f"模型初始化失败: {str(e)}")
raise

def process_segment(self, text, output_path):
"""处理单段文本"""
temp_path = f"{output_path}_temp_{os.getpid()}.wav"
try:
tts = self.initialize_tts()
tts.tts_to_file(
text=text,
file_path=temp_path,
length_scale=0.9, # 更慢更稳定
speed=1.1 # 补偿速度
)

if os.path.getsize(temp_path) > 1024:
audio = normalize(AudioSegment.from_file(temp_path))
audio.export(output_path, format="wav")
return True
return False
except Exception as e:
logging.warning(f"分段处理失败: {str(e)}")
return False
finally:
if os.path.exists(temp_path):
os.remove(temp_path)

def text_to_speech(self, text, output_path):
"""主处理流程"""
try:
cleaned_text = self.clean_text(text)
abs_path = os.path.abspath(output_path.replace('\\', '/'))
os.makedirs(os.path.dirname(abs_path), exist_ok=True)

# 智能路由
if not self.should_chunk(cleaned_text):
if self.process_segment(cleaned_text, abs_path):
logging.info(f"✅ 单次模式成功: {abs_path}")
return

# 分块处理
logging.info("启用分块模式...")
chunks = self.split_text(cleaned_text)
combined = AudioSegment.silent(duration=100)

for i, chunk in enumerate(chunks):
chunk_path = f"{abs_path}_part_{i}.wav"
try:
if self.process_segment(chunk, chunk_path):
seg = AudioSegment.from_file(chunk_path)
combined = combined.append(seg, crossfade=self.config["crossfade_ms"])
logging.info(f"✓ 分块 {i + 1}/{len(chunks)} 成功")
finally:
if os.path.exists(chunk_path):
os.remove(chunk_path)

if len(combined) > 100:
combined.export(abs_path, format="wav")
logging.info(f"✅ 合成完成: {abs_path}")
else:
raise RuntimeError("无有效音频生成")

except Exception as e:
logging.error(f"❌ 合成失败: {type(e).__name__} - {str(e)}", exc_info=True)
sys.exit(1)
finally:
torch.cuda.empty_cache()


if __name__ == "__main__":
if len(sys.argv) != 3:
logging.error("用法: python tts_hybrid.py <文本> <输出路径>")
sys.exit(1)

tts = HybridTTS()
tts.text_to_speech(sys.argv[1], sys.argv[2])
posted @ 2025-03-28 17:01  sensen~||^_^|||&  阅读(52)  评论(0)    收藏  举报