TTS 合成语音

1. python脚本：
import sys
import os
import logging
import torch
import re
from TTS.utils.radam import RAdam
from collections import defaultdict
from TTS.api import TTS
from pydub import AudioSegment
from pydub.effects import normalize
import io

# 设置系统编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

# 安全序列化全局配置
torch.serialization.add_safe_globals([RAdam, defaultdict, dict])


class HybridTTS:
    def __init__(self):
        self.setup_logging()
        self.setup_config()

    def setup_logging(self):
        # 创建支持Unicode的日志处理器
        class UnicodeStreamHandler(logging.StreamHandler):
            def emit(self, record):
                try:
                    msg = self.format(record)
                    stream = self.stream
                    stream.write(msg + self.terminator)
                    self.flush()
                except UnicodeEncodeError:
                    # 如果遇到编码错误，尝试使用UTF-8编码
                    msg = self.format(record).encode('utf-8').decode('utf-8', 'replace')
                    stream.write(msg + self.terminator)
                    self.flush()
                except Exception:
                    self.handleError(record)

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler("tts_hybrid.log", encoding='utf-8'),
                UnicodeStreamHandler()
            ]
        )

    def setup_config(self):
        self.config = {
            # 模式切换阈值
            "max_direct_length": 120,
            "min_chunk_length": 20,

            # 分块参数
            "chunk_size": 50,
            "crossfade_ms": 50,

            # 模型参数
            "max_decoder_steps": 5000,
            "gate_threshold": 0.6,

            # 音频参数
            "sample_rate": 22050,
            "bitrate": "192k",

            # 特殊车次处理
            "train_number_map": {
                'G': '高', 'D': '动', 'C': '城',
                'K': '快', 'T': '特', 'Z': '直'
            }
        }

    def convert_train_numbers(self, text):
        """专门处理列车车次"""

        # 匹配格式：字母+数字+次（可选）
        def replace_match(match):
            letter = match.group(1)
            number = match.group(2)
            suffix = match.group(3) or ''
            return f"{self.config['train_number_map'].get(letter, letter)}{number}{suffix}"

        return re.sub(
            r'([A-Z])(\d+)(次)?',
            replace_match,
            text
        )


    def clean_text(self, text):
        """增强版文本清洗"""
        # 先处理列车车次
        text = self.convert_train_numbers(text)

        # 常规清洗
        pattern = re.compile(r'[^\u4e00-\u9fa5，。！？、；："\'（）《》【】\s0-9年月日点分秒]')
        cleaned = pattern.sub('', text)

        # 多音字处理
        cleaned = cleaned.replace("同行", "同xing")
        cleaned = cleaned.replace("长时间", "常时间")

        # 确保最后一个字符是标点符号
        if not cleaned or cleaned[-1] not in '，。！？':
            cleaned += '。'

        return cleaned

    def should_chunk(self, text):
        """智能判断是否需要分块"""
        return (len(text) > self.config["max_direct_length"] or
                '\n\n' in text or
                text.count('。') > 3)

    def split_text(self, text):
        """语义感知分块算法"""
        if not self.should_chunk(text):
            return [text]

        chunks = []
        current = ""

        for segment in re.split(r'([。！？\n])', text):
            if not segment:
                continue

            if len(current) + len(segment) <= self.config["chunk_size"]:
                current += segment
            else:
                if current:
                    chunks.append(current)
                    current = ""

                while len(segment) > self.config["chunk_size"]:
                    split_pos = segment.rfind("，", 0, self.config["chunk_size"])
                    if split_pos < self.config["min_chunk_length"]:
                        split_pos = self.config["chunk_size"]
                    chunks.append(segment[:split_pos + 1])
                    segment = segment[split_pos + 1:]

                current = segment

        if current:
            chunks.append(current)

        return chunks

    def initialize_tts(self):
        """安全的TTS初始化"""
        try:
            with torch.serialization.safe_globals([RAdam, defaultdict, dict]):
                tts = TTS(
                    model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST",
                    progress_bar=False
                )

                if hasattr(tts.synthesizer, 'tts_config'):
                    config = tts.synthesizer.tts_config
                    config.max_decoder_steps = self.config["max_decoder_steps"]
                    config.gate_threshold = self.config["gate_threshold"]

                # 强制预热
                with open(os.devnull, 'w', encoding='utf-8') as f:
                    tts.tts_to_file(text="预热", file_path=f.name)

                return tts
        except Exception as e:
            logging.error(f"模型初始化失败: {str(e)}")
            raise

    def process_segment(self, text, output_path):
        """处理单段文本"""
        temp_path = f"{output_path}_temp_{os.getpid()}.wav"
        try:
            tts = self.initialize_tts()
            tts.tts_to_file(
                text=text,
                file_path=temp_path,
                length_scale=0.9,  # 更慢更稳定
                speed=1.1  # 补偿速度
            )

            if os.path.getsize(temp_path) > 1024:
                audio = normalize(AudioSegment.from_file(temp_path))
                audio.export(output_path, format="wav")
                return True
            return False
        except Exception as e:
            logging.warning(f"分段处理失败: {str(e)}")
            return False
        finally:
            if os.path.exists(temp_path):
                os.remove(temp_path)

    def text_to_speech(self, text, output_path):
        """主处理流程"""
        try:
            cleaned_text = self.clean_text(text)
            abs_path = os.path.abspath(output_path.replace('\\', '/'))
            os.makedirs(os.path.dirname(abs_path), exist_ok=True)

            # 智能路由
            if not self.should_chunk(cleaned_text):
                if self.process_segment(cleaned_text, abs_path):
                    logging.info(f"✅ 单次模式成功: {abs_path}")
                    return

            # 分块处理
            logging.info("启用分块模式...")
            chunks = self.split_text(cleaned_text)
            combined = AudioSegment.silent(duration=100)

            for i, chunk in enumerate(chunks):
                chunk_path = f"{abs_path}_part_{i}.wav"
                try:
                    if self.process_segment(chunk, chunk_path):
                        seg = AudioSegment.from_file(chunk_path)
                        combined = combined.append(seg, crossfade=self.config["crossfade_ms"])
                        logging.info(f"✓ 分块 {i + 1}/{len(chunks)} 成功")
                finally:
                    if os.path.exists(chunk_path):
                        os.remove(chunk_path)

            if len(combined) > 100:
                combined.export(abs_path, format="wav")
                logging.info(f"✅ 合成完成: {abs_path}")
            else:
                raise RuntimeError("无有效音频生成")

        except Exception as e:
            logging.error(f"❌ 合成失败: {type(e).__name__} - {str(e)}", exc_info=True)
            sys.exit(1)
        finally:
            torch.cuda.empty_cache()


if __name__ == "__main__":
    if len(sys.argv) != 3:
        logging.error("用法: python tts_hybrid.py <文本> <输出路径>")
        sys.exit(1)

    tts = HybridTTS()
    tts.text_to_speech(sys.argv[1], sys.argv[2])
posted @ 2025-03-28 17:01 sensen~||^_^|||& 阅读(52) 评论(0) 收藏举报
刷新页面返回顶部
sensen~||^_^|||&

h

TTS 合成语音

公告