pp_speech_demo - po3a

import paddle
import yaml
import soundfile as sf
import threading
import pygame
import time
import os
from yacs.config import CfgNode
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.exps.syn_utils import get_am_inference
from paddlespeech.t2s.exps.syn_utils import get_voc_inference
from paddlespeech.t2s.exps.syn_utils import run_frontend
""""
paddlespeech,百度语音框架，用于对场景中的OCR检测结果和dtection检测结果进行中英文tts（text to speech）语音合成
"""

class MyPP_Speech():
    def __init__(self):
        self.phones_dict=r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt"
        self.am_config_file = r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0\default.yaml"
        self.am_ckpt = r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0\snapshot_iter_99200.pdz"
        self.am_stat=r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0\speech_stats.npy"
        self.speaker_dict=r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0\speaker_id_map.txt"


        self.voc_config_file=r"baidu_pp_speech\download\hifigan_csmsc_ckpt_0.1.1\default.yaml"
        self.voc_ckpt=r"baidu_pp_speech\download\hifigan_csmsc_ckpt_0.1.1\snapshot_iter_2500000.pdz"
        self.voc_stat=r"baidu_pp_speech\download\hifigan_csmsc_ckpt_0.1.1\feats_stats.npy"
        self.frontend = MixFrontend(phone_vocab_path=self.phones_dict)
        self.music_thread=None
        print("frontend done!")
        with open(self.am_config_file) as f:
            self.am_config = CfgNode(yaml.safe_load(f))
        self.am_inference = get_am_inference(
            am="fastspeech2_mix",
            am_config=self.am_config,
            am_ckpt=self.am_ckpt,
            am_stat=self.am_stat,
            phones_dict=self.phones_dict,
            tones_dict=None,
            speaker_dict=self.speaker_dict)
        print("acoustic model done!")
        with open(self.voc_config_file) as f:
            self.voc_config = CfgNode(yaml.safe_load(f))
        self.voc_inference = get_voc_inference(
            voc="hifigan_aishell3",
            voc_config=self.voc_config,
            voc_ckpt=self.voc_ckpt,
            voc_stat=self.voc_stat)
        print("voc done!")

    def make_tts(self,sentence):
        self.off_tts()
        frontend_dict = run_frontend(
            frontend=self.frontend,
            text=sentence,
            merge_sentences=False,
            get_tone_ids=False,
            lang="mix")
        phone_ids = frontend_dict['phone_ids']

        # inference
        flags = 0
        for i in range(len(phone_ids)):
            part_phone_ids = phone_ids[i]
            spk_id = 174  # baker:174, ljspeech:175, aishell3:0~173, vctk:176~282
            spk_id = paddle.to_tensor(spk_id)
            mel = self.am_inference(part_phone_ids, spk_id)
            wav = self.voc_inference(mel)
            if flags == 0:
                wav_all = wav
                flags = 1
            else:
                wav_all = paddle.concat([wav_all, wav])
        print("infer successfully.")
        wav = wav_all.numpy()
        # print(self.am_config.fs)
        sf.write("./out.wav", wav, self.am_config.fs)
        self.take_tts()

    def play_music(self,file_path="./out.wav"):
        pygame.mixer.init()
        pygame.mixer.music.load(file_path)
        pygame.mixer.music.play()
        while pygame.mixer.music.get_busy():
            continue
        pygame.mixer.music.stop()  # 停止当前音乐
        pygame.mixer.quit()  # 关闭音乐模块
    def take_tts(self):
        self.off_tts()
        self.music_thread=threading.Thread(target=self.play_music,)
        self.music_thread.start()
    def off_tts(self):
        if self.music_thread!=None:
            if self.music_thread.is_alive():
                pygame.mixer.music.stop()
                self.music_thread.join()
                pygame.mixer.quit()
            self.music_thread.join()
posted on 2025-04-29 09:25 po3a 阅读(13) 评论(0) 收藏举报
刷新页面返回顶部

博客园 © 2004-2025 浙公网安备 33010602011771号浙ICP备2021040463号-3
导航