po3a  
import paddle
import yaml
import soundfile as sf
import threading
import pygame
import time
import os
from yacs.config import CfgNode
from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.exps.syn_utils import get_am_inference
from paddlespeech.t2s.exps.syn_utils import get_voc_inference
from paddlespeech.t2s.exps.syn_utils import run_frontend
""""
paddlespeech,百度语音框架,用于对场景中的OCR检测结果和dtection检测结果进行中英文tts(text to speech)语音合成
"""

class MyPP_Speech():
def __init__(self):
self.phones_dict=r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt"
self.am_config_file = r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0\default.yaml"
self.am_ckpt = r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0\snapshot_iter_99200.pdz"
self.am_stat=r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0\speech_stats.npy"
self.speaker_dict=r"baidu_pp_speech\download\fastspeech2_mix_ckpt_1.2.0\speaker_id_map.txt"


self.voc_config_file=r"baidu_pp_speech\download\hifigan_csmsc_ckpt_0.1.1\default.yaml"
self.voc_ckpt=r"baidu_pp_speech\download\hifigan_csmsc_ckpt_0.1.1\snapshot_iter_2500000.pdz"
self.voc_stat=r"baidu_pp_speech\download\hifigan_csmsc_ckpt_0.1.1\feats_stats.npy"
self.frontend = MixFrontend(phone_vocab_path=self.phones_dict)
self.music_thread=None
print("frontend done!")
with open(self.am_config_file) as f:
self.am_config = CfgNode(yaml.safe_load(f))
self.am_inference = get_am_inference(
am="fastspeech2_mix",
am_config=self.am_config,
am_ckpt=self.am_ckpt,
am_stat=self.am_stat,
phones_dict=self.phones_dict,
tones_dict=None,
speaker_dict=self.speaker_dict)
print("acoustic model done!")
with open(self.voc_config_file) as f:
self.voc_config = CfgNode(yaml.safe_load(f))
self.voc_inference = get_voc_inference(
voc="hifigan_aishell3",
voc_config=self.voc_config,
voc_ckpt=self.voc_ckpt,
voc_stat=self.voc_stat)
print("voc done!")

def make_tts(self,sentence):
self.off_tts()
frontend_dict = run_frontend(
frontend=self.frontend,
text=sentence,
merge_sentences=False,
get_tone_ids=False,
lang="mix")
phone_ids = frontend_dict['phone_ids']

# inference
flags = 0
for i in range(len(phone_ids)):
part_phone_ids = phone_ids[i]
spk_id = 174 # baker:174, ljspeech:175, aishell3:0~173, vctk:176~282
spk_id = paddle.to_tensor(spk_id)
mel = self.am_inference(part_phone_ids, spk_id)
wav = self.voc_inference(mel)
if flags == 0:
wav_all = wav
flags = 1
else:
wav_all = paddle.concat([wav_all, wav])
print("infer successfully.")
wav = wav_all.numpy()
# print(self.am_config.fs)
sf.write("./out.wav", wav, self.am_config.fs)
self.take_tts()

def play_music(self,file_path="./out.wav"):
pygame.mixer.init()
pygame.mixer.music.load(file_path)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
continue
pygame.mixer.music.stop() # 停止当前音乐
pygame.mixer.quit() # 关闭音乐模块
def take_tts(self):
self.off_tts()
self.music_thread=threading.Thread(target=self.play_music,)
self.music_thread.start()
def off_tts(self):
if self.music_thread!=None:
if self.music_thread.is_alive():
pygame.mixer.music.stop()
self.music_thread.join()
pygame.mixer.quit()
self.music_thread.join()
posted on 2025-04-29 09:25  po3a  阅读(9)  评论(0)    收藏  举报