快乐的果栋

2009-08-19 11:35

tts即“从文本到语音”。 TTS技术对文本文件进行实时转换，转换时间之短可以秒计算。在其特有智能语音控制器作用下，文本输出的语音音律流畅，使得听者在听取信息时感觉自然，毫无机器语音输出的冷漠与生涩感。TTS语音合成技术即将覆盖国标一、二级汉字，具有英文接口，自动识别中、英文，支持中英文混读。所有声音采用真人普通话为标准发音，实现了120-150个汉字/秒的快速语音合成，朗读速度达3-4个汉字/秒，使用户可以听到清晰悦耳的音质和连贯流畅的语调
上面的算是tts的基本概念，baidu搜到的- -！其实如果说到tts的code，很多东西网上都是有现成的，楼主可以多加利用，我以前有搜集一些，希望可以帮助到楼主
CText2Speech类的设计，其定义文件可以类似于下面
///////////////////////////////////////////////////////////////
// active speech engine
//
#include <atlbase.h>
extern CComModule _Module;
#include <atlcom.h>
#include "sapi.h"
#include <sphelper.h>

///////////////////////////////////////////////////////////////
// speech message
//
#define WM_TTSEVENT WM_USER+101

///////////////////////////////////////////////////////////////
// text-to-speech class
//
class CText2Speech
{
public:
CText2Speech();
virtual ~CText2Speech();

// initialize
BOOL Initialize(HWND hWnd = NULL);
void Destroy();

// speak
HRESULT Speak(const WCHAR *pwcs, DWORD dwFlags = SPF_DEFAULT);
HRESULT Pause();
HRESULT Resume();

// rate
HRESULT SetRate(long lRateAdjust);
HRESULT GetRate(long* plRateAdjust);

// volume
HRESULT SetVolume(USHORT usVolume);
HRESULT GetVolume(USHORT* pusVolume);

// voice
ULONG GetVoiceCount();
HRESULT GetVoice(WCHAR **ppszDescription, ULONG lIndex = -1);
HRESULT SetVoice(WCHAR **ppszDescription);

// error string
CString GetErrorString()
{
   return m_sError;
}

// interface
CComPtr<ISpVoice> m_IpVoice;

private:
CString m_sError;
};
文件的开始几行语句：
#include <atlbase.h>
extern CComModule _Module;
#include <atlcom.h>
#include "sapi.h"
#include <sphelper.h>
而类库格式的.dll格式的可以有一下格式：
using System;
using System.Collections.Generic;
using System.Text;
using SpeechLib;
using System.Collections;
namespace SpeechReco
{
public delegate void SpeechHandler(object sender, SpeechEvent e);

public class SpeechVoice
{
      private SpVoiceClass spVoice;
      public event SpeechHandler mySpeech;
      public event SpeechHandler imgHandler;
      private SpeechEvent evt = new SpeechEvent();
      private string status="default";
      private int roleNum = 0;//选择朗读的角色
      public SpeechVoice()
      {
         spVoice = new SpVoiceClass();
         spVoice.Word += new _ISpeechVoiceEvents_WordEventHandler(spVoice_Word);//每读一个单词触发该事件
         spVoice.EndStream += new _ISpeechVoiceEvents_EndStreamEventHandler(spVoice_EndStream);//文本读完触发该事件
      }
      //读完调用此事件
下面是tts应用到的几个函数，很有用
// 初始化和释放函数
BOOL Initialize(HWND hWnd = NULL);
void Destroy();
// 语音操作函数
HRESULT Speak(const WCHAR *pwcs, DWORD dwFlags = SPF_DEFAULT);
HRESULT Pause();
HRESULT Resume();
// 语速函数
HRESULT SetRate(long lRateAdjust);
HRESULT GetRate(long* plRateAdjust);
// 音量函数
HRESULT SetVolume(USHORT usVolume);
HRESULT GetVolume(USHORT* pusVolume);
// 语言函数
ULONG GetVoiceCount();
HRESULT GetVoice(WCHAR **ppszDescription, ULONG lIndex = -1);
HRESULT SetVoice(WCHAR **ppszDesc)；
// 获取错误信息函数
CString GetErrorString()
CText2Speech类的构造函数用于初始化Text-To-Speech引擎接口指针m_IpVoice和错误字符串；析构函数则调用释放引擎的Destroy()函数释放语音引擎，其代码如下：
CText2Speech::CText2Speech()
{
m_IpVoice = NULL;
m_sError=_T("");
}
CText2Speech::~CText2Speech()
{
Destroy();
}
如果运用到单词的朗读，尝试下下面的code或许有用
void spVoice_EndStream(int StreamNumber, object StreamPosition)
      {
         status = "default";
         if (imgHandler != null)
         {
            imgHandler(this, evt);
         }
      }
      //每读一个单词触发该事件， CharacterPosition表示单词在整个句子中的位置 length表示单词的长度
初始化函数Initialize首先初始化COM库，并调用CoCreateInstance方法初始化语音引擎。然后设置必须响应的引擎事件，并指定响应事件消息的窗口句柄。该窗口句柄是作为函数的参数传入的。Initialize函数的代码如下：
BOOL CText2Speech::Initialize(HWND hWnd)
{
if (FAILED(CoInitialize(NULL)))
{
      m_sError=_T("Error intialization COM");
      return FALSE;
}
HRESULT hr;
hr = m_IpVoice.CoCreateInstance(CLSID_SpVoice);
if (FAILED(hr))
{
      m_sError=_T("Error creating voice");
      return FALSE;
}
hr = m_IpVoice->SetInterest(SPFEI(SPEI_VISEME), SPFEI(SPEI_VISEME));
if (FAILED(hr))
{
      m_sError=_T("Error creating interest...seriously");
      return FALSE;
}
if (::IsWindow(hWnd))
{
hr = m_IpVoice->SetNotifyWindowMessage(hWnd, WM_TTSEVENT, 0, 0);
   if (FAILED(hr))
   {
   m_sError=_T("Error setting notification window");
   return FALSE;
   }
}
return TRUE;
}
语速音量之类的下面的code可以参考下
void spVoice_Word(int StreamNumber, object StreamPosition, int CharacterPosition, int Length)
      {
         evt.CharacterPosition = CharacterPosition;
         evt.Length = Length;
         mySpeech(this, evt);
      }
      //选择发音的男女
      public int role
      {
         set
         {
            spVoice.Voice = spVoice.GetVoices("", "").Item(value);
            spVoice.Voice.GetDescription(0);
            roleNum = value;
         }
         get
         {
            return roleNum;
         }
      }
      public void speakChinese()
      {

         spVoice.GetVoices(string.Empty, string.Empty).Item(0);

      }
      /// <summary>
      /// 获得当前的语速
      /// </summary>
      /// <returns></returns>
      public int getRate()
      {
         int a;
         a = spVoice.Rate;
         return (a);
      }
      /// <summary>
      /// rate 设置语速
      /// </summary>
      /// <param name="rate"></param>
      public void setRate(int rate)
      {//设置语速
         spVoice.Rate = rate;
      }
      public void setVolume(int volume)
      {//设置音量
         spVoice.Volume = volume;
      }
释放函数则释放语音引擎接口和COM库，其代码如下：
void CText2Speech:

estroy()
{
if (m_IpVoice)
m_IpVoice.Release();
CoUninitialize();
}
语音、语速、音量函数都是通过m_IpVoice成员直接调用ISpVoice接口的相关方法来实现的：
HRESULT CText2Speech::Speak(const WCHAR *pwcs, DWORD dwFlags)
{return m_IpVoice->Speak(pwcs, dwFlags, NULL);}

HRESULT CText2Speech:

ause()
{return m_IpVoice->

ause();}

HRESULT CText2Speech::Resume()
{ return m_IpVoice->Resume();}

// rate
HRESULT CText2Speech::SetRate(long lRateAdjust)
{return m_IpVoice->SetRate(lRateAdjust);}

HRESULT CText2Speech::GetRate(long* plRateAdjust)
{ return m_IpVoice->GetRate(plRateAdjust);}

// volume
HRESULT CText2Speech::SetVolume(USHORT usVolume)
{ return m_IpVoice->SetVolume(usVolume);}

HRESULT CText2Speech::GetVolume(USHORT* pusVolume)
{ return m_IpVoice->GetVolume(pusVolume);}
暂停开始播放，可以类似于下面的code
public void pause()
      {//暂停
         spVoice.Pause();
         status = "pause";
      }
      public void resume()
      { //恢复播放
         spVoice.Resume();
         status = "play";
      }
语言函数的实现比较复杂。由于IspVoice接口提供的语言函数，都只与抽象的语音语言接口ISpObjectToken相关，而我们能看到的却是语音语言的描述，比如，通过控制面板的语音程序所能见到的就是语音语言的描述。因此，笔者设计了直接对语音语言进行操作的语言函数，包括获取系统中已安装的语音语言数目，设置指定的语音语言，获取指定的语音语言描述（包括当前设定的语音语言）。它们的代码如下：
ULONG CText2Speech::GetVoiceCount()
{
HRESULT                                  hr = S_OK;
CComPtr<ISpObjectToken>          cpVoiceToken;
CComPtr<IEnumSpObjectTokens>          cpEnum;
ULONG                                  ulCount = -1;

//Enumerate the available voices
hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &cpEnum);
if(FAILED(hr))
{
   m_sError = _T("Error to enumerate voices");
   return -1;
}

//Get the number of voices
hr = cpEnum->GetCount(&ulCount);
if(FAILED(hr))
{
   m_sError = _T("Error to get voice count");
   return -1;
}

return ulCount;
}

HRESULT CText2Speech::GetVoice(WCHAR **ppszDescription, ULONG lIndex)
{
HRESULT                                  hr = S_OK;
CComPtr<ISpObjectToken>          cpVoiceToken;
CComPtr<IEnumSpObjectTokens>    cpEnum;
ULONG                                  ulCount = 0;

if (lIndex == -1)
{
   // current voice
   //

   hr = m_IpVoice->GetVoice(&cpVoiceToken);
   if(FAILED(hr))
   {
      m_sError = _T("Error to get current voice");
      return hr;
   }

   SpGetDescription(cpVoiceToken, ppszDescription);
   if(FAILED(hr))
   {
      m_sError = _T("Error to get current voice description");
      return hr;
   }
}
else
{
   // else other voices, we should enumerate the voice list first

   //Enumerate the available voices
   hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &cpEnum);
   if(FAILED(hr))
   {
      m_sError = _T("Error to enumerate voices");
      return hr;
   }
   //Get the number of voices
   hr = cpEnum->GetCount(&ulCount);
   if(FAILED(hr))
   {
      m_sError = _T("Error to voice count");
      return hr;
   }

   // range control
   ASSERT(lIndex >= 0);
   ASSERT(lIndex < ulCount);
   // Obtain specified voice id
   ULONG l = 0;
   while (SUCCEEDED(hr))
   {
      cpVoiceToken.Release();
      hr = cpEnum->Next( 1, &cpVoiceToken, NULL );
      if(FAILED(hr))
      {
            m_sError = _T("Error to get voice token");
            return hr;
      }

      if (l == lIndex)
      {
            hr = SpGetDescription(cpVoiceToken, ppszDescription);
            if(FAILED(hr))
            {
               m_sError = _T("Error to get voice description");
               return hr;
            }
            break;
      }
      l++;
   }
}
return hr;
}
HRESULT CText2Speech::SetVoice(WCHAR **ppszDescription)
{
HRESULT                                  hr = S_OK;
CComPtr<ISpObjectToken>          cpVoiceToken;
CComPtr<IEnumSpObjectTokens>          cpEnum;
ULONG                                  ulCount = 0;

//Enumerate the available voices
hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &cpEnum);
if(FAILED(hr))
{
   m_sError = _T("Error to enumerate voices");
   return hr;
}
//Get the number of voices
hr = cpEnum->GetCount(&ulCount);
if(FAILED(hr))
{
   m_sError = _T("Error to voice count");
   return hr;
}
// Obtain specified voice id
while (SUCCEEDED(hr) && ulCount--)
{
   cpVoiceToken.Release();
   hr = cpEnum->Next( 1, &cpVoiceToken, NULL );
   if(FAILED(hr))
   {
      m_sError = _T("Error to voice token");
      return hr;
   }
   WCHAR *pszDescription1;
   hr = SpGetDescription(cpVoiceToken, &pszDescription1);
   if(FAILED(hr))
   {
      m_sError = _T("Error to get voice description");
      return hr;
   }
   if (! wcsicmp(pszDescription1, *ppszDescription))
   {
      hr = m_IpVoice->SetVoice(cpVoiceToken);
      if(FAILED(hr))
      {
            m_sError = _T("Error to set voice");
            return hr;
      }
          break;
   }
}
   return hr;
}

选择列表中的语音语言并设为当前的语音设置是通过响应该列表的LBN_SELCHANGE消息来实现的。其消息响应函数为：

void CReciterDlg::OnSelchangeList1()

{

CString sVoice;

int nIndex = m_ListVoices.GetCurSel();

m_ListVoices.GetText(nIndex, sVoice);

BSTR bstr = sVoice.AllocSysString();

if (FAILED(m_Text2Speech.SetVoice(&bstr)))

   AfxMessageBox(m_Text2Speech.GetErrorString());

}
如果想朗读文本，类似函数如下：
public void speak(String text)
      {// 读取文本
         if (text.Trim() != "")
         {
            status = "play";
            spVoice.Speak(text, SpeechVoiceSpeakFlags.SVSFlagsAsync);
         }
      }
      public void speak(String text,int status)
      {// 朗读文本
         if (text.Trim() != "")
         {
            spVoice.Speak(text, SpeechVoiceSpeakFlags.SVSFlagsAsync);
         }
      }
      //停止
      public void stop()
      {
         status = "default";
         spVoice.Speak("", SpeechVoiceSpeakFlags.SVSFPurgeBeforeSpeak);
      }
      public ArrayList getAllRole()//返回当前系统的所有角色
      {
         ArrayList roleArr = new ArrayList();
         for (int i = 0; i < spVoice.GetVoices("", "").Count; i++)
         {
            roleArr.Add(spVoice.GetVoices("", "").Item(i).GetDescription(0));
         }
         return roleArr;

      }
      public int getVolume()//返回当前的音量
      {
         return spVoice.Volume;
      }
      public string Status
      {
         get { return status; }
      }
      /// <summary>
      /// 将文字转换为语音文件
      /// </summary>
      /// <param name="filename">要保存的文件名</param>
      /// <param name="speakstr">要转换的文本</param>
      public void saveSound(string filename, string speakstr)
      {
         SpeechStreamFileMode spFileMode = SpeechStreamFileMode.SSFMCreateForWrite;
         SpFileStream spFileStream = new SpFileStream();
         spFileStream.Open(filename, spFileMode, true);
         spVoice.AudioOutputStream = spFileStream;
         spVoice.Speak(speakstr, SpeechVoiceSpeakFlags.SVSFlagsAsync);
         spVoice.WaitUntilDone(System.Threading.Timeout.Infinite);
         spFileStream.Close();
         spVoice.AudioOutputStream = null;
      }
}
}

通过CSpEvent的GetFrom函数可以获得当前的事件信息，eEventId成员中记录了朗读的音节的代号。数组g_iMapVisemeToImage定义了音节代码和对应嘴形位图序列号的对应关系：

const int g_iMapVisemeToImage[22] =

{

0,  // SP_VISEME_0 = 0, // Silence

11, // SP_VISEME_1,       // AE, AX, AH

11, // SP_VISEME_2,       // AA

11, // SP_VISEME_3,       // AO

10, // SP_VISEME_4,       // EY, EH, UH

11, // SP_VISEME_5,       // ER

9,  // SP_VISEME_6,       // y, IY, IH, IX

2,  // SP_VISEME_7,       // w, UW

13, // SP_VISEME_8,       // OW

9,  // SP_VISEME_9,       // AW

12, // SP_VISEME_10,    // OY

11, // SP_VISEME_11,    // AY

9,  // SP_VISEME_12,    // h

3,  // SP_VISEME_13,    // r

6,  // SP_VISEME_14,    // l

7,  // SP_VISEME_15,    // s, z

8,  // SP_VISEME_16,    // SH, CH, JH, ZH

5,  // SP_VISEME_17,    // TH, DH

4,  // SP_VISEME_18,    // f, v

7,  // SP_VISEME_19,    // d, t, n

9,  // SP_VISEME_20,    // k, g, NG

1  // SP_VISEME_21,    // p, b, m

};

为了响应消息WM_TTSEVENT，需要添加相应的消息响应函数：

BEGIN_MESSAGE_MAP(CReciterDlg, CDialog)

//{{AFX_MSG_MAP(CReciterDlg)

ON_WM_SYSCOMMAND()

ON_WM_PAINT()

ON_WM_QUERYDRAGICON()

ON_BN_CLICKED(IDC_BUTTON_SPEAK, OnButtonSpeak)

ON_LBN_SELCHANGE(IDC_LIST1, OnSelchangeList1)

ON_BN_CLICKED(IDC_BUTTON_STOP, OnButtonStop)

ON_BN_CLICKED(IDC_BUTTON_RESUME, OnButtonResume)
   //}}AFX_MSG_MAP
   ON_MESSAGE(WM_TTSEVENT, OnMouthEvent)
END_MESSAGE_MAP()
LRESULT CReciterDlg::OnMouthEvent(WPARAM wParam, LPARAM lParam)
{
    CSpEvent event;
    while (event.GetFrom(m_Text2Speech.m_IpVoice) == S_OK) {
        switch (event.eEventId) {
          case SPEI_VISEME:
              m_iMouthBmp = g_iMapVisemeToImage[event.Viseme()];
              InvalidateRect(m_cMouthRect, false);
              break;        }
    }   return 0;

}

快乐的果栋

公告

TTS技术【转载+整理】