[转]PocketSphinx语音识别系统的编程

PocketSphinx语音识别系统的编程

zouxy09@qq.com

http://blog.csdn.net/zouxy09

关于语音识别的基础知识和sphinx的知识，具体可以参考我的另外的博文：

语音识别的基础知识与CMUsphinx介绍：

http://blog.csdn.net/zouxy09/article/details/7941585

PocketSphinx语音识别系统的编译、安装和使用：

http://blog.csdn.net/zouxy09/article/details/7942784

PocketSphinx语音识别系统语言模型的训练和声学模型的改进:

http://blog.csdn.net/zouxy09/article/details/7949126

PocketSphinx语音识别系统声学模型的训练与使用

http://blog.csdn.net/zouxy09/article/details/7962382

本文主要实现PocketSphinx语音识别系统的编程使用，主要分两个方面，一个是编程解码语音文件（主要参考CMU sphinx的wiki：http://cmusphinx.sourceforge.net/wiki/），二是编程识别麦克风的语音（主要参考PocketSphinx源码包里的pocketsphinx.c文件）。对于后面加入我的人机交互系统的话，采用的是识别麦克风的语音的编程，具体使用时还需要对其进行精简。

#include <pocketsphinx.h>

int main(int argc, char *argv[])
{
    ps_decoder_t *ps;
    cmd_ln_t *config;
    FILE *fh;
    char const *hyp, *uttid;
        int16 buf[512];
    int rv;
    int32 score;

    //1、初始化：创建一个配置对象 cmd_ln_t *
    //cmd_ln_init函数第一个参数是我们需要更新的上一个配置，因为这里是初次创建，所以传入NULL；
    //第二个参数是一个定义参数的数组，如果使用的是标准配置的参数集的话可以通过调用ps_args()去获得。
    //第三个参数是是一个标志，它决定了参数的解释是否严格，如果为TRUE，那么遇到重复的或者未知的参
    //数，将会导致解释失败；
    //MODELDIR这个宏，指定了模型的路径，包括声学模型，语言模型和字典三个文件，是由gcc命令行传入，
    //我们通过pkg-config工具从PocketSphinx的配置中去获得这个modeldir变量
    config = cmd_ln_init(NULL, ps_args(), TRUE,
                 "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k",
                 "-lm", MODELDIR "/lm/en/turtle.DMP",
                 "-dict", MODELDIR "/lm/en/turtle.dic",
                 NULL);
    if (config == NULL)
        return 1;
    
    //2、初始化解码器（语言识别就是一个解码过程，通俗的将就是将你说的话解码成对应的文字串）
    ps = ps_init(config);
    if (ps == NULL)
        return 1;

    //3、解码文件流
    //因为音频输入接口（麦克风）受到一些特定平台的影响，不利用我们演示，所以我们通过解码音频文件流
    //来演示PocketSphinx API的用法，goforward.raw是一个包含了一些诸如“go forward ten meters”等用来
    //控制机器人的短语（指令）的音频文件，其在test/data/goforward.raw。把它复制到当前目录
    fh = fopen("/dev/input/event14", "rb");
    if (fh == NULL) {
        perror("Failed to open goforward.raw");
        return 1;
    }
    
    //4、使用ps_decode_raw()进行解码
    
    rv = ps_decode_raw(ps, fh, NULL, -1);
    if (rv < 0)
        return 1;
    
    //5、得到解码的结果（概率最大的字串） hypothesis
    hyp = ps_get_hyp(ps, &score, &uttid);
    if (hyp == NULL)
        return 1;
    printf("Recognized: %s\n", hyp);

    //从内存中解码音频数据
    //现在我们将再次解码相同的文件，但是使用API从内存块中解码音频数据。在这种情况下，首先我们
    //需要使用ps_start_utt()开始说话：
    fseek(fh, 0, SEEK_SET);
    
    rv = ps_start_utt(ps, NULL);
    if (rv < 0)
        return 1;
        while (!feof(fh)) {
    rv = ps_start_utt(ps, NULL);
        if (rv < 0)
                return 1;

        printf("ready:\n");
            size_t nsamp;
            nsamp = fread(buf, 2, 512, fh);
        printf("read:\n");
            //我们将每次从文件中读取512大小的样本，使用ps_process_raw()把它们放到解码器中:
            rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE);
        printf("process:\n");
        }
        //我们需要使用ps_end_utt()去标记说话的结尾处：
        rv = ps_end_utt(ps);
    if (rv < 0)
        return 1;
        
    //以相同精确的方式运行来检索假设的字符串：
    hyp = ps_get_hyp(ps, &score, &uttid);
    if (hyp == NULL)
        return 1;
    printf("Recognized: %s\n", hyp);
    }
    //6、清理工作：使用ps_free()释放使用ps_init()返回的对象，不用释放配置对象。
    fclose(fh);
        ps_free(ps);
    return 0;
}

2、编译：

编译方法：

gcc -o test_ps test_ps.c \

-DMODELDIR=\"`pkg-config --variable=modeldir pocketsphinx`\" \

`pkg-config --cflags --libs pocketsphinx sphinxbase`

//gcc的-D选项，指定宏定义，如-Dmacro=defn 相当于C语言中的#define macro=defn那么上面就表示在test_ps.c文件中，新加入一个宏定义：

#define MODELDIR=\"`pkg-config --variable=modeldir pocketsphinx`\"

\表示转义符，把“号转义。

这么做是为什么呢？因为程序中需要指定MODELDIR这个变量，但是因为不同的使用者，这个变量不一样，没办法指定死一个路径，所以只能放在编译时，让用户去根据自己的情况来指定。

pkg-config工具可以获得一个库的编译和连接等信息；

#pkg-config --cflags --libs pocketsphinx sphinxbase

显示：

-I/usr/local/include/sphinxbase -I/usr/local/include/pocketsphinx

-L/usr/local/lib -lpocketsphinx -lsphinxbase –lsphinxad

#pkg-config --variable=modeldir pocketsphinx

显示结果输出：/usr/local/share/pocketsphinx/model

二、编程解码麦克风的录音

1、编程

麦克风录音数据的获得主要是用sphinxbase封装了alsa的接口来实现。

  1 #include <stdio.h>
  2 #include <string.h>
  3 #include <sys/types.h>
  4 #include <sys/time.h>
  5 #include <signal.h>
  6 #include <setjmp.h>
  7 
  8 #include <sphinxbase/err.h>
  9 //generic live audio interface for recording and playback
 10 #include <sphinxbase/ad.h>
 11 #include <sphinxbase/cont_ad.h>
 12 
 13 #include "pocketsphinx.h"
 14 
 15 static ps_decoder_t *ps;
 16 static cmd_ln_t *config;
 17 
 18 static void print_word_times(int32 start)
 19 {
 20     ps_seg_t *iter = ps_seg_iter(ps, NULL);
 21     while (iter != NULL) 
 22     {
 23         int32 sf, ef, pprob;
 24         float conf;
 25         
 26         ps_seg_frames (iter, &sf, &ef);
 27         pprob = ps_seg_prob (iter, NULL, NULL, NULL);
 28         conf = logmath_exp(ps_get_logmath(ps), pprob);
 29         printf ("%s %f %f %f\n", ps_seg_word (iter), (sf + start) / 100.0, (ef + start) / 100.0, conf);
 30         iter = ps_seg_next (iter);
 31     }
 32 }
 33 
 34 /* Sleep for specified msec */
 35 static void sleep_msec(int32 ms)
 36 {
 37     struct timeval tmo;
 38 
 39     tmo.tv_sec = 0;
 40     tmo.tv_usec = ms * 1000;
 41 
 42     select(0, NULL, NULL, NULL, &tmo);
 43 }
 44 
 45 /*
 46  * Main utterance processing loop:
 47  *     for (;;) {
 48  *        wait for start of next utterance;
 49  *        decode utterance until silence of at least 1 sec observed;
 50  *        print utterance result;
 51  *     }
 52  */
 53 static void recognize_from_microphone()
 54 {
 55     ad_rec_t *ad;
 56     int16 adbuf[4096];
 57     int32 k, ts, rem;
 58     char const *hyp;
 59     char const *uttid;
 60     cont_ad_t *cont;
 61     char word[256];
 62 
 63     if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),
 64                           (int)cmd_ln_float32_r(config, "-samprate"))) == NULL)
 65         E_FATAL("Failed top open audio device\n");
 66 
 67     /* Initialize continuous listening module */
 68     if ((cont = cont_ad_init(ad, ad_read)) == NULL)
 69         E_FATAL("Failed to initialize voice activity detection\n");
 70     if (ad_start_rec(ad) < 0)
 71         E_FATAL("Failed to start recording\n");
 72     if (cont_ad_calib(cont) < 0)
 73         E_FATAL("Failed to calibrate voice activity detection\n");
 74 
 75     for (;;) {
 76         /* Indicate listening for next utterance */
 77         printf("READY....\n");
 78         fflush(stdout);
 79         fflush(stderr);
 80 
 81         /* Wait data for next utterance */
 82         while ((k = cont_ad_read(cont, adbuf, 4096)) == 0)
 83             sleep_msec(100);
 84 
 85         if (k < 0)
 86             E_FATAL("Failed to read audio\n");
 87 
 88         /*
 89          * Non-zero amount of data received; start recognition of new utterance.
 90          * NULL argument to uttproc_begin_utt => automatic generation of utterance-id.
 91          */
 92         if (ps_start_utt(ps, NULL) < 0)
 93             E_FATAL("Failed to start utterance\n");
 94         ps_process_raw(ps, adbuf, k, FALSE, FALSE);
 95         printf("Listening...\n");
 96         fflush(stdout);
 97 
 98         /* Note timestamp for this first block of data */
 99         ts = cont->read_ts;
100 
101         /* Decode utterance until end (marked by a "long" silence, >1sec) */
102         for (;;) {
103             /* Read non-silence audio data, if any, from continuous listening module */
104             if ((k = cont_ad_read(cont, adbuf, 4096)) < 0)
105                 E_FATAL("Failed to read audio\n");
106             if (k == 0) {
107                 /*
108                  * No speech data available; check current timestamp with most recent
109                  * speech to see if more than 1 sec elapsed.  If so, end of utterance.
110                  */
111                 if ((cont->read_ts - ts) > DEFAULT_SAMPLES_PER_SEC)
112                     break;
113             }
114             else {
115                 /* New speech data received; note current timestamp */
116                 ts = cont->read_ts;
117             }
118 
119             /*
120              * Decode whatever data was read above.
121              */
122             rem = ps_process_raw(ps, adbuf, k, FALSE, FALSE);
123 
124             /* If no work to be done, sleep a bit */
125             if ((rem == 0) && (k == 0))
126                 sleep_msec(20);
127         }
128 
129         /*
130          * Utterance ended; flush any accumulated, unprocessed A/D data and stop
131          * listening until current utterance completely decoded
132          */
133         ad_stop_rec(ad);
134         while (ad_read(ad, adbuf, 4096) >= 0);
135         cont_ad_reset(cont);
136 
137         printf("Stopped listening, please wait...\n");
138         fflush(stdout);
139         /* Finish decoding, obtain and print result */
140         ps_end_utt(ps);
141         hyp = ps_get_hyp(ps, NULL, &uttid);
142         printf("%s: %s\n", uttid, hyp);
143         fflush(stdout);
144 
145         /* Exit if the first word spoken was GOODBYE */
146         if (hyp) {
147             sscanf(hyp, "%s", word);
148             if (strcmp(word, "goodbye") == 0)
149                 break;
150         }
151 
152         /* Resume A/D recording for next utterance */
153         if (ad_start_rec(ad) < 0)
154             E_FATAL("Failed to start recording\n");
155     }
156 
157     cont_ad_close(cont);
158     ad_close(ad);
159 }
160 
161 static jmp_buf jbuf;
162 static void sighandler(int signo)
163 {
164     longjmp(jbuf, 1);
165 }
166 
167 int main(int argc, char *argv[])
168 {
169     
170     config = cmd_ln_init(NULL, ps_args(), TRUE,
171                  "-hmm", MODELDIR "/hmm/en_US/hub4wsj_sc_8k",
172                  "-lm", MODELDIR "/lm/en/turtle.DMP",
173                  "-dict", MODELDIR "/lm/en/turtle.dic",
174                  NULL);
175     if (config == NULL)
176         return 1;
177     
178     ps = ps_init(config);
179     if (ps == NULL)
180         return 1;
181 
182     signal(SIGINT, &sighandler);
183      if (setjmp(jbuf) == 0) 
184         recognize_from_microphone();
185     
186         ps_free(ps);
187     return 0;
188 }

2、编译

和1.2一样。

至于说后面把PocketSphinx语音识别系统加入我的人机交互系统这个阶段，因为感觉这个系统本身的识别率不是很高，自己做了适应和重新训练声学和语言模型后，提升还是有限，暂时实用性还不是很强，所以暂时搁置下，看能不能通过其他方法去改进目前的状态。希望有牛人指导下。另外，由于开学了，需要上课，所以后续的进程可能会稍微减慢，不过依然期待各位多多交流！呵呵

posted @ 2014-03-29 23:50 woshijpfgg 阅读(1289) 评论(0) 收藏举报

刷新页面返回顶部

woshijpfgg

[转]PocketSphinx语音识别系统的编程

公告