UniMrcp中asr超时逻辑

环境:FreeSwitch-1.10.7  unimrcp-1.6.0  webrtc-vad

默认情况下,仅有 SPEECH_COMPLETE,才会返回识别结果,修改为 【非 no_input_timeout 状态,都会返回识别结果】

1.默认超时逻辑

  1.1 首先区分三个关键主体:

  1.  FreeSwitch:识别发起方,提供语音流和识别超时参数
  2.  recog_engine: 识别接收方,接受音频流并推送至ASR音频,处理超时参数
  3.     mpf: 语音活动监测,使用超时参数控制检测流程

  1.2 超时参数配置

  识别开始时,在函数demo_recog_channel_recognize中识别FS发送的超时参数

/** Process RECOGNIZE request */
static apt_bool_t demo_recog_channel_recognize(mrcp_engine_channel_t *channel, mrcp_message_t *request, mrcp_message_t *response)
{
    ******************
    忽略部分无关代码
    ******************

    /* get recognizer header */
    recog_header = mrcp_resource_header_get(request);
    if(recog_header) {
        if(mrcp_resource_header_property_check(request,RECOGNIZER_HEADER_START_INPUT_TIMERS) == TRUE) {
            recog_channel->timers_started = recog_header->start_input_timers;
        }
        if(mrcp_resource_header_property_check(request,RECOGNIZER_HEADER_NO_INPUT_TIMEOUT) == TRUE) {
            apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"recog_header->no_input_timeout [%lu]", (unsigned long)recog_header->no_input_timeout);
            mpf_activity_detector_noinput_timeout_set(recog_channel->detector,recog_header->no_input_timeout);
        }else {
            mpf_activity_detector_noinput_timeout_set(recog_channel->detector,5000);
        }

        if(mrcp_resource_header_property_check(request,RECOGNIZER_HEADER_SPEECH_COMPLETE_TIMEOUT) == TRUE) {
        /**  非静音->静音所需时间,同时表示说话人已经停止说话,语音识别资源成功返回结果之间所必须消耗的时间  **/
            apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"recog_header->speech_complete_timeout [%lu]", (unsigned long)recog_header->speech_complete_timeout);
            mpf_activity_detector_silence_timeout_set(recog_channel->detector,recog_header->speech_complete_timeout);
        }else {
            mpf_activity_detector_speech_timeout_set(recog_channel->detector,1000);
        }

        if(mrcp_resource_header_property_check(request,RECOGNIZER_HEADER_RECOGNITION_TIMEOUT) == TRUE) {
        /**  限定整个识别处理时间  **/
            apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"recog_header->recognition_timeout [%lu]", (unsigned long)recog_header->recognition_timeout);
            mpf_activity_detector_recognition_timeout_set(recog_channel->detector,recog_header->recognition_timeout);
        }else {
            mpf_activity_detector_recognition_timeout_set(recog_channel->detector,30000);
        }
    }

    ******************
    忽略部分无关代码
    ******************
}

 

  FS发送的超时参数 speech_incomplete_timeout ,被 recog_engine 识别为 speech_complete_timeout,并在 mpf 中设置为 silence_timeout ,表示从 activity 转换为 inactivity 所需时间;

  no_input_timeout ,在 recog_engine 和 mpf 中同名,表示从识别开始最大的无声音时间 ;

  recognition_timeout ,默认未使用该参数,表示整个识别流程的最大时间。

  1.3 mpf中超时参数使用

  主要有两个函数:

mpf_activity_detector_level_calculate计算音频活动状态,有活动返回值>=1,无活动返回值<1
下面是采用webrtc-vad的检测代码
static apr_size_t mpf_activity_detector_level_calculate(const mpf_frame_t *frame)
{
    apr_size_t samplesCount = frame->codec_frame.size/2;
    int per_ms_frames = 20;
    apr_size_t sampleRate = 8000;
    size_t samples = sampleRate * per_ms_frames / 1000;
    if (samples == 0) return -1;
    size_t nTotal = (samplesCount / samples);
    int16_t *input = frame->codec_frame.buffer;
    VadInst *vadInst;
    if (WebRtcVad_Create(&vadInst)) {
        return -1;
    }
    int status = WebRtcVad_Init(vadInst);
    if (status != 0) {
        WebRtcVad_Free(vadInst);
        return -1;
    }
    int16_t vad_mode = 1;
    status = WebRtcVad_set_mode(vadInst, vad_mode);
    if (status != 0) {
        WebRtcVad_Free(vadInst);
        return -1;
    }
    int cnt = 0;
    int i  = 0;
    if(nTotal > 0) {
        for (i = 0; i < nTotal; i++) {
            //int keep_weight = 0;
            int nVadRet = WebRtcVad_Process(vadInst, sampleRate, input, samples);
            //printf("==========%d=============== \n", nVadRet);
            if (nVadRet == -1) {
                WebRtcVad_Free(vadInst);
                return -1;
            } else {
                if (nVadRet >= 1) {
                    cnt++;
                }
            }
            input += samples;
        }
        WebRtcVad_Free(vadInst);
        if (cnt < nTotal/10) {
            return 0;
        } else {
            return 1;
        }
    }
 
    if(nTotal == 0) {
    int nVadRet = WebRtcVad_Process(vadInst, sampleRate, input, samplesCount);
    //printf("==========%d=============== \n", nVadRet);
    WebRtcVad_Free(vadInst);
    return nVadRet;
    }

    return 0;
}

  

mpf_activity_detector_process,音频状态转换函数
MPF_DECLARE(mpf_detector_event_e) mpf_activity_detector_process(mpf_activity_detector_t *detector, const mpf_frame_t *frame)
{
    apt_log(MPF_LOG_MARK,APT_PRIO_DEBUG,"[mpf webrtc]Activity Detector Process");
    mpf_detector_event_e det_event = MPF_DETECTOR_EVENT_NONE;
    apr_size_t level = 0;
    if((frame->type & MEDIA_FRAME_TYPE_AUDIO) == MEDIA_FRAME_TYPE_AUDIO) {     
        level = mpf_activity_detector_level_calculate(frame);
#if 0
        apt_log(MPF_LOG_MARK,APT_PRIO_DEBUG,"Activity Detector --------------------- [%"APR_SIZE_T_FMT"]",level);
#endif
    }
    
    detector->total_duration += CODEC_FRAME_TIME_BASE;
    apt_log(MPF_LOG_MARK,APT_PRIO_INFO,"Activity Detector ----channel detect speech total duration ---- [%d][%d]", detector->total_duration,detector->recognition_timeout);

    if(detector->total_duration >= detector->recognition_timeout) {
        apt_log(MPF_LOG_MARK,APT_PRIO_INFO,"Activity Detector ----channel detect speech total duration ---------------- [%d]", detector->total_duration);
        det_event = MPF_DETECTOR_EVENT_RECOGNITION_TIMEOUT;
        return det_event;
    }

    if(detector->state == DETECTOR_STATE_INACTIVITY) {
        if(level >= 1) {
           //apt_log(MPF_LOG_MARK,APT_PRIO_INFO,"Activity Detector ----DETECTOR_STATE_ACTIVITY_TRANSITION---------------- [%"APR_SIZE_T_FMT"]",level);
            mpf_activity_detector_state_change(detector, DETECTOR_STATE_ACTIVITY_TRANSITION);
        }
        else {
            detector->duration += CODEC_FRAME_TIME_BASE;
            if(detector->duration >= detector->noinput_timeout) {
                det_event = MPF_DETECTOR_EVENT_NOINPUT;
            }
        }
    }
    else if(detector->state == DETECTOR_STATE_ACTIVITY_TRANSITION) {
        if(level >= 1) {
            detector->duration += CODEC_FRAME_TIME_BASE;
             //apt_log(MPF_LOG_MARK,APT_PRIO_INFO,"Activity Detector ----DETECTOR_STATE_ACTIVITY-------11111--------- [%"APR_SIZE_T_FMT"]",level);
            if(detector->duration >= detector->speech_timeout && detector->start_of_input == TRUE) {
                det_event = MPF_DETECTOR_EVENT_ACTIVITY;
                mpf_activity_detector_state_change(detector, DETECTOR_STATE_ACTIVITY);
            }
        }
        else {
            mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY);
        }
    }
    else if(detector->state == DETECTOR_STATE_ACTIVITY) {
        if(level >= 1) {
               //apt_log(MPF_LOG_MARK,APT_PRIO_INFO,"Activity Detector ----DETECTOR_STATE_ACTIVITY--------2222-------- [%"APR_SIZE_T_FMT"]",level);
            detector->duration += CODEC_FRAME_TIME_BASE;
        } else {
            //apt_log(MPF_LOG_MARK,APT_PRIO_INFO,"Activity Detector ----DETECTOR_STATE_INACTIVITY_TRANSITION---------------- [%"APR_SIZE_T_FMT"]",level);
            mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY_TRANSITION);
        }
    }
    else if(detector->state == DETECTOR_STATE_INACTIVITY_TRANSITION) {
        if(level >= 1) {
            mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY);
        }
        else {
            detector->duration += CODEC_FRAME_TIME_BASE;
            if(detector->duration >= detector->silence_timeout) {
               //apt_log(MPF_LOG_MARK,APT_PRIO_INFO,"Activity Detector ----DETECTOR_STATE_INACTIVITY---------------- [%"APR_SIZE_T_FMT"]",level);
                det_event = MPF_DETECTOR_EVENT_INACTIVITY;
                mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY);
            }
        }
    }
    return det_event;
}

  此函数会返回处理后的当前音频状态,recog_engine 接收处理结果后,进一步控制识别流程

  

  下图为识别流程状态机:

 

 

备注:

1.音频帧长度为10ms,8000采样率为160字节

2.音频帧level算法采用webrtc方案,level<1为静音

3.每次状态变化都会重新计算状态持续时长

4.首次进入活动状态,会触发start_of_input,返回给FS,FS收到后会停止放音(play_and_detect_speech)优化点

4.speech_complete_timeout 非静音->静音所需时间,同时表示说话人已经停止说话,

语音识别资源成功返回结果之间所必须消耗的时间,对应mrcp中silence_timeout

5.必须由转换静音态再次进入静音态,才会触发speech_complete,此时为正常mrcp流程的识别结束

6.必须一直处于静音状态,才会触发no_input

7.总识别时长(无论什么状态)>recogition_timeout,会触发识别超时RECOGNITION-COMPLETE,

(原生mrcp协议header,默认未使用,个人改造开启)

8.除no_input_timeout外,其余均会正常返回识别结果

 

2.优化后超时逻辑

  2.1优化原因:环境噪音也会触发start_of_input,打断FS放音,使用效果很差

  2.2 优化逻辑:修改状态转换逻辑,只有收到ASR识别结果才会认为开始输入

  2.3 优化后的识别流程状态机

 

 

posted @ 2024-06-14 11:19  caicaibucai!  阅读(360)  评论(0)    收藏  举报