pocketsphinx hotword检测不起作用

时间:2017-09-18 07:38:28

标签: cmusphinx pocketsphinx

我正在尝试构建一个使用CMU Sphinx语音识别工具包(pocketsphinx)检测热门词汇的小软件。

  1. 我创建了一个包含2个单词的语料库文件,其中一个是HELP
  2. 使用模型工具获取模型....(http://www.speech.cs.cmu.edu/tools/lmtool-new.html
  3. 即使没有人说热门话,我也会得到太多的热门话。
  4. 我失踪了什么?

    以下是代码:

    #include "stdafx.h"
    #include <windows.h>
    #include <sphinxbase/err.h>
    #include <sphinxbase/ad.h>
    #include <pocketsphinx.h>
    
    using namespace System;
    
    #define HOTWORD_KEY "hotwordsearch"
    #define LM_KEY "lmsearch"
    static const arg_t args_def[] = {
        POCKETSPHINX_OPTIONS,
        CMDLN_EMPTY_OPTION
    };
    
    const char *keyphrase = NULL;
    
    ad_rec_t* open_recording_device(ps_decoder_t *ps, cmd_ln_t *config)
    {
        ad_rec_t *ad;
        int samprate = (int)cmd_ln_float32_r(ps_get_config(ps), "-samprate");
        if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), samprate)) == NULL) {
            E_ERROR("Failed to open audio device\n");
            return NULL;
        }
        if (ad_start_rec(ad) < 0) {
            E_ERROR("Failed to start recording\n");
            return NULL;
        }
        return ad;
    }
    
    char const *acquire_from_mic(ps_decoder_t *ps, ad_rec_t *ad, int need_final)
    {
        int16 adbuf[4096];
        uint8 utt_started, in_speech;
        int32 k, score=0;
        char const *hyp;
    
        if (ps_start_utt(ps) < 0) {
            E_ERROR("Failed to start utterance\n");
            return NULL;
        }
        utt_started = FALSE;
        E_INFO("Ready....\n");
    
        for (;;) {
            if ((k = ad_read(ad, adbuf, 4096)) < 0)
                E_FATAL("Failed to read audio\n");
            ps_process_raw(ps, adbuf, k, FALSE, FALSE);
            in_speech = ps_get_in_speech(ps);
            if (in_speech && !utt_started) {
                utt_started = TRUE;
                E_INFO("Listening...\n");
            }
            if (!in_speech && utt_started){ 
                /* speech -> silence transition, time to start new utterance  */
                ps_end_utt(ps);
    
                hyp = NULL;
                //hyp = ps_get_hyp_final(ps, &score);
                hyp = ps_get_hyp(ps, &score);
                if ((hyp != NULL)/*&&(score>0)*/) {
                    E_INFO("---> score = %d\n", score);
                    E_INFO("---> hyp   = %s \n", hyp);
                    return hyp;
                }
    
                if (ps_start_utt(ps) < 0) {
                    E_ERROR("Failed to start utterance\n");
                    return NULL;
                }
                utt_started = FALSE;
                E_INFO("Ready again....\n");
            }
            Sleep(10);
        }
    
        return NULL;
    }
    
    int wait_for_hotword(ps_decoder_t *ps, ad_rec_t *ad)
    {
        if (ps_set_search(ps, HOTWORD_KEY) < 0) {
            E_ERROR("Couldn't set hotwordsearch\n");
            return 0;
        }
    
        if (keyphrase == NULL) {
            keyphrase = ps_get_kws(ps, HOTWORD_KEY);
            E_INFO("keyphrase is:  %s \n", keyphrase);
        }
    
        const char *hyp;
        do {
            hyp = NULL;
            hyp = acquire_from_mic(ps, ad, FALSE);
            if (hyp != NULL){
                if (strcmp(keyphrase, hyp) == 0) {
                    return 1;
                }
            }       
        } while (1);
    
        return 0;
    
    }
    
    
    int main(int argc, char *argv[])
    {
        ps_decoder_t *ps;
        cmd_ln_t *config;
    
        config = cmd_ln_parse_file_r(NULL, args_def, "pocketsphinx.conf", 1);
        if (config == NULL) {
            fprintf(stderr, "Failed to create config object, see log for details\n");
            return -1;
        }
    
        ps = ps_init(config);
        if (ps == NULL) {
            fprintf(stderr, "Failed to create recognizer, see log for details\n");
            return -1;
        }
    
        ps_set_lm_file(ps, LM_KEY, "0806.lm");
        ps_set_keyphrase(ps, HOTWORD_KEY,  "HELP");
    
        ad_rec_t* ad = open_recording_device(ps, config);
        if (ad == NULL) {
            fprintf(stderr, "Failed to open_recording_device\n");
            return -1;
        }
    
        while (true) {
            if (wait_for_hotword(ps, ad)==1)
            {
                fprintf(stderr, "\n\n****************\nGot hotword\n");
            }
        }
    
        ps_free(ps);
        cmd_ln_free_r(config);
    
        Console::WriteLine(L"Hello World");
        return 0;
    }
    

    配置文件:

    -dict 0806.dic
    -kws_threshold 1e-40
    -samprate 16000
    -lm 0806.lm
    -hmm model/en-us/en-us/
    

    下面的输出(

    • 错误检测:一直检测到的热门词 - 没有人说“#34; HELP&#34;”这个词:
    • 不明白为什么我会得到---&gt; hyp = HELP HELP HELP HELP HELP

    输出:

    INFO: pocketsphinx.c(152): Parsed model-specific feature parameters from model/en-us/en-us//feat.params
    Current configuration:
    [NAME]                  [DEFLT]         [VALUE]
    -agc                    none            none
    -agcthresh              2.0             2.000000e+000
    -allphone
    -allphone_ci            no              no
    -alpha                  0.97            9.700000e-001
    -ascale                 20.0            2.000000e+001
    -aw                     1               1
    -backtrace              no              no
    -beam                   1e-48           1.000000e-048
    -bestpath               yes             yes
    -bestpathlw             9.5             9.500000e+000
    -ceplen                 13              13
    -cmn                    current         current
    -cmninit                8.0             40,3,-1
    -compallsen             no              no
    -debug                                  0
    -dict                                   0806.dic
    -dictcase               no              no
    -dither                 no              no
    -doublebw               no              no
    -ds                     1               1
    -fdict
    -feat                   1s_c_d_dd       1s_c_d_dd
    -featparams
    -fillprob               1e-8            1.000000e-008
    -frate                  100             100
    -fsg
    -fsgusealtpron          yes             yes
    -fsgusefiller           yes             yes
    -fwdflat                yes             yes
    -fwdflatbeam            1e-64           1.000000e-064
    -fwdflatefwid           4               4
    -fwdflatlw              8.5             8.500000e+000
    -fwdflatsfwin           25              25
    -fwdflatwbeam           7e-29           7.000000e-029
    -fwdtree                yes             yes
    -hmm                                    model/en-us/en-us/
    -input_endian           little          little
    -jsgf
    -keyphrase
    -kws
    -kws_delay              10              10
    -kws_plp                1e-1            1.000000e-001
    -kws_threshold          1               1.000000e-040
    -latsize                5000            5000
    -lda
    -ldadim                 0               0
    -lifter                 0               22
    -lm                                     0806.lm
    -lmctl
    -lmname
    -logbase                1.0001          1.000100e+000
    -logfn
    -logspec                no              no
    -lowerf                 133.33334       1.300000e+002
    -lpbeam                 1e-40           1.000000e-040
    -lponlybeam             7e-29           7.000000e-029
    -lw                     6.5             6.500000e+000
    -maxhmmpf               30000           30000
    -maxwpf                 -1              -1
    -mdef
    -mean
    -mfclogdir
    -min_endfr              0               0
    -mixw
    -mixwfloor              0.0000001       1.000000e-007
    -mllr
    -mmap                   yes             yes
    -ncep                   13              13
    -nfft                   512             512
    -nfilt                  40              25
    -nwpen                  1.0             1.000000e+000
    -pbeam                  1e-48           1.000000e-048
    -pip                    1.0             1.000000e+000
    -pl_beam                1e-10           1.000000e-010
    -pl_pbeam               1e-10           1.000000e-010
    -pl_pip                 1.0             1.000000e+000
    -pl_weight              3.0             3.000000e+000
    -pl_window              5               5
    -rawlogdir
    -remove_dc              no              no
    -remove_noise           yes             yes
    -remove_silence         yes             yes
    -round_filters          yes             yes
    -samprate               16000           1.600000e+004
    -seed                   -1              -1
    -sendump
    -senlogdir
    -senmgau
    -silprob                0.005           5.000000e-003
    -smoothspec             no              no
    -svspec                                 0-12/13-25/26-38
    -tmat
    -tmatfloor              0.0001          1.000000e-004
    -topn                   4               4
    -topn_beam              0               0
    -toprule
    -transform              legacy          dct
    -unit_area              yes             yes
    -upperf                 6855.4976       6.800000e+003
    -uw                     1.0             1.000000e+000
    -vad_postspeech         50              50
    -vad_prespeech          20              20
    -vad_startspeech        10              10
    -vad_threshold          2.0             3.000000e+000
    -var
    -varfloor               0.0001          1.000000e-004
    -varnorm                no              no
    -verbose                no              no
    -warp_params
    -warp_type              inverse_linear  inverse_linear
    -wbeam                  7e-29           7.000000e-029
    -wip                    0.65            6.500000e-001
    -wlen                   0.025625        2.562500e-002
    
    INFO: feat.c(715): Initializing feature stream to type: '1s_c_d_dd', ceplen=13, CMN='current', VARNORM='no', AGC='none'
    INFO: cmn.c(143): mean[0]= 12.00, mean[1..12]= 0.0
    INFO: acmod.c(164): Using subvector specification 0-12/13-25/26-38
    INFO: mdef.c(518): Reading model definition: model/en-us/en-us//mdef
    INFO: mdef.c(531): Found byte-order mark BMDF, assuming this is a binary mdef file
    INFO: bin_mdef.c(336): Reading binary model definition: model/en-us/en-us//mdef
    INFO: bin_mdef.c(516): 42 CI-phone, 137053 CD-phone, 3 emitstate/phone, 126 CI-sen, 5126 Sen, 29324 Sen-Seq
    INFO: tmat.c(206): Reading HMM transition probability matrices: model/en-us/en-us//transition_matrices
    INFO: acmod.c(117): Attempting to use PTM computation module
    INFO: ms_gauden.c(198): Reading mixture gaussian parameter: model/en-us/en-us//means
    INFO: ms_gauden.c(292): 42 codebook, 3 feature, size:
    INFO: ms_gauden.c(294):  128x13
    INFO: ms_gauden.c(294):  128x13
    INFO: ms_gauden.c(294):  128x13
    INFO: ms_gauden.c(198): Reading mixture gaussian parameter: model/en-us/en-us//variances
    INFO: ms_gauden.c(292): 42 codebook, 3 feature, size:
    INFO: ms_gauden.c(294):  128x13
    INFO: ms_gauden.c(294):  128x13
    INFO: ms_gauden.c(294):  128x13
    INFO: ms_gauden.c(354): 222 variance values floored
    INFO: ptm_mgau.c(476): Loading senones from dump file model/en-us/en-us//sendump
    INFO: ptm_mgau.c(500): BEGIN FILE FORMAT DESCRIPTION
    INFO: ptm_mgau.c(563): Rows: 128, Columns: 5126
    INFO: ptm_mgau.c(595): Using memory-mapped I/O for senones
    INFO: ptm_mgau.c(835): Maximum top-N: 4
    INFO: phone_loop_search.c(114): State beam -225 Phone exit beam -225 Insertion penalty 0
    INFO: dict.c(320): Allocating 4104 * 20 bytes (80 KiB) for word entries
    INFO: dict.c(333): Reading main dictionary: 0806.dic
    INFO: dict.c(213): Allocated 0 KiB for strings, 0 KiB for phones
    INFO: dict.c(336): 3 words read
    INFO: dict.c(358): Reading filler dictionary: model/en-us/en-us//noisedict
    INFO: dict.c(213): Allocated 0 KiB for strings, 0 KiB for phones
    INFO: dict.c(361): 5 words read
    INFO: dict2pid.c(396): Building PID tables for dictionary
    INFO: dict2pid.c(406): Allocating 42^3 * 2 bytes (144 KiB) for word-initial triphones
    INFO: dict2pid.c(132): Allocated 21336 bytes (20 KiB) for word-final triphones
    INFO: dict2pid.c(196): Allocated 21336 bytes (20 KiB) for single-phone word triphones
    INFO: ngram_model_trie.c(347): Trying to read LM in trie binary format
    INFO: ngram_model_trie.c(358): Header doesn't match
    INFO: ngram_model_trie.c(176): Trying to read LM in arpa format
    INFO: ngram_model_trie.c(192): LM of order 3
    INFO: ngram_model_trie.c(194): #1-grams: 5
    INFO: ngram_model_trie.c(194): #2-grams: 6
    INFO: ngram_model_trie.c(194): #3-grams: 3
    INFO: lm_trie.c(473): Training quantizer
    INFO: lm_trie.c(481): Building LM trie
    INFO: ngram_search_fwdtree.c(99): 3 unique initial diphones
    INFO: ngram_search_fwdtree.c(148): 0 root, 0 non-root channels, 6 single-phone words
    INFO: ngram_search_fwdtree.c(186): Creating search tree
    INFO: ngram_search_fwdtree.c(192): before: 0 root, 0 non-root channels, 6 single-phone words
    INFO: ngram_search_fwdtree.c(326): after: max nonroot chan increased to 139
    INFO: ngram_search_fwdtree.c(339): after: 3 root, 11 non-root channels, 5 single-phone words
    INFO: ngram_search_fwdflat.c(157): fwdflat: min_ef_width = 4, max_sf_win = 25
    INFO: ngram_model_trie.c(347): Trying to read LM in trie binary format
    INFO: ngram_model_trie.c(358): Header doesn't match
    INFO: ngram_model_trie.c(176): Trying to read LM in arpa format
    INFO: ngram_model_trie.c(192): LM of order 3
    INFO: ngram_model_trie.c(194): #1-grams: 5
    INFO: ngram_model_trie.c(194): #2-grams: 6
    INFO: ngram_model_trie.c(194): #3-grams: 3
    INFO: lm_trie.c(473): Training quantizer
    INFO: lm_trie.c(481): Building LM trie
    INFO: ngram_search_fwdtree.c(99): 3 unique initial diphones
    INFO: ngram_search_fwdtree.c(148): 0 root, 0 non-root channels, 6 single-phone words
    INFO: ngram_search_fwdtree.c(186): Creating search tree
    INFO: ngram_search_fwdtree.c(192): before: 0 root, 0 non-root channels, 6 single-phone words
    INFO: ngram_search_fwdtree.c(326): after: max nonroot chan increased to 139
    INFO: ngram_search_fwdtree.c(339): after: 3 root, 11 non-root channels, 5 single-phone words
    INFO: ngram_search_fwdflat.c(157): fwdflat: min_ef_width = 4, max_sf_win = 25
    INFO: kws_search.c(420): KWS(beam: -1080, plp: -23, default threshold -900, delay 10)
    ERROR: "cmd_ln.c", line 938: Unknown argument: -adcdev
    Allocating 32 buffers of 2500 samples each
    INFO: cppTest.cpp(103): keyphrase is:  HELP
    INFO: cppTest.cpp(53): Ready....
    INFO: cppTest.cpp(63): Listening...
    INFO: cmn_prior.c(131): cmn_prior_update: from < 40.00  3.00 -1.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 >
    INFO: cmn_prior.c(149): cmn_prior_update: to   < 38.01 18.02 -0.60 -3.94  3.44  2.67 -2.47 -0.60  2.43 -5.32 -2.22 -7.04  2.46 >
    INFO: cppTest.cpp(76): ---> score = 0
    INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP
    INFO: cppTest.cpp(53): Ready....
    INFO: cppTest.cpp(63): Listening...
    INFO: cmn_prior.c(131): cmn_prior_update: from < 38.01 18.02 -0.60 -3.94  3.44  2.67 -2.47 -0.60  2.43 -5.32 -2.22 -7.04  2.46 >
    INFO: cmn_prior.c(149): cmn_prior_update: to   < 39.34 14.99 -7.16 -2.53 -4.98 -8.98 -3.15  1.60  4.95 -9.76  2.29 -5.59  4.39 >
    INFO: cppTest.cpp(76): ---> score = 0
    INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP
    INFO: cppTest.cpp(53): Ready....
    INFO: cppTest.cpp(63): Listening...
    INFO: cmn_prior.c(131): cmn_prior_update: from < 39.34 14.99 -7.16 -2.53 -4.98 -8.98 -3.15  1.60  4.95 -9.76  2.29 -5.59  4.39 >
    INFO: cmn_prior.c(149): cmn_prior_update: to   < 39.29 13.66 -5.37 -1.40 -4.89 -9.12 -5.23  0.36  4.09 -10.30  3.82 -4.18  4.00 >
    INFO: cppTest.cpp(76): ---> score = 0
    INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP HELP HELP
    INFO: cppTest.cpp(53): Ready....
    INFO: cppTest.cpp(63): Listening...
    INFO: cmn_prior.c(99): cmn_prior_update: from < 39.29 13.66 -5.37 -1.40 -4.89 -9.12 -5.23  0.36  4.09 -10.30  3.82 -4.18  4.00 >
    INFO: cmn_prior.c(116): cmn_prior_update: to   < 39.24 13.36 -4.74 -0.99 -5.23 -9.40 -5.08 -0.40  3.95 -10.70  3.53 -4.19  3.83 >
    INFO: cmn_prior.c(99): cmn_prior_update: from < 39.24 13.36 -4.74 -0.99 -5.23 -9.40 -5.08 -0.40  3.95 -10.70  3.53 -4.19  3.83 >
    INFO: cmn_prior.c(116): cmn_prior_update: to   < 39.12 11.92 -5.07 -1.94 -6.63 -7.68 -3.63 -2.29  0.49 -11.79  3.25 -3.04  2.44 >
    INFO: cmn_prior.c(131): cmn_prior_update: from < 39.12 11.92 -5.07 -1.94 -6.63 -7.68 -3.63 -2.29  0.49 -11.79  3.25 -3.04  2.44 >
    INFO: cmn_prior.c(149): cmn_prior_update: to   < 38.03 11.47 -5.09 -2.07 -6.22 -6.86 -3.23 -2.30  0.27 -11.47  2.86 -3.17  2.37 >
    INFO: cppTest.cpp(76): ---> score = 0
    INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP
    INFO: cppTest.cpp(53): Ready....
    INFO: cppTest.cpp(63): Listening...
    INFO: cmn_prior.c(99): cmn_prior_update: from < 38.03 11.47 -5.09 -2.07 -6.22 -6.86 -3.23 -2.30  0.27 -11.47  2.86 -3.17  2.37 >
    INFO: cmn_prior.c(116): cmn_prior_update: to   < 37.17 10.53 -3.43  1.18 -7.71 -8.15 -2.56 -5.45 -0.81 -12.44  0.63 -2.53  2.86 >
    INFO: cmn_prior.c(131): cmn_prior_update: from < 37.17 10.53 -3.43  1.18 -7.71 -8.15 -2.56 -5.45 -0.81 -12.44  0.63 -2.53  2.86 >
    INFO: cmn_prior.c(149): cmn_prior_update: to   < 36.30 10.14 -2.66  2.23 -7.40 -7.74 -2.55 -5.77 -0.57 -11.16  0.96 -2.47  2.95 >
    INFO: cppTest.cpp(76): ---> score = 0
    INFO: cppTest.cpp(77): ---> hyp   = HELP HELP HELP HELP HELP HELP HELP HELP HELP
    INFO: cppTest.cpp(53): Ready....
    INFO: cppTest.cpp(63): Listening...
    INFO: cmn_prior.c(131): cmn_prior_update: from < 36.30 10.14 -2.66  2.23 -7.40 -7.74 -2.55 -5.77 -0.57 -11.16  0.96 -2.47  2.95 >
    INFO: cmn_prior.c(149): cmn_prior_update: to   < 36.39 10.85 -2.80  2.29 -5.72 -6.97 -2.79 -3.21 -0.31 -10.98  0.31 -3.93  2.68 >
    INFO: cppTest.cpp(76): ---> score = 0
    INFO: cppTest.cpp(77): ---> hyp   = HELP
    
    
    ****************
    Got hotword
    
    INFO: cppTest.cpp(53): Ready....
    INFO: cppTest.cpp(63): Listening...
    

0 个答案:

没有答案