我正在尝试构建一个使用CMU Sphinx语音识别工具包(pocketsphinx)检测热门词汇的小软件。
我失踪了什么?
以下是代码:
#include "stdafx.h"
#include <windows.h>
#include <sphinxbase/err.h>
#include <sphinxbase/ad.h>
#include <pocketsphinx.h>
using namespace System;
#define HOTWORD_KEY "hotwordsearch"
#define LM_KEY "lmsearch"
static const arg_t args_def[] = {
POCKETSPHINX_OPTIONS,
CMDLN_EMPTY_OPTION
};
const char *keyphrase = NULL;
ad_rec_t* open_recording_device(ps_decoder_t *ps, cmd_ln_t *config)
{
ad_rec_t *ad;
int samprate = (int)cmd_ln_float32_r(ps_get_config(ps), "-samprate");
if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), samprate)) == NULL) {
E_ERROR("Failed to open audio device\n");
return NULL;
}
if (ad_start_rec(ad) < 0) {
E_ERROR("Failed to start recording\n");
return NULL;
}
return ad;
}
char const *acquire_from_mic(ps_decoder_t *ps, ad_rec_t *ad, int need_final)
{
int16 adbuf[4096];
uint8 utt_started, in_speech;
int32 k, score=0;
char const *hyp;
if (ps_start_utt(ps) < 0) {
E_ERROR("Failed to start utterance\n");
return NULL;
}
utt_started = FALSE;
E_INFO("Ready....\n");
for (;;) {
if ((k = ad_read(ad, adbuf, 4096)) < 0)
E_FATAL("Failed to read audio\n");
ps_process_raw(ps, adbuf, k, FALSE, FALSE);
in_speech = ps_get_in_speech(ps);
if (in_speech && !utt_started) {
utt_started = TRUE;
E_INFO("Listening...\n");
}
if (!in_speech && utt_started){
/* speech -> silence transition, time to start new utterance */
ps_end_utt(ps);
hyp = NULL;
//hyp = ps_get_hyp_final(ps, &score);
hyp = ps_get_hyp(ps, &score);
if ((hyp != NULL)/*&&(score>0)*/) {
E_INFO("---> score = %d\n", score);
E_INFO("---> hyp = %s \n", hyp);
return hyp;
}
if (ps_start_utt(ps) < 0) {
E_ERROR("Failed to start utterance\n");
return NULL;
}
utt_started = FALSE;
E_INFO("Ready again....\n");
}
Sleep(10);
}
return NULL;
}
int wait_for_hotword(ps_decoder_t *ps, ad_rec_t *ad)
{
if (ps_set_search(ps, HOTWORD_KEY) < 0) {
E_ERROR("Couldn't set hotwordsearch\n");
return 0;
}
if (keyphrase == NULL) {
keyphrase = ps_get_kws(ps, HOTWORD_KEY);
E_INFO("keyphrase is: %s \n", keyphrase);
}
const char *hyp;
do {
hyp = NULL;
hyp = acquire_from_mic(ps, ad, FALSE);
if (hyp != NULL){
if (strcmp(keyphrase, hyp) == 0) {
return 1;
}
}
} while (1);
return 0;
}
int main(int argc, char *argv[])
{
ps_decoder_t *ps;
cmd_ln_t *config;
config = cmd_ln_parse_file_r(NULL, args_def, "pocketsphinx.conf", 1);
if (config == NULL) {
fprintf(stderr, "Failed to create config object, see log for details\n");
return -1;
}
ps = ps_init(config);
if (ps == NULL) {
fprintf(stderr, "Failed to create recognizer, see log for details\n");
return -1;
}
ps_set_lm_file(ps, LM_KEY, "0806.lm");
ps_set_keyphrase(ps, HOTWORD_KEY, "HELP");
ad_rec_t* ad = open_recording_device(ps, config);
if (ad == NULL) {
fprintf(stderr, "Failed to open_recording_device\n");
return -1;
}
while (true) {
if (wait_for_hotword(ps, ad)==1)
{
fprintf(stderr, "\n\n****************\nGot hotword\n");
}
}
ps_free(ps);
cmd_ln_free_r(config);
Console::WriteLine(L"Hello World");
return 0;
}
配置文件:
-dict 0806.dic
-kws_threshold 1e-40
-samprate 16000
-lm 0806.lm
-hmm model/en-us/en-us/
下面的输出(
输出:
INFO: pocketsphinx.c(152): Parsed model-specific feature parameters from model/en-us/en-us//feat.params
Current configuration:
[NAME] [DEFLT] [VALUE]
-agc none none
-agcthresh 2.0 2.000000e+000
-allphone
-allphone_ci no no
-alpha 0.97 9.700000e-001
-ascale 20.0 2.000000e+001
-aw 1 1
-backtrace no no
-beam 1e-48 1.000000e-048
-bestpath yes yes
-bestpathlw 9.5 9.500000e+000
-ceplen 13 13
-cmn current current
-cmninit 8.0 40,3,-1
-compallsen no no
-debug 0
-dict 0806.dic
-dictcase no no
-dither no no
-doublebw no no
-ds 1 1
-fdict
-feat 1s_c_d_dd 1s_c_d_dd
-featparams
-fillprob 1e-8 1.000000e-008
-frate 100 100
-fsg
-fsgusealtpron yes yes
-fsgusefiller yes yes
-fwdflat yes yes
-fwdflatbeam 1e-64 1.000000e-064
-fwdflatefwid 4 4
-fwdflatlw 8.5 8.500000e+000
-fwdflatsfwin 25 25
-fwdflatwbeam 7e-29 7.000000e-029
-fwdtree yes yes
-hmm model/en-us/en-us/
-input_endian little little
-jsgf
-keyphrase
-kws
-kws_delay 10 10
-kws_plp 1e-1 1.000000e-001
-kws_threshold 1 1.000000e-040
-latsize 5000 5000
-lda
-ldadim 0 0
-lifter 0 22
-lm 0806.lm
-lmctl
-lmname
-logbase 1.0001 1.000100e+000
-logfn
-logspec no no
-lowerf 133.33334 1.300000e+002
-lpbeam 1e-40 1.000000e-040
-lponlybeam 7e-29 7.000000e-029
-lw 6.5 6.500000e+000
-maxhmmpf 30000 30000
-maxwpf -1 -1
-mdef
-mean
-mfclogdir
-min_endfr 0 0
-mixw
-mixwfloor 0.0000001 1.000000e-007
-mllr
-mmap yes yes
-ncep 13 13
-nfft 512 512
-nfilt 40 25
-nwpen 1.0 1.000000e+000
-pbeam 1e-48 1.000000e-048
-pip 1.0 1.000000e+000
-pl_beam 1e-10 1.000000e-010
-pl_pbeam 1e-10 1.000000e-010
-pl_pip 1.0 1.000000e+000
-pl_weight 3.0 3.000000e+000
-pl_window 5 5
-rawlogdir
-remove_dc no no
-remove_noise yes yes
-remove_silence yes yes
-round_filters yes yes
-samprate 16000 1.600000e+004
-seed -1 -1
-sendump
-senlogdir
-senmgau
-silprob 0.005 5.000000e-003
-smoothspec no no
-svspec 0-12/13-25/26-38
-tmat
-tmatfloor 0.0001 1.000000e-004
-topn 4 4
-topn_beam 0 0
-toprule
-transform legacy dct
-unit_area yes yes
-upperf 6855.4976 6.800000e+003
-uw 1.0 1.000000e+000
-vad_postspeech 50 50
-vad_prespeech 20 20
-vad_startspeech 10 10
-vad_threshold 2.0 3.000000e+000
-var
-varfloor 0.0001 1.000000e-004
-varnorm no no
-verbose no no
-warp_params
-warp_type inverse_linear inverse_linear
-wbeam 7e-29 7.000000e-029
-wip 0.65 6.500000e-001
-wlen 0.025625 2.562500e-002
INFO: feat.c(715): Initializing feature stream to type: '1s_c_d_dd', ceplen=13, CMN='current', VARNORM='no', AGC='none'
INFO: cmn.c(143): mean[0]= 12.00, mean[1..12]= 0.0
INFO: acmod.c(164): Using subvector specification 0-12/13-25/26-38
INFO: mdef.c(518): Reading model definition: model/en-us/en-us//mdef
INFO: mdef.c(531): Found byte-order mark BMDF, assuming this is a binary mdef file
INFO: bin_mdef.c(336): Reading binary model definition: model/en-us/en-us//mdef
INFO: bin_mdef.c(516): 42 CI-phone, 137053 CD-phone, 3 emitstate/phone, 126 CI-sen, 5126 Sen, 29324 Sen-Seq
INFO: tmat.c(206): Reading HMM transition probability matrices: model/en-us/en-us//transition_matrices
INFO: acmod.c(117): Attempting to use PTM computation module
INFO: ms_gauden.c(198): Reading mixture gaussian parameter: model/en-us/en-us//means
INFO: ms_gauden.c(292): 42 codebook, 3 feature, size:
INFO: ms_gauden.c(294): 128x13
INFO: ms_gauden.c(294): 128x13
INFO: ms_gauden.c(294): 128x13
INFO: ms_gauden.c(198): Reading mixture gaussian parameter: model/en-us/en-us//variances
INFO: ms_gauden.c(292): 42 codebook, 3 feature, size:
INFO: ms_gauden.c(294): 128x13
INFO: ms_gauden.c(294): 128x13
INFO: ms_gauden.c(294): 128x13
INFO: ms_gauden.c(354): 222 variance values floored
INFO: ptm_mgau.c(476): Loading senones from dump file model/en-us/en-us//sendump
INFO: ptm_mgau.c(500): BEGIN FILE FORMAT DESCRIPTION
INFO: ptm_mgau.c(563): Rows: 128, Columns: 5126
INFO: ptm_mgau.c(595): Using memory-mapped I/O for senones
INFO: ptm_mgau.c(835): Maximum top-N: 4
INFO: phone_loop_search.c(114): State beam -225 Phone exit beam -225 Insertion penalty 0
INFO: dict.c(320): Allocating 4104 * 20 bytes (80 KiB) for word entries
INFO: dict.c(333): Reading main dictionary: 0806.dic
INFO: dict.c(213): Allocated 0 KiB for strings, 0 KiB for phones
INFO: dict.c(336): 3 words read
INFO: dict.c(358): Reading filler dictionary: model/en-us/en-us//noisedict
INFO: dict.c(213): Allocated 0 KiB for strings, 0 KiB for phones
INFO: dict.c(361): 5 words read
INFO: dict2pid.c(396): Building PID tables for dictionary
INFO: dict2pid.c(406): Allocating 42^3 * 2 bytes (144 KiB) for word-initial triphones
INFO: dict2pid.c(132): Allocated 21336 bytes (20 KiB) for word-final triphones
INFO: dict2pid.c(196): Allocated 21336 bytes (20 KiB) for single-phone word triphones
INFO: ngram_model_trie.c(347): Trying to read LM in trie binary format
INFO: ngram_model_trie.c(358): Header doesn't match
INFO: ngram_model_trie.c(176): Trying to read LM in arpa format
INFO: ngram_model_trie.c(192): LM of order 3
INFO: ngram_model_trie.c(194): #1-grams: 5
INFO: ngram_model_trie.c(194): #2-grams: 6
INFO: ngram_model_trie.c(194): #3-grams: 3
INFO: lm_trie.c(473): Training quantizer
INFO: lm_trie.c(481): Building LM trie
INFO: ngram_search_fwdtree.c(99): 3 unique initial diphones
INFO: ngram_search_fwdtree.c(148): 0 root, 0 non-root channels, 6 single-phone words
INFO: ngram_search_fwdtree.c(186): Creating search tree
INFO: ngram_search_fwdtree.c(192): before: 0 root, 0 non-root channels, 6 single-phone words
INFO: ngram_search_fwdtree.c(326): after: max nonroot chan increased to 139
INFO: ngram_search_fwdtree.c(339): after: 3 root, 11 non-root channels, 5 single-phone words
INFO: ngram_search_fwdflat.c(157): fwdflat: min_ef_width = 4, max_sf_win = 25
INFO: ngram_model_trie.c(347): Trying to read LM in trie binary format
INFO: ngram_model_trie.c(358): Header doesn't match
INFO: ngram_model_trie.c(176): Trying to read LM in arpa format
INFO: ngram_model_trie.c(192): LM of order 3
INFO: ngram_model_trie.c(194): #1-grams: 5
INFO: ngram_model_trie.c(194): #2-grams: 6
INFO: ngram_model_trie.c(194): #3-grams: 3
INFO: lm_trie.c(473): Training quantizer
INFO: lm_trie.c(481): Building LM trie
INFO: ngram_search_fwdtree.c(99): 3 unique initial diphones
INFO: ngram_search_fwdtree.c(148): 0 root, 0 non-root channels, 6 single-phone words
INFO: ngram_search_fwdtree.c(186): Creating search tree
INFO: ngram_search_fwdtree.c(192): before: 0 root, 0 non-root channels, 6 single-phone words
INFO: ngram_search_fwdtree.c(326): after: max nonroot chan increased to 139
INFO: ngram_search_fwdtree.c(339): after: 3 root, 11 non-root channels, 5 single-phone words
INFO: ngram_search_fwdflat.c(157): fwdflat: min_ef_width = 4, max_sf_win = 25
INFO: kws_search.c(420): KWS(beam: -1080, plp: -23, default threshold -900, delay 10)
ERROR: "cmd_ln.c", line 938: Unknown argument: -adcdev
Allocating 32 buffers of 2500 samples each
INFO: cppTest.cpp(103): keyphrase is: HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(131): cmn_prior_update: from < 40.00 3.00 -1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 >
INFO: cmn_prior.c(149): cmn_prior_update: to < 38.01 18.02 -0.60 -3.94 3.44 2.67 -2.47 -0.60 2.43 -5.32 -2.22 -7.04 2.46 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp = HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(131): cmn_prior_update: from < 38.01 18.02 -0.60 -3.94 3.44 2.67 -2.47 -0.60 2.43 -5.32 -2.22 -7.04 2.46 >
INFO: cmn_prior.c(149): cmn_prior_update: to < 39.34 14.99 -7.16 -2.53 -4.98 -8.98 -3.15 1.60 4.95 -9.76 2.29 -5.59 4.39 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp = HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(131): cmn_prior_update: from < 39.34 14.99 -7.16 -2.53 -4.98 -8.98 -3.15 1.60 4.95 -9.76 2.29 -5.59 4.39 >
INFO: cmn_prior.c(149): cmn_prior_update: to < 39.29 13.66 -5.37 -1.40 -4.89 -9.12 -5.23 0.36 4.09 -10.30 3.82 -4.18 4.00 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp = HELP HELP HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(99): cmn_prior_update: from < 39.29 13.66 -5.37 -1.40 -4.89 -9.12 -5.23 0.36 4.09 -10.30 3.82 -4.18 4.00 >
INFO: cmn_prior.c(116): cmn_prior_update: to < 39.24 13.36 -4.74 -0.99 -5.23 -9.40 -5.08 -0.40 3.95 -10.70 3.53 -4.19 3.83 >
INFO: cmn_prior.c(99): cmn_prior_update: from < 39.24 13.36 -4.74 -0.99 -5.23 -9.40 -5.08 -0.40 3.95 -10.70 3.53 -4.19 3.83 >
INFO: cmn_prior.c(116): cmn_prior_update: to < 39.12 11.92 -5.07 -1.94 -6.63 -7.68 -3.63 -2.29 0.49 -11.79 3.25 -3.04 2.44 >
INFO: cmn_prior.c(131): cmn_prior_update: from < 39.12 11.92 -5.07 -1.94 -6.63 -7.68 -3.63 -2.29 0.49 -11.79 3.25 -3.04 2.44 >
INFO: cmn_prior.c(149): cmn_prior_update: to < 38.03 11.47 -5.09 -2.07 -6.22 -6.86 -3.23 -2.30 0.27 -11.47 2.86 -3.17 2.37 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp = HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(99): cmn_prior_update: from < 38.03 11.47 -5.09 -2.07 -6.22 -6.86 -3.23 -2.30 0.27 -11.47 2.86 -3.17 2.37 >
INFO: cmn_prior.c(116): cmn_prior_update: to < 37.17 10.53 -3.43 1.18 -7.71 -8.15 -2.56 -5.45 -0.81 -12.44 0.63 -2.53 2.86 >
INFO: cmn_prior.c(131): cmn_prior_update: from < 37.17 10.53 -3.43 1.18 -7.71 -8.15 -2.56 -5.45 -0.81 -12.44 0.63 -2.53 2.86 >
INFO: cmn_prior.c(149): cmn_prior_update: to < 36.30 10.14 -2.66 2.23 -7.40 -7.74 -2.55 -5.77 -0.57 -11.16 0.96 -2.47 2.95 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp = HELP HELP HELP HELP HELP HELP HELP HELP HELP
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...
INFO: cmn_prior.c(131): cmn_prior_update: from < 36.30 10.14 -2.66 2.23 -7.40 -7.74 -2.55 -5.77 -0.57 -11.16 0.96 -2.47 2.95 >
INFO: cmn_prior.c(149): cmn_prior_update: to < 36.39 10.85 -2.80 2.29 -5.72 -6.97 -2.79 -3.21 -0.31 -10.98 0.31 -3.93 2.68 >
INFO: cppTest.cpp(76): ---> score = 0
INFO: cppTest.cpp(77): ---> hyp = HELP
****************
Got hotword
INFO: cppTest.cpp(53): Ready....
INFO: cppTest.cpp(63): Listening...