我在我的debian 9系统上编译了这个语音识别的C代码librry pocketsphinx。
我在名为goforward.raw的文件中录制了一个示例音频,其中包含命令:“go forward”。
pockesphinx_continuous程序既没有使用linux上的arecord工具有效地识别通过耳机录制的单词,也没有给出示例代码。只是部分识别,即它将“前进”命令识别为“向前移动”并且没有问题,但是其他命令非常难以识别。如果你打个招呼,它就会变成你是谁。?
有趣的是,使用文本到语音工具pico2wave创建的音频文件在从通过pico2wave工具创建的wav文件中提取单词时,可以非常高效地识别80%的准确度。
Here is the example pockesphinx code:
#include <pocketsphinx.h>
int
main(int argc, char *argv[])
{
ps_decoder_t *ps;
cmd_ln_t *config;
FILE *fh;
char const *hyp, *uttid;
int16 buf[512];
int rv;
int32 score;
config = cmd_ln_init(NULL, ps_args(), TRUE,
"-hmm", MODELDIR "/en-us/en-us",
"-lm", MODELDIR "/en-us/en-us.lm.bin",
"-dict", MODELDIR "/en-us/cmudict-en-us.dict",
NULL);
if (config == NULL) {
fprintf(stderr, "Failed to create config object, see log for details\n");
return -1;
}
ps = ps_init(config);
if (ps == NULL) {
fprintf(stderr, "Failed to create recognizer, see log for details\n");
return -1;
}
fh = fopen("goforward.raw", "rb");
if (fh == NULL) {
fprintf(stderr, "Unable to open input file goforward.raw\n");
return -1;
}
rv = ps_start_utt(ps);
while (!feof(fh)) {
size_t nsamp;
nsamp = fread(buf, 2, 512, fh);
rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE);
}
rv = ps_end_utt(ps);
hyp = ps_get_hyp(ps, &score);
printf("Recognized: %s\n", hyp);
fclose(fh);
ps_free(ps);
cmd_ln_free_r(config);
return 0;
}
以下是pocketsphinx官方软件包提供的pocketsphinx_continuous工具代码:
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
* Copyright (c) 1999-2010 Carnegie Mellon University. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* This work was supported in part by funding from the Defense Advanced
* Research Projects Agency and the National Science Foundation of the
* United States of America, and the CMU Sphinx Speech Consortium.
*
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
/*
* continuous.c - Simple pocketsphinx command-line application to test
* both continuous listening/silence filtering from microphone
* and continuous file transcription.
*/
/*
* This is a simple example of pocketsphinx application that uses continuous listening
* with silence filtering to automatically segment a continuous stream of audio input
* into utterances that are then decoded.
*
* Remarks:
* - Each utterance is ended when a silence segment of at least 1 sec is recognized.
* - Single-threaded implementation for portability.
* - Uses audio library; can be replaced with an equivalent custom library.
*/
#include <stdio.h>
#include <string.h>
#include <assert.h>
#if !defined(_WIN32_WCE)
#include <signal.h>
#include <setjmp.h>
#endif
#if defined(WIN32) && !defined(GNUWINCE)
#include <time.h>
#else
#include <sys/types.h>
#include <sys/time.h>
#endif
#include <sphinxbase/err.h>
#include <sphinxbase/ad.h>
#include "pocketsphinx.h"
static const arg_t cont_args_def[] = {
POCKETSPHINX_OPTIONS,
/* Argument file. */
{"-argfile",
ARG_STRING,
NULL,
"Argument file giving extra arguments."},
{"-adcdev",
ARG_STRING,
NULL,
"Name of audio device to use for input."},
{"-infile",
ARG_STRING,
NULL,
"Audio file to transcribe."},
{"-time",
ARG_BOOLEAN,
"no",
"Print word times in file transcription."},
CMDLN_EMPTY_OPTION
};
static ps_decoder_t *ps;
static cmd_ln_t *config = cmd_ln_init(NULL, ps_args(), TRUE,
"-hmm", "/home/bsnayak/Trainguard_MT2/pocketsphinx/model9/hmm/trainguard/",
"-jsgf", "/home/bsnayak/Trainguard_MT2/pocketsphinx/model9/lm2/trainguardmt_adv_2.jsgf",
"-dict", "/home/bsnayak/Trainguard_MT2/pocketsphinx/model9/dict/trainguard.dic",
NULL);
static FILE *rawfd;
static void
print_word_times(int32 start)
{
ps_seg_t *iter = ps_seg_iter(ps, NULL);
while (iter != NULL) {
int32 sf, ef, pprob;
float conf;
ps_seg_frames(iter, &sf, &ef);
pprob = ps_seg_prob(iter, NULL, NULL, NULL);
conf = logmath_exp(ps_get_logmath(ps), pprob);
printf("%s %f %f %f\n", ps_seg_word(iter), (sf + start) / 100.0,
(ef + start) / 100.0, conf);
iter = ps_seg_next(iter);
}
}
/*
* Continuous recognition from a file
*/
static void
recognize_from_file()
{
int16 adbuf[4096];
const char *hyp;
const char *uttid;
int32 k;
uint8 cur_vad_state, vad_state;
char waveheader[44];
if ((rawfd = fopen(cmd_ln_str_r(config, "-infile"), "rb")) == NULL) {
E_FATAL_SYSTEM("Failed to open file '%s' for reading",
cmd_ln_str_r(config, "-infile"));
}
//skip wav header
fread(waveheader, 1, 44, rawfd);
cur_vad_state = 0;
ps_start_utt(ps, NULL);
while ((k = fread(adbuf, sizeof(int16), 4096, rawfd)) > 0) {
ps_process_raw(ps, adbuf, k, FALSE, FALSE);
vad_state = ps_get_vad_state(ps);
if (cur_vad_state && !vad_state) {
//speech->silence transition,
//time to end utterance and start new one
ps_end_utt(ps);
hyp = ps_get_hyp(ps, NULL, &uttid);
printf("%s: %s\n", uttid, hyp);
fflush(stdout);
ps_start_utt(ps, NULL);
}
cur_vad_state = vad_state;
}
ps_end_utt(ps);
hyp = ps_get_hyp(ps, NULL, &uttid);
printf("%s: %s\n", uttid, hyp);
fflush(stdout);
fclose(rawfd);
}
/* Sleep for specified msec */
static void
sleep_msec(int32 ms)
{
#if (defined(WIN32) && !defined(GNUWINCE)) || defined(_WIN32_WCE)
Sleep(ms);
#else
/* ------------------- Unix ------------------ */
struct timeval tmo;
tmo.tv_sec = 0;
tmo.tv_usec = ms * 1000;
select(0, NULL, NULL, NULL, &tmo);
#endif
}
/*
* Main utterance processing loop:
* for (;;) {
* start utterance and wait for speech to process
* decoding till end-of-utterance silence will be detected
* print utterance result;
* }
*/
static void
recognize_from_microphone()
{
ad_rec_t *ad;
int16 adbuf[4096];
uint8 cur_vad_state, vad_state;
int32 k;
char const *hyp;
char const *uttid;
if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),
(int) cmd_ln_float32_r(config,
"-samprate"))) == NULL)
E_FATAL("Failed to open audio device\n");
if (ad_start_rec(ad) < 0)
E_FATAL("Failed to start recording\n");
if (ps_start_utt(ps, NULL) < 0)
E_FATAL("Failed to start utterance\n");
cur_vad_state = 0;
/* Indicate listening for next utterance */
printf("READY....\n");
fflush(stdout);
fflush(stderr);
for (;;) {
if ((k = ad_read(ad, adbuf, 4096)) < 0)
E_FATAL("Failed to read audio\n");
sleep_msec(100);
ps_process_raw(ps, adbuf, k, FALSE, FALSE);
vad_state = ps_get_vad_state(ps);
if (vad_state && !cur_vad_state) {
//silence -> speech transition,
// let user know that he is heard
printf("Listening...\n");
fflush(stdout);
}
if (!vad_state && cur_vad_state) {
//speech -> silence transition,
//time to start new utterance
ps_end_utt(ps);
hyp = ps_get_hyp(ps, NULL, &uttid);
printf("%s: %s\n", uttid, hyp);
fflush(stdout);
//Exit if the first word spoken was GOODBYE
if (hyp && (strcmp(hyp, "good bye") == 0))
break;
if (ps_start_utt(ps, NULL) < 0)
E_FATAL("Failed to start utterance\n");
/* Indicate listening for next utterance */
printf("READY....\n");
fflush(stdout);
fflush(stderr);
}
cur_vad_state = vad_state;
}
ad_close(ad);
}
static jmp_buf jbuf;
static void
sighandler(int signo)
{
longjmp(jbuf, 1);
}
int
main(int argc, char *argv[])
{
char const *cfg;
/*
config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE);
///* Handle argument file as -argfile. */
/* if (config && (cfg = cmd_ln_str_r(config, "-argfile")) != NULL) {
config = cmd_ln_parse_file_r(config, cont_args_def, cfg, FALSE);
}
if (config == NULL)
return 1;
ps_default_search_args(config);
ps = ps_init(config);
if (ps == NULL)
return 1;
*/
if (config == NULL)
return 1;
ps = ps_init(config);
if (ps == NULL)
return 1;
E_INFO("%s COMPILED ON: %s, AT: %s\n\n", argv[0], __DATE__, __TIME__);
if (cmd_ln_str_r(config, "-infile") != NULL) {
recognize_from_file();
}
else {
/* Make sure we exit cleanly (needed for profiling among other things) */
/* Signals seem to be broken in arm-wince-pe. */
#if !defined(GNUWINCE) && !defined(_WIN32_WCE) && !defined(__SYMBIAN32__)
signal(SIGINT, &sighandler);
#endif
if (setjmp(jbuf) == 0) {
recognize_from_microphone();
}
}
ps_free(ps);
return 0;
}
/** Silvio Moioli: Windows CE/Mobile entry point added. */
#if defined(_WIN32_WCE)
#pragma comment(linker,"/entry:mainWCRTStartup")
#include <windows.h>
//Windows Mobile has the Unicode main only
int
wmain(int32 argc, wchar_t * wargv[])
{
char **argv;
size_t wlen;
size_t len;
int i;
argv = malloc(argc * sizeof(char *));
for (i = 0; i < argc; i++) {
wlen = lstrlenW(wargv[i]);
len = wcstombs(NULL, wargv[i], wlen);
argv[i] = malloc(len + 1);
wcstombs(argv[i], wargv[i], wlen);
}
//assuming ASCII parameters
return main(argc, argv);
}
#endif
我需要做什么才能使命令工作?即使有一些错误的发音或重音差异,也能更有效地被认可。
答案 0 :(得分:1)
这适用于那些可能也遇到同样问题的人,而我回答自己的问题的原因是很少谈及pocketphinx语音识别库,因此很难学习或工作因为社区很少活跃。官方网站没有提供易于理解的指南,我发现官方文档的研究范围比开发人员只需要针对pocketsphinx库构建他/她的应用程序的指南更多。
所以,如果您遇到使用默认语言模型和字典成功识别语音的行,但您想要效率和准确性,那么您必须创建自己的语言模型和字典,或者您可能想要添加一些新的重音默认语言模型。
您所要做的就是创建一个包含文本文件中的单词或句子的示例语言corspus。然后使用Sphinx lmtool从中创建语言模型(lm文件)和字典(dic文件)。
接下来就是在编译过程中不提供defualt语言模型和字典,而应该提供这个新的lm和dic文件参数。
就是这样,它会非常快速地识别单词,并且准确率达到100%。这是整个过程的链接:http://ghatage.com/tech/2012/12/13/Make-Pocketsphinx-recognize-new-words/
答案 1 :(得分:0)
冒着交叉发布(或避免这样做)的风险https://raspberrypi.stackexchange.com/questions/10384/speech-processing-on-the-raspberry-pi/18222#18222涵盖了一些要点
这里是后代。
我带着pocketsphinx_continuous和一个$4 sound card。
要解决使用语音合成器时需要停止收听的事实,我使用了混音器来处理输入到麦克风的音量(CMU建议采用最佳做法,因为停止启动引擎会导致识别效果较差)
echo "SETTING MIC IN TO 15 (94%)" >> ./audio.log
amixer -c 1 set Mic 15 unmute 2>&1 >/dev/null
使用匹配命令在播放语音合成器时使收听静音
FILE: mute.sh
#!/bin/sh
sleep $1;
amixer -c 1 set Mic 0 unmute >/dev/null 2>&1 ;
echo "** MIC OFF **" >> /home/pi/PIXIE/audio.log
要计算正确的静音时间,我只需通过lua运行soxi,然后将unmute.sh(与muute.sh相反)设置为在启动后运行“ x”秒。毫无疑问,有很多方法可以解决这个问题。我对这种方法的结果感到满意。
LUA片段:
-- Begin parallel timing
-- MUTE UNTIL THE SOUNDCARD FREES UP
-- "filename" is a fully qualified path to a wav file
-- outputted by voice synth in previous operation
-- GET THE LENGTH
local sample_length = io.popen('soxi -D '..filename);
local total_length = sample_length:read("*a");
clean_length = string.gsub(total_length, "\n", "") +1;
sample_length:close();
-- EXAMPLE LOGGING OUTPUT...
--os.execute( 'echo LENGTH WAS "'.. clean_length .. '" Seconds >> ./audio.log');
-- we are about to play something...
-- MUTE, then schedule UNMUTE.sh in x seconds, then play synth output
-- (have unrolled mute.sh here for clarity)
os.execute( 'amixer -c 1 set Mic '..mic_level..' unmute 2>&1 >/dev/null ');
os.execute( 'echo "** MIC OFF **" >> ./audio.log ');
-- EXAMPLE LOGGING OUTPUT...
-- os.execute( 'echo PLAYING: "'.. filename..'" circa ' .. clean_length .. ' Seconds >> ./audio.log ');
os.execute( './unmute.sh "'.. clean_length ..'" &');
-- THEN PLAY THE THING WHILE THE OTHER PROCESS IS SLEEPING
os.execute( './sounds-uncached.sh '..filename..' 21000')
要在pi上实际获取声音,我使用:
pocketsphinx_continuous -bestpath 0 -adcdev plughw:1 -samprate 20000 \
-nfft 512 -ds2 -topn2 -maxwpf 5 -kdtreefn 3000 -kdmaxdepth 7 -kdmaxbbi 15 \
-pl_window 10 -lm ./LANGUAGE/0892-min.lm -dict ./LANGUAGE/0892-min.dic 2>&1 \
| tee -i 2>/dev/null >( sed -u -n -e 's/^.\{9\}: //p' ) \
>( sed -u -n -e 's/^READY//p' \
-e 's/^Listening//p' -e 's/^FATAL_ERROR: \"continuous\.c\"\, //p') \
> /dev/null
同样,还有其他方法,但是我喜欢这种方式的输出。
对于合成器,我使用的是Cepstrals雏鸟pi解决方案,但它在线上不可用,您必须直接与他们联系以安排购买,它的价格约为30美元。结果是可以接受的,但是演讲的确会产生一些讨厌的咔嗒声,该公司回答说他们不再拥有RaspPi,并且不愿意改进产品。 YMMV
“空闲”时,语音识别的CPU占12%左右,而在进行大量识别时,语音识别会短暂升高。
渲染时,语音创建的峰值大约为50-80%。
播放/声音很重,但是我在播放声音时确实将实时效果应用于渲染的声音;)
使用我可以找到的所有指南来大量删除pi,以停止不需要的服务并以完整的CLI模式运行。超频800mhz(最小)。
scaling_governor设置为:性能
完全运行时:在直射阳光下约50ºC,在阴凉处约38ºC。我有散热器。
最后一点:实际上,我将所有这些装备都运用于了“互联网驱动的” AI,这是一个不错的选择。
pi可以无缝处理所有这些,并实时播放任何网络音频,并将完全循环的音频传输到任何其他Unix盒。等等
为处理大量语音CPU开销,我已经实现了基于md5sum的缓存系统,因此相同的语音不会被渲染两次。 (大约1000个文件@ 220 mb的总覆盖了我通常从AI中得到的话语的70%),这确实有助于降低CPU的总体负载。
实际上,这完全是可行的。但是语音识别的质量仅取决于您的麦克风的质量,您的语言模型,与目标听众的声音对目标听众的接近程度(我对en_UK儿童使用的是en_US模型,并不完美)以及其他细节方面的质量通过努力,您可以降低到一个不错的结果。
记录下来,我以前在kindle上已经做过一次(这也适用于cmu狮身人面像和flite)。希望这会有所帮助。