我在Qt C ++应用程序中使用Google的语音文本API。
Google's C++ documentation很有帮助,但在一定程度上。
在下面的代码中,如果我取消注释
std :: this_thread :: sleep_for(std :: chrono :: seconds(1));
语音识别功能正常,但无法正常运行-跳过了一些单词。但是如果没有这条线,它根本无法工作。我认为这是因为 MicrophoneThreadMain()的while循环与 start_speech_to_text()的while循环冲突。但是我不确定。
我希望这两个功能可以并排运行,没有中断,没有延迟。 我尝试使用QThreads和Signal and Slots,但无法正常工作。
speech_to_text.cpp
#include "speechtotext.h"
using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;
SpeechToText::SpeechToText(QObject *parent) : QObject(parent)
{
}
void SpeechToText::initialize()
{
QAudioFormat qtFormat;
// Get default audio input device
QAudioDeviceInfo qtInfo = QAudioDeviceInfo::defaultInputDevice();
// Set the audio format settings
qtFormat.setCodec("audio/pcm");
qtFormat.setByteOrder(QAudioFormat::Endian::LittleEndian);
qtFormat.setChannelCount(1);
qtFormat.setSampleRate(16000);
qtFormat.setSampleSize(16);
qtFormat.setSampleType(QAudioFormat::SignedInt);
// Check whether the format is supported
if (!qtInfo.isFormatSupported(qtFormat)) {
qWarning() << "Default format is not supported";
exit(3);
}
// Instantiate QAudioInput with the settings
audioInput = new QAudioInput(qtFormat);
// Start receiving data from audio input
ioDevice = audioInput->start();
emit finished_initializing();
}
void SpeechToText::MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
StreamingRecognizeResponse> *streamer)
{
StreamingRecognizeRequest request;
std::size_t size_read;
while(true)
{
audioDataBuffer.append(ioDevice->readAll());
size_read = audioDataBuffer.size();
// And write the chunk to the stream.
request.set_audio_content(&audioDataBuffer.data()[0], size_read);
std::cout << "Sending " << size_read / 1024 << "k bytes." << std::endl;
streamer->Write(request);
//std::this_thread::sleep_for(std::chrono::seconds(1));
}
}
void SpeechToText::start_speech_to_text()
{
StreamingRecognizeRequest request;
auto *streaming_config = request.mutable_streaming_config();
RecognitionConfig *recognition_config = new RecognitionConfig();
recognition_config->set_language_code("en-US");
recognition_config->set_sample_rate_hertz(16000);
recognition_config->set_encoding(RecognitionConfig::LINEAR16);
streaming_config->set_allocated_config(recognition_config);
// Create a Speech Stub connected to the speech service.
auto creds = grpc::GoogleDefaultCredentials();
auto channel = grpc::CreateChannel("speech.googleapis.com", creds);
std::unique_ptr<Speech::Stub> speech(Speech::NewStub(channel));
// Begin a stream.
grpc::ClientContext context;
auto streamer = speech->StreamingRecognize(&context);
// Write the first request, containing the config only.
streaming_config->set_interim_results(true);
streamer->Write(request);
// The microphone thread writes the audio content.
std::thread microphone_thread(&SpeechToText::MicrophoneThreadMain, this, streamer.get());
// Read responses.
StreamingRecognizeResponse response;
while (streamer->Read(&response)) // Returns false when no more to read.
{
// Dump the transcript of all the results.
for (int r = 0; r < response.results_size(); ++r)
{
auto result = response.results(r);
std::cout << "Result stability: " << result.stability() << std::endl;
for (int a = 0; a < result.alternatives_size(); ++a)
{
auto alternative = result.alternatives(a);
std::cout << alternative.confidence() << "\t"
<< alternative.transcript() << std::endl;
}
}
}
grpc::Status status = streamer->Finish();
microphone_thread.join();
if (!status.ok()) {
// Report the RPC failure.
qDebug() << "error RPC";
std::cerr << status.error_message() << std::endl;
}
}
speech_to_text.h
#ifndef SPEECHTOTEXT_H
#define SPEECHTOTEXT_H
#include <QObject>
#include <QDebug>
#include <QThread>
#include <thread>
#include <chrono>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string>
#include <functional>
#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>
#include <QtConcurrent>
#include <QMutex>
#include <grpc++/grpc++.h>
#include "google/cloud/speech/v1/cloud_speech.grpc.pb.h"
using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;
class SpeechToText : public QObject
{
Q_OBJECT
public:
explicit SpeechToText(QObject *parent = nullptr);
signals:
void finished_initializing();
void finished_speech_to_text(QString);
public slots:
void initialize();
void start_speech_to_text();
private:
void MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
StreamingRecognizeResponse> *);
QAudioInput *audioInput;
QIODevice *ioDevice;
QByteArray audioDataBuffer;
};
#endif // SPEECHTOTEXT_H
关于如何解决此问题的任何想法?
答案 0 :(得分:0)
WritesDone()
上使用streamer
。append
调用都将堆积它。由于您使用的是指向基础数组中数据的第一个元素的指针,因此,每次循环运行时,都将截至该点为止捕获的所有音频数据发送到streamer
。我建议一个嵌套循环重复调用QIODevice::read(char *data, qint64 maxSize)
,直到您的QByteArray
恰好有64KB。您需要处理一个返回值-1,该值指示流的结尾,并根据需要多少数据来向下填充maxSize
,以将数组最多填充到64k。向Google API发出的数据量过少的请求(例如,您的当前循环初看起来只有几个字节)可能会限制您的速率,或者由于高的协议开销与数据的比率而在Internet连接上造成上游拥塞。另外,使用固定大小(64k)的普通C样式数组而不是QByteArray可能更容易处理此问题,因为您不需要调整大小,并且AFAIK QByteArray::clear()
可能会导致内存分配(对性能不利) 。为了避免在短写入期间重新发送旧数据(例如,当麦克风流在64k缓冲区已满之前关闭),您还应该在每次memset(array, 0, sizeof array);
调用之后ClientReaderWriterInterface::WritesDone()
。QAudioInput
上出现溢出的情况,该地方的本地缓冲区不足以存储音频。更多的缓冲使这种可能性降低,但也会降低响应速度。您可能只想将QAudioInput
产生的所有数据缓冲到无限制的QByteArray
中,并一次从 64k中读出(可以通过包装它在QBuffer
中的位置,并且您在QIODevice
中处理MicrophoneThreadMain()
的所有代码都将是兼容的。)我认为,通常,对于像您这样的项目,用户宁愿响应性较差,因为在网络相关的超速情况下,不必重复自己的操作。但是可能存在一个阈值(可能是5秒钟左右),此后缓冲的数据可能会“过时”,因为用户可能再次尝试对着麦克风讲话,一旦上游出现多个连续发生的STT事件,就会产生怪异的影响瓶颈释放了。答案 1 :(得分:0)
我在这里发布我的问题的解决方案。感谢@allquixotic提供了所有有用的信息。
在mainwindow.cpp中
void MainWindow::setUpMicrophoneRecorder()
{
microphone_thread = new QThread(this);
microphone_recorder_engine.moveToThread(microphone_thread);
connect(microphone_thread, SIGNAL(started()), µphone_recorder_engine, SLOT(start_listen()));
connect(µphone_recorder_engine, &MicrophoneRecorder::microphone_data_raw,
this, [this] (const QByteArray &data) {
this->speech_to_text_engine.listen(data);
});
microphone_thread->start();
}
void MainWindow::setUpSpeechToTextEngine()
{
speech_to_text_thread = new QThread(this);
speech_to_text_engine.moveToThread(speech_to_text_thread);
connect(speech_to_text_thread, SIGNAL(started()), &speech_to_text_engine, SLOT(initialize()));
connect(&speech_to_text_engine, SIGNAL(finished_speech_to_text(QString)), this, SLOT(process_user_input(QString)));
speech_to_text_thread->start();
}
microphonerecorder.h
#ifndef MICROPHONERECORDER_H
#define MICROPHONERECORDER_H
#include <QObject>
#include <QByteArray>
#include <QDebug>
#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>
class MicrophoneRecorder : public QObject
{
Q_OBJECT
public:
explicit MicrophoneRecorder(QObject *parent = nullptr);
signals:
void microphone_data_raw(const QByteArray &);
public slots:
void start_listen();
private slots:
void listen(const QByteArray &);
private:
QAudioInput *audioInput;
QIODevice *ioDevice;
QByteArray audioDataBuffer;
};
#endif // MICROPHONERECORDER_H
microphonerecorder.cpp
#include "microphonerecorder.h"
MicrophoneRecorder::MicrophoneRecorder(QObject *parent) : QObject(parent)
{
}
void MicrophoneRecorder::listen(const QByteArray &audioData)
{
emit microphone_data_raw(audioData);
}
void MicrophoneRecorder::start_listen()
{
QAudioFormat qtFormat;
// Get default audio input device
QAudioDeviceInfo qtInfo = QAudioDeviceInfo::defaultInputDevice();
// Set the audio format settings
qtFormat.setCodec("audio/pcm");
qtFormat.setByteOrder(QAudioFormat::Endian::LittleEndian);
qtFormat.setChannelCount(1);
qtFormat.setSampleRate(16000);
qtFormat.setSampleSize(16);
qtFormat.setSampleType(QAudioFormat::SignedInt);
// Check whether the format is supported
if (!qtInfo.isFormatSupported(qtFormat)) {
qWarning() << "Default format is not supported";
exit(3);
}
// Instantiate QAudioInput with the settings
audioInput = new QAudioInput(qtFormat);
// Start receiving data from audio input
ioDevice = audioInput->start();
// Listen to the received data for wake words
QObject::connect(ioDevice, &QIODevice::readyRead, [=] {
listen(ioDevice->readAll());
});
}
语音文本.h
#ifndef SPEECHTOTEXT_H
#define SPEECHTOTEXT_H
#include <QObject>
#include <QDebug>
#include <QThread>
#include <QDateTime>
#include <thread>
#include <chrono>
#include <string>
#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>
#include <QtConcurrent>
#include <QMutex>
#include <grpc++/grpc++.h>
#include "google/cloud/speech/v1/cloud_speech.grpc.pb.h"
using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;
class SpeechToText : public QObject
{
Q_OBJECT
public:
explicit SpeechToText(QObject *parent = nullptr);
signals:
void finished_initializing();
void in_speech_to_text();
void out_of_speech_to_text();
void finished_speech_to_text(QString);
public slots:
void initialize();
void listen(const QByteArray &);
void start_speech_to_text();
private:
void MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
StreamingRecognizeResponse> *);
void StreamerThread(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
StreamingRecognizeResponse> *);
QByteArray audioDataBuffer;
int m_start_time;
};
#endif // SPEECHTOTEXT_H
speechtotext.cpp
#include "speechtotext.h"
using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;
SpeechToText::SpeechToText(QObject *parent) : QObject(parent)
{
}
void SpeechToText::initialize()
{
emit finished_initializing();
}
void SpeechToText::MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
StreamingRecognizeResponse> *streamer)
{
StreamingRecognizeRequest request;
std::size_t size_read;
while (time(0) - m_start_time <= TIME_RECOGNITION)
{
int chunk_size = 64 * 1024;
if (audioDataBuffer.size() >= chunk_size)
{
QByteArray bytes_read = QByteArray(audioDataBuffer);
size_read = std::size_t(bytes_read.size());
// And write the chunk to the stream.
request.set_audio_content(&bytes_read.data()[0], size_read);
bool ok = streamer->Write(request);
/*if (ok)
{
std::cout << "Sending " << size_read / 1024 << "k bytes." << std::endl;
}*/
audioDataBuffer.clear();
audioDataBuffer.resize(0);
}
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
qDebug() << "Out of speech recognition: " << end_date;
emit out_of_speech_to_text();
streamer->WritesDone();
}
void SpeechToText::StreamerThread(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
StreamingRecognizeResponse> *streamer)
{
// Read responses.
StreamingRecognizeResponse response;
while (time(0) - m_start_time <= TIME_RECOGNITION)
{
if(streamer->Read(&response)) // Returns false when no more to read.
{
// Dump the transcript of all the results.
if (response.results_size() > 0)
{
auto result = response.results(0);
if (result.alternatives_size() > 0)
{
auto alternative = result.alternatives(0);
auto transcript = QString::fromStdString(alternative.transcript());
if (result.is_final())
{
qDebug() << "Speech recognition: " << transcript;
emit finished_speech_to_text(transcript);
}
}
}
}
}
}
void SpeechToText::listen(const QByteArray &audioData)
{
audioDataBuffer.append(audioData);
}
void SpeechToText::start_speech_to_text()
{
qDebug() << "in start_speech_to_text: " << start_date;
emit in_speech_to_text();
m_start_time = time(0);
audioDataBuffer.clear();
audioDataBuffer.resize(0);
StreamingRecognizeRequest request;
auto *streaming_config = request.mutable_streaming_config();
RecognitionConfig *recognition_config = new RecognitionConfig();
recognition_config->set_language_code("en-US");
recognition_config->set_sample_rate_hertz(16000);
recognition_config->set_encoding(RecognitionConfig::LINEAR16);
streaming_config->set_allocated_config(recognition_config);
// Create a Speech Stub connected to the speech service.
auto creds = grpc::GoogleDefaultCredentials();
auto channel = grpc::CreateChannel("speech.googleapis.com", creds);
std::unique_ptr<Speech::Stub> speech(Speech::NewStub(channel));
// Begin a stream.
grpc::ClientContext context;
auto streamer = speech->StreamingRecognize(&context);
// Write the first request, containing the config only.
streaming_config->set_interim_results(true);
streamer->Write(request);
// The microphone thread writes the audio content.
std::thread microphone_thread(&SpeechToText::MicrophoneThreadMain, this, streamer.get());
std::thread streamer_thread(&SpeechToText::StreamerThread, this, streamer.get());
microphone_thread.join();
streamer_thread.join();
}