Question

我在Qt C ++应用程序中使用Google的语音文本API。

Google's C++ documentation很有帮助，但在一定程度上。

在下面的代码中，如果我取消注释

std :: this_thread :: sleep_for（std :: chrono :: seconds（1））;

语音识别功能正常，但无法正常运行-跳过了一些单词。但是如果没有这条线，它根本无法工作。我认为这是因为 MicrophoneThreadMain（）的while循环与 start_speech_to_text（）的while循环冲突。但是我不确定。

我希望这两个功能可以并排运行，没有中断，没有延迟。我尝试使用QThreads和Signal and Slots，但无法正常工作。

speech_to_text.cpp

#include "speechtotext.h"

using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;

SpeechToText::SpeechToText(QObject *parent) : QObject(parent)
{

}

void SpeechToText::initialize()
{
    QAudioFormat qtFormat;

    // Get default audio input device
    QAudioDeviceInfo qtInfo = QAudioDeviceInfo::defaultInputDevice();

    // Set the audio format settings
    qtFormat.setCodec("audio/pcm");
    qtFormat.setByteOrder(QAudioFormat::Endian::LittleEndian);
    qtFormat.setChannelCount(1);
    qtFormat.setSampleRate(16000);
    qtFormat.setSampleSize(16);
    qtFormat.setSampleType(QAudioFormat::SignedInt);

    // Check whether the format is supported
    if (!qtInfo.isFormatSupported(qtFormat)) {
        qWarning() << "Default format is not supported";
        exit(3);
    }

    // Instantiate QAudioInput with the settings
    audioInput = new QAudioInput(qtFormat);

    // Start receiving data from audio input
    ioDevice = audioInput->start();

    emit finished_initializing();
}

void SpeechToText::MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                      StreamingRecognizeResponse> *streamer)
{
    StreamingRecognizeRequest request;
    std::size_t size_read;

    while(true)
    {
        audioDataBuffer.append(ioDevice->readAll());
        size_read = audioDataBuffer.size();
        // And write the chunk to the stream.
        request.set_audio_content(&audioDataBuffer.data()[0], size_read);
        std::cout << "Sending " << size_read / 1024 << "k bytes." << std::endl;
        streamer->Write(request);
        //std::this_thread::sleep_for(std::chrono::seconds(1));
    }
}

void SpeechToText::start_speech_to_text()
{
    StreamingRecognizeRequest request;

    auto *streaming_config   = request.mutable_streaming_config();
    RecognitionConfig *recognition_config = new RecognitionConfig();

    recognition_config->set_language_code("en-US");
    recognition_config->set_sample_rate_hertz(16000);
    recognition_config->set_encoding(RecognitionConfig::LINEAR16);
    streaming_config->set_allocated_config(recognition_config);

    // Create a Speech Stub connected to the speech service.
    auto creds = grpc::GoogleDefaultCredentials();
    auto channel = grpc::CreateChannel("speech.googleapis.com", creds);
    std::unique_ptr<Speech::Stub> speech(Speech::NewStub(channel));

    // Begin a stream.
    grpc::ClientContext context;
    auto streamer = speech->StreamingRecognize(&context);

    // Write the first request, containing the config only.
    streaming_config->set_interim_results(true);
    streamer->Write(request);

    // The microphone thread writes the audio content.
    std::thread microphone_thread(&SpeechToText::MicrophoneThreadMain, this, streamer.get());

    // Read responses.
    StreamingRecognizeResponse response;
    while (streamer->Read(&response)) // Returns false when no more to read.
    {
        // Dump the transcript of all the results.
        for (int r = 0; r < response.results_size(); ++r)
        {
            auto result = response.results(r);
            std::cout << "Result stability: " << result.stability() << std::endl;
            for (int a = 0; a < result.alternatives_size(); ++a)
            {
                auto alternative = result.alternatives(a);
                std::cout << alternative.confidence() << "\t"
                        << alternative.transcript() << std::endl;
            }
        }
    }

    grpc::Status status = streamer->Finish();
    microphone_thread.join();
    if (!status.ok()) {
      // Report the RPC failure.
      qDebug() << "error RPC";
      std::cerr << status.error_message() << std::endl;
    }
}

speech_to_text.h

#ifndef SPEECHTOTEXT_H
#define SPEECHTOTEXT_H

#include <QObject>
#include <QDebug>
#include <QThread>

#include <thread>
#include <chrono>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string>
#include <functional>

#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>
#include <QtConcurrent>
#include <QMutex>

#include <grpc++/grpc++.h>
#include "google/cloud/speech/v1/cloud_speech.grpc.pb.h"

using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;

class SpeechToText : public QObject
{
    Q_OBJECT
public:
    explicit SpeechToText(QObject *parent = nullptr);

signals:
    void finished_initializing();
    void finished_speech_to_text(QString);

public slots:
    void initialize();
    void start_speech_to_text();

private:
    void MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                          StreamingRecognizeResponse> *);

    QAudioInput *audioInput;
    QIODevice *ioDevice;
    QByteArray audioDataBuffer;
};

#endif // SPEECHTOTEXT_H

关于如何解决此问题的任何想法？

Answer 1

您应该真正遵循Google的示例，一次只能执行64k。
当您打算将请求发送到Google的服务器时，应在WritesDone()上使用streamer。
看来您从未清除过QByteArray的数据，因此随着时间的推移，QByteArray上的每个连续append调用都将堆积它。由于您使用的是指向基础数组中数据的第一个元素的指针，因此，每次循环运行时，都将截至该点为止捕获的所有音频数据发送到streamer。我建议一个嵌套循环重复调用QIODevice::read(char *data, qint64 maxSize)，直到您的QByteArray恰好有64KB。您需要处理一个返回值-1，该值指示流的结尾，并根据需要多少数据来向下填充maxSize，以将数组最多填充到64k。向Google API发出的数据量过少的请求（例如，您的当前循环初看起来只有几个字节）可能会限制您的速率，或者由于高的协议开销与数据的比率而在Internet连接上造成上游拥塞。另外，使用固定大小（64k）的普通C样式数组而不是QByteArray可能更容易处理此问题，因为您不需要调整大小，并且AFAIK QByteArray::clear()可能会导致内存分配（对性能不利）。为了避免在短写入期间重新发送旧数据（例如，当麦克风流在64k缓冲区已满之前关闭），您还应该在每次memset(array, 0, sizeof array);调用之后ClientReaderWriterInterface::WritesDone()。
如果网络无法跟上传入的麦克风数据，您可能会在QAudioInput上出现溢出的情况，该地方的本地缓冲区不足以存储音频。更多的缓冲使这种可能性降低，但也会降低响应速度。您可能只想将QAudioInput产生的所有数据缓冲到无限制的QByteArray中，并一次从 64k中读出（可以通过包装它在QBuffer中的位置，并且您在QIODevice中处理MicrophoneThreadMain()的所有代码都将是兼容的。）我认为，通常，对于像您这样的项目，用户宁愿响应性较差，因为在网络相关的超速情况下，不必重复自己的操作。但是可能存在一个阈值（可能是5秒钟左右），此后缓冲的数据可能会“过时”，因为用户可能再次尝试对着麦克风讲话，一旦上游出现多个连续发生的STT事件，就会产生怪异的影响瓶颈释放了。

Answer 2

我在这里发布我的问题的解决方案。感谢@allquixotic提供了所有有用的信息。

在mainwindow.cpp中

void MainWindow::setUpMicrophoneRecorder()
{
    microphone_thread = new QThread(this);
    microphone_recorder_engine.moveToThread(microphone_thread);

    connect(microphone_thread, SIGNAL(started()), &microphone_recorder_engine, SLOT(start_listen()));
    connect(&microphone_recorder_engine, &MicrophoneRecorder::microphone_data_raw,
            this, [this] (const QByteArray &data) {
        this->speech_to_text_engine.listen(data);
    });

    microphone_thread->start();
}

void MainWindow::setUpSpeechToTextEngine()
{
    speech_to_text_thread = new QThread(this);
    speech_to_text_engine.moveToThread(speech_to_text_thread);

    connect(speech_to_text_thread, SIGNAL(started()), &speech_to_text_engine, SLOT(initialize()));
    connect(&speech_to_text_engine, SIGNAL(finished_speech_to_text(QString)), this, SLOT(process_user_input(QString)));

    speech_to_text_thread->start();
}

microphonerecorder.h

#ifndef MICROPHONERECORDER_H
#define MICROPHONERECORDER_H

#include <QObject>
#include <QByteArray>
#include <QDebug>
#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>

class MicrophoneRecorder : public QObject
{
    Q_OBJECT
public:
    explicit MicrophoneRecorder(QObject *parent = nullptr);

signals:
    void microphone_data_raw(const QByteArray &);

public slots:
    void start_listen();

private slots:
    void listen(const QByteArray &);

private:
    QAudioInput *audioInput;
    QIODevice *ioDevice;
    QByteArray audioDataBuffer;
};

#endif // MICROPHONERECORDER_H

microphonerecorder.cpp

#include "microphonerecorder.h"

MicrophoneRecorder::MicrophoneRecorder(QObject *parent) : QObject(parent)
{

}

void MicrophoneRecorder::listen(const QByteArray &audioData)
{
    emit microphone_data_raw(audioData);
}

void MicrophoneRecorder::start_listen()
{
    QAudioFormat qtFormat;

    // Get default audio input device
    QAudioDeviceInfo qtInfo = QAudioDeviceInfo::defaultInputDevice();

    // Set the audio format settings
    qtFormat.setCodec("audio/pcm");
    qtFormat.setByteOrder(QAudioFormat::Endian::LittleEndian);
    qtFormat.setChannelCount(1);
    qtFormat.setSampleRate(16000);
    qtFormat.setSampleSize(16);
    qtFormat.setSampleType(QAudioFormat::SignedInt);

    // Check whether the format is supported
    if (!qtInfo.isFormatSupported(qtFormat)) {
        qWarning() << "Default format is not supported";
        exit(3);
    }

    // Instantiate QAudioInput with the settings
    audioInput = new QAudioInput(qtFormat);

    // Start receiving data from audio input
    ioDevice = audioInput->start();

    // Listen to the received data for wake words
    QObject::connect(ioDevice, &QIODevice::readyRead, [=] {
        listen(ioDevice->readAll());
    });
}

语音文本.h

#ifndef SPEECHTOTEXT_H
#define SPEECHTOTEXT_H

#include <QObject>
#include <QDebug>
#include <QThread>
#include <QDateTime>

#include <thread>
#include <chrono>
#include <string>

#include <QtMultimedia>
#include <QtMultimedia/QAudioInput>
#include <QAudioDeviceInfo>
#include <QAudioFormat>
#include <QIODevice>
#include <QtConcurrent>
#include <QMutex>

#include <grpc++/grpc++.h>
#include "google/cloud/speech/v1/cloud_speech.grpc.pb.h"

using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;

class SpeechToText : public QObject
{
    Q_OBJECT
public:
    explicit SpeechToText(QObject *parent = nullptr);

signals:
    void finished_initializing();
    void in_speech_to_text();
    void out_of_speech_to_text();
    void finished_speech_to_text(QString);

public slots:
    void initialize();
    void listen(const QByteArray &);
    void start_speech_to_text();

private:
    void MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                          StreamingRecognizeResponse> *);
    void StreamerThread(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                          StreamingRecognizeResponse> *);

    QByteArray audioDataBuffer;
    int m_start_time;
};

#endif // SPEECHTOTEXT_H

speechtotext.cpp

#include "speechtotext.h"

using google::cloud::speech::v1::StreamingRecognitionConfig;
using google::cloud::speech::v1::RecognitionConfig;
using google::cloud::speech::v1::Speech;
using google::cloud::speech::v1::StreamingRecognizeRequest;
using google::cloud::speech::v1::StreamingRecognizeResponse;

SpeechToText::SpeechToText(QObject *parent) : QObject(parent)
{

}

void SpeechToText::initialize()
{
    emit finished_initializing();
}

void SpeechToText::MicrophoneThreadMain(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                        StreamingRecognizeResponse> *streamer)
{
    StreamingRecognizeRequest request;
    std::size_t size_read;
    while (time(0) - m_start_time <= TIME_RECOGNITION)
    {
        int chunk_size = 64 * 1024;
        if (audioDataBuffer.size() >= chunk_size)
        {
            QByteArray bytes_read = QByteArray(audioDataBuffer);
            size_read = std::size_t(bytes_read.size());

            // And write the chunk to the stream.
            request.set_audio_content(&bytes_read.data()[0], size_read);

            bool ok = streamer->Write(request);
            /*if (ok)
            {
                std::cout << "Sending " << size_read / 1024 << "k bytes." << std::endl;
            }*/

            audioDataBuffer.clear();
            audioDataBuffer.resize(0);
        }
        std::this_thread::sleep_for(std::chrono::milliseconds(50));
    }

    qDebug() << "Out of speech recognition: " << end_date;

    emit out_of_speech_to_text();

    streamer->WritesDone();
}

void SpeechToText::StreamerThread(grpc::ClientReaderWriterInterface<StreamingRecognizeRequest,
                                      StreamingRecognizeResponse> *streamer)
{
    // Read responses.
    StreamingRecognizeResponse response;

    while (time(0) - m_start_time <= TIME_RECOGNITION)
    {
        if(streamer->Read(&response)) // Returns false when no more to read.
        {
            // Dump the transcript of all the results.
            if (response.results_size() > 0)
            {
                auto result = response.results(0);
                if (result.alternatives_size() > 0)
                {
                    auto alternative = result.alternatives(0);
                    auto transcript = QString::fromStdString(alternative.transcript());
                    if (result.is_final())
                    {
                        qDebug() << "Speech recognition: " << transcript;

                        emit finished_speech_to_text(transcript);
                    }
                }
            }
        }
    }
}

void SpeechToText::listen(const QByteArray &audioData)
{
    audioDataBuffer.append(audioData);
}

void SpeechToText::start_speech_to_text()
{
    qDebug() << "in start_speech_to_text: " << start_date;

    emit in_speech_to_text();

    m_start_time = time(0);
    audioDataBuffer.clear();
    audioDataBuffer.resize(0);

    StreamingRecognizeRequest request;

    auto *streaming_config   = request.mutable_streaming_config();
    RecognitionConfig *recognition_config = new RecognitionConfig();

    recognition_config->set_language_code("en-US");
    recognition_config->set_sample_rate_hertz(16000);
    recognition_config->set_encoding(RecognitionConfig::LINEAR16);
    streaming_config->set_allocated_config(recognition_config);

    // Create a Speech Stub connected to the speech service.
    auto creds = grpc::GoogleDefaultCredentials();
    auto channel = grpc::CreateChannel("speech.googleapis.com", creds);
    std::unique_ptr<Speech::Stub> speech(Speech::NewStub(channel));

    // Begin a stream.
    grpc::ClientContext context;
    auto streamer = speech->StreamingRecognize(&context);

    // Write the first request, containing the config only.
    streaming_config->set_interim_results(true);
    streamer->Write(request);

    // The microphone thread writes the audio content.
    std::thread microphone_thread(&SpeechToText::MicrophoneThreadMain, this, streamer.get());
    std::thread streamer_thread(&SpeechToText::StreamerThread, this, streamer.get());

    microphone_thread.join();
    streamer_thread.join();
}

由于语音线程Qt C ++，Google语音识别无法正常工作

2 个答案: