使用 python 和谷歌云语音实时将 websocket 流中的音频转录为文本

时间:2021-04-19 09:00:16

标签: python flask google-speech-to-text-api

我正在尝试使用 vonage API 和谷歌语音将电话实时转录为文本,但是当我尝试使用谷歌 STT 时,我一直收到错误消息。

这是我的代码:

<rootDir>

这是我得到的错误:

from flask import Flask, request, jsonify
from flask_sockets import Sockets
from google.cloud import speech # speech to text service
import sys
import os

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = PATH


app = Flask(__name__)
sockets = Sockets(app)

language_code = "iw-IL"  # a BCP-47 language tag

client = speech.SpeechClient()
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=RATE,
    language_code=language_code,
)

streaming_config = speech.StreamingRecognitionConfig(
    config=config, interim_results=True
)


def listen_print_loop(responses):
    """Iterates through server responses and prints them.

    The responses passed is a generator that will block until a response
    is provided by the server.

    Each response may contain multiple results, and each result may contain
    multiple alternatives; for details, see .  Here we
    print only the transcription for the top alternative of the top result.

    In this case, responses are provided for interim results as well. If the
    response is an interim one, print a line feed at the end of it, to allow
    the next result to overwrite it, until the response is a final one. For the
    final one, print a newline to preserve the finalized transcription.
    """


    num_chars_printed = 0
    for response in responses:
        if not response.results:
            continue

        # The `results` list is consecutive. For streaming, we only care about
        # the first result being considered, since once it's `is_final`, it
        # moves on to considering the next utterance.
        result = response.results[0]
        if not result.alternatives:
            continue

        # Display the transcription of the top alternative.
        transcript = result.alternatives[0].transcript
        global phaseNum
        # Display interim results, but with a carriage return at the end of the
        # line, so subsequent lines will overwrite them.
        #
        # If the previous result was longer than this one, we need to print
        # some extra spaces to overwrite the previous result
        overwrite_chars = " " * (num_chars_printed - len(transcript))

        if not result.is_final:

            sys.stdout.write(transcript + overwrite_chars + "\r")
            sys.stdout.flush()

            num_chars_printed = len(transcript)


        else:
            print('==>'+transcript + overwrite_chars)


            num_chars_printed = 0


@app.route("/ncco")
def answer_call():
    ncco = [
        {
            "action": "talk",
            "text": "Please wait while we connect you to the echo server",
        },
        {
            "action": "connect",
            "from": "NUMBER",
            "endpoint": [
                {
                    "type": "websocket",
                    "uri": "wss://{0}/socket".format(request.host),
                    "content-type": "audio/l16;rate=16000",
                }
            ],
        },
    ]

    return jsonify(ncco)


@app.route("/webhooks/event", methods=["POST"])
def events():
    return "200"


@sockets.route("/socket", methods=["GET"])
def echo_socket(ws):
    while not ws.closed:
        message = ws.receive()
        if type(message) == str:
            print(message)
        elif message:
            requests = (speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in message)
            responses = client.streaming_recognize(streaming_config, requests)
            # listen_print_loop(responses)
            # ws.send(message)


if __name__ == "__main__":
    from gevent import pywsgi
    from geventwebsocket.handler import WebSocketHandler
    server = pywsgi.WSGIServer(("", 3000), app, handler_class=WebSocketHandler)
    print('server is Up')
    server.serve_forever()


基本上我尝试使用来自麦克风的网络套接字的流来实现谷歌麦克风流示例,但该实现存在问题

1 个答案:

答案 0 :(得分:0)

我找到了一个解决方案,在这里:

#!/usr/bin/env python3
import base64
import json
import threading
import os, sys

from flask import Flask, request, jsonify
from flask_sockets import Sockets
from google.cloud.speech import RecognitionConfig, StreamingRecognitionConfig
from SpeechClientBridge import SpeechClientBridge

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = PATH


config = RecognitionConfig(
    encoding=RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code="iw-IL",
)
streaming_config = StreamingRecognitionConfig(config=config, interim_results=True)


app = Flask(__name__)
sockets = Sockets(app)


@app.route("/ncco")
def answer_call():
    ncco = [
        {
            "action": "connect",
            "from": "+97223764024",
            "endpoint": [
                {
                    "type": "websocket",
                    "uri": "wss://{0}/socket".format(request.host),
                    "content-type": "audio/l16;rate=16000",
                }
            ],
        },
    ]

    return jsonify(ncco)


def on_transcription_response(response):
    num_chars_printed = 0
    if not response.results:
        return

    result = response.results[0]
    if not result.alternatives:
        return

    transcript = result.alternatives[0].transcript
    overwrite_chars = " " * (num_chars_printed - len(transcript))

    print(result.is_final)

    if not result.is_final:

            sys.stdout.write(transcript + overwrite_chars + "\r")
            sys.stdout.flush()

            num_chars_printed = len(transcript)


    else:
        print('==>'+transcript + overwrite_chars)


@app.route("/webhooks/event", methods=["POST"])
def events():
    return "200"


@sockets.route("/socket", methods=["GET"])
def transcript(ws):
    print("WS connection opened")
    bridge = SpeechClientBridge(streaming_config, on_transcription_response)
    t = threading.Thread(target=bridge.start)
    t.start()

    while not ws.closed:
        message = ws.receive()
        if message is None:
            bridge.add_request(None)
            bridge.terminate()
            break

        # print(message)
        if type(message) == str:
            print(message)
        elif type(message) == None:
            break
            # chunk = base64.b64decode(message)
        else:
            bridge.add_request(message)


    bridge.terminate()
    print("WS connection closed")


# def echo_socket(ws):
#     print("WS connection opened")
#     while not ws.closed:
#         message = ws.receive()
#         ws.send(message)


if __name__ == "__main__":
    from gevent import pywsgi
    from geventwebsocket.handler import WebSocketHandler
    server = pywsgi.WSGIServer(("", 3000), app, handler_class=WebSocketHandler)
    print('server is Up')
    server.serve_forever()

还有一个重要的类:

import queue

from google.cloud import speech


class SpeechClientBridge:
    def __init__(self, streaming_config, on_response):
        self._on_response = on_response
        self._queue = queue.Queue()
        self._ended = False
        self.streaming_config = streaming_config

    def start(self):
        client = speech.SpeechClient()
        stream = self.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in stream
        )
        responses = client.streaming_recognize(self.streaming_config, requests)
        self.process_responses_loop(responses)

    def terminate(self):
        self._ended = True

    def add_request(self, buffer):
        self._queue.put(bytes(buffer), block=False)

    def process_responses_loop(self, responses):
        for response in responses:
            self._on_response(response)

            if self._ended:
                break

    def generator(self):
        while not self._ended:
            # Use a blocking get() to ensure there's at least one chunk of
            # data, and stop iteration if the chunk is None, indicating the
            # end of the audio stream.
            chunk = self._queue.get()
            if chunk is None:
                return
            data = [chunk]

            # Now consume whatever other data's still buffered.
            while True:
                try:
                    chunk = self._queue.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b"".join(data)