我正在尝试使用 vonage API 和谷歌语音将电话实时转录为文本,但是当我尝试使用谷歌 STT 时,我一直收到错误消息。
这是我的代码:
<rootDir>
这是我得到的错误:
from flask import Flask, request, jsonify
from flask_sockets import Sockets
from google.cloud import speech # speech to text service
import sys
import os
# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = PATH
app = Flask(__name__)
sockets = Sockets(app)
language_code = "iw-IL" # a BCP-47 language tag
client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=RATE,
language_code=language_code,
)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
def listen_print_loop(responses):
"""Iterates through server responses and prints them.
The responses passed is a generator that will block until a response
is provided by the server.
Each response may contain multiple results, and each result may contain
multiple alternatives; for details, see . Here we
print only the transcription for the top alternative of the top result.
In this case, responses are provided for interim results as well. If the
response is an interim one, print a line feed at the end of it, to allow
the next result to overwrite it, until the response is a final one. For the
final one, print a newline to preserve the finalized transcription.
"""
num_chars_printed = 0
for response in responses:
if not response.results:
continue
# The `results` list is consecutive. For streaming, we only care about
# the first result being considered, since once it's `is_final`, it
# moves on to considering the next utterance.
result = response.results[0]
if not result.alternatives:
continue
# Display the transcription of the top alternative.
transcript = result.alternatives[0].transcript
global phaseNum
# Display interim results, but with a carriage return at the end of the
# line, so subsequent lines will overwrite them.
#
# If the previous result was longer than this one, we need to print
# some extra spaces to overwrite the previous result
overwrite_chars = " " * (num_chars_printed - len(transcript))
if not result.is_final:
sys.stdout.write(transcript + overwrite_chars + "\r")
sys.stdout.flush()
num_chars_printed = len(transcript)
else:
print('==>'+transcript + overwrite_chars)
num_chars_printed = 0
@app.route("/ncco")
def answer_call():
ncco = [
{
"action": "talk",
"text": "Please wait while we connect you to the echo server",
},
{
"action": "connect",
"from": "NUMBER",
"endpoint": [
{
"type": "websocket",
"uri": "wss://{0}/socket".format(request.host),
"content-type": "audio/l16;rate=16000",
}
],
},
]
return jsonify(ncco)
@app.route("/webhooks/event", methods=["POST"])
def events():
return "200"
@sockets.route("/socket", methods=["GET"])
def echo_socket(ws):
while not ws.closed:
message = ws.receive()
if type(message) == str:
print(message)
elif message:
requests = (speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in message)
responses = client.streaming_recognize(streaming_config, requests)
# listen_print_loop(responses)
# ws.send(message)
if __name__ == "__main__":
from gevent import pywsgi
from geventwebsocket.handler import WebSocketHandler
server = pywsgi.WSGIServer(("", 3000), app, handler_class=WebSocketHandler)
print('server is Up')
server.serve_forever()
基本上我尝试使用来自麦克风的网络套接字的流来实现谷歌麦克风流示例,但该实现存在问题
答案 0 :(得分:0)
我找到了一个解决方案,在这里:
#!/usr/bin/env python3
import base64
import json
import threading
import os, sys
from flask import Flask, request, jsonify
from flask_sockets import Sockets
from google.cloud.speech import RecognitionConfig, StreamingRecognitionConfig
from SpeechClientBridge import SpeechClientBridge
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = PATH
config = RecognitionConfig(
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="iw-IL",
)
streaming_config = StreamingRecognitionConfig(config=config, interim_results=True)
app = Flask(__name__)
sockets = Sockets(app)
@app.route("/ncco")
def answer_call():
ncco = [
{
"action": "connect",
"from": "+97223764024",
"endpoint": [
{
"type": "websocket",
"uri": "wss://{0}/socket".format(request.host),
"content-type": "audio/l16;rate=16000",
}
],
},
]
return jsonify(ncco)
def on_transcription_response(response):
num_chars_printed = 0
if not response.results:
return
result = response.results[0]
if not result.alternatives:
return
transcript = result.alternatives[0].transcript
overwrite_chars = " " * (num_chars_printed - len(transcript))
print(result.is_final)
if not result.is_final:
sys.stdout.write(transcript + overwrite_chars + "\r")
sys.stdout.flush()
num_chars_printed = len(transcript)
else:
print('==>'+transcript + overwrite_chars)
@app.route("/webhooks/event", methods=["POST"])
def events():
return "200"
@sockets.route("/socket", methods=["GET"])
def transcript(ws):
print("WS connection opened")
bridge = SpeechClientBridge(streaming_config, on_transcription_response)
t = threading.Thread(target=bridge.start)
t.start()
while not ws.closed:
message = ws.receive()
if message is None:
bridge.add_request(None)
bridge.terminate()
break
# print(message)
if type(message) == str:
print(message)
elif type(message) == None:
break
# chunk = base64.b64decode(message)
else:
bridge.add_request(message)
bridge.terminate()
print("WS connection closed")
# def echo_socket(ws):
# print("WS connection opened")
# while not ws.closed:
# message = ws.receive()
# ws.send(message)
if __name__ == "__main__":
from gevent import pywsgi
from geventwebsocket.handler import WebSocketHandler
server = pywsgi.WSGIServer(("", 3000), app, handler_class=WebSocketHandler)
print('server is Up')
server.serve_forever()
还有一个重要的类:
import queue
from google.cloud import speech
class SpeechClientBridge:
def __init__(self, streaming_config, on_response):
self._on_response = on_response
self._queue = queue.Queue()
self._ended = False
self.streaming_config = streaming_config
def start(self):
client = speech.SpeechClient()
stream = self.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in stream
)
responses = client.streaming_recognize(self.streaming_config, requests)
self.process_responses_loop(responses)
def terminate(self):
self._ended = True
def add_request(self, buffer):
self._queue.put(bytes(buffer), block=False)
def process_responses_loop(self, responses):
for response in responses:
self._on_response(response)
if self._ended:
break
def generator(self):
while not self._ended:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._queue.get()
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._queue.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b"".join(data)