我目前正在研究Softbanks的机器人Pepper,我尝试通过Websocket协议在Watson的音频缓冲区远程流上使用Watson语音转文本解决方案。
我用前一个问题NAO robot remote audio problems的答案来找到一种方式来访问远程胡椒的音频缓冲区,并使用项目https://github.com/ibm-dev/watson-streaming-stt来学习如何使用websocket协议来使用watson流stt。
但是,打开Websocket应用程序后,我开始向watson发送缓冲区,并且在发送几次之后,收到错误消息:“无法从audio / l16; rate = 48000; channel = 1转换为以下其中之一:audio / x-float-array;速率= 16000;频道= 1'
每次尝试将Pepper的音频缓冲区发送给Watson时,它都无法理解。
我比较了我发送的数据和在watson stream stt示例中发送的数据(使用麦克风的pyaudio流而不是Pepper的缓冲区流),但没有发现任何区别。两次我都很确定我要发送一个包含原始字节块的字符串。这是Watson在文档中要求的。
我尝试以48kHz的采样率发送8192字节的块,并且我可以轻松地将Pepper的音频缓冲区转换为六进制,所以我不明白为什么沃森无法对其进行转码。
这是我的代码:
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import argparse
import base64
import configparser
import json
import threading
import time
from optparse import OptionParser
import naoqi
import numpy as np
import sys
from threading import Thread
import ssl
import websocket
from websocket._abnf import ABNF
CHANNELS = 1
NAO_IP = "172.20.10.12"
class SoundReceiverModule(naoqi.ALModule):
"""
Use this object to get call back from the ALMemory of the naoqi world.
Your callback needs to be a method with two parameter (variable name, value).
"""
def __init__( self, strModuleName, strNaoIp):
try:
naoqi.ALModule.__init__(self, strModuleName );
self.BIND_PYTHON( self.getName(),"callback" );
self.strNaoIp = strNaoIp;
self.outfile = None;
self.aOutfile = [None]*(4-1); # ASSUME max nbr channels = 4
self.FINALS = []
self.RECORD_SECONDS = 20
self.ws_open = False
self.ws_listening = ""
# init data for websocket interfaces
self.headers = {}
self.userpass = "" #userpass and password
self.headers["Authorization"] = "Basic " + base64.b64encode(
self.userpass.encode()).decode()
self.url = ("wss://stream.watsonplatform.net//speech-to-text/api/v1/recognize"
"?model=fr-FR_BroadbandModel")
except BaseException, err:
print( "ERR: abcdk.naoqitools.SoundReceiverModule: loading error: %s" % str(err) );
# __init__ - end
def __del__( self ):
print( "INF: abcdk.SoundReceiverModule.__del__: cleaning everything" );
self.stop();
def start( self ):
audio = naoqi.ALProxy( "ALAudioDevice", self.strNaoIp, 9559 );
self.nNbrChannelFlag = 3; # ALL_Channels: 0, AL::LEFTCHANNEL: 1, AL::RIGHTCHANNEL: 2; AL::FRONTCHANNEL: 3 or AL::REARCHANNEL: 4.
self.nDeinterleave = 0;
self.nSampleRate = 48000;
audio.setClientPreferences( self.getName(), self.nSampleRate, self.nNbrChannelFlag, self.nDeinterleave ); # setting same as default generate a bug !?!
audio.subscribe( self.getName() );
#openning websocket app
self._ws = websocket.WebSocketApp(self.url,
header=self.headers,
on_open = self.on_open,
on_message=self.on_message,
on_error=self.on_error,
on_close=self.on_close)
sslopt={"cert_reqs": ssl.CERT_NONE}
threading.Thread(target=self._ws.run_forever, kwargs = {'sslopt':sslopt}).start()
print( "INF: SoundReceiver: started!" );
def stop( self ):
print( "INF: SoundReceiver: stopping..." );
audio = naoqi.ALProxy( "ALAudioDevice", self.strNaoIp, 9559 );
audio.unsubscribe( self.getName() );
print( "INF: SoundReceiver: stopped!" );
print "INF: WebSocket: closing..."
data = {"action": "stop"}
self._ws.send(json.dumps(data).encode('utf8'))
# ... which we need to wait for before we shutdown the websocket
time.sleep(1)
self._ws.close()
print "INF: WebSocket: closed"
if( self.outfile != None ):
self.outfile.close();
def processRemote( self, nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, buffer ):
"""
This is THE method that receives all the sound buffers from the "ALAudioDevice" module"""
print "receiving buffer"
# self.data_to_send = self.data_to_send + buffer
# print len(self.data_to_send)
#self.data_to_send = ''.join( [ "%02X " % ord( x ) for x in buffer ] ).strip()
self.data_to_send = buffer
#print("buffer type :", type(data))
#print("buffer :", buffer)
#~ print( "process!" );
print( "processRemote: %s, %s, %s, lendata: %s, data0: %s (0x%x), data1: %s (0x%x)" % (nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, len(buffer), buffer[0],ord(buffer[0]),buffer[1],ord(buffer[1])) );
if self.ws_open == True and self.ws_listening == True:
print "sending data"
self._ws.send(self.data_to_send, ABNF.OPCODE_BINARY)
print "data sent"
#print self.data_to_send
aSoundDataInterlaced = np.fromstring( str(buffer), dtype=np.int16 );
#
aSoundData = np.reshape( aSoundDataInterlaced, (nbOfChannels, nbrOfSamplesByChannel), 'F' );
# print "processRemote over"
# processRemote - end
def on_message(self, ws, msg):
print("message")
data = json.loads(msg)
print data
if "state" in data:
if data["state"] == "listening":
self.ws_listening = True
if "results" in data:
if data["results"][0]["final"]:
self.FINALS.append(data)
# This prints out the current fragment that we are working on
print(data['results'][0]['alternatives'][0]['transcript'])
def on_error(self, ws, error):
"""Print any errors."""
print(error)
def on_close(self, ws):
"""Upon close, print the complete and final transcript."""
transcript = "".join([x['results'][0]['alternatives'][0]['transcript']
for x in self.FINALS])
print("transcript :", transcript)
self.ws_open = False
def on_open(self, ws):
"""Triggered as soon a we have an active connection."""
# args = self._ws.args
print "INF: WebSocket: opening"
data = {
"action": "start",
# this means we get to send it straight raw sampling
"content-type": "audio/l16;rate=%d;channel=1" % self.nSampleRate,
"continuous": True,
"interim_results": True,
# "inactivity_timeout": 5, # in order to use this effectively
# you need other tests to handle what happens if the socket is
# closed by the server.
"word_confidence": True,
"timestamps": True,
"max_alternatives": 3
}
# Send the initial control message which sets expectations for the
# binary stream that follows:
self._ws.send(json.dumps(data).encode('utf8'))
# Spin off a dedicated thread where we are going to read and
# stream out audio.
print "INF: WebSocket: opened"
self.ws_open = True
def version( self ):
return "0.6";
def main():
"""initialisation
"""
parser = OptionParser()
parser.add_option("--pip",
help="Parent broker port. The IP address or your robot",
dest="pip")
parser.add_option("--pport",
help="Parent broker port. The port NAOqi is listening to",
dest="pport",
type="int")
parser.set_defaults(
pip=NAO_IP,
pport=9559)
(opts, args_) = parser.parse_args()
pip = opts.pip
pport = opts.pport
# We need this broker to be able to construct
# NAOqi modules and subscribe to other modules
# The broker must stay alive until the program exists
myBroker = naoqi.ALBroker("myBroker",
"0.0.0.0", # listen to anyone
0, # find a free port and use it
pip, # parent broker IP
pport) # parent broker port
"""fin initialisation
"""
global SoundReceiver
SoundReceiver = SoundReceiverModule("SoundReceiver", pip) #thread1
SoundReceiver.start()
try:
while True:
time.sleep(1)
print "hello"
except KeyboardInterrupt:
print "Interrupted by user, shutting down"
myBroker.shutdown()
SoundReceiver.stop()
sys.exit(0)
if __name__ == "__main__":
main()
如果有人对如何绕过该错误或如何尝试获取有用的信息有任何想法,我将非常感激。我最初认为我向沃森发送了“错误的”数据,但是经过多次尝试,我对如何解决该问题一无所知。
非常感谢
亚历克斯