我正在尝试使用IBM Watson的语音到文本(STT)服务与NAO和Pepper机器人进行一些实时的语音到文本的转录。
我尝试遵循IBM的github为python SDK提供的示例(在此处找到:https://github.com/watson-developer-cloud/python-sdk/blob/master/examples/microphone-speech-to-text.py),但是我遇到了一些问题。它没有正确接受我发送到websocket的数据缓冲区。
从IBM websocket文档(可在https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-audio-formats#audio-formats找到)中,它指出pcm数据必须为16位格式。
当我检查了用于检索机器人缓冲区的代码(位于NAO robot remote audio problems)时,缓冲区以字节数据的字符串表示形式返回数据。然后,使用numpy将此数据转换为16位整数数据。但是,IBM websocket无法正确处理此数据。我正在尝试通过
我用来检索发送到websocket代码的缓冲区的代码如下:
# -*- coding: utf-8 -*-
####################################################################################
# Retrieve robot audio buffer from NAO/Pepper
#
# Audio data from buffer is then processed and converted
# into a .wav file
#
# Wav file is then read by IBM Watson's speech-to-text
# (STT) service
#
# resulting transcription of .wav file is then saved
#
# SoundReceiverModule inspired by the work of Alexandre Mazel
# from https://stackoverflow.com/questions/24243757/nao-robot-remote-audio-problems
####################################################################################
from __future__ import print_function
from naoqi import ALModule, ALBroker, ALProxy
import numpy as np
import time
import sys
import os
import wave
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
import json
from threading import Thread
from Queue import Queue
#--------------------------------------------------------------------------------------------
# Module for Watson Speech To Text Real Time Streaming
#--------------------------------------------------------------------------------------------
class MyRecognizeCallback(RecognizeCallback):
def __init__(self):
RecognizeCallback.__init__(self)
self.transcript = ''
def on_connected(self):
print('Connected to Watson Speech to Text')
def on_listening(self):
print('Listening for audio...')
# def on_data(self, data):
# results = data['results'][0]['alternatives'][0]['transcript']
# print('User: ', end='')
# print(results)
# def on_hypothesis(self, hypothesis):
# print('Hypothesis: ', end='')
# print(hypothesis)
def on_transcription(self, transcript):
self.transcript = transcript[0]['transcript'].encode('ascii', 'ignore')
print('User transcript: ', end='')
print(self.transcript)
def get_transcript(self):
return self.transcript
def on_error(self, error):
print('Error received: {}'.format(error))
def on_inactivity_timeout(self, error):
print('Inactivity timeout: {}'.format(error))
#--------------------------------------------------------------------------------------------
# Module for remote processing of audio data from NAO
#--------------------------------------------------------------------------------------------
class SoundReceiverModule(ALModule):
"""
Use this object to get call back from the ALMemory of the naoqi world.
Your callback needs to be a method with two parameter (variable name, value).
"""
def __init__(self, strModuleName, myRecognizeCallback, speech_to_text):
try:
ALModule.__init__(self, strModuleName)
self.BIND_PYTHON( self.getName(),"callback" )
self.myRecognizeCallback = myRecognizeCallback
self.speech_to_text = speech_to_text
self.outfile = None
self.wavfileName = None
self.transcript = ''
self.queue = Queue()
self.audioSource = AudioSource(self.queue, True, True)
except BaseException, err:
print( "ERR: abcdk.naoqitools.SoundReceiverModule: loading error: %s" % str(err) )
def get_transcript(self):
return self.transcript
def listen(self):
audio = ALProxy( 'ALAudioDevice')
nNbrChannelFlag = 3 # ALL_Channels: 0, AL::LEFTCHANNEL: 1, AL::RIGHTCHANNEL: 2 AL::FRONTCHANNEL: 3 or AL::REARCHANNEL: 4.
nDeinterleave = 0
nSampleRate = 16000
audio.setClientPreferences(self.getName(), nSampleRate, nNbrChannelFlag, nDeinterleave) # setting same as default generate a bug !?!
strFilenameOut = '\out.raw'
self.outfile = open(os.getcwd() + strFilenameOut, 'wb')
if(self.outfile != None):
self.outfile.seek(0)
self.outfile.truncate()
# start remote processing
audio.subscribe(self.getName())
self.speech_to_text.recognize_using_websocket(
audio=self.audioSource,
content_type='audio/l16;rate=16000',
recognize_callback=self.myRecognizeCallback,
interim_results=True,
max_alternatives=3)
print( "INF: SoundReceiver: started!" )
print("INF: Writing sound to '%s'" % strFilenameOut)
def stop(self):
print("INF: SoundReceiver: stopping...")
audio = ALProxy("ALAudioDevice")
audio.unsubscribe(self.getName())
print("INF: SoundReceiver: stopped!")
print
if(self.outfile != None):
self.outfile.close()
self.wavfileName = self.rawToWav(self.outfile)
# self.transcript = self.process_raw_audio_data(self.wavfileName)
else:
print("outfile not saved properly")
def processRemote(self, nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, buffer):
"""
This is THE method that receives all the sound buffers from the "ALAudioDevice" module
"""
# self.queue.put(buffer)
aSoundDataInterlaced = np.fromstring(buffer, dtype=np.int16)
aSoundData = np.reshape(aSoundDataInterlaced, (nbOfChannels, nbrOfSamplesByChannel), 'F')
# print(aSoundData[0])
# print('')
self.queue.put(aSoundData[0])
aSoundData[0].tofile(self.outfile)
# convert raw file to wav file
def rawToWav(self, raw):
if not os.path.isfile(raw.name):
print("file not in path...")
return
print("Converting .raw file to .wav file...")
wav = wave.open(raw.name.replace(".raw", ".wav"), "wb")
wav.setframerate(16000)
wav.setnchannels(1)
wav.setsampwidth(2)
f = open(raw.name, 'rb')
sample = f.read(4096)
while sample != "":
wav.writeframes(sample)
sample = f.read(4096)
path = raw.name.replace(".raw", ".wav")
f.close()
return path
def version( self ):
return "0.6"
#---------------------------------------------------------------------------------------------------------------------
# Main function for testing purposes
#---------------------------------------------------------------------------------------------------------------------
def main():
""" Main entry point
"""
NAO_IP = "192.168.20.151" # Nao IP address
pip = NAO_IP
pport = 9559
# We need this broker to be able to construct
# NAOqi modules and subscribe to other modules
# The broker must stay alive until the program exists
myBroker = ALBroker("myBroker",
"0.0.0.0", # listen to anyone
0, # find a free port and use it
pip, # parent broker IP
pport) # parent broker port
# initialize Watson Text to Speech
speech_to_text = SpeechToTextV1(
iam_apikey='xyz',
url='https://stream.watsonplatform.net/speech-to-text/api')
myRecognizeCallback = MyRecognizeCallback()
# initialize SoundReceiver
global SoundReceiver
SoundReceiver = SoundReceiverModule("SoundReceiver", myRecognizeCallback, speech_to_text)
# Start Sound Receiver and Watson Text to Speech
leds = ALProxy('ALLeds')
leds.setIntensity('EarLeds', 1)
SoundReceiver.listen()
time.sleep(4)
SoundReceiver.stop()
leds.setIntensity('EarLeds', .5)
if __name__ == "__main__":
main()
我感谢某人可以提供的任何帮助。谢谢