Question

我正在尝试使用IBM Watson的语音到文本（STT）服务与NAO和Pepper机器人进行一些实时的语音到文本的转录。

我尝试遵循IBM的github为python SDK提供的示例（在此处找到：https://github.com/watson-developer-cloud/python-sdk/blob/master/examples/microphone-speech-to-text.py），但是我遇到了一些问题。它没有正确接受我发送到websocket的数据缓冲区。

从IBM websocket文档（可在https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-audio-formats#audio-formats找到）中，它指出pcm数据必须为16位格式。

当我检查了用于检索机器人缓冲区的代码（位于NAO robot remote audio problems）时，缓冲区以字节数据的字符串表示形式返回数据。然后，使用numpy将此数据转换为16位整数数据。但是，IBM websocket无法正确处理此数据。我正在尝试通过

我用来检索发送到websocket代码的缓冲区的代码如下：

# -*- coding: utf-8 -*-
####################################################################################
# Retrieve robot audio buffer from NAO/Pepper
#
# Audio data from buffer is then processed and converted 
# into a .wav file
#
# Wav file is then read by IBM Watson's speech-to-text 
# (STT) service
#
# resulting transcription of .wav file is then saved
#
# SoundReceiverModule inspired by the work of Alexandre Mazel
# from https://stackoverflow.com/questions/24243757/nao-robot-remote-audio-problems
####################################################################################
from __future__ import print_function
from naoqi import ALModule, ALBroker, ALProxy
import numpy as np
import time
import sys
import os
import wave
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
import json
from threading import Thread
from Queue import Queue

#--------------------------------------------------------------------------------------------
# Module for Watson Speech To Text Real Time Streaming
#--------------------------------------------------------------------------------------------
class MyRecognizeCallback(RecognizeCallback):
    def __init__(self):
        RecognizeCallback.__init__(self)
        self.transcript = ''

    def on_connected(self):
        print('Connected to Watson Speech to Text')

    def on_listening(self):
        print('Listening for audio...')

    # def on_data(self, data):
    #     results = data['results'][0]['alternatives'][0]['transcript']
    #     print('User: ', end='')
    #     print(results)

    # def on_hypothesis(self, hypothesis):
    #     print('Hypothesis: ', end='')
    #     print(hypothesis)

    def on_transcription(self, transcript):
        self.transcript = transcript[0]['transcript'].encode('ascii', 'ignore')
        print('User transcript: ', end='')
        print(self.transcript)

    def get_transcript(self):
        return self.transcript

    def on_error(self, error):
        print('Error received: {}'.format(error))

    def on_inactivity_timeout(self, error):
        print('Inactivity timeout: {}'.format(error))


#--------------------------------------------------------------------------------------------
# Module for remote processing of audio data from NAO
#--------------------------------------------------------------------------------------------
class SoundReceiverModule(ALModule):
    """
    Use this object to get call back from the ALMemory of the naoqi world.
    Your callback needs to be a method with two parameter (variable name, value).
    """

    def __init__(self, strModuleName, myRecognizeCallback, speech_to_text):
        try:
            ALModule.__init__(self, strModuleName)
            self.BIND_PYTHON( self.getName(),"callback" )
            self.myRecognizeCallback = myRecognizeCallback
            self.speech_to_text = speech_to_text
            self.outfile = None
            self.wavfileName = None
            self.transcript = ''
            self.queue = Queue()
            self.audioSource = AudioSource(self.queue, True, True)

        except BaseException, err:
            print( "ERR: abcdk.naoqitools.SoundReceiverModule: loading error: %s" % str(err) )

    def get_transcript(self):
        return self.transcript

    def listen(self):
        audio = ALProxy( 'ALAudioDevice')
        nNbrChannelFlag = 3 # ALL_Channels: 0,  AL::LEFTCHANNEL: 1, AL::RIGHTCHANNEL: 2 AL::FRONTCHANNEL: 3  or AL::REARCHANNEL: 4.
        nDeinterleave = 0
        nSampleRate = 16000
        audio.setClientPreferences(self.getName(),  nSampleRate, nNbrChannelFlag, nDeinterleave) # setting same as default generate a bug !?!

        strFilenameOut = '\out.raw'
        self.outfile = open(os.getcwd() + strFilenameOut, 'wb')
        if(self.outfile != None):
            self.outfile.seek(0)
            self.outfile.truncate()

        # start remote processing
        audio.subscribe(self.getName())

        self.speech_to_text.recognize_using_websocket(
            audio=self.audioSource,
            content_type='audio/l16;rate=16000',
            recognize_callback=self.myRecognizeCallback,
            interim_results=True,
            max_alternatives=3)

        print( "INF: SoundReceiver: started!" )
        print("INF: Writing sound to '%s'" % strFilenameOut)

    def stop(self):
        print("INF: SoundReceiver: stopping...")
        audio = ALProxy("ALAudioDevice")
        audio.unsubscribe(self.getName())        
        print("INF: SoundReceiver: stopped!")
        print

        if(self.outfile != None):
            self.outfile.close()

            self.wavfileName = self.rawToWav(self.outfile)

            # self.transcript = self.process_raw_audio_data(self.wavfileName)

        else:
            print("outfile not saved properly")


    def processRemote(self, nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, buffer):
        """
        This is THE method that receives all the sound buffers from the "ALAudioDevice" module
        """
        # self.queue.put(buffer)

        aSoundDataInterlaced = np.fromstring(buffer, dtype=np.int16)

        aSoundData = np.reshape(aSoundDataInterlaced, (nbOfChannels, nbrOfSamplesByChannel), 'F')

        # print(aSoundData[0])
        # print('')

        self.queue.put(aSoundData[0])

        aSoundData[0].tofile(self.outfile)


    # convert raw file to wav file
    def rawToWav(self, raw):

        if not os.path.isfile(raw.name):
            print("file not in path...")
            return

        print("Converting .raw file to .wav file...")
        wav = wave.open(raw.name.replace(".raw", ".wav"), "wb")
        wav.setframerate(16000)
        wav.setnchannels(1)
        wav.setsampwidth(2)


        f = open(raw.name, 'rb')
        sample = f.read(4096)

        while sample != "":
            wav.writeframes(sample)
            sample = f.read(4096)

        path = raw.name.replace(".raw", ".wav")

        f.close()

        return path

    def version( self ):
        return "0.6"

#---------------------------------------------------------------------------------------------------------------------
#                                           Main function for testing purposes
#---------------------------------------------------------------------------------------------------------------------
def main():
    """ Main entry point

    """
    NAO_IP = "192.168.20.151" # Nao IP address
    pip   = NAO_IP
    pport = 9559

    # We need this broker to be able to construct
    # NAOqi modules and subscribe to other modules
    # The broker must stay alive until the program exists
    myBroker = ALBroker("myBroker",
       "0.0.0.0",   # listen to anyone
       0,           # find a free port and use it
       pip,         # parent broker IP
       pport)       # parent broker port

    # initialize Watson Text to Speech
    speech_to_text = SpeechToTextV1(
        iam_apikey='xyz',
        url='https://stream.watsonplatform.net/speech-to-text/api')

    myRecognizeCallback = MyRecognizeCallback()

    # initialize SoundReceiver
    global SoundReceiver

    SoundReceiver = SoundReceiverModule("SoundReceiver", myRecognizeCallback, speech_to_text)

    # Start Sound Receiver and Watson Text to Speech
    leds = ALProxy('ALLeds')
    leds.setIntensity('EarLeds', 1)
    SoundReceiver.listen()
    time.sleep(4)
    SoundReceiver.stop()
    leds.setIntensity('EarLeds', .5)


if __name__ == "__main__":
    main()

我感谢某人可以提供的任何帮助。谢谢

适用于NAO / Pepper机器人的IBM Watson语音到文本实时转录

0 个答案: