我已经创建了向客户端发送音频的服务器,然后客户端应该将音频转发到谷歌云语音到文本。
我在转发从服务器接收的音频数据时遇到问题。 我修改了麦克风流的代码,它具有 getAudiodata 功能,可以从服务器发送的套接字接收数据。
import socket
import threading, wave, pyaudio, time, queue
from google.cloud import speech_v1p1beta1 as speech
import os
from six.moves import queue
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "api_key.json"
def transcribe_streaming(stream_file):
"""Streams transcription of the given audio file."""
import io
from google.cloud import speech
client = speech.SpeechClient()
with io.open(stream_file, "rb") as audio_file:
content = audio_file.read()
# In practice, stream should be a generator yielding chunks of audio data.
stream = [content]
requests = (
speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream
)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
)
streaming_config = speech.StreamingRecognitionConfig(config=config)
# streaming_recognize returns a generator.
responses = client.streaming_recognize(
config=streaming_config,
requests=requests,
)
for response in responses:
# Once the transcription has settled, the first result will contain the
# is_final result. The other results will be for subsequent portions of
# the audio.
for result in response.results:
print("Finished: {}".format(result.is_final))
print("Stability: {}".format(result.stability))
alternatives = result.alternatives
# The alternatives are ordered from most likely to least.
for alternative in alternatives:
print("Confidence: {}".format(alternative.confidence))
print(u"Transcript: {}".format(alternative.transcript))
import re
import sys
from google.cloud import speech
import pyaudio
from six.moves import queue
# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
def listen_print_loop(responses):
num_chars_printed = 0
for response in responses:
if not response.results:
continue
# The `results` list is consecutive. For streaming, we only care about
# the first result being considered, since once it's `is_final`, it
# moves on to considering the next utterance.
result = response.results[0]
if not result.alternatives:
continue
# Display the transcription of the top alternative.
transcript = result.alternatives[0].transcript
# Display interim results, but with a carriage return at the end of the
# line, so subsequent lines will overwrite them.
#
# If the previous result was longer than this one, we need to print
# some extra spaces to overwrite the previous result
overwrite_chars = " " * (num_chars_printed - len(transcript))
if not result.is_final:
sys.stdout.write(transcript + overwrite_chars + "\r")
sys.stdout.flush()
num_chars_printed = len(transcript)
else:
print(transcript + overwrite_chars)
# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r"\b(exit|quit)\b", transcript, re.I):
print("Exiting..")
break
num_chars_printed = 0
host_name = socket.gethostname()
host_ip = '0.0.0.0'# socket.gethostbyname(host_name)
print(host_ip)
port = 4984
# For details visit: www.pyshine.com
q = queue.Queue(maxsize=2000)
# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
class MicrophoneStream(object):
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True
self.host_name = socket.gethostname()
self.host_ip = '0.0.0.0'# socket.gethostbyname(host_name)
print(host_ip)
self.port = 4984
def __enter__(self):
self.BUFF_SIZE = 65536
self.client_socket = socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
self.client_socket.setsockopt(socket.SOL_SOCKET,socket.SO_RCVBUF,self.BUFF_SIZE)
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
channels=1,
rate=self._rate,
output=True,
frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
)
self.closed = False
message = b'Hello'
self.client_socket.sendto(message,(self.host_ip,self.port))
self.socket_address = (host_ip,port)
t1 = threading.Thread(target=self.getAudioData, args=())
t1.start()
time.sleep(5)
return self
def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def getAudioData(self):
while True:
frame,_= self.client_socket.recvfrom(self.BUFF_SIZE)
self._buff.put(frame)
def generator(self):
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b"".join(data)
def audio_stream_UDP():
# create socket
message = b'Hello'
client_socket.sendto(message,(host_ip,port))
socket_address = (host_ip,port)
def getAudioData():
while True:
frame,_= client_socket.recvfrom(BUFF_SIZE)
q.put(frame)
# print('Queue size...',q.qsize())
t1 = threading.Thread(target=getAudioData, args=())
t1.start()
time.sleep(5)
print('Now Playing...')
data = []
while True:
frame = q.get(block=False)
stream.write(frame)
if frame is None:
return
data.append(frame)
yield b"".join(data)
client_socket.close()
print('Audio closed')
os._exit(1)
# t1 = threading.Thread(target=audio_stream_UDP, args=())
# t1.start()
def main():
# See http://g.co/cloud/speech/docs/languages
# for a list of supported languages.
language_code = "en-US" # a BCP-47 language tag
client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=RATE,
language_code=language_code,
)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
responses = client.streaming_recognize(streaming_config, requests)
# Now, put the transcription responses to use.
listen_print_loop(responses)
if __name__ == "__main__":
main()