我注意到Google演讲最近的表现有所不同
例如,在此文件中,扬声器在正确转录为“ 123457”之前用泰语说“ 123457”,但是由于某些原因,现在API返回“ 12345”
flac文件:http://s000.tinyupload.com/index.php?file_id=76277841017264777654
Python代码:
import argparse
import base64
import json
import sys
from googleapiclient import discovery
import httplib2
from oauth2client.client import GoogleCredentials
DISCOVERY_URL = ('https://{api}.googleapis.com/$discovery/rest?'
'version={apiVersion}')
def get_speech_service():
credentials = GoogleCredentials.get_application_default().create_scoped(
['https://www.googleapis.com/auth/cloud-platform'])
http = httplib2.Http()
credentials.authorize(http)
return discovery.build(
'speech', 'v1beta1', http=http, discoveryServiceUrl=DISCOVERY_URL)
def english_numeric(number_as_string):
s = number_as_string
numerics=["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "zero"]
for i,num in enumerate(numerics):
s = s.replace(num,str(i))
return s.replace(" ", "")
def transcribe_audio(speech_file, languageCode="th-TH", encoding="FLAC"):
"""Transcribe the given audio file.
Args:
speech_file: the name of the audio file.
"""
with open(speech_file, 'rb') as speech:
speech_content = base64.b64encode(speech.read())
try:
service = get_speech_service()
except:
print("probably didn't do EXPORT")
print("i.e. export GCLOUD_PROJECT=project-id")
print("i.e. export GOOGLE_APPLICATION_CREDENTIALS=/path/to/crednetials.json")
print("Error message: {}".format(sys.exc_info()[0]))
return
service_request = service.speech().syncrecognize(
body={
'config': {
'encoding': encoding, # raw 16-bit signed LE samples
# 'sampleRate': 16000, # 16 khz
'languageCode': languageCode, # a BCP-47 language tag,
'enableWordTimeOffsets' : "true"
},
'audio': {
'content': speech_content.decode('UTF-8')
}
})
response = ""
try:
response = service_request.execute()
except:
# should do some major debugging here
pass
# return response
json_string = json.dumps(response, ensure_ascii=False)
# json_object = json.loads(json_string)
# from pprint import pprint
# pprint(json_object)
return json_string
def audio_json_to_text(audio_transcription, include_confidence=True):
transcript_text= "COULD_NOT_BE_TRANSCRIBED"
if(audio_transcription and "results" in audio_transcription):
if(include_confidence):
transcript_text = "{} {}".format(
audio_transcription["results"][0]["alternatives"][0]["transcript"],
'(%.3f)' % float(audio_transcription["results"][0]["alternatives"][0]["confidence"]),
)
else:
transcript_text = "{}".format(
audio_transcription["results"][0]["alternatives"][0]["transcript"]
)
return transcript_text
if __name__ == '__main__':
flac = sys.argv[1]
languageCode = sys.argv[2]
print(transcribe_audio(flac, languageCode=languageCode))
# print(english_numeric("one five four three"))