我正在使用sample code provided here并已实施以下内容:
# [START import_libraries]
import argparse
import base64
import json
import time
from oauth2client.service_account import ServiceAccountCredentials
import googleapiclient.discovery
import googleapiclient as gac
# [END import_libraries]
# [START authenticating]
# Application default credentials provided by env variable
# GOOGLE_APPLICATION_CREDENTIALS
def get_speech_service(credentials):
return googleapiclient.discovery.build('speech', 'v1beta1',credentials = credentials)
def main(speech_file):
"""Transcribe the given audio file asynchronously.
Args:
speech_file: the name of the audio file.
"""
# [START construct_request]
with open(speech_file, 'rb') as speech:
# Base64 encode the binary audio file for inclusion in the request.
speech_content = base64.b64encode(speech.read())
# print speech_content
scopes = ['https://www.googleapis.com/auth/cloud-platform']
credentials = ServiceAccountCredentials.from_json_keyfile_name(
'/Users/user/Documents/google_cloud/myjson.json', scopes)
service = get_speech_service(credentials)
service_request = service.speech().asyncrecognize(
body={
'config': {
# There are a bunch of config options you can specify. See
# https://cloud.google.com/speech/reference/rest/v1beta1/RecognitionConfig for the full list.
'encoding': 'LINEAR16', # raw 16-bit signed LE samples
'sampleRate': 16000, # 16 khz
# See http://g.co/cloud/speech/docs/languages for a list of
# supported languages.
'languageCode': 'en-US', # a BCP-47 language tag
},
'audio': {
'content': speech_content.decode('UTF-8')
}
})
# [END construct_request]
# [START send_request]
response = service_request.execute()
print(json.dumps(response))
# [END send_request]
name = response['name']
# Construct a GetOperation request.
service_request = service.operations().get(name=name)
while True:
# Give the server a few seconds to process.
print('Waiting for server processing...')
time.sleep(1)
# Get the long running operation with response.
response = service_request.execute()
if 'done' in response and response['done']:
break
# First print the raw json response
print(json.dumps(response['response'], indent=2))
# Now print the actual transcriptions
out = []
for result in response['response'].get('results', []):
print 'poo'
print('Result:')
for alternative in result['alternatives']:
print(u' Alternative: {}'.format(alternative['transcript']))
out.append(result)
return response
r = main("/Users/user/Downloads/brooklyn.flac")
然而我的印刷品如下:
{"name": "3202776140236290963"}
Waiting for server processing...
Waiting for server processing...
{
"@type": "type.googleapis.com/google.cloud.speech.v1beta1.AsyncRecognizeResponse"
}
我的归档对象是:
{u'done': True,
u'metadata': {u'@type': u'type.googleapis.com/google.cloud.speech.v1beta1.AsyncRecognizeMetadata',
u'lastUpdateTime': u'2017-03-25T15:54:46.136925Z',
u'progressPercent': 100,
u'startTime': u'2017-03-25T15:54:44.514614Z'},
u'name': u'2024312474309214820',
u'response': {u'@type': u'type.googleapis.com/google.cloud.speech.v1beta1.AsyncRecognizeResponse'}}
不确定为什么我没有从示例文件中获得正确的转录。
感谢任何输入!
答案 0 :(得分:1)
您的配置选项包含以下内容:
'config': {
# There are a bunch of config options you can specify. See
# https://cloud.google.com/speech/reference/rest/v1beta1/RecognitionConfig for the full list.
'encoding': 'LINEAR16', # raw 16-bit signed LE samples
'sampleRate': 16000, # 16 khz
# See http://g.co/cloud/speech/docs/languages for a list of
# supported languages.
'languageCode': 'en-US', # a BCP-47 language tag
},
但是,您使用的是FLAC
文件:
r = main("/Users/user/Downloads/brooklyn.flac")
引用https://cloud.google.com/speech/reference/rest/v1beta1/RecognitionConfig:
<强> LINEAR16
强>
未压缩的16位带符号小端样本(线性PCM)。这是
speech.asyncrecognize
可以使用的唯一编码。
<强> FLAC
强>
这是
speech.syncrecognize
和StreamingRecognize
的推荐编码,因为它使用无损压缩;因此,有损编解码器不会影响识别准确度。
换句话说,您无法将FLAC
与speech.asyncrecognize
一起使用,您可能需要先将样本转码为线性PCM,或将speech.syncrecognize
与FLAC
一起使用编码选项。