在我在GitHub上找到的示例的帮助下,我的代码当前能够读取音频文件并使用Azure Speech to Text进行转录。但是,我需要在转录中包括所有单词的时间戳。根据文档,此功能已在1.5.0版中添加,可以通过request_word_level_timestamps()方法进行访问。但是即使我已经打电话给我,我也会得到与以前相同的答复。我无法从文档中弄清楚如何使用它。有谁知道它是如何工作的?
我正在使用Python SDK 1.5.1版。
import azure.cognitiveservices.speech as speechsdk
import time
from allennlp.predictors.predictor import Predictor
import json
inputPath = "(inputlocation)"
outputPath = "(outputlocation)"
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
speech_key, service_region = "apikey", "region"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.request_word_level_timestamps()
speech_config.output_format=speechsdk.OutputFormat.Detailed
#print("VALUE: " + speech_config.get_property(property_id=speechsdk.PropertyId.SpeechServic eResponse_RequestWordLevelTimestamps))
filename = input("Enter filename: ")
print(speech_config)
try:
audio_config = speechsdk.audio.AudioConfig(filename= inputPath + filename)
# Creates a recognizer with the given settings
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
def start():
done = False
#output = ""
fileOpened = open(outputPath+ filename[0: len(filename) - 4] + "_MS_recognized.txt", "w+")
fileOpened.truncate(0)
fileOpened.close()
def stop_callback(evt):
print("Closing on {}".format(evt))
speech_recognizer.stop_continuous_recognition()
nonlocal done
done = True
def add_to_res(evt):
#nonlocal output
#print("Recognized: {}".format(evt.result.text))
#output = output + evt.result.text + "\n"
fileOpened = open( outputPath + filename[0: len(filename) - 4] + "_MS_recognized.txt", "a")
fileOpened.write(evt.result.text + "\n")
fileOpened.close()
#print(output)
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.recognized.connect(add_to_res)
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_callback)
speech_recognizer.canceled.connect(stop_callback)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
# </SpeechContinuousRecognitionWithFile>
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. The task returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
start()
except Exception as e:
print("File does not exist")
#print(e)
结果仅包含session_id和一个结果对象,其中包括result_id,文本和原因。
答案 0 :(得分:4)
在评论它如何有助于连续识别时,如果将SpeechConfig
设置为request_word_level_timestamps()
,则可以将其作为连续识别来运行。您可以使用evt.result.json
检查json结果。
例如,
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.request_word_level_timestamps()
然后是您的语音识别器:
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
将回调连接到speech_recognizer触发的事件时,您可以看到带有以下字词的时间戳:
speech_recognizer.recognized.connect(lambda evt: print('JSON: {}'.format(evt.result.json)))
我的问题是Translation对象不包含单词级别,因为它不接受speech_config
。
答案 1 :(得分:1)
设置
speech_config.request_word_level_timestamps()
在azure sdk的语音配置中,您可以获取每个单词的成绩单和时间戳。
speech_config.output_format = speechsdk.OutputFormat(1)
此语句将允许您从azure sdk获取详细的json对象。
下面是示例代码。确保更换钥匙。在与文字的语音交流可能失败的地方,可能需要一些错误处理。
def process(self):
logger.debug("Speech to text request received")
speechapi_settings = SpeechAPIConf()
audio_filepath = <PATH_TO_AUDIO_FILE>
locale = "en-US" # Change as per requirement
logger.debug(audio_filepath)
audio_config = speechsdk.audio.AudioConfig(filename=audio_filepath)
speech_config = speechsdk.SpeechConfig(subscription=<SUBSCRIPTION_KEY>, region=<SERVICE_REGION>)
speech_config.request_word_level_timestamps()
speech_config.speech_recognition_language = locale
speech_config.output_format = speechsdk.OutputFormat(1)
# Creates a recognizer with the given settings
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
# Variable to monitor status
done = False
# Service callback for recognition text
transcript_display_list = []
transcript_ITN_list = []
confidence_list = []
words = []
def parse_azure_result(evt):
import json
response = json.loads(evt.result.json)
transcript_display_list.append(response['DisplayText'])
confidence_list_temp = [item.get('Confidence') for item in response['NBest']]
max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
confidence_list.append(response['NBest'][max_confidence_index]['Confidence'])
transcript_ITN_list.append(response['NBest'][max_confidence_index]['ITN'])
words.extend(response['NBest'][max_confidence_index]['Words'])
logger.debug(evt)
# Service callback that stops continuous recognition upon receiving an event `evt`
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
speech_recognizer.stop_continuous_recognition()
nonlocal done
done = True
# Do something with the combined responses
print(transcript_display_list)
print(confidence_list)
print(words)
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: logger.debug('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(parse_azure_result)
speech_recognizer.session_started.connect(lambda evt: logger.debug('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: logger.debug('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: logger.debug('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
logger.debug("Initiating speech to text")
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
答案 2 :(得分:0)
我参考了您的代码,并按照官方教程Quickstart: Recognize speech with the Speech SDK for Python
编写了以下示例代码,该代码可以为每个单词打印Offset
和Duration
值。我使用了一个名为whatstheweatherlike.wav
的音频文件,该文件来自GitHub Repo Azure-Samples/cognitive-services-speech-sdk
的{{3}}。
这是我的示例代码及其结果。
import azure.cognitiveservices.speech as speechsdk
speech_key, service_region = "<your api key>", "<your region>"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.request_word_level_timestamps()
audio_config = speechsdk.audio.AudioConfig(filename='whatstheweatherlike.wav')
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
result = speech_recognizer.recognize_once()
# print(result.json)
# If without `request_word_level_timestamps`, the result:
# {"DisplayText":"What's the weather like?","Duration":13400000,"Offset":400000,"RecognitionStatus":"Success"}
# Enable `request_word_level_timestamps`, the result includes word level timestamps.
# {"Duration":13400000,"NBest":[{"Confidence":0.9761951565742493,"Display":"What's the weather like?","ITN":"What's the weather like","Lexical":"what's the weather like","MaskedITN":"What's the weather like","Words":[{"Duration":3800000,"Offset":600000,"Word":"what's"},{"Duration":1200000,"Offset":4500000,"Word":"the"},{"Duration":2900000,"Offset":5800000,"Word":"weather"},{"Duration":4700000,"Offset":8800000,"Word":"like"}]},{"Confidence":0.9245584011077881,"Display":"what is the weather like","ITN":"what is the weather like","Lexical":"what is the weather like","MaskedITN":"what is the weather like","Words":[{"Duration":2900000,"Offset":600000,"Word":"what"},{"Duration":700000,"Offset":3600000,"Word":"is"},{"Duration":1300000,"Offset":4400000,"Word":"the"},{"Duration":2900000,"Offset":5800000,"Word":"weather"},{"Duration":4700000,"Offset":8800000,"Word":"like"}]}],"Offset":400000,"RecognitionStatus":"Success"}
import json
stt = json.loads(result.json)
confidences_in_nbest = [item['Confidence'] for item in stt['NBest']]
best_index = confidences_in_nbest.index(max(confidences_in_nbest))
words = stt['NBest'][best_index]['Words']
print(words)
print(f"Word\tOffset\tDuration")
for word in words:
print(f"{word['Word']}\t{word['Offset']}\t{word['Duration']}")
上面的脚本的输出是:
[{'Duration': 3800000, 'Offset': 600000, 'Word': "what's"}, {'Duration': 1200000, 'Offset': 4500000, 'Word': 'the'}, {'Duration': 2900000, 'Offset': 5800000, 'Word': 'weather'}, {'Duration': 4700000, 'Offset': 8800000, 'Word': 'like'}]
Word Offset Duration
what's 600000 3800000
the 4500000 1200000
weather 5800000 2900000
like 8800000 4700000
希望有帮助。