我可以使用Pocketsphinx从音频文件中成功提取音素数据,但是如何输出每个音素的持续时间呢?
def phonemes(filename):
audio_path = os.path.join(dir_path, filename)
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string('-hmm', os.path.join(MODELDIR, 'en-us'))
config.set_string('-allphone', os.path.join(MODELDIR, 'en-us/en-us-phone.lm.dmp'))
config.set_float('-lw', 2.0)
config.set_float('-beam', 1e-10)
config.set_float('-pbeam', 1e-10)
# Decode streaming data.
decoder = Decoder(config)
decoder.start_utt()
stream = open(audio_path, 'rb')
while True:
buf = stream.read(1024)
if buf:
decoder.process_raw(buf, False, False)
else:
break
decoder.end_utt()
pho = [seg.word for seg in decoder.seg()]
print('Phoneme:', pho)
打印结果如下: (“音素”,[“ SIL”,“ HH”,“ M”,“ W”,“ M”,“ HH”,“ HH”,“ HH”,“ HH”,“ HH”,“ HH”, 'HH','HH','HH','L','HH','L','M','M','M','HH','HH','HH','HH ','L','HH','HH','HH','HH','HH','HH','M','M','W','M','HH', 'HH','ER','ER','HH','HH','M','M','M','HH','M','M','G','M ','M','M','SIL','M','HH','M','ER','SIL','W','M','M','NG', 'M','M','HH','L','M','M','SIL','W','HH','L','M','SIL','HH ','AE','V','R','HH','+ SPN +','HH','HH','SIL','V','UW','L','V' ,'N','HH','D','V','D','+ SPN +','D','B','AA','SIL','+ SPN +','HH' ,'HH','AH','N','DH','UW','L','HH','UW','V','D','N','M',' D','M','UW','P'])
答案 0 :(得分:0)
简单
index+1
或者您除了print('Phoneme:', decoder.seg())
之外,还可以使用seg.start
,seg.end
。