如何处理差异音频文件的MFCC功能中的差异

时间:2019-01-16 16:11:31

标签: python-3.x librosa

librosa.feature.mfcc返回不同音频文件的不同尺寸。那么如何处理这种情况以训练或测试模型

#test.py
import os
import pickle
import numpy as np 
from scipy.io.wavfile import read
import librosa as mfcc
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
def get_MFCC(sr,audio):
    features = mfcc.feature.mfcc(audio,sr,n_mfcc=20, dct_type=2)
    feat     = np.asarray(())
    for i in range(features.shape[0]):
        temp = features[i,:]
        if np.isnan(np.min(temp)):
            continue
        else:
            if feat.size == 0:
                feat = temp
            else:
                feat = np.vstack((feat, temp))
    features = feat;
    features = preprocessing.scale(features)
    return features
#path to test data
source   = "C:\\Users\\PrashuGupta\\Downloads\\datasets\\pygender\\test_data\\AudioSet\\female_clips\\"
#path to save trained model
modelpath     = "C:\\Users\\Prashu Gupta\\Downloads\\datasets\\pygender\\"


gmm_files = [os.path.join(modelpath,fname) for fname in
              os.listdir(modelpath) if fname.endswith('.gmm')]
models    = [pickle.load(open(fname,'rb')) for fname in gmm_files]
genders   = [fname.split("\\")[-1].split(".gmm")[0] for fname
              in gmm_files]
files     = [os.path.join(source,f) for f in os.listdir(source)
              if f.endswith(".wav")] 
for f in files:
    print (f.split("\\")[-1])
    audio,sr  = mfcc.load(f, sr = 16000,mono = True)     
    features   = get_MFCC(sr,audio)
    scores     = None
    log_likelihood = np.zeros(len(models))
    for i in range(len(models)):
        gmm    = models[i]         #checking with each model one by one
        scores = np.array(gmm.score(features))
        log_likelihood[i] = scores.sum()
    winner = np.argmax(log_likelihood)
    print ("\tdetected as - ", genders[winner],"\n\tscores:female",log_likelihood[0],",male ", log_likelihood[1],"\n")

错误

  

预期输入数据X具有1800个功能,但其中有313个功能   分数= np.array(gmm.score(功能))

1 个答案:

答案 0 :(得分:1)

要么必须截断/填充文件以使它们具有相同的大小(例如5秒),要么将文件的功能汇总为不依赖剪辑长度的固定长度矢量(平均/最小/最大) ,或者让分类器在固定长度的特征窗口流(例如1秒)上运行。