python

时间:2017-02-02 14:33:48

标签: python wav librosa

所以最近我想通过NN获得声音分类,我找到了这个教程:

aqibsaeed.github.io/2016-09-03-urban-sound-classification-part-1/

不幸的是,代码在某些方面似乎无法正常运行。

起初:

def load_sound_files(file_paths):
raw_sounds = []
for fp in file_paths:
    X,sr = librosa.load(fp)
    raw_sounds.append(X)
return raw_sounds

无法加载以后要绘制的10个示例文件中的一个。 我写了一个简短的脚本来检查有多少文件无法读取,结果是第一次折叠的大约40%! 所以我在网上看了一下,尝试了其他的阅读库,结果就是这样:

def load_sound_files(file_paths):
raw_sounds = []
sffreq = []
for fp in file_paths:
    X,sr = soundfile.read(fp)
    #X=np.ascontiguousarray(np.transpose(X),dtype=np.float64)
    sffreq.append(sr)
    try:
        #if stereo sound compress to mono
        Z = np.compress([False, True], X, axis= 1)
        Z+= np.compress([True, False], X, axis= 1)
        X = (Z/2.0).flatten()

    except:
        pass
    raw_sounds.append(X)
return raw_sounds, sffreq

这似乎打开了所有文件,但是它输出的值略微不同于librosa,在某些情况下,情节差别很大。 大多数情况下这很好但是在枪声的情况下:

  

Gunshot waveform   枪声波形

  

Gunshot specgram   枪声谱图

他们差异很大。

似乎librosa打开每个wav为单声道,频率为22050,而soundfile打开大部分文件为立体声,有些为单声道,具有不同的频率,具体取决于声音文件。

这是第一个问题:

为什么会这样?

为什么读取数据与单声道/立体声数据和频率不同?

如何确定哪些数据更准确?

后来我尝试了用于特征提取的代码,我最终得到了这个,在tonnetz函数上发生了相当长的崩溃报告:

def extract_feature(file_name):
    X,sample_rate = sf.read(file_name)
    #X=np.ascontiguousarray(np.transpose(X),dtype=np.float64)
    try:
        print ("Freq of sf: ", X.shape)
        Z = np.compress([False, True], X, axis= 1)
        Z+= np.compress([True, False], X, axis= 1)
        X = (Z/2.0).flatten()

    except:
        print "Wow i had one dim"
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz


<ipython-input-14-29f39c0472e8> in <module>()
      2 
      3 sub_dirs = ['fold1','fold2','fold3']
----> 4 features, labels = parse_audio_files(parent_dir,sub_dirs)

<ipython-input-13-97fd78a13282> in parse_audio_files(parent_dir, sub_dirs, file_ext)
     22     for label, sub_dir in enumerate(sub_dirs):
     23         for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
---> 24             mfccs, chroma, mel, contrast,tonnetz = extract_feature(fn)
     25             ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
     26             features = np.vstack([features,ext_features])

<ipython-input-13-97fd78a13282> in extract_feature(file_name)
     15     mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
     16     contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
---> 17     tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
     18     return mfccs,chroma,mel,contrast,tonnetz
     19 

/usr/local/lib/python2.7/dist-packages/librosa/feature/spectral.pyc in tonnetz(y, sr, chroma)
   1155 
   1156     if chroma is None:
-> 1157         chroma = chroma_cqt(y=y, sr=sr)
   1158 
   1159     # Generate Transformation matrix

/usr/local/lib/python2.7/dist-packages/librosa/feature/spectral.pyc in chroma_cqt(y, sr, C, hop_length, fmin, norm, threshold, tuning, n_chroma, n_octaves, window, bins_per_octave, cqt_mode, mode)
    934                                       bins_per_octave=bins_per_octave,
    935                                       tuning=tuning,
--> 936                                       real=False))
    937 
    938     # Map to chroma

/usr/local/lib/python2.7/dist-packages/librosa/core/constantq.pyc in cqt(y, sr, hop_length, fmin, n_bins, bins_per_octave, tuning, filter_scale, aggregate, norm, sparsity, real, resolution)
    249 
    250         # Compute the cqt filter response and append to the stack
--> 251         cqt_resp.append(__cqt_response(my_y, n_fft, my_hop, fft_basis))
    252 
    253 

/usr/local/lib/python2.7/dist-packages/librosa/core/constantq.pyc in __cqt_response(y, n_fft, hop_length, fft_basis)
    529 
    530     # Compute the STFT matrix
--> 531     D = stft(y, n_fft=n_fft, hop_length=hop_length, window=np.ones)
    532 
    533     # And filter response energy

/usr/local/lib/python2.7/dist-packages/librosa/core/spectrum.pyc in stft(y, n_fft, hop_length, win_length, window, center, dtype)
    165 
    166     # Window the time series.
--> 167     y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length)
    168 
    169     # Pre-allocate the STFT matrix

/usr/local/lib/python2.7/dist-packages/librosa/util/utils.pyc in frame(y, frame_length, hop_length)
    100     # Horizontal stride is `hop_length` samples
    101     y_frames = as_strided(y, shape=(frame_length, n_frames),
--> 102                           strides=(y.itemsize, hop_length * y.itemsize))
    103     return y_frames
    104 

/home/dziobak/.local/lib/python2.7/site-packages/numpy/lib/stride_tricks.pyc in as_strided(x, shape, strides, subok, writeable)
    100         interface['strides'] = tuple(strides)
    101 
--> 102     array = np.asarray(DummyArray(interface, base=x))
    103 
    104     if array.dtype.fields is None and x.dtype.fields is not None:

/home/dziobak/.local/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    529 
    530     """
--> 531     return array(a, dtype, copy=False, order=order)
    532 
    533 

TypeError: 'float' object cannot be interpreted as an index

老实说,我不知道那里发生了什么,并希望得到一些帮助。

0 个答案:

没有答案