Question

我正在练习“https://github.com/YerevaNN/Spoken-language-identification”中的口语识别代码。

输入'csv'数据集是从'https://gist.github.com/Harhro94/aa11fe6b454c614cdedea882fd00f8d7'

下载的

首要任务是将输入转换为频谱图。我尝试了这段代码，但显示错误。您可以在此示例中使用任何音频文件（wav文件）。

来自github的原始代码：'https://github.com/YerevaNN/Spoken-language-identification/blob/master/create_spectrograms.py

import numpy as np

from matplotlib import pyplot as plt

import scipy.io.wavfile as wav

from numpy.lib import stride_tricks

import PIL.Image as Image

import os

def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):

      win = window(frameSize)

      hopSize = int(frameSize - np.floor(overlapFac * frameSize))

      samples = np.append(np.zeros(np.floor(frameSize / 2.0)), sig)

      cols = np.ceil((len(samples) - frameSize) / float(hopSize)) + 1

      samples = np.append(samples, np.zeros(frameSize))

      frames = stride_tricks.as_strided(samples, shape=(cols, frameSize),
                                   strides=(samples.strides[0] * hopSize,
                                                samples.strides[0])).copy()

      frames *= win

      return np.fft.rfft(frames)

def logscale_spec(spec, sr=44100, factor=20, alpha=1.0, f0=0.9, fmax=1):

      spec = spec[:, 0:256]

      timebins, freqbins = np.shape(spec)

      scale = np.linspace(0, 1, freqbins)  # ** factor

      scale = np.array(map(lambda x: x * alpha 
            if x <= f0 else (fmax - alpha * f0) / (fmax - f0) * 
                                              (x - f0) + alpha * f0, scale))
      scale *= (freqbins - 1) / max(scale)

      newspec = np.complex128(np.zeros([timebins, freqbins]))

      allfreqs = np.abs(np.fft.fftfreq(freqbins * 2, 1. / sr)[:freqbins + 1])

      freqs = [0.0 for i in range(freqbins)]

      totw = [0.0 for i in range(freqbins)]

      for i in range(0, freqbins):

          if (i < 1 or i + 1 >= freqbins):

              newspec[:, i] += spec[:, i]

              freqs[i] += allfreqs[i]

              totw[i] += 1.0

              continue

          else:

              w_up = scale[i] - np.floor(scale[i])

              w_down = 1 - w_up

              j = int(np.floor(scale[i]))

              newspec[:, j] += w_down * spec[:, i]

              freqs[j] += w_down * allfreqs[i]

              totw[j] += w_down

              newspec[:, j + 1] += w_up * spec[:, i]

              freqs[j + 1] += w_up * allfreqs[i]

              totw[j + 1] += w_up

      for i in range(len(freqs)):

          if (totw[i] > 1e-6):

              freqs[i] /= totw[i]

      return newspec, freqs

def plotstft(audiopath, binsize=2 ** 10, plotpath=None, colormap="gray", 
                          channel=0, name='sampleaudio.png', alpha=1, offset=0):

      samplerate, samples = wav.read(audiopath)

      samples = samples[:, channel]

      s = stft(samples, binsize)

      sshow, freq = logscale_spec(s, factor=1, sr=samplerate, alpha=alpha)

      sshow = sshow[2:, :]

      ims = 20. * np.log10(np.abs(sshow) / 10e-6)

      timebins, freqbins = np.shape(ims)

      ims = np.transpose(ims)

      ims = ims[0:256, :]

      image = Image.fromarray(ims)

      image = image.convert('L')

      image.save(name)

file = open('trainingData.csv', 'r')

for iter, line in enumerate(file.readlines()[1:]):

      filepath = line.split(',')[0]

      filename = filepath[:-4]

      wavfile = 'sampleaudio.wav'

      os.system('mpg123 -w'+wavfile+'/C:/AnacondaProj/sampaudio.wav/'+ filepath)


      plotstft(wavfile,channel=0,name='/C:/AnacondaProj/sampaudio.wav/'+ 
                                                       filename+'.png', alpha=1)


      # os.remove(wavfile)


       print("processed %d files" % (iter + 1))

三江源

通过在python中创建频谱图来识别口语？

0 个答案: