如何使用八度音程获得mfcc功能

时间:2015-05-31 16:47:03

标签: signal-processing octave speech-recognition mfcc

我的目标是在八度音程上创建程序,加载音频文件(wav,flac),计算其mfcc功能并将其作为输出提供。问题是我没有太多的八度音程经验,也无法获得八度加载音频文件,这就是为什么我不确定提取算法是否正确。是否有简单的方法加载文件并获取其功能?

2 个答案:

答案 0 :(得分:2)

您可以以八度为单位从RASTAMAT运行mfcc代码,您只需修复一些内容,可以下载固定版本here

更改是在powspec.m中正确设置窗口

DataContext

并修复与matlab不兼容的specgram函数中的bug

答案 1 :(得分:2)

https://github.com/jagdish7908/mfcc-octave 处查看用于计算 MFCC 的 Octave 函数

有关计算 MFCC 步骤的详细理论,请参阅 http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/

 function frame = create_frames(y, Fs, Fsize, Fstep)
  N = length(y);
  % divide the signal into frames with overlap = framestep
  samplesPerFrame = floor(Fs*Fsize);
  samplesPerFramestep = floor(Fs*Fstep);
  i = 1;
  frame = [];
  while(i <= N-samplesPerFrame)
    frame = [frame y(i:(i+samplesPerFrame-1))];
    i = i+samplesPerFramestep;
  endwhile
  return 
 endfunction

function ans = hz2mel(f)
  ans = 1125*log(1+f/700);
  return
 endfunction

 function ans = mel2hz(f)
  ans = 700*(exp(f/1125) - 1);
  return
 endfunction

function bank = melbank(n, min, max, sr)
  % n = number of banks
  % min = min frequency in hertz
  % max = max frequency in hertz 
  % convert the min and max freq in mel scale
  NFFT = 512;
  % figure out bin value of min and max freq
  minBin = floor((NFFT)*min/(sr/2));
  maxBin = floor((NFFT)*max/(sr/2));
  % convert the min, max in mel scale
  min_mel = hz2mel(min);
  max_mel = hz2mel(max);
  m = [min_mel:(max_mel-min_mel)/(n+2-1):max_mel];
  %disp(m);
  h = mel2hz(m);
  % replace frequencies in h with thier respective bin values
  fbin = floor((NFFT)*h/(sr/2));

  %disp(h);
  % create triangular melfilter vectors
  H = zeros(NFFT,n);
  for vect = 2:n+1
    for k = minBin:maxBin
      
      if k >= fbin(vect-1) && k <= fbin(vect)
        H(k,vect) = (k-fbin(vect-1))/(fbin(vect)-fbin(vect-1));  
      elseif k >= fbin(vect) && k <= fbin(vect+1)
        H(k,vect) = (fbin(vect+1) - k)/(fbin(vect+1)-fbin(vect));
      endif
      
    endfor
  endfor
  bank = H;
  return
 endfunction     

clc;
clear all;
close all;
pkg load signal;

% record audio
Fs = 44100;
y = record(3,44100);
% OR %
% Load existing file
%[y, Fs] = wavread('../FILE_PATH/');
%y = y(44100:2*44100);
 
 % create mel filterbanks
 minFreq = 500;   % minimum cutoff frequency in Hz
 maxFreq = 10000;   % maximum cutoff frequency in Hz
% melbank(number_of_banks, minFreq, mazFreq, sampling_rate)
 foo = melbank(30,minFreq,maxFreq,Fs);

 % create frames
 frames = create_frames(y, Fs, 0.025, 0.010);
 % calculate periodogram of each frame
 NF = length(frames(1,:));
 [P,F] = periodogram(frames(:,1),[], 1024, Fs);
 % apply mel filters to the power spectra
 P = foo.*P(1:512);
 % sum the energy in each filter and take the logarithm
 P = log(sum(P));
 % take the DCT of the log filterbank energies
 % discard the first coeff 'cause it'll be -Inf after taking log
 L = length(P);
 P = dct(P(2:L));
 PXX = P;

 for i = 2:NF
  P = periodogram(frames(:,i),[], 1024, Fs);
   % apply mel filters to the power spectra
  P = foo.*P(1:512);
  % sum the energy in each filter and take the logarithm
  P = log(sum(P));
  % take the DCT of the log filterbank energies
  % discard the first coeff 'cause it'll be -Inf after taking log
  P = dct(P(2:L));
  % coeffients are stacked row wise for each frame
  PXX = [PXX; P];
 endfor
 % stack the coeffients column wise
 PXX = PXX';
 plot(PXX);