我是Matlab的新手,现在遇到了使用PNCC和MFFC实现简单的说话人识别系统的问题。 我的问题实际上是在矩阵维度上,当我运行程序时,它给了我这个错误:
Matrix dimensions must agree.
disteu错误(第43行) d(n,:) = sum((x(:, n +副本-y)。^ 2,1); 测试错误(第22行) d = disteu(v,代码{l}); 主线错误(第4行) 测试('C:\ Users \ Antonio \ Documents \ MATLAB \ test',5,代码);
为清楚起见,我附上了我的代码。 有人可以帮我吗?
function d = disteu(x, y)
% DISTEU Pairwise Euclidean distances between columns of two matrices
%
% Input:
% x, y: Two matrices whose each column is an a vector data.
%
% Output:
% d: Element d(i,j) will be the Euclidean distance between two
% column vectors X(:,i) and Y(:,j)
%
% Note:
% The Euclidean distance D between two vectors X and Y is:
% D = sum((x-y).^2).^0.5
% D = sum((x-y).^2).^0.5
[M, N] = size(x);
[M2, P] = size(y);
if (M ~= M2)
y=padarray(y,0,0,'post');
x=padarray(x,21,0,'post');
[M, N] = size(x)
[M2, P] = size(y)
y=padarray(y,0,0,'post');
[M2, P] = size(y)
end
%error('Matrix dimensions do not match.')
d = zeros(N, P);
if (N < P)
copies = zeros(1,P);
for n = 1:N
d(n,:) = sum((x(:, n+copies) - y) .^2, 1);
end
else
copies = zeros(1,N);
for p = 1:P
d(:,p) = sum((x - y(:, p+copies)) .^2, 1)';
end
end
d = d.^0.5;
function [aadDCT] = PNCC(rawdata, fsamp)
ad_x = rawdata;
%addpath voicebox/; % With Spectral Subtraction - default parameters
%ad_x = specsub(rawdata, fsamp);
dLamda_L = 0.999;
dLamda_S = 0.999;
dSampRate = fsamp;
dLowFreq = 200;% Changed to 40 from 200 as low freq is 40 in gabor as well
dHighFreq = dSampRate / 2;
dPowerCoeff = 1 / 15;
iFiltType = 1;
dFactor = 2.0;
dGammaThreshold = 0.005;
iM = 0; % Changed from 2 to 0 as number of frames coming out to be different due to queue
iN = 4;
iSMType = 0;
dLamda = 0.999;
dLamda2 = 0.5;
dDelta1 = 1;
dLamda3 = 0.85;
dDelta2 = 0.2;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Flags
%
bPreem = 1; % pre-emphasis flag
bSSF = 1;
bPowerLaw = 1;
bDisplay = 0;
dFrameLen = 0.025; % 25.6 ms window length, which is the default setting in CMU Sphinx
dFramePeriod = 0.010; % 10 ms frame period
iPowerFactor = 1;
global iNumFilts;
iNumFilts = 40;
if iNumFilts<30
iFFTSize = 512;
else
iFFTSize = 1024;
end
% For derivatives
deltawindow = 2; % to calculate 1st derivative
accwindow = 2; % to calculate 2nd derivative
% numcoeffs = 13; % number of cepstral coefficients to be used
numcoeffs = 13;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Flags
%
%
% Array Queue Ring-buffer
%
global Queue_aad_P;
global Queue_iHead;
global Queue_iTail;
global Queue_iWindow;
global Queue_iNumElem;
Queue_iWindow = 2 * iM + 1;
Queue_aad_P = zeros(Queue_iWindow, iNumFilts);
Queue_iHead = 0;
Queue_iTail = 0;
Queue_iNumElem = 0;
iFL = floor(dFrameLen * dSampRate);
iFP = floor(dFramePeriod * dSampRate);
iNumFrames = floor((length(ad_x) - iFL) / iFP) + 1;
iSpeechLen = length(ad_x);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Pre-emphasis using H(z) = 1 - 0.97 z ^ -1
%
if (bPreem == 1)
ad_x = filter([1 -0.97], 1, double(ad_x));
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Obtaning the gammatone coefficient.
%
% Based on M. Snelly's auditory toolbox.
% In actual C-implementation, we just use a table
%
bGamma = 1;
[wts,binfrqs] = fft2melmx(iFFTSize, dSampRate, iNumFilts, 1, dLowFreq, dHighFreq, iFiltType);
wts = wts';
wts(size(wts, 1) / 2 + 1 : size(wts, 1), : ) = [];
aad_H = wts;
i_FI = 0;
i_FI_Out = 0;
if bSSF == 1
adSumPower = zeros(1, iNumFrames - 2 * iM);
else
adSumPower = zeros(1, iNumFrames);
end
%dLamda_L = 0.998;
aad_P = zeros(iNumFrames, iNumFilts);
aad_P_Out = zeros(iNumFrames - 2 * iM, iNumFilts);
ad_Q = zeros(1, iNumFilts);
ad_Q_Out = zeros(1, iNumFilts);
ad_QMVAvg = zeros(1, iNumFilts);
ad_w = zeros(1, iNumFilts);
ad_w_sm = zeros(1, iNumFilts);
ad_QMVAvg_LA = zeros(1, iNumFilts);
MEAN_POWER = 1e10;
dMean = 5.8471e+08;
dPeak = 2.7873e+09 / 15.6250;
% (1.7839e8, 2.0517e8, 2.4120e8, 2.9715e8, 3.9795e8) 95, 96, 97, 98, 99
% percentile from WSJ-si84
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
dPeakVal = 4e+07;% % 4.0638e+07 --> Mean from WSJ0-si84 (Important!!!)
%%%%%%%%%%%
dMean = dPeakVal;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Obtaining the short-time Power P(i, j)
%
for m = 0 : iFP : iSpeechLen - iFL
ad_x_st = ad_x(m + 1 : m + iFL) .* hamming(iFL);
adSpec = fft(ad_x_st, iFFTSize);
ad_X = abs(adSpec(1: iFFTSize / 2));
aadX(:, i_FI + 1) = ad_X;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Calculating the Power P(i, j)
%
for j = 1 : iNumFilts
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Squared integration
%
if iFiltType == 2
aad_P(i_FI + 1, j) = sum((ad_X .* aad_H(:, j)) .^ 2);
else
aad_P(i_FI + 1, j) = sum((ad_X .^ 2 .* aad_H(:, j)));
end
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Calculating the Power P(i, j)
%
dSumPower = sum(aad_P(i_FI + 1, : ));
if bSSF == 1
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Ring buffer (using a Queue)
%
if (i_FI >= 2 * iM + 1)
Queue_poll();
end
Queue_offer(aad_P(i_FI + 1, :));
ad_Q = Queue_avg();
if (i_FI == 2 * iM)
ad_QMVAvg = ad_Q.^ (1 / 15);
ad_PBias = (ad_Q) * 0.9;
end
if (i_FI >= 2 * iM)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Bias Update
%
for i = 1 : iNumFilts,
if (ad_Q(i) > ad_PBias(i))
ad_PBias(i) = dLamda * ad_PBias(i) + (1 - dLamda) * ad_Q(i);
else
ad_PBias(i) = dLamda2 * ad_PBias(i) + (1 - dLamda2) * ad_Q(i);
end
end
for i = 1 : iNumFilts,
ad_Q_Out(i) = max(ad_Q(i) - ad_PBias(i), 0) ;
if (i_FI == 2 * iM)
ad_QMVAvg2(i) = 0.9 * ad_Q_Out(i);
ad_QMVAvg3(i) = ad_Q_Out(i);
ad_QMVPeak(i) = ad_Q_Out(i);
end
if (ad_Q_Out(i) > ad_QMVAvg2(i))
ad_QMVAvg2(i) = dLamda * ad_QMVAvg2(i) + (1 - dLamda) * ad_Q_Out(i);
else
ad_QMVAvg2(i) = dLamda2 * ad_QMVAvg2(i) + (1 - dLamda2) * ad_Q_Out(i);
end
dOrg = ad_Q_Out(i);
ad_QMVAvg3(i) = dLamda3 * ad_QMVAvg3(i);
if (ad_Q(i) < dFactor * ad_PBias(i))
ad_Q_Out(i) = ad_QMVAvg2(i);
else
if (ad_Q_Out(i) <= dDelta1 * ad_QMVAvg3(i))
ad_Q_Out(i) = dDelta2 * ad_QMVAvg3(i);
end
end
ad_QMVAvg3(i) = max(ad_QMVAvg3(i), dOrg);
ad_Q_Out(i) = max(ad_Q_Out(i), ad_QMVAvg2(i));
end
ad_w = ad_Q_Out ./ max(ad_Q, eps);
for i = 1 : iNumFilts,
if iSMType == 0
ad_w_sm(i) = mean(ad_w(max(i - iN, 1) : min(i + iN ,iNumFilts)));
elseif iSMType == 1
ad_w_sm(i) = exp(mean(log(ad_w(max(i - iN, 1) : min(i + iN ,iNumFilts)))));
elseif iSMType == 2
ad_w_sm(i) = mean((ad_w(max(i - iN, 1) : min(i + iN ,iNumFilts))).^(1/15))^15;
elseif iSMType == 3
ad_w_sm(i) = (mean( (ad_w(max(i - iN, 1) : min(i + iN ,iNumFilts))).^15 )) ^ (1 / 15);
end
end
aad_P_Out(i_FI_Out + 1, :) = ad_w_sm .* aad_P(i_FI - iM + 1, :);
adSumPower(i_FI_Out + 1) = sum(aad_P_Out(i_FI_Out + 1, :));
if adSumPower(i_FI_Out + 1) > dMean
dMean = dLamda_S * dMean + (1 - dLamda_S) * adSumPower(i_FI_Out + 1);
else
dMean = dLamda_L * dMean + (1 - dLamda_L) * adSumPower(i_FI_Out + 1);
end
aad_P_Out(i_FI_Out + 1, :) = aad_P_Out(i_FI_Out + 1, :) / (dMean) * MEAN_POWER;
i_FI_Out = i_FI_Out + 1;
end
else % if not SSF
adSumPower(i_FI + 1) = sum(aad_P(i_FI + 1, :));
if adSumPower(i_FI_Out + 1) > dMean
dMean = dLamda_S * dMean + (1 - dLamda_S) * adSumPower(i_FI_Out + 1);
else
dMean = dLamda_L * dMean + (1 - dLamda_L) * adSumPower(i_FI_Out + 1);
end
aad_P_Out(i_FI + 1, :) = aad_P(i_FI + 1, :) / (dMean) * MEAN_POWER;
end
i_FI = i_FI + 1;
end
%adSorted = sort(adSumPower);
%dMaxPower = adSorted(round(0.98 * length(adSumPower)));
%aad_P_Out = aad_P_Out / dMaxPower * 1e10;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Apply the nonlinearity
%
%dPowerCoeff
if bPowerLaw == 1
aadSpec = aad_P_Out .^ dPowerCoeff;
else
aadSpec = log(aad_P_Out + eps);
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% DCT
%
aadDCT = dct(aadSpec')';
%aadDCT(:, numcoeffs+1:iNumFilts) = [];
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% MVN
%
% for i = 1 : numcoeffs
% aadDCT( :, i ) = (aadDCT( : , i ) - mean(aadDCT( : , i)))/std(aadDCT(:,i));
% end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Temporal Derivatives
% calculate 1st derivative (velocity)
dt1 = deltacc(aadDCT', deltawindow);
% calculate 2nd derivative (acceleration)
dt2 = deltacc(dt1, accwindow);
% append dt1 and dt2 to mfcco
aadDCT = [aadDCT'; dt2];
% aadDCT = [aadDCT'; dt2];
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Display
%
if bDisplay == 1
figure
aadSpec = idct(aadDCT', iNumFilts);
imagesc(aadSpec); axis xy;
end
aadDCT = aadDCT';
%{
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Writing the feature in Sphinx format
%
[iM, iN] = size(aadDCT);
iNumData = iM * iN;
fid = fopen(szOutFeatFileName, 'wb');
fwrite(fid, iNumData, 'int32');
iCount = fwrite(fid, aadDCT(:), 'float32');
fclose(fid);
%}
end
function dt = deltacc(input, winlen)
% calculates derivatives of a matrix, whose columns are feature vectors
tmp = 0;
for cnt = 1 : winlen
tmp = tmp + cnt*cnt;
end
nrm = 1 / (2*tmp);
dt = zeros(size(input));
rows = size(input,1);
cols = size(input,2);
for col = 1 : cols
for cnt = 1 : winlen
inx1 = col - cnt; inx2 = col + cnt;
if inx1 < 1; inx1 = 1; end
if inx2 > cols; inx2 = cols; end
dt(:, col) = dt(:, col) + (input(:, inx2) - input(:, inx1)) * cnt;
end
end
dt = dt * nrm;
end
function [] = Queue_offer(ad_x)
global Queue_aad_P;
global Queue_iHead;
global Queue_iTail;
global Queue_iWindow;
global Queue_iNumElem;
Queue_aad_P(Queue_iTail + 1, :) = ad_x;
Queue_iTail = mod(Queue_iTail + 1, Queue_iWindow);
Queue_iNumElem = Queue_iNumElem + 1;
if Queue_iNumElem > Queue_iWindow
error ('Queue overflow');
end
end
function [ad_x] = Queue_poll()
global Queue_aad_P;
global Queue_iHead;
global Queue_iTail;
global Queue_iWindow;
global Queue_iNumElem;
if Queue_iNumElem <= 0
error ('No elements');
end
ad_x = Queue_aad_P(Queue_iHead + 1, :);
Queue_iHead = mod(Queue_iHead + 1, Queue_iWindow);
Queue_iNumElem = Queue_iNumElem - 1;
end
function[adMean] = Queue_avg()
global Queue_aad_P;
global Queue_iHead;
global Queue_iTail;
global Queue_iWindow;
global Queue_iNumElem;
global iNumFilts;
adMean = zeros(1, iNumFilts); % Changed from 40 (number of filter banks)
iPos = Queue_iHead;
for i = 1 : Queue_iNumElem
adMean = adMean + Queue_aad_P(iPos + 1 ,: );
iPos = mod(iPos + 1, Queue_iWindow);
end
adMean = adMean / Queue_iNumElem;
end
function test(testdir, n, code)
for k = 1:n % read test sound file of each speaker
file = sprintf('%ss%d.wav', testdir, k);
[s, fs] = audioread(file);
%x = s + 0.01*randn(length(s),1); %AWGN Noise
%[SNR1] = snr(s);
%[SNR2] = snr(x) ;
v = PNCC(s, fs); % Compute MFCC's
distmin = inf;
k1 = 0;
for l = 1:length(code) % each trained codebook, compute distortion
d = disteu(v, code{l});
dist = sum(min(d,[],2)) / size(d,1);
if dist < distmin
distmin = dist;
k1 = l;
end
end
msg = sprintf('speaker%d -->> s%d', k, k1);
disp(msg);
end
function r = vqlbg(d,k)
%
% Inputs: d contains training data vectors (one per column)
% k is number of centroids required
e = .01;
r = mean(d, 2);
dpr = 10000;
for i = 1:log2(k)
r = [r*(1+e), r*(1-e)];
while (1 == 1)
z = interdists(d, r);
[m,ind] = min(z, [], 2);
t = 0;
for j = 1:2^i
r(:, j) = mean(d(:, find(ind == j)), 2);
x = interdists(d(:, find(ind == j)), r(:, j));
for q = 1:length(x)
t = t + x(q);
end
end
if (((dpr - t)/t) < e)
break;
else
dpr = t;
end
end
end %Output: r contains the result VQ codebook (k columns, one for each centroids)