说话人识别系统中的欧几里得距离

时间:2020-08-23 13:06:14

标签: matlab matrix

我是Matlab的新手,现在遇到了使用PNCC和MFFC实现简单的说话人识别系统的问题。 我的问题实际上是在矩阵维度上,当我运行程序时,它给了我这个错误:

Matrix dimensions must agree.

disteu错误(第43行) d(n,:) = sum((x(:, n +副本-y)。^ 2,1); 测试错误(第22行) d = disteu(v,代码{l}); 主线错误(第4行) 测试('C:\ Users \ Antonio \ Documents \ MATLAB \ test',5,代码);

为清楚起见,我附上了我的代码。 有人可以帮我吗?

function d = disteu(x, y)
% DISTEU Pairwise Euclidean distances between columns of two matrices
%
% Input:
%       x, y:   Two matrices whose each column is an a vector data.
%
% Output:
%       d:      Element d(i,j) will be the Euclidean distance between two
%               column vectors X(:,i) and Y(:,j)
%
% Note:
%       The Euclidean distance D between two vectors X and Y is:
%       D = sum((x-y).^2).^0.5

 %  D = sum((x-y).^2).^0.5
 [M, N] = size(x);
[M2, P] = size(y);



if (M ~= M2)
y=padarray(y,0,0,'post');
x=padarray(x,21,0,'post');


[M, N] = size(x)
[M2, P] = size(y)

y=padarray(y,0,0,'post');
[M2, P] = size(y)

    

end

%error('Matrix dimensions do not match.')

d = zeros(N, P);

if (N < P)
    copies = zeros(1,P);
    for n = 1:N
        d(n,:) = sum((x(:, n+copies) - y) .^2, 1);
    end
else
    copies = zeros(1,N);
    for p = 1:P
        
        
        d(:,p) = sum((x - y(:, p+copies)) .^2, 1)';
    end
end

d = d.^0.5;

function [aadDCT] = PNCC(rawdata, fsamp)



    ad_x = rawdata;
    %addpath voicebox/; % With Spectral Subtraction - default parameters
        %ad_x = specsub(rawdata, fsamp);    

    dLamda_L = 0.999;
    dLamda_S = 0.999;

    dSampRate   = fsamp;
    dLowFreq      = 200;% Changed to 40 from 200 as low freq is 40 in gabor as well
    dHighFreq     = dSampRate / 2;
    dPowerCoeff = 1 / 15;

    iFiltType = 1;
    dFactor = 2.0;

    dGammaThreshold = 0.005;

    iM = 0; % Changed from 2 to 0 as number of frames coming out to be different due to queue
    iN = 4;

    iSMType = 0;
    
    dLamda  = 0.999;
    dLamda2 = 0.5;
    dDelta1 = 1;

    dLamda3 = 0.85;
    dDelta2 = 0.2;
  
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Flags
       %
    bPreem         = 1; % pre-emphasis flag
    bSSF             = 1;
    bPowerLaw    = 1;
    bDisplay        = 0;
     

    dFrameLen     = 0.025;  % 25.6 ms window length, which is the default setting in CMU Sphinx
    dFramePeriod = 0.010;   % 10 ms frame period
    iPowerFactor  = 1;

    global  iNumFilts;
    iNumFilts = 40;
    
        if iNumFilts<30
           iFFTSize  = 512;
        else
           iFFTSize  = 1024;
        end

    % For derivatives
    deltawindow = 2;    % to calculate 1st derivative
    accwindow   = 2;        % to calculate 2nd derivative

       %    numcoeffs = 13;     % number of cepstral coefficients to be used
    numcoeffs = 13;
       %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Flags
    %
    %
    % Array Queue Ring-buffer
    %
    global Queue_aad_P;
    global Queue_iHead;
    global Queue_iTail;
    global Queue_iWindow;
    global Queue_iNumElem;

    Queue_iWindow  = 2 * iM + 1;
    Queue_aad_P    = zeros(Queue_iWindow, iNumFilts);
    Queue_iHead    = 0;
    Queue_iTail    = 0;
    Queue_iNumElem = 0;
   
    iFL        = floor(dFrameLen    * dSampRate);
    iFP        = floor(dFramePeriod * dSampRate);
    iNumFrames = floor((length(ad_x) - iFL) / iFP) + 1;
    iSpeechLen = length(ad_x);
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Pre-emphasis using H(z) = 1 - 0.97 z ^ -1
    %
    if (bPreem == 1)
        ad_x = filter([1 -0.97], 1, double(ad_x));
    end

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Obtaning the gammatone coefficient. 
    %
    % Based on M. Snelly's auditory toolbox. 
    % In actual C-implementation, we just use a table
    %
    bGamma = 1;
    
    [wts,binfrqs]  = fft2melmx(iFFTSize, dSampRate, iNumFilts, 1, dLowFreq, dHighFreq, iFiltType);
    wts = wts';
    wts(size(wts, 1) / 2 + 1 : size(wts, 1), : ) = [];
    aad_H = wts;
    
    i_FI     = 0;
    i_FI_Out = 0;

    if bSSF == 1
        adSumPower = zeros(1, iNumFrames - 2 * iM);
    else
        adSumPower = zeros(1, iNumFrames);
    end
     
    %dLamda_L   = 0.998;
    aad_P      = zeros(iNumFrames,      iNumFilts);
    aad_P_Out  = zeros(iNumFrames - 2 * iM,      iNumFilts);
    ad_Q       = zeros(1,               iNumFilts);
    ad_Q_Out   = zeros(1,               iNumFilts);
    ad_QMVAvg  = zeros(1,               iNumFilts);
    ad_w       = zeros(1,               iNumFilts);
    ad_w_sm    = zeros(1,               iNumFilts);
    ad_QMVAvg_LA = zeros(1,               iNumFilts);

    MEAN_POWER = 1e10;

    dMean  = 5.8471e+08;
    dPeak = 2.7873e+09 / 15.6250;
    % (1.7839e8, 2.0517e8, 2.4120e8, 2.9715e8, 3.9795e8) 95, 96, 97, 98, 99
    % percentile from WSJ-si84
                %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    dPeakVal = 4e+07;% % 4.0638e+07  --> Mean from WSJ0-si84  (Important!!!)
                    %%%%%%%%%%%
    dMean = dPeakVal;
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Obtaining the short-time Power P(i, j)
    %
    for m = 0 : iFP : iSpeechLen  - iFL 
        ad_x_st                = ad_x(m + 1 : m + iFL) .* hamming(iFL);
        adSpec                 = fft(ad_x_st, iFFTSize);
        ad_X                   = abs(adSpec(1: iFFTSize / 2));
        aadX(:, i_FI + 1)      = ad_X; 
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        %
        % Calculating the Power P(i, j)
        %
        for j = 1 : iNumFilts
                %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                %
                % Squared integration
                %
                
                if iFiltType == 2
                    aad_P(i_FI + 1, j)  = sum((ad_X .* aad_H(:, j)) .^ 2);
                else
                    aad_P(i_FI + 1, j)  = sum((ad_X .^ 2 .* aad_H(:, j)));
                end
                
        end
        
            %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
            %
            % Calculating the Power P(i, j)
            %
            
            dSumPower = sum(aad_P(i_FI + 1, : ));
                 
            if bSSF == 1
            %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
            %
            % Ring buffer (using a Queue)
            %
            if (i_FI >= 2 * iM + 1)
                Queue_poll();
                end
                Queue_offer(aad_P(i_FI + 1, :));

                ad_Q = Queue_avg();

            if (i_FI == 2 * iM)
                ad_QMVAvg     = ad_Q.^ (1 / 15);
                ad_PBias  =  (ad_Q) * 0.9;
            end
          
            if (i_FI >= 2 * iM)  
                %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                %
                % Bias Update
                %
                for i = 1 : iNumFilts,
                    if (ad_Q(i) > ad_PBias(i))
                       ad_PBias(i) = dLamda * ad_PBias(i)  + (1 - dLamda) * ad_Q(i);
                    else
                       ad_PBias(i) = dLamda2 * ad_PBias(i) + (1 - dLamda2) * ad_Q(i);
                    end
                end
                        
                   for i = 1 : iNumFilts,
                ad_Q_Out(i) =   max(ad_Q(i) - ad_PBias(i), 0) ;

                if (i_FI == 2 * iM)
                    ad_QMVAvg2(i)  =  0.9 * ad_Q_Out(i);
                    ad_QMVAvg3(i)  =  ad_Q_Out(i);
                    ad_QMVPeak(i)  =  ad_Q_Out(i);
                end

                if (ad_Q_Out(i) > ad_QMVAvg2(i))
                     ad_QMVAvg2(i) = dLamda * ad_QMVAvg2(i)  + (1 -  dLamda)  *  ad_Q_Out(i);
                else
                     ad_QMVAvg2(i) = dLamda2 * ad_QMVAvg2(i) + (1 -  dLamda2) *  ad_Q_Out(i);
                end

                dOrg =  ad_Q_Out(i);

                ad_QMVAvg3(i) = dLamda3 * ad_QMVAvg3(i);
                      
                if (ad_Q(i) <  dFactor * ad_PBias(i))
                    ad_Q_Out(i) = ad_QMVAvg2(i);
                else
                     if (ad_Q_Out(i) <= dDelta1 *  ad_QMVAvg3(i))
                        ad_Q_Out(i) = dDelta2 * ad_QMVAvg3(i);
                     end
                end
                ad_QMVAvg3(i) = max(ad_QMVAvg3(i),   dOrg);

                ad_Q_Out(i) =  max(ad_Q_Out(i), ad_QMVAvg2(i));
            end
                  ad_w      =   ad_Q_Out ./ max(ad_Q, eps);

            for i = 1 : iNumFilts,
                 if iSMType == 0
                            ad_w_sm(i) = mean(ad_w(max(i - iN, 1) : min(i + iN ,iNumFilts)));
                    elseif iSMType == 1
                            ad_w_sm(i) = exp(mean(log(ad_w(max(i - iN, 1) : min(i + iN ,iNumFilts)))));
                    elseif iSMType == 2
                         ad_w_sm(i) = mean((ad_w(max(i - iN, 1) : min(i + iN ,iNumFilts))).^(1/15))^15;
                    elseif iSMType == 3
                    ad_w_sm(i) = (mean(  (ad_w(max(i - iN, 1) : min(i + iN ,iNumFilts))).^15 )) ^ (1 / 15); 
                    end
            end        
                    
                aad_P_Out(i_FI_Out + 1, :) = ad_w_sm .* aad_P(i_FI - iM + 1, :);
                adSumPower(i_FI_Out + 1)   = sum(aad_P_Out(i_FI_Out + 1, :));

                if  adSumPower(i_FI_Out + 1) > dMean
                     dMean = dLamda_S * dMean + (1 - dLamda_S) * adSumPower(i_FI_Out + 1);
                else
                     dMean = dLamda_L * dMean + (1 - dLamda_L) * adSumPower(i_FI_Out + 1);
                end
                
                aad_P_Out(i_FI_Out + 1, :) = aad_P_Out(i_FI_Out + 1, :) / (dMean)  * MEAN_POWER;
                i_FI_Out = i_FI_Out + 1;
                
        end
           
    else % if not SSF
        adSumPower(i_FI + 1)   = sum(aad_P(i_FI + 1, :));
             
        if  adSumPower(i_FI_Out + 1) > dMean
             dMean = dLamda_S * dMean + (1 - dLamda_S) * adSumPower(i_FI_Out + 1);
        else
             dMean = dLamda_L * dMean + (1 - dLamda_L) * adSumPower(i_FI_Out + 1);
        end

        aad_P_Out(i_FI + 1, :) = aad_P(i_FI + 1, :) / (dMean)  * MEAN_POWER;
        end
        i_FI = i_FI + 1;
    end

    
    %adSorted  = sort(adSumPower);
    %dMaxPower = adSorted(round(0.98 * length(adSumPower)));
    %aad_P_Out = aad_P_Out / dMaxPower * 1e10;

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Apply the nonlinearity
    %
    %dPowerCoeff
    if bPowerLaw == 1
        aadSpec = aad_P_Out .^ dPowerCoeff;
    else
        aadSpec = log(aad_P_Out + eps);
    end

    
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % DCT
    %
    aadDCT                  = dct(aadSpec')';
    
    
    %aadDCT(:, numcoeffs+1:iNumFilts) = [];

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % MVN
    %
%   for i = 1 : numcoeffs
%          aadDCT( :, i ) = (aadDCT( : , i ) - mean(aadDCT( : , i)))/std(aadDCT(:,i));
%   end

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % Temporal Derivatives
    % calculate 1st derivative (velocity)
    dt1 = deltacc(aadDCT', deltawindow);

    % calculate 2nd derivative (acceleration)
    dt2 = deltacc(dt1, accwindow);
    % append dt1 and dt2 to mfcco
    aadDCT = [aadDCT'; dt2];
        % aadDCT = [aadDCT'; dt2];

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Display
    %
    if bDisplay == 1
        figure
        
        aadSpec = idct(aadDCT', iNumFilts);
        imagesc(aadSpec); axis xy;
    end

    aadDCT = aadDCT';
    
    %{
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %
    % Writing the feature in Sphinx format
    %
    [iM, iN] = size(aadDCT);
    iNumData = iM * iN;
    fid = fopen(szOutFeatFileName, 'wb');
    fwrite(fid, iNumData, 'int32');
    iCount = fwrite(fid, aadDCT(:), 'float32');
    fclose(fid);
    %}
    
end


function dt = deltacc(input, winlen)
% calculates derivatives of a matrix, whose columns are feature vectors

tmp = 0;
for cnt = 1 : winlen
    tmp = tmp + cnt*cnt;
end
nrm = 1 / (2*tmp);

dt   = zeros(size(input));
rows = size(input,1);
cols = size(input,2);
for col = 1 : cols
    for cnt = 1 : winlen
        inx1 = col - cnt; inx2 = col + cnt;
        if inx1 < 1;     inx1 = 1;     end
        if inx2 > cols;  inx2 = cols;  end
        dt(:, col) = dt(:, col) + (input(:, inx2) - input(:, inx1)) * cnt;
    end
end
dt = dt * nrm;
end

function [] = Queue_offer(ad_x)
    global Queue_aad_P;
    global Queue_iHead;
    global Queue_iTail;
    global Queue_iWindow;
    global Queue_iNumElem;
    
    Queue_aad_P(Queue_iTail + 1, :) = ad_x;
    
    Queue_iTail    = mod(Queue_iTail + 1, Queue_iWindow);
    Queue_iNumElem = Queue_iNumElem + 1;
    
    if Queue_iNumElem > Queue_iWindow
       error ('Queue overflow'); 
    end
    
  
end


function [ad_x] = Queue_poll()
    global Queue_aad_P;
    global Queue_iHead;
    global Queue_iTail;
    global Queue_iWindow;
    global Queue_iNumElem;
    
   
    
    if Queue_iNumElem <= 0
       error ('No elements'); 
    end
    
    
    ad_x =  Queue_aad_P(Queue_iHead + 1, :);
    
    Queue_iHead    = mod(Queue_iHead + 1, Queue_iWindow);
    Queue_iNumElem = Queue_iNumElem - 1;
 
end


function[adMean] = Queue_avg()

    global Queue_aad_P;
    global Queue_iHead;
    global Queue_iTail;
    global Queue_iWindow;
    global Queue_iNumElem;
    global iNumFilts;
    
    adMean = zeros(1, iNumFilts);  % Changed from 40 (number of filter banks)

    
    iPos = Queue_iHead;
    
    
    for i = 1 : Queue_iNumElem
        adMean = adMean + Queue_aad_P(iPos + 1 ,: );
        iPos   = mod(iPos + 1, Queue_iWindow);
    end
    
    adMean = adMean / Queue_iNumElem;

end

function test(testdir, n, code)


for k = 1:n                     % read test sound file of each speaker
    file = sprintf('%ss%d.wav', testdir, k);
    [s, fs] = audioread(file); 
    %x = s + 0.01*randn(length(s),1);  %AWGN Noise
    
   
%[SNR1] = snr(s);
%[SNR2] = snr(x) ;


    v = PNCC(s, fs);            % Compute MFCC's
   
    distmin = inf;
    k1 = 0;
   
    for l = 1:length(code)      % each trained codebook, compute distortion
        d = disteu(v, code{l}); 
        dist = sum(min(d,[],2)) / size(d,1);
      
        if dist < distmin
            distmin = dist;
            k1 = l;
            
        end      
    end
   
    msg = sprintf('speaker%d -->> s%d', k, k1);
    disp(msg);
    
end

function r = vqlbg(d,k)
%
% Inputs: d contains training data vectors (one per column)
%         k is number of centroids required

e   = .01;
r   = mean(d, 2);
dpr = 10000;

for i = 1:log2(k)
    r = [r*(1+e), r*(1-e)];
    
    while (1 == 1)
        z = interdists(d, r);
        [m,ind] = min(z, [], 2);
        t = 0;
        for j = 1:2^i
            r(:, j) = mean(d(:, find(ind == j)), 2);
            x = interdists(d(:, find(ind == j)), r(:, j));
            for q = 1:length(x)
                t = t + x(q);
            end
        end
        if (((dpr - t)/t) < e)
            break;
        else
            dpr = t;
        end
   end    
end %Output: r contains the result VQ codebook (k columns, one for each centroids)

0 个答案:

没有答案