Question

我有一个Matlab mex函数，它重复调用一个名为calculate().的C函数我创建了两个版本的函数：

版本A：每次mex()调用calculate()时，它只传递输入参数，calculate()所需的所有内存都在calculate()内分配和释放 - 每个时间！

版本B：calculate()所需的内存在mex()的开头分配，指针传递给calculate()。内存仅在mex()结束时释放。换句话说，分配/释放只对整个企业进行一次。

根据内存分配需要时间的概念，我根据天真的假设创建了版本B，这应该可以提高速度。但实际上它的执行时间增加了大约5倍！那里发生了什么？

以下是版本B的实际代码 - 您可以从我最初为版本A分配内存的注释中看到。

/* MEX business*/
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]){
   /* mexPrintf("Inside the mex function of CalcMz2.\n");
mexEvalString("drawnow;");*/

/*declare local variables*/
struct InputParameters voxelPars;
InputParameters *point2pars;
struct Zcontrast contrast;
struct Zcontrast* ptr2contrast;
ptr2contrast = &contrast;
mxArray *in, *out;
mxArray *temp; 
double *output;
int nFields;
int i,j,k;

in = mxDuplicateArray(prhs[0]); /*'in' now is a copy of the input struct*/
nFields = mxGetNumberOfFields(prhs[0]); 
if (nFields != 39) mexPrintf("Error: the number of fields in the input struct is incorrect.\n");

 //associate outputs
    out = plhs[0] = mxCreateDoubleScalar(0.0);
    output = mxGetPr(out);

    /*Passed variables memory allocation*/
double **pntrA; 
pntrA = (double **)mxCalloc(18, sizeof(double*));
for (i=0; i <18; i++){
    pntrA[i] = (double *) mxCalloc(18, sizeof(double));
}
double *pntrAinvB;
pntrAinvB = (double *) mxCalloc(18, sizeof(double));


/*used by MatrInv()*/
double **CopyOfMatrix; 
CopyOfMatrix = (double**)mxCalloc(18, sizeof(double*));
for (i=0; i<18; i++){
    CopyOfMatrix[i] = (double*)mxCalloc(18, sizeof(double));
  }

double *vector;
double *col;
int *indx;
vector = (double *)mxCalloc(18, sizeof(double));
indx = (int *)mxCalloc(18, sizeof(int));
col = (double *)mxCalloc(18, sizeof(double));

/*used by CalculateY()*/
double *Aat; 
Aat = (double*)mxCalloc(18*18, sizeof(double));

double *Aate; 
Aate = (double*)mxCalloc(18*18, sizeof(double));

double *sum;
sum = (double*)mxCalloc(18, sizeof(double));
double *product;
product = (double*)mxCalloc(18, sizeof(double));

/*Reassign values passed from input struct*/ 
temp = mxGetFieldByNumber(prhs[0],0,0);
voxelPars.ampl = mxGetPr(temp);
temp = mxGetFieldByNumber(prhs[0],0,1);
voxelPars.phi = mxGetPr(temp);
temp  =  mxGetFieldByNumber(prhs[0],0,2);
voxelPars.count1 = mxGetScalar(temp);
temp = mxGetFieldByNumber(prhs[0],0,3);
voxelPars.pw1 = mxGetScalar(temp);
temp = mxGetFieldByNumber(prhs[0],0,4);
voxelPars.b1 = mxGetScalar(temp);
temp = mxGetFieldByNumber(prhs[0],0,5);
voxelPars.puloffsetppm = mxGetScalar(temp);
temp = mxGetFieldByNumber(prhs[0],0,6);
voxelPars.pw1dc = mxGetScalar(temp);
temp = mxGetFieldByNumber(prhs[0],0,7);
voxelPars.cf = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,8);
voxelPars.M0w = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,9);
voxelPars.T1a = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,10);
voxelPars.T2a = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,11);
voxelPars.offsetappm = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,12);
voxelPars.bwfraction = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,13);
voxelPars.M0a = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,14);
voxelPars.M0bw = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,15);
voxelPars.T1bw = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,16);
voxelPars.T2bw = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,17);
voxelPars.offsetbwppm = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,18);
voxelPars.exratebw = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,19);
voxelPars.M0b = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,20);
voxelPars.T1b = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,21);
voxelPars.T2b = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,22);
voxelPars.offsetbppm = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,23);
voxelPars.exrateb = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,24);
voxelPars.M0c = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,25);
voxelPars.T1c = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,26);
voxelPars.T2c = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,27);
voxelPars.offsetcppm = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,28);
voxelPars.exratec = mxGetScalar(temp);
temp = mxGetFieldByNumber(prhs[0],0,29);
voxelPars.M0d = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,30);
voxelPars.T1d = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,31);
voxelPars.T2d = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,32);
voxelPars.offsetdppm = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,33);
voxelPars.exrated = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,34);
voxelPars.M0e = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,35);
voxelPars.T1e = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,36);
voxelPars.T2e = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,37);
voxelPars.offseteppm = mxGetScalar(temp); 
temp = mxGetFieldByNumber(prhs[0],0,38);
voxelPars.exratee = mxGetScalar(temp); 

point2pars = &voxelPars;


/*Initialize contrast's values to zero*/
ptr2contrast->SimMza = ptr2contrast->SimMzb = ptr2contrast->contrastz = 1.0;

/* call calculate()*/
calculate(point2pars, ptr2contrast, pntrA, pntrAinvB, CopyOfMatrix,vector,  col, indx, Aat, Aate, sum, product);
*output = ptr2contrast->contrastz; 
//mexPrintf("The contrast has been calculated as :%f.\n",     ptr2contrast->contrastz);


/*Free memory
 mxFree(pntrA); mxFree(pntrAinvB);*/
 mxFree(CopyOfMatrix); mxFree(Aat); mxFree(Aate);
 mxFree(vector);mxFree(indx);mxFree(col);
 mxFree(sum); mxFree(product);
}//end mex function   



void calculate (struct InputParameters *voxelPars1, struct Zcontrast       *contrast,  double **pntrA,double *pntrAinvB, double **CopyOfMatrix, double *vector, double *col,int *indx, double *Aat, double *Aate, double *sum, double *product){  

/*Local variables*/
int i, j, k, n1, m; 
double pwms, pw1delay;
int npul; 
double timeStepSize; 
double W;
double Wa, Wbw, Wb, Wc, Wd, We;
double Cbw, Cb, Cc, Cd, Ce;
double M0a, M0bw, M0b, M0c, M0d, M0e; //declaring these as locals just to make life easier
double pa, pbw, pb, pc, pd, pe;
double Cabw, Cab, Cac, Cad, Cae;
double k1a, k1bw, k1b, k1c, k1d, k1e;
double k2a, k2bw, k2b, k2c, k2d, k2e;

//     double **pntrA; 
//     pntrA = (double **)mxCalloc(18, sizeof(double*));
//     for (i=0; i <18; i++){
//         pntrA[i] = (double *) mxCalloc(18, sizeof(double));


  //     }
   //           
    double AdinvB[18];
    double AinvB[18];
    for(i=0; i<18; i++){
          AdinvB[i] = 0.0;
         AinvB[i] = 0.0;
     }
//    
// /*Passed variables memory allocation*/
//     double **CopyOfMatrix; /*used by MatriInv()*/
//     CopyOfMatrix = (double**)mxCalloc(18, sizeof(double*));
//     for (i=0; i<18; i++){
//         CopyOfMatrix[i] = (double*)mxCalloc(18, sizeof(double));
//       }
//     
//     double *vector;
//     double *col;
//     int *indx;
//     vector = (double *)mxCalloc(18, sizeof(double));
//     indx = (int *)mxCalloc(18, sizeof(int));
//     col = (double *)mxCalloc(18, sizeof(double));
//     
//     /*used by CalculateY()*/
//     double *Aat; 
//     Aat = (double*)mxCalloc(18*18, sizeof(double));
//    /* for (i=0; i<18; i++){
//         Aat[i] = (double*)mxCalloc(18, sizeof(double));
//     }*/
//     
//     double *Aate; 
//     Aate = (double*)mxCalloc(18*18, sizeof(double));
//    /* for (i=0; i<18; i++){
//         Aate[i] = (double*)mxCalloc(18, sizeof(double));
//     }   */
//      
//     double *sum;
//     sum = (double*)mxCalloc(18, sizeof(double));
//     double *product;
//     product = (double*)mxCalloc(18, sizeof(double));

    double W1[voxelPars1->count1]; //Now THIS might require a pointer, since count1 is only passed in the argument struct. We'll see.
    double W1x[voxelPars1->count1];
    double W1y[voxelPars1->count1];
   /* mexPrintf("Declared all local variables in calculate().\n");
    mexEvalString("drawnow;");*/

    /*Fill local variables with values from input struct voxelPars*/

....计算继续追踪

Answer 1

我最近遇到过类似的问题。虽然我对matlab内存管理设施没有任何了解，但我可以说，对于我来说，切换到C-native malloc / free给了大概x5的加速。

似乎在你的情况下，也没有必要在Matlab堆上管理内存（使用mxCalloc和类似的东西） - 因为内存分配仅供mex-function内部使用。建议你尝试一下：只需将mxCalloc替换为malloc，mxFree替换为free。

mex函数执行时间的巨大差异取决于内存分配的位置

1 个答案: