C代码正在从嵌套for循环缓慢运行

时间:2019-03-14 21:43:16

标签: performance loops for-loop nested

我的c程序正在运行缓慢(现在没有并行化大约40秒)。我尝试使用openmp大大降低了计时,但我希望使用简单自然的方法来使我的代码运行更快,而不是使用并行for循环。该代码的基本结构是将某些命令行参数作为输入,然后将这些输入保存为变量。然后,它使用math.h库和complex.h库递归计算一个称为Rplus1的变量。代码的问题以及大部分时间都花在了嵌套for循环的底部。我的目标是使整个代码在5秒钟内运行,但是到目前为止,它无需使用并行for循环即可在40秒钟内运行。请帮忙!

#include "time.h"
#include "stdio.h"
#include "stdlib.h"
#include "complex.h"
#include "math.h"
#include "string.h"
#include "unistd.h"
#include "omp.h"
#define PI 3.14159265


int main (int argc, char *argv[]){
if(argc >= 8){

double start1 = omp_get_wtime();

// command line arguments are aligned in the following order: [theta] [number of layers in superlattice] [material_1] [lat const_1] [number of unit cells_1] [material_2] [lat const_2] [number of unit cells_2] .... [material_N] [lat const_N] [number of unit cells_N] [Log/Linear] [number of repeating superlattice layers] [yes/no]

int N;
sscanf(argv[2],"%d",&N); // Number of layers in superlattice specified by second input argument


if(strcmp(argv[argc-1],"yes") == 0) //If the substrate is included then add one more layer to the N variable
{
        N = N+1;
}

int total;
sscanf(argv[argc-2],"%d",&total); // Number of repeating superlattice layers specified by second to last argument

double layers[N][6], horizangle[1001], vertangle[1001]; 

double complex (*F_hkl)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*F_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)),SF_table[10];// this array will hold the unit cell structure factors for all of the materials selected for each wavevector in the beam spectrum

double real, real2, lam, c_light = 299792458, h_pl = 4.135667516e-15,E = 10e3, r_0 = 2.818e-15, Lccd = 1.013;// just a few variables to hold values through calculations and constants, speed of light, plancks const, photon energy, and detector distance from sample

double angle;

double complex z;// just a variable to hold complex numbers throughout calculations

int i,j,m,n,t; // integers to index through arrays

lam = (h_pl*c_light)/E;

sscanf(argv[1],"%lf",&angle); //first argument is the angle of incidence, read it
angle = angle*(PI/180.0);
angle2 = -angle;


double (*table)[10] = malloc(10*9*sizeof(double)); // this array holds all the coefficients to calculate the atomic scattering factor below
double (*table2)[10] = malloc(10*2*sizeof(double));

FILE*datfile1 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/coef_table.bin","rb"); // read the binary file containg all the coefficients
fread(table,sizeof(double),90,datfile1);
fclose(datfile1);

FILE*datfile2 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/dispersioncs.bin","rb");
fread(table2,sizeof(double),20,datfile2);
fclose(datfile2);

// Calculate scattering factors for all elements
double a,b;
double k_z = (sin(angle)/lam)*1e-10; // incorporate angular dependence of SF but neglect 0.24 degree divergence because of approximation

for(i = 0;i<10;i++) // for each element...
{
    SF_table[i] = 0;
    for(j = 0;j<4;j++) // summation
    {
        a = table[2*j][i];
        b = table[2*j+1][i];
        SF_table[i] = SF_table[i] + a * exp(-b*k_z*k_z);
    }
    SF_table[i] = SF_table[i] + table[8][i] + table2[0][i] + table2[1][i]*I; 
}

free(table);



double mm = 4.0, (*phi)[1001][1001] = malloc(N*1001*1001*sizeof(double));

for(i = 1; i < N+1; i++) // for each layer of material...
{

    sscanf(argv[i*3+1],"%lf",&layers[i-1][1]);  // get out of plane lattice constant

    sscanf(argv[i*3+2],"%lf",&layers[i-1][2]);  // get the number of unit cells in the layer


    layers[i-1][1] = layers[i-1][1]*1e-10; // convert lat const input to meters



// Define reciprocal space positions at the incident angle h, k, l

    layers[i-1][3] = 0; // h
    layers[i-1][4] = 0; // k

    double l; // l calculated for each wavevector in the spectrum because l changes with angle of incidence



    for (m = 0; m < 1001; m++)
    {
        for (n = 0; n <1001; n++)
        {

        l = 4;

        phi[i-1][m][n] = 2*PI*layers[i-1][1]*sin(angle)/lam; // Caculate phi for each layer

        if(strcmp(argv[i*3],"GaAs") == 0)
        {
            F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*(SF_table[2]+SF_table[3]*cexp(I*PI*l/2));
            F_0[i-1][m][n] = 0.5*8.0*(31 + table2[0][2] + table2[1][2]*I) + 0.5*8.0*(33 + table2[0][3] + table2[1][3]*I);
            g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
            g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
        }

        if(strcmp(argv[i*3],"AlGaAs") == 0)
        {
            F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*((0.76*SF_table[2]+ 0.24*SF_table[4])+SF_table[3]*cexp(I*PI*l/2));
            F_0[i-1][m][n] = 0.24*4.0*(13 + table2[0][4] + table2[1][4]*I) + 0.76*4.0*(31 + table2[0][2] + table2[1][2]*I) + 4.0*(33 + table2[0][3] + table2[1][3]*I);
            g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
            g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
        }
      }
    }
}


   double complex (*Rplus1)[1001] = malloc(1001*1001*sizeof(double complex));

    for (m = 0; m < 1001; m++)
    {
            for (n = 0; n <1001; n++)
            {

            Rplus1[m][n] = 0.0;
            }
    }


double stop1 = omp_get_wtime();

                    for(i=1;i<N;i++) // For each layer of the film
                    {
                            for(j=0;j<layers[i][2];j++) // For each unit cell
                            {
                                    for (m = 0; m < 1001; m++) // For each row of the diffraction pattern
                                    {
                                            for (n = 0; n <1001; n++) // For each column of the diffraction pattern
                                            {
                                            Rplus1[m][n] = -I*g[i][m][n] + ((1-I*g_0[i][m][n])*(1-I*g_0[i][m][n]))/(I*g[i][m][n] + (cos(-2*phi[i][m][n])+I*sin(-2*phi[i][m][n]))/Rplus1[m][n]);
                                            }
                                    }
                            }
                    }

double stop2 = omp_get_wtime();


double elapsed1 = (double)(stop1 - start1);// Second user defined function to use Durbin and Follis recursive formula
double elapsed2 = (double)(stop2 - start1);// Second user defined function to use Durbin and Follis recursive formula
printf("main() through before diffraction function took %f seconds to run\n\n",elapsed1);
printf("main() through after diffraction function took %f seconds to run\n\n",elapsed2);


}

}

0 个答案:

没有答案