从C程序到Cuda C.

时间:2015-11-02 09:24:00

标签: c cuda

我开始将C程序转换为Cuda C代码。我想使用像kernel这样的外部方法。我添加strcmp函数,如here,但程序返回错误__shared__ char myRet[twoBitSz]; expression must have a constant value。我不确定转换是否准确,我接受有关代码的建议。 C源代码是:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <math.h>
#include <stdbool.h>
#include <ctype.h>

#define BUFSIZ 1024
#define ARRAYSIZE(x)  (sizeof(x)/sizeof(*(x)))

double calculateMLpa(const char *Xn_val[], char *traj[], double value[], double alphaxixj, double tauxi, int sz, int dim) {
    double mlx = 0;
    double v;
    double alphaxi;
    char *state;
    int i;
    int p;
    int j;
    int k;
    double trns[sz][sz];
    double m[sz];
    char *trat="-";

    // m[xi] values: the number of transitions leaving the state xi
    printf("%d %d \n",sz,dim);
    int cont=0;
    for (i = 0; i < sz; i++) {
        m[i] = 0.0;
        for (j = 0; j < sz; j++) {
            v = 0.0;
            int newlength = strlen(Xn_val[i])+strlen(trat)+strlen(Xn_val[j])+1;
            state = malloc(sizeof(char)*newlength);
            if(state != NULL){
                state[0] = '\0';
                strcat(state,Xn_val[i]);
                strcat(state,trat);
                strcat(state,Xn_val[j]);
                //  printf("%s ",state);
            }else {
                printf(stderr,"malloc failed!\n");
            }

            if (traj[cont] != NULL ){
                if (strcmp(traj[cont],state)==0){
                    v = value[cont];
                    //      printf("%f \n",v);
                }
            }

            trns[i][j] = v;

            printf("%f - \n",trns[i][j]);
            if (strcmp(Xn_val[i],Xn_val[j])!=0)
                m[i] = m[i] + v;

            cont++;
        }
    }
    for (i=0;i<sz;++i){
        for(j=0;j<sz;++j){

            printf("%f ",trns[i][j]);
        }
        printf("\n");
    }


    for (p=0;p<sz;++p){
        printf("%f - \n",m[p]);
    }
    alphaxi = alphaxixj * (((double) sz) - 1.0);
    alphaxi = alphaxixj;
    //printf("%d ",sz);
    for (i = 0; i < sz; i++) {
        for (j = 0; j < sz; j++) {
            // xi!=xj
            if (strcmp(Xn_val[i], Xn_val[j])!=0) {
                mlx = mlx + lgamma(alphaxixj + trns[i][j]) - lgamma(alphaxixj);
            }
            // xi
            else {
                mlx = mlx + lgamma(alphaxi) - lgamma(alphaxi + m[i]);
                mlx = mlx + lgamma(alphaxi + m[i] + 1.0)+ (alphaxi + 1.0) * log(tauxi);
                mlx = mlx - lgamma(alphaxi + 1.0)- (alphaxi + m[i] + 1.0) * log(tauxi + trns[i][j]);
            }
        }
    }

    return (mlx);
}

void main() {
    printf("inizio\n");
    FILE *pf;
    int N=20;
    char fName[2083];
    char *a[]={"0","1","2"};
    char *traject[]={"0-0","0-1","0-2","1-0","1-1","1-2","2-0","2-1","2-2"};
    //char *str;
    char *trat="-";
    int file,Epoca;
    for (Epoca=0;Epoca<3;Epoca++){
        double bs=0;
        for (file=0;file<4;++file){
            int f,j,kk=0,k=0,i;
            sprintf(fName, "//home//user//prova%d.csv",file);
            pf=fopen(fName,"r");
            char *X;
            char *PaX;
            int Time;
            char *pa;
            char *xixj;
            float val;
            char buffer[BUFSIZ], *ptr;
            if (pf)
            {

                /*
                 * Read each line from the file.
                 */
                while(fgets(buffer, sizeof buffer, pf)){
                    k++;
                }
                fclose(pf);
            }
            double Value[k];
            pf=fopen(fName,"r");
            if(pf)
            {
                printf("k=%d\n",k);
                char *state[k];
                while(fgets(buffer, sizeof buffer, pf))
                {
                    //k=0;
                    printf("start csv \n");


                    char *token;
                    char *ptr = buffer;
                    const char end[2]=",";//fgets(buffer, sizeof buffer, pf);
                    token = strtok(ptr, end);
                    f=0;
                    /* walk through other tokens */
                    while( token != NULL )
                    {

                        if(f==0){
                            X=token;
                            printf( "X %s\n", token );
                        }else if(f==1){
                            PaX=token;
                            printf( "PaX %s\n", token );
                        }
                        else if(f==2){
                            Time=(token);
                            printf( "Time %s \n", token );

                        }
                        else if(f==3){
                            pa=token;
                            printf( "pa %s \n", token );

                        }
                        else if(f==4){
                            xixj=(token);
                            printf( "xixj %s \n", token );

                        }
                        else{
                            //  char *str;
                            Value[kk]=strtod(&token[1], NULL);
                            printf("Value %f \n", Value[kk]);
                            kk++;

                        }
                        token = strtok(NULL, end);
                        f++;

                    }

                }
                //  file0
                double new_value0[(k-1)/3];
                double new_value1[(k-1)/3];
                double new_value2[(k-1)/3];
                //  file1,2
                double new_value00[(k-1)/6];
                double new_value01[(k-1)/6];
                double new_value10[(k-1)/6];
                double new_value11[(k-1)/6];
                double new_value20[(k-1)/6];
                double new_value21[(k-1)/6];
                // file 3
                double new_value000[(k-1)/12];
                double new_value001[(k-1)/12];
                double new_value010[(k-1)/12];
                double new_value011[(k-1)/12];
                double new_value100[(k-1)/12];
                double new_value101[(k-1)/12];
                double new_value110[(k-1)/12];
                double new_value111[(k-1)/12];
                double new_value200[(k-1)/12];
                double new_value201[(k-1)/12];
                double new_value210[(k-1)/12];
                double new_value211[(k-1)/12];
                //
                if (file==0){
                    for (i=0;i<(k-1)/3;++i){
                        new_value0[i]=Value[i+1];
                        new_value1[i]=Value[i + 1+((k-1)/3)];
                        new_value2[i]=Value[i + (1+ 2*(k)/3)];
                        printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
                    }
                }else if(file==1 || file==2){
                    for (i=0; i<(k-1)/6;++i)
                    {
                        new_value00[i]=Value[i+1];
                        new_value01[i]=Value[i+ ((k-1)/6)+1];
                        new_value10[i]=Value[i+ (2*(k-1)/6)+1];
                        new_value11[i]=Value[i+ (3*(k-1)/6)+1];
                        new_value20[i]=Value[i+ (4*(k-1)/6)+1];
                        new_value21[i]=Value[i+ (5*(k-1)/6)+1];

                        printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
                    }
                }else{
                    for (i=0; i<(k-1)/12;++i)
                    {
                        new_value000[i]=Value[i+1];
                        new_value001[i]=Value[i+ ((k-1)/12)+1];
                        new_value010[i]=Value[i+ (2*(k-1)/12)+1];
                        new_value011[i]=Value[i+ (3*(k-1)/12)+1];
                        new_value100[i]=Value[i+ (4*(k-1)/12)+1];
                        new_value101[i]=Value[i+ (5*(k-1)/12)+1];
                        new_value110[i]=Value[i+ (6*(k-1)/12)+1];
                        new_value111[i]=Value[i+ (7*(k-1)/12)+1];
                        new_value200[i]=Value[i+ (8*(k-1)/12)+1];
                        new_value201[i]=Value[i+ (9*(k-1)/12)+1];
                        new_value210[i]=Value[i+ (10*(k-1)/12)+1];
                        new_value211[i]=Value[i+ (11*(k-1)/12)+1];

                        printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
                    }

                }


                printf("\nstart\n");
                int sz=ARRAYSIZE(a);
                int dim=ARRAYSIZE(traject);

                if(Epoca==0 && file==0)
                {
                    bs=bs+calculateMLpa(a,traject,new_value0,1.0,0.1,sz,dim);
                }else if(Epoca==0 && (file==1 || file==2)){
                    bs=bs+calculateMLpa(a,traject,new_value00,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value01,1.0,0.1,sz,dim);
                }else if(Epoca==0 && file==3){
                    bs=bs+calculateMLpa(a,traject,new_value000,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value001,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value010,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value011,1.0,0.1,sz,dim);
                }

                else if(Epoca==1 && file==0)
                {
                    bs=bs+calculateMLpa(a,traject,new_value1,1.0,0.1,sz,dim);
                }else if(Epoca==1 && (file==1 || file==2)){
                    bs=bs+calculateMLpa(a,traject,new_value10,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value11,1.0,0.1,sz,dim);
                }else if(Epoca==1 && file==3){
                    bs=bs+calculateMLpa(a,traject,new_value100,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value101,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value110,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value111,1.0,0.1,sz,dim);
                }

                else if(Epoca==2 && file==0)
                {
                    bs=bs+calculateMLpa(a,traject,new_value2,1.0,0.1,sz,dim);
                }else if(Epoca==2 && (file==1 || file==2)){
                    bs=bs+calculateMLpa(a,traject,new_value20,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value21,1.0,0.1,sz,dim);
                }else if(Epoca==2 && file==3){
                    bs=bs+calculateMLpa(a,traject,new_value200,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value201,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value210,1.0,0.1,sz,dim);
                    bs=bs+calculateMLpa(a,traject,new_value211,1.0,0.1,sz,dim);
                }
                printf("\ndone file%d epoca%d\n",file,Epoca);
                printf("%f\n ",bs);
                fclose(pf);
            }

            else /* fopen() returned NULL */
            {
                perror(pf);
            }
        }
    }
}

Cuda C转换代码:

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
#include <string.h>
#include <sys/time.h>
#include <stdbool.h>
#include <ctype.h>

    __device__ int strcmp (const char * src, int lenSrc, const char * dst, int lenDest)
{
   //Get how long the longer of the arrays is.
   int maxLen=(lenSrc>lenDest)?lenSrc:lenDest;

   int twoBitSz = maxLen>>2; // twoBitSz = maxLen/4
   //if not div. by 4, add one
   if (twoBitSz & 0x10 != 0x0) twoBitSz++;

   //Shared mem. for results compilation.
   //__shared__ char myRet[twoBitSz];
   char *myRet = (char*)malloc(twoBitSz*sizeof(char));

   //Thread's idx, subidx
   int el = threadIdx.x + blockDim.x * blockIdx.x;
   int subIdx = el & 0x3;
   int idx = el >> 4;

   //would normally yield pos. as src-null char > 0...
   //So or signed 1 if true, else or unsigned 0
   //Same op, so minimal thread divergence
   if ( el > lenDest && el < maxLen )
      myRet[idx] |= (0x1 << subIdx);
   else
      myRet[0] |= 0x0;

   //would normally yield neg. as null char - dest < 0...
   //So or signed -1 if true, else or unsigned 0
   //Same op, so minimal thread divergence
   if ( el > lenSrc && el < maxLen )
      myRet[idx] |= (0x2 << subIdx);
   else
      myRet[0] |= 0x0;

   //TODO: Code to coallesce bit subtotals
   //for loop
   //warp level coalesce.

   __syncthreads();
   return( myRet[0] );
}




int main (void){
    printf("START");
    FILE *pf;
    const int N=1024;
    char fName[2083];
    char *parents[3]={"0","1","2"};
    char *traject[9]={"0-0","0-1","0-2","1-0","1-1","1-2","2-0","2-1","2-2"};
    size_t parents_len;
    size_t traject_len;
    parents_len=sizeof(char)/sizeof(parents[0]);
    traject_len=sizeof(char)/sizeof(traject[0]);
    //possibile malloc

    //pointer host to memory
    char **parents_dev;
    char **traject_dev;

    //allocate on device
    cudaMalloc((void **)&parents_dev,sizeof(char**)*parents_len);
    cudaMalloc((void **)&traject_dev,sizeof(char**)*traject_len);

    //host to Device
    cudaMemcpy(parents_dev,parents,sizeof(char**)*parents_len,cudaMemcpyHostToDevice);
    cudaMemcpy(traject_dev,traject,sizeof(char**)*traject_len,cudaMemcpyHostToDevice);

    //Loop start
    int file,Epoca;
    for(Epoca=0; Epoca<3; Epoca++){
        float *bs;
        float *bs_dev;
        //size_t size = N * sizeof(float);
        bs=(float *)malloc(N * sizeof(float));
        cudaMalloc((void **)&bs_dev, N * sizeof(float));
        cudaMemcpy(bs_dev,bs,N*sizeof(float), cudaMemcpyHostToDevice);

        for(file=0;file<4;file++){
            int f, i, j, file_size=0, kk=0;
            //file IO
            sprintf(fName, "//home//user//prova%d.csv",file);
            pf=fopen(fName,"r");
            char *X;
            char *PaX;
            int Time;
            char *pa;
            char *xixj;
            float val;
            char buffer[BUFSIZ], *ptr;
            if (pf)
            {

                /*
                 * Read each line from the file.
                 */
                while(fgets(buffer, sizeof buffer, pf)){
                    file_size++;
                }
                fclose(pf);
            }
            //variabile per kernel
            float *Value, *Value_dev;
            Value=(float *)malloc(file_size*N);
            cudaMalloc((void **)&Value_dev, N * file_size);

            //

            pf=fopen(fName,"r");
            if(pf)
            {
                printf("numero righe file = %d\n",file_size);
                char *state[file_size];
                while(fgets(buffer, sizeof buffer, pf))
                {
                    printf("start csv \n");
                    char *token;
                    char *ptr = buffer;
                    const char end[2]=",";//fgets(buffer, sizeof buffer, pf);
                    token = strtok(ptr, end);
                    f=0;
                    /* walk through other tokens */
                    while( token != NULL )
                    {

                        if(f==0){
                            X=token;
                            printf( "X %s\n", token );
                        }else if(f==1){
                            PaX=token;
                            printf( "PaX %s\n", token );
                        }
                        else if(f==2){
                            Time=strtod(token,NULL);
                            printf( "Time %f \n", token );

                        }
                        else if(f==3){
                            pa=token;
                            printf( "pa %s \n", token );

                        }
                        else if(f==4){
                            xixj=(token);
                            printf( "xixj %s \n", token );

                        }
                        else{
                            Value[kk]=strtod(&token[1], NULL);
                            printf("Value %f \n", Value[kk]);
                            kk++;

                        }
                        token = strtok(NULL, end);
                        f++;

                    }

                }
                cudaMemcpy(Value_dev,Value,N*sizeof(file_size), cudaMemcpyHostToDevice);

                //variable of kernel
                //no parent
                float *new_value0, *new_value0_dev;
                new_value0=(float *)malloc(file_size*N/3);
                cudaMalloc((void **)&new_value0_dev, N * file_size/3);
                //
                float *new_value1, *new_value1_dev;
                new_value1=(float *)malloc(file_size*N/3);
                cudaMalloc((void **)&new_value1_dev, N * file_size/3);
                //
                float *new_value2, *new_value2_dev;
                new_value2=(float *)malloc(file_size*N/3);
                cudaMalloc((void **)&new_value2_dev, N * file_size/3);
                //
                //one parent 1,2
                float *new_value00, *new_value00_dev;
                new_value00=(float *)malloc(file_size*N/6);
                cudaMalloc((void **)&new_value0_dev, N * file_size/6);
                //
                float *new_value01, *new_value01_dev;
                new_value01=(float *)malloc(file_size*N/6);
                cudaMalloc((void **)&new_value1_dev, N * file_size/6);
                //
                float *new_value10, *new_value10_dev;
                new_value10=(float *)malloc(file_size*N/6);
                cudaMalloc((void **)&new_value10_dev, N * file_size/6);
                //
                float *new_value11, *new_value11_dev;
                new_value11=(float *)malloc(file_size*N/6);
                cudaMalloc((void **)&new_value11_dev, N * file_size/6);
                //
                float *new_value20, *new_value20_dev;
                new_value20=(float *)malloc(file_size*N/6);
                cudaMalloc((void **)&new_value20_dev, N * file_size/6);
                //
                float *new_value21, *new_value21_dev;
                new_value21=(float *)malloc(file_size*N/6);
                cudaMalloc((void **)&new_value21_dev, N * file_size/6);
                //
                //double parent
                float *new_value000, *new_value000_dev;
                new_value000=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value000_dev, N * file_size/12);
                //
                float *new_value001, *new_value001_dev;
                new_value001=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value001_dev, N * file_size/12);
                //
                float *new_value010, *new_value010_dev;
                new_value010=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value010_dev, N * file_size/12);
                //
                float *new_value011, *new_value011_dev;
                new_value011=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value011_dev, N * file_size/12);
                //
                float *new_value100, *new_value100_dev;
                new_value100=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value100_dev, N * file_size/12);
                //
                float *new_value101, *new_value101_dev;
                new_value101=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value101_dev, N * file_size/12);
                //
                float *new_value110, *new_value110_dev;
                new_value110=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value110_dev, N * file_size/12);
                //
                float *new_value111, *new_value111_dev;
                new_value111=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value111_dev, N * file_size/12);
                //
                float *new_value200, *new_value200_dev;
                new_value200=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value200_dev, N * file_size/12);
                //
                float *new_value201, *new_value201_dev;
                new_value201=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value201_dev, N * file_size/12);
                //
                float *new_value210, *new_value210_dev;
                new_value210=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value210_dev, N * file_size/12);
                //
                float *new_value211, *new_value211_dev;
                new_value211=(float *)malloc(file_size*N/12);
                cudaMalloc((void **)&new_value211_dev, N * file_size/12);
                //

                //insert in variable
                if (file==0){
                    for (i=0;i<(file_size-1)/3;++i){
                        new_value0[i]=Value[i+1];
                        cudaMemcpy(new_value0_dev,new_value0,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value1[i]=Value[i + 1+((file_size-1)/3)];
                        cudaMemcpy(new_value1_dev,new_value1,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value2[i]=Value[i + (1+ 2*(file_size)/3)];
                        cudaMemcpy(new_value2_dev,new_value2,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
                    }
                }else if(file==1 || file==2){
                    for (i=0; i<(file_size-1)/6;++i)
                    {
                        new_value00[i]=Value[i+1];
                        cudaMemcpy(new_value00_dev,new_value00,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value01[i]=Value[i+ ((file_size-1)/6)+1];
                        cudaMemcpy(new_value01_dev,new_value01,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value10[i]=Value[i+ (2*(file_size-1)/6)+1];
                        cudaMemcpy(new_value10_dev,new_value10,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value11[i]=Value[i+ (3*(file_size-1)/6)+1];
                        cudaMemcpy(new_value11_dev,new_value11,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value20[i]=Value[i+ (4*(file_size-1)/6)+1];
                        cudaMemcpy(new_value20_dev,new_value20,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value21[i]=Value[i+ (5*(file_size-1)/6)+1];
                        cudaMemcpy(new_value21_dev,new_value21,N*sizeof(file_size), cudaMemcpyHostToDevice);

                        printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
                    }
                }else{
                    for (i=0; i<(file_size-1)/12;++i)
                    {
                        new_value000[i]=Value[i+1];
                        cudaMemcpy(new_value000_dev,new_value000,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value001[i]=Value[i+ ((file_size-1)/12)+1];
                        cudaMemcpy(new_value001_dev,new_value001,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value010[i]=Value[i+ (2*(file_size-1)/12)+1];
                        cudaMemcpy(new_value010_dev,new_value010,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value011[i]=Value[i+ (3*(file_size-1)/12)+1];
                        cudaMemcpy(new_value011_dev,new_value011,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value100[i]=Value[i+ (4*(file_size-1)/12)+1];
                        cudaMemcpy(new_value100_dev,new_value100,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value101[i]=Value[i+ (5*(file_size-1)/12)+1];
                        cudaMemcpy(new_value101_dev,new_value101,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value110[i]=Value[i+ (6*(file_size-1)/12)+1];
                        cudaMemcpy(new_value110_dev,new_value110,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value111[i]=Value[i+ (7*(file_size-1)/12)+1];
                        cudaMemcpy(new_value111_dev,new_value111,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value200[i]=Value[i+ (8*(file_size-1)/12)+1];
                        cudaMemcpy(new_value200_dev,new_value200,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value201[i]=Value[i+ (9*(file_size-1)/12)+1];
                        cudaMemcpy(new_value201_dev,new_value201,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value210[i]=Value[i+ (10*(file_size-1)/12)+1];
                        cudaMemcpy(new_value210_dev,new_value210,N*sizeof(file_size), cudaMemcpyHostToDevice);
                        new_value211[i]=Value[i+ (11*(file_size-1)/12)+1];
                        cudaMemcpy(new_value211_dev,new_value211,N*sizeof(file_size), cudaMemcpyHostToDevice);

                        printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
                    }

                }

                //START computation
                printf("\nPRE KERNEL\n");
                const int sz=(sizeof(parents)/sizeof(*(parents)));
                const int dim=(sizeof(traject)/sizeof(*(traject)));
                printf("%d - %d ",sz, dim);

                //chiamata kernel

                int block_size = 8;
                int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
                float *trns, **trns_dev;
                trns=(float *)malloc(sz*N);
                cudaMalloc((void **)&trns_dev, N * sz);
                float *m, *m_dev;
                m=(float *)malloc(sz*N);
                cudaMalloc((void **)&m_dev, N * sz);
                calculateMLpa<<<n_blocks, block_size >>>(N,bs,parents_dev,traject_dev, new_value0_dev,1.0,0.1,sz,dim,trns_dev,m_dev);



            }
            //cudaMemcpy(trns,trns_dev,N*sz,cudaMemcpyDeviceToHost);



        }

    }
    free(parents_dev);
    free(traject_dev);
    //free(trns_dev);

}

内核:

   __global__  void calculateMLpa(int N, float *bs, char **Xn_val, char **traj, float *value, float alphaxixj, float tauxi, const int sz, int dim, float** trns, float *m){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx<N){
        float mlx = 0;
        float v;
        float alphaxi;
        char *state;
        int i, p, j, k;
        char *trat="-";
        int cont=0;
        printf("%d %d \n",sz,dim);
        for (i = 0; i < sz; i++) {
            m[i] = 0;
            for (j = 0; j < sz; j++) {
                v = 0.0;
                int newlength = strlen(Xn_val[i]) + strlen(trat) + strlen(Xn_val[j])+1;
                state =(char *) malloc(sizeof(char) * newlength);
                if(state != NULL){
                    state[0] = '\0';
                    strcat(state,Xn_val[i]);
                    strcat(state,trat);
                    strcat(state,Xn_val[j]);
                    //  printf("%s ",state);
                }else {
                    perror("failed");
                }

                if (traj[cont] != NULL ){
                    if (strcmp(traj[cont],state)==0){
                        v = value[cont];
                        //      printf("%f \n",v);
                    }
                }

                trns[i][j] = v;

                printf("%f - \n",trns[i][j]);
                if (strcmp(Xn_val[i],Xn_val[j])!=0)
                    atomicAdd(&m[i], v);

                cont++;
            }
        }
        for (i=0;i<sz;++i){
            for(j=0;j<sz;++j){

                printf("%f ",trns[i][j]);
            }
            printf("\n");
        }


        for (p=0;p<sz;++p){
            printf("%f - \n",m[p]);
        }
        alphaxi = alphaxixj * (((double) sz) - 1.0);
        alphaxi = alphaxixj;
        //printf("%d ",sz);
        for (i = 0; i < sz; i++) {
            for (j = 0; j < sz; j++) {
                // xi!=xj
                if (strcmp(Xn_val[i], Xn_val[j])!=0) {
                    mlx = mlx + lgamma(alphaxixj + trns[i][j]) - lgamma(alphaxixj);
                }
                // xi
                else {
                    mlx = mlx + lgamma(alphaxi) - lgamma(alphaxi + m[i]);
                    mlx = mlx + lgamma(alphaxi + m[i] + 1.0)+ (alphaxi + 1.0) * log(tauxi);
                    mlx = mlx - lgamma(alphaxi + 1.0)- (alphaxi + m[i] + 1.0) * log(tauxi + trns[i][j]);
                }
            }
        }
        atomicAdd(&bs[idx], mlx);
    }
}

注意:调用内核的结果仍然没有出现,因为它缺少打印

0 个答案:

没有答案