我开始将C程序转换为Cuda C代码。我想使用像kernel这样的外部方法。我添加strcmp
函数,如here,但程序返回错误__shared__ char myRet[twoBitSz]; expression must have a constant value
。我不确定转换是否准确,我接受有关代码的建议。
C源代码是:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <math.h>
#include <stdbool.h>
#include <ctype.h>
#define BUFSIZ 1024
#define ARRAYSIZE(x) (sizeof(x)/sizeof(*(x)))
double calculateMLpa(const char *Xn_val[], char *traj[], double value[], double alphaxixj, double tauxi, int sz, int dim) {
double mlx = 0;
double v;
double alphaxi;
char *state;
int i;
int p;
int j;
int k;
double trns[sz][sz];
double m[sz];
char *trat="-";
// m[xi] values: the number of transitions leaving the state xi
printf("%d %d \n",sz,dim);
int cont=0;
for (i = 0; i < sz; i++) {
m[i] = 0.0;
for (j = 0; j < sz; j++) {
v = 0.0;
int newlength = strlen(Xn_val[i])+strlen(trat)+strlen(Xn_val[j])+1;
state = malloc(sizeof(char)*newlength);
if(state != NULL){
state[0] = '\0';
strcat(state,Xn_val[i]);
strcat(state,trat);
strcat(state,Xn_val[j]);
// printf("%s ",state);
}else {
printf(stderr,"malloc failed!\n");
}
if (traj[cont] != NULL ){
if (strcmp(traj[cont],state)==0){
v = value[cont];
// printf("%f \n",v);
}
}
trns[i][j] = v;
printf("%f - \n",trns[i][j]);
if (strcmp(Xn_val[i],Xn_val[j])!=0)
m[i] = m[i] + v;
cont++;
}
}
for (i=0;i<sz;++i){
for(j=0;j<sz;++j){
printf("%f ",trns[i][j]);
}
printf("\n");
}
for (p=0;p<sz;++p){
printf("%f - \n",m[p]);
}
alphaxi = alphaxixj * (((double) sz) - 1.0);
alphaxi = alphaxixj;
//printf("%d ",sz);
for (i = 0; i < sz; i++) {
for (j = 0; j < sz; j++) {
// xi!=xj
if (strcmp(Xn_val[i], Xn_val[j])!=0) {
mlx = mlx + lgamma(alphaxixj + trns[i][j]) - lgamma(alphaxixj);
}
// xi
else {
mlx = mlx + lgamma(alphaxi) - lgamma(alphaxi + m[i]);
mlx = mlx + lgamma(alphaxi + m[i] + 1.0)+ (alphaxi + 1.0) * log(tauxi);
mlx = mlx - lgamma(alphaxi + 1.0)- (alphaxi + m[i] + 1.0) * log(tauxi + trns[i][j]);
}
}
}
return (mlx);
}
void main() {
printf("inizio\n");
FILE *pf;
int N=20;
char fName[2083];
char *a[]={"0","1","2"};
char *traject[]={"0-0","0-1","0-2","1-0","1-1","1-2","2-0","2-1","2-2"};
//char *str;
char *trat="-";
int file,Epoca;
for (Epoca=0;Epoca<3;Epoca++){
double bs=0;
for (file=0;file<4;++file){
int f,j,kk=0,k=0,i;
sprintf(fName, "//home//user//prova%d.csv",file);
pf=fopen(fName,"r");
char *X;
char *PaX;
int Time;
char *pa;
char *xixj;
float val;
char buffer[BUFSIZ], *ptr;
if (pf)
{
/*
* Read each line from the file.
*/
while(fgets(buffer, sizeof buffer, pf)){
k++;
}
fclose(pf);
}
double Value[k];
pf=fopen(fName,"r");
if(pf)
{
printf("k=%d\n",k);
char *state[k];
while(fgets(buffer, sizeof buffer, pf))
{
//k=0;
printf("start csv \n");
char *token;
char *ptr = buffer;
const char end[2]=",";//fgets(buffer, sizeof buffer, pf);
token = strtok(ptr, end);
f=0;
/* walk through other tokens */
while( token != NULL )
{
if(f==0){
X=token;
printf( "X %s\n", token );
}else if(f==1){
PaX=token;
printf( "PaX %s\n", token );
}
else if(f==2){
Time=(token);
printf( "Time %s \n", token );
}
else if(f==3){
pa=token;
printf( "pa %s \n", token );
}
else if(f==4){
xixj=(token);
printf( "xixj %s \n", token );
}
else{
// char *str;
Value[kk]=strtod(&token[1], NULL);
printf("Value %f \n", Value[kk]);
kk++;
}
token = strtok(NULL, end);
f++;
}
}
// file0
double new_value0[(k-1)/3];
double new_value1[(k-1)/3];
double new_value2[(k-1)/3];
// file1,2
double new_value00[(k-1)/6];
double new_value01[(k-1)/6];
double new_value10[(k-1)/6];
double new_value11[(k-1)/6];
double new_value20[(k-1)/6];
double new_value21[(k-1)/6];
// file 3
double new_value000[(k-1)/12];
double new_value001[(k-1)/12];
double new_value010[(k-1)/12];
double new_value011[(k-1)/12];
double new_value100[(k-1)/12];
double new_value101[(k-1)/12];
double new_value110[(k-1)/12];
double new_value111[(k-1)/12];
double new_value200[(k-1)/12];
double new_value201[(k-1)/12];
double new_value210[(k-1)/12];
double new_value211[(k-1)/12];
//
if (file==0){
for (i=0;i<(k-1)/3;++i){
new_value0[i]=Value[i+1];
new_value1[i]=Value[i + 1+((k-1)/3)];
new_value2[i]=Value[i + (1+ 2*(k)/3)];
printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
}
}else if(file==1 || file==2){
for (i=0; i<(k-1)/6;++i)
{
new_value00[i]=Value[i+1];
new_value01[i]=Value[i+ ((k-1)/6)+1];
new_value10[i]=Value[i+ (2*(k-1)/6)+1];
new_value11[i]=Value[i+ (3*(k-1)/6)+1];
new_value20[i]=Value[i+ (4*(k-1)/6)+1];
new_value21[i]=Value[i+ (5*(k-1)/6)+1];
printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
}
}else{
for (i=0; i<(k-1)/12;++i)
{
new_value000[i]=Value[i+1];
new_value001[i]=Value[i+ ((k-1)/12)+1];
new_value010[i]=Value[i+ (2*(k-1)/12)+1];
new_value011[i]=Value[i+ (3*(k-1)/12)+1];
new_value100[i]=Value[i+ (4*(k-1)/12)+1];
new_value101[i]=Value[i+ (5*(k-1)/12)+1];
new_value110[i]=Value[i+ (6*(k-1)/12)+1];
new_value111[i]=Value[i+ (7*(k-1)/12)+1];
new_value200[i]=Value[i+ (8*(k-1)/12)+1];
new_value201[i]=Value[i+ (9*(k-1)/12)+1];
new_value210[i]=Value[i+ (10*(k-1)/12)+1];
new_value211[i]=Value[i+ (11*(k-1)/12)+1];
printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
}
}
printf("\nstart\n");
int sz=ARRAYSIZE(a);
int dim=ARRAYSIZE(traject);
if(Epoca==0 && file==0)
{
bs=bs+calculateMLpa(a,traject,new_value0,1.0,0.1,sz,dim);
}else if(Epoca==0 && (file==1 || file==2)){
bs=bs+calculateMLpa(a,traject,new_value00,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value01,1.0,0.1,sz,dim);
}else if(Epoca==0 && file==3){
bs=bs+calculateMLpa(a,traject,new_value000,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value001,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value010,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value011,1.0,0.1,sz,dim);
}
else if(Epoca==1 && file==0)
{
bs=bs+calculateMLpa(a,traject,new_value1,1.0,0.1,sz,dim);
}else if(Epoca==1 && (file==1 || file==2)){
bs=bs+calculateMLpa(a,traject,new_value10,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value11,1.0,0.1,sz,dim);
}else if(Epoca==1 && file==3){
bs=bs+calculateMLpa(a,traject,new_value100,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value101,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value110,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value111,1.0,0.1,sz,dim);
}
else if(Epoca==2 && file==0)
{
bs=bs+calculateMLpa(a,traject,new_value2,1.0,0.1,sz,dim);
}else if(Epoca==2 && (file==1 || file==2)){
bs=bs+calculateMLpa(a,traject,new_value20,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value21,1.0,0.1,sz,dim);
}else if(Epoca==2 && file==3){
bs=bs+calculateMLpa(a,traject,new_value200,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value201,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value210,1.0,0.1,sz,dim);
bs=bs+calculateMLpa(a,traject,new_value211,1.0,0.1,sz,dim);
}
printf("\ndone file%d epoca%d\n",file,Epoca);
printf("%f\n ",bs);
fclose(pf);
}
else /* fopen() returned NULL */
{
perror(pf);
}
}
}
}
Cuda C转换代码:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
#include <string.h>
#include <sys/time.h>
#include <stdbool.h>
#include <ctype.h>
__device__ int strcmp (const char * src, int lenSrc, const char * dst, int lenDest)
{
//Get how long the longer of the arrays is.
int maxLen=(lenSrc>lenDest)?lenSrc:lenDest;
int twoBitSz = maxLen>>2; // twoBitSz = maxLen/4
//if not div. by 4, add one
if (twoBitSz & 0x10 != 0x0) twoBitSz++;
//Shared mem. for results compilation.
//__shared__ char myRet[twoBitSz];
char *myRet = (char*)malloc(twoBitSz*sizeof(char));
//Thread's idx, subidx
int el = threadIdx.x + blockDim.x * blockIdx.x;
int subIdx = el & 0x3;
int idx = el >> 4;
//would normally yield pos. as src-null char > 0...
//So or signed 1 if true, else or unsigned 0
//Same op, so minimal thread divergence
if ( el > lenDest && el < maxLen )
myRet[idx] |= (0x1 << subIdx);
else
myRet[0] |= 0x0;
//would normally yield neg. as null char - dest < 0...
//So or signed -1 if true, else or unsigned 0
//Same op, so minimal thread divergence
if ( el > lenSrc && el < maxLen )
myRet[idx] |= (0x2 << subIdx);
else
myRet[0] |= 0x0;
//TODO: Code to coallesce bit subtotals
//for loop
//warp level coalesce.
__syncthreads();
return( myRet[0] );
}
int main (void){
printf("START");
FILE *pf;
const int N=1024;
char fName[2083];
char *parents[3]={"0","1","2"};
char *traject[9]={"0-0","0-1","0-2","1-0","1-1","1-2","2-0","2-1","2-2"};
size_t parents_len;
size_t traject_len;
parents_len=sizeof(char)/sizeof(parents[0]);
traject_len=sizeof(char)/sizeof(traject[0]);
//possibile malloc
//pointer host to memory
char **parents_dev;
char **traject_dev;
//allocate on device
cudaMalloc((void **)&parents_dev,sizeof(char**)*parents_len);
cudaMalloc((void **)&traject_dev,sizeof(char**)*traject_len);
//host to Device
cudaMemcpy(parents_dev,parents,sizeof(char**)*parents_len,cudaMemcpyHostToDevice);
cudaMemcpy(traject_dev,traject,sizeof(char**)*traject_len,cudaMemcpyHostToDevice);
//Loop start
int file,Epoca;
for(Epoca=0; Epoca<3; Epoca++){
float *bs;
float *bs_dev;
//size_t size = N * sizeof(float);
bs=(float *)malloc(N * sizeof(float));
cudaMalloc((void **)&bs_dev, N * sizeof(float));
cudaMemcpy(bs_dev,bs,N*sizeof(float), cudaMemcpyHostToDevice);
for(file=0;file<4;file++){
int f, i, j, file_size=0, kk=0;
//file IO
sprintf(fName, "//home//user//prova%d.csv",file);
pf=fopen(fName,"r");
char *X;
char *PaX;
int Time;
char *pa;
char *xixj;
float val;
char buffer[BUFSIZ], *ptr;
if (pf)
{
/*
* Read each line from the file.
*/
while(fgets(buffer, sizeof buffer, pf)){
file_size++;
}
fclose(pf);
}
//variabile per kernel
float *Value, *Value_dev;
Value=(float *)malloc(file_size*N);
cudaMalloc((void **)&Value_dev, N * file_size);
//
pf=fopen(fName,"r");
if(pf)
{
printf("numero righe file = %d\n",file_size);
char *state[file_size];
while(fgets(buffer, sizeof buffer, pf))
{
printf("start csv \n");
char *token;
char *ptr = buffer;
const char end[2]=",";//fgets(buffer, sizeof buffer, pf);
token = strtok(ptr, end);
f=0;
/* walk through other tokens */
while( token != NULL )
{
if(f==0){
X=token;
printf( "X %s\n", token );
}else if(f==1){
PaX=token;
printf( "PaX %s\n", token );
}
else if(f==2){
Time=strtod(token,NULL);
printf( "Time %f \n", token );
}
else if(f==3){
pa=token;
printf( "pa %s \n", token );
}
else if(f==4){
xixj=(token);
printf( "xixj %s \n", token );
}
else{
Value[kk]=strtod(&token[1], NULL);
printf("Value %f \n", Value[kk]);
kk++;
}
token = strtok(NULL, end);
f++;
}
}
cudaMemcpy(Value_dev,Value,N*sizeof(file_size), cudaMemcpyHostToDevice);
//variable of kernel
//no parent
float *new_value0, *new_value0_dev;
new_value0=(float *)malloc(file_size*N/3);
cudaMalloc((void **)&new_value0_dev, N * file_size/3);
//
float *new_value1, *new_value1_dev;
new_value1=(float *)malloc(file_size*N/3);
cudaMalloc((void **)&new_value1_dev, N * file_size/3);
//
float *new_value2, *new_value2_dev;
new_value2=(float *)malloc(file_size*N/3);
cudaMalloc((void **)&new_value2_dev, N * file_size/3);
//
//one parent 1,2
float *new_value00, *new_value00_dev;
new_value00=(float *)malloc(file_size*N/6);
cudaMalloc((void **)&new_value0_dev, N * file_size/6);
//
float *new_value01, *new_value01_dev;
new_value01=(float *)malloc(file_size*N/6);
cudaMalloc((void **)&new_value1_dev, N * file_size/6);
//
float *new_value10, *new_value10_dev;
new_value10=(float *)malloc(file_size*N/6);
cudaMalloc((void **)&new_value10_dev, N * file_size/6);
//
float *new_value11, *new_value11_dev;
new_value11=(float *)malloc(file_size*N/6);
cudaMalloc((void **)&new_value11_dev, N * file_size/6);
//
float *new_value20, *new_value20_dev;
new_value20=(float *)malloc(file_size*N/6);
cudaMalloc((void **)&new_value20_dev, N * file_size/6);
//
float *new_value21, *new_value21_dev;
new_value21=(float *)malloc(file_size*N/6);
cudaMalloc((void **)&new_value21_dev, N * file_size/6);
//
//double parent
float *new_value000, *new_value000_dev;
new_value000=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value000_dev, N * file_size/12);
//
float *new_value001, *new_value001_dev;
new_value001=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value001_dev, N * file_size/12);
//
float *new_value010, *new_value010_dev;
new_value010=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value010_dev, N * file_size/12);
//
float *new_value011, *new_value011_dev;
new_value011=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value011_dev, N * file_size/12);
//
float *new_value100, *new_value100_dev;
new_value100=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value100_dev, N * file_size/12);
//
float *new_value101, *new_value101_dev;
new_value101=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value101_dev, N * file_size/12);
//
float *new_value110, *new_value110_dev;
new_value110=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value110_dev, N * file_size/12);
//
float *new_value111, *new_value111_dev;
new_value111=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value111_dev, N * file_size/12);
//
float *new_value200, *new_value200_dev;
new_value200=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value200_dev, N * file_size/12);
//
float *new_value201, *new_value201_dev;
new_value201=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value201_dev, N * file_size/12);
//
float *new_value210, *new_value210_dev;
new_value210=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value210_dev, N * file_size/12);
//
float *new_value211, *new_value211_dev;
new_value211=(float *)malloc(file_size*N/12);
cudaMalloc((void **)&new_value211_dev, N * file_size/12);
//
//insert in variable
if (file==0){
for (i=0;i<(file_size-1)/3;++i){
new_value0[i]=Value[i+1];
cudaMemcpy(new_value0_dev,new_value0,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value1[i]=Value[i + 1+((file_size-1)/3)];
cudaMemcpy(new_value1_dev,new_value1,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value2[i]=Value[i + (1+ 2*(file_size)/3)];
cudaMemcpy(new_value2_dev,new_value2,N*sizeof(file_size), cudaMemcpyHostToDevice);
printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
}
}else if(file==1 || file==2){
for (i=0; i<(file_size-1)/6;++i)
{
new_value00[i]=Value[i+1];
cudaMemcpy(new_value00_dev,new_value00,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value01[i]=Value[i+ ((file_size-1)/6)+1];
cudaMemcpy(new_value01_dev,new_value01,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value10[i]=Value[i+ (2*(file_size-1)/6)+1];
cudaMemcpy(new_value10_dev,new_value10,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value11[i]=Value[i+ (3*(file_size-1)/6)+1];
cudaMemcpy(new_value11_dev,new_value11,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value20[i]=Value[i+ (4*(file_size-1)/6)+1];
cudaMemcpy(new_value20_dev,new_value20,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value21[i]=Value[i+ (5*(file_size-1)/6)+1];
cudaMemcpy(new_value21_dev,new_value21,N*sizeof(file_size), cudaMemcpyHostToDevice);
printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
}
}else{
for (i=0; i<(file_size-1)/12;++i)
{
new_value000[i]=Value[i+1];
cudaMemcpy(new_value000_dev,new_value000,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value001[i]=Value[i+ ((file_size-1)/12)+1];
cudaMemcpy(new_value001_dev,new_value001,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value010[i]=Value[i+ (2*(file_size-1)/12)+1];
cudaMemcpy(new_value010_dev,new_value010,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value011[i]=Value[i+ (3*(file_size-1)/12)+1];
cudaMemcpy(new_value011_dev,new_value011,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value100[i]=Value[i+ (4*(file_size-1)/12)+1];
cudaMemcpy(new_value100_dev,new_value100,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value101[i]=Value[i+ (5*(file_size-1)/12)+1];
cudaMemcpy(new_value101_dev,new_value101,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value110[i]=Value[i+ (6*(file_size-1)/12)+1];
cudaMemcpy(new_value110_dev,new_value110,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value111[i]=Value[i+ (7*(file_size-1)/12)+1];
cudaMemcpy(new_value111_dev,new_value111,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value200[i]=Value[i+ (8*(file_size-1)/12)+1];
cudaMemcpy(new_value200_dev,new_value200,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value201[i]=Value[i+ (9*(file_size-1)/12)+1];
cudaMemcpy(new_value201_dev,new_value201,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value210[i]=Value[i+ (10*(file_size-1)/12)+1];
cudaMemcpy(new_value210_dev,new_value210,N*sizeof(file_size), cudaMemcpyHostToDevice);
new_value211[i]=Value[i+ (11*(file_size-1)/12)+1];
cudaMemcpy(new_value211_dev,new_value211,N*sizeof(file_size), cudaMemcpyHostToDevice);
printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
}
}
//START computation
printf("\nPRE KERNEL\n");
const int sz=(sizeof(parents)/sizeof(*(parents)));
const int dim=(sizeof(traject)/sizeof(*(traject)));
printf("%d - %d ",sz, dim);
//chiamata kernel
int block_size = 8;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
float *trns, **trns_dev;
trns=(float *)malloc(sz*N);
cudaMalloc((void **)&trns_dev, N * sz);
float *m, *m_dev;
m=(float *)malloc(sz*N);
cudaMalloc((void **)&m_dev, N * sz);
calculateMLpa<<<n_blocks, block_size >>>(N,bs,parents_dev,traject_dev, new_value0_dev,1.0,0.1,sz,dim,trns_dev,m_dev);
}
//cudaMemcpy(trns,trns_dev,N*sz,cudaMemcpyDeviceToHost);
}
}
free(parents_dev);
free(traject_dev);
//free(trns_dev);
}
内核:
__global__ void calculateMLpa(int N, float *bs, char **Xn_val, char **traj, float *value, float alphaxixj, float tauxi, const int sz, int dim, float** trns, float *m){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx<N){
float mlx = 0;
float v;
float alphaxi;
char *state;
int i, p, j, k;
char *trat="-";
int cont=0;
printf("%d %d \n",sz,dim);
for (i = 0; i < sz; i++) {
m[i] = 0;
for (j = 0; j < sz; j++) {
v = 0.0;
int newlength = strlen(Xn_val[i]) + strlen(trat) + strlen(Xn_val[j])+1;
state =(char *) malloc(sizeof(char) * newlength);
if(state != NULL){
state[0] = '\0';
strcat(state,Xn_val[i]);
strcat(state,trat);
strcat(state,Xn_val[j]);
// printf("%s ",state);
}else {
perror("failed");
}
if (traj[cont] != NULL ){
if (strcmp(traj[cont],state)==0){
v = value[cont];
// printf("%f \n",v);
}
}
trns[i][j] = v;
printf("%f - \n",trns[i][j]);
if (strcmp(Xn_val[i],Xn_val[j])!=0)
atomicAdd(&m[i], v);
cont++;
}
}
for (i=0;i<sz;++i){
for(j=0;j<sz;++j){
printf("%f ",trns[i][j]);
}
printf("\n");
}
for (p=0;p<sz;++p){
printf("%f - \n",m[p]);
}
alphaxi = alphaxixj * (((double) sz) - 1.0);
alphaxi = alphaxixj;
//printf("%d ",sz);
for (i = 0; i < sz; i++) {
for (j = 0; j < sz; j++) {
// xi!=xj
if (strcmp(Xn_val[i], Xn_val[j])!=0) {
mlx = mlx + lgamma(alphaxixj + trns[i][j]) - lgamma(alphaxixj);
}
// xi
else {
mlx = mlx + lgamma(alphaxi) - lgamma(alphaxi + m[i]);
mlx = mlx + lgamma(alphaxi + m[i] + 1.0)+ (alphaxi + 1.0) * log(tauxi);
mlx = mlx - lgamma(alphaxi + 1.0)- (alphaxi + m[i] + 1.0) * log(tauxi + trns[i][j]);
}
}
}
atomicAdd(&bs[idx], mlx);
}
}
注意:调用内核的结果仍然没有出现,因为它缺少打印