我正在尝试在CUDA上调用相同的内核(使用一个不同的输入参数),但它只执行第一个内核而不执行其他内核调用。
假设输入数组是
new_value0=[123.814935276; 234; 100; 166; 203.0866414; 383; 186; 338; 173.0984233]
和
new_value1=[186.221113; 391; 64; 235; 195.7454998; 275; 218; 121; 118.0333872]
输出的一部分是:
entra
entra
entra
334
549
524
alpha1.000000
alpha1.000000
alpha1.000000
in 2 idx-j 0-0 Value 123.814934 - m=334 - k=0
mlx -1618.175171
in 1 idx-j 0-1 Value 234.000000 - m=334 k=1
mlx -571.983032
in 1 idx-j 0-2 Value 100.000000 - m=334 k=2
mlx -208.243652
in 1 idx-j 1-0 Value 166.000000 - m=549 k=3
mlx 477.821777
in 2 idx-j 1-1 Value 203.086639 - m=549 - k=4
mlx -2448.556396
in 1 idx-j 1-2 Value 383.000000 - m=549 k=5
mlx -549.565674
in 1 idx-j 2-0 Value 186.000000 - m=524 k=6
mlx 239.955444
in 1 idx-j 2-1 Value 338.000000 - m=524 k=7
mlx 1873.975708
in 2 idx-j 2-2 Value 173.098419 - m=524 - k=8
mlx -835.600220
mlx =-835.600220
bs = -835.600220 .
esci
esci
esci
来自第一次内核调用。
这是内核:
__global__ void calculateMLpa( int N, float *bs, float *Value, float alphaxixj, float tauxi, const int sz, int dim, int *m){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
printf("entra\n");
if(idx<N){
bs[idx]=0;
int i,k=0;
float mlx = 0;
float v;
float alphaxi;
m[idx]=0;
int state[9];
int p, j, t;
int cont=0;
if(idx==0){
m[idx]=Value[idx+1]+Value[idx+2];
}
else if(idx==1){
m[idx]=Value[idx+2]+Value[idx+4];
}else{
m[idx]=Value[idx+4]+Value[idx+5];
}
printf("%d \n",m[idx]);
alphaxi = alphaxixj * (((float) sz) - 1.0);
alphaxi = alphaxixj;
printf("alpha%f \n",alphaxi);
if(idx==0){
for(i=0;i<sz;i++){
for (j = 0; j < sz; j++) {
// xi!=xj
if (i!=j){
if(j==0) {
k=i*3;
}
else if(j==1){
k=i*3+1;
}
else if(j==2) {
k=i*3+2;
}
mlx = mlx + lgamma(alphaxixj + Value[k]) - lgamma(alphaxixj);
printf("in 1 idx-j %d-%d Value %f - m=%d k=%d \n",i,j,Value[k],m[i],k);
printf(" mlx %f \n",mlx);
//k++;
}
// xi
else {
if(j==0) {
k=i*3;
}
else if(j==1){
k=i*3+1;
}
else if(j==2) {
k=i*3+2;
}
mlx = mlx + lgamma(alphaxi) - lgamma(alphaxi + m[i]);
mlx = mlx + lgamma(alphaxi + m[i] + 1.0)+ (alphaxi + 1.0) * log(tauxi);
mlx = mlx - lgamma(alphaxi + 1.0)- (alphaxi + m[i] + 1.0) * log(tauxi + Value[k]);
printf("in 2 idx-j %d-%d Value %f - m=%d - k=%d \n",i,j,Value[k],m[i],k);
printf(" mlx %f \n",mlx);
//k++;
}
}
}
printf("mlx =%f \n",mlx);
bs[idx]=mlx;
printf("bs = %f .\n",bs[idx]);
}
}
printf("esci\n");
}
以下是代码:
int main (void){
printf("START");
FILE *pf;
const int N=9;
char fName[2083];
char *parents[3]={"0","1","2"};
char *traject[9]={"0-0","0-1","0-2","1-0","1-1","1-2","2-0","2-1","2-2"};
size_t parents_len;
size_t traject_len;
parents_len=sizeof(char)/sizeof(parents[0]);
traject_len=sizeof(char)/sizeof(traject[0]);
//possibile malloc
//pointer host to memory
char **parents_dev;
char **traject_dev;
//allocate on device
cudaMalloc((void **)&parents_dev,sizeof(char**)*parents_len);
cudaMalloc((void **)&traject_dev,sizeof(char**)*traject_len);
//host to Device
cudaMemcpy(parents_dev,parents,sizeof(char**)*parents_len,cudaMemcpyHostToDevice);
cudaMemcpy(traject_dev,traject,sizeof(char**)*traject_len,cudaMemcpyHostToDevice);
//Loop start
int file,Epoca;
float *bs;
float *bs_dev;
int file_size0=28;
int file_size1=55;
int file_size3=109;
//size_t size = N * sizeof(float);
bs=(float *)malloc(N * sizeof(float));
cudaMalloc((void **)&bs_dev, N * sizeof(float));
float *new_value0,*new_value0_dev;
new_value0=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value0_dev, N * file_size0/3);
//
float *new_value1,*new_value1_dev;
new_value1=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value1_dev, N * file_size0/3);
//
float *new_value2,*new_value2_dev;
new_value2=(float *)malloc(file_size0*N/3);
cudaMalloc((void **)&new_value2_dev, N * file_size0/3);
//
//one parent 1,2
float *new_value00,*new_value00_dev;
new_value00=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value00_dev, N * file_size1/6);
//
float *new_value01,*new_value01_dev;
new_value01=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value01_dev, N * file_size1/6);
//
float *new_value10,*new_value10_dev;
new_value10=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value10_dev, N * file_size1/6);
//
float *new_value11,*new_value11_dev;
new_value11=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value11_dev, N * file_size1/6);
//
float *new_value20,*new_value20_dev;
new_value20=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value20_dev, N * file_size1/6);
//
float *new_value21,*new_value21_dev;
new_value21=(float *)malloc(file_size1*N/6);
cudaMalloc((void **)&new_value21_dev, N * file_size1/6);
//
//double parent
float *new_value000,*new_value000_dev;
new_value000=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value000_dev, N * file_size3/12);
//
float *new_value001,*new_value001_dev;
new_value001=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value001_dev, N * file_size3/12);
//
float *new_value010,*new_value010_dev;
new_value010=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value010_dev, N * file_size3/12);
//
float *new_value011,*new_value011_dev;
new_value011=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value011_dev, N * file_size3/12);
//
float *new_value100,*new_value100_dev;
new_value100=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value100_dev, N * file_size3/12);
//
float *new_value101,*new_value101_dev;
new_value101=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value101_dev, N * file_size3/12);
//
float *new_value110,*new_value110_dev;
new_value110=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value110_dev, N * file_size3/12);
//
float *new_value111,*new_value111_dev;
new_value111=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value111_dev, N * file_size3/12);
//
float *new_value200,*new_value200_dev;
new_value200=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value200_dev, N * file_size3/12);
//
float *new_value201,*new_value201_dev;
new_value201=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value201_dev, N * file_size3/12);
//
float *new_value210,*new_value210_dev;
new_value210=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value210_dev, N * file_size3/12);
//
float *new_value211,*new_value211_dev;
new_value211=(float *)malloc(file_size3*N/12);
cudaMalloc((void **)&new_value211_dev, N * file_size3/12);
//int file;
for(file=0;file<4;file++){
int f, i, j, file_size=0, kk=0;
//file IO
sprintf(fName, "//home//user//prova%d.csv",file);
pf=fopen(fName,"r");
char *X;
char *PaX;
int Time;
char *pa;
char *xixj;
float val;
char buffer[BUFSIZ], *ptr;
if (pf)
{
/*
* Read each line from the file.
*/
while(fgets(buffer, sizeof buffer, pf)){
file_size++;
}
fclose(pf);
}
//variabile per kernel
float *Value, *Value_dev;
Value=(float *)malloc(file_size*N);
cudaMalloc((void **)&Value_dev, N * file_size);
//
pf=fopen(fName,"r");
if(pf)
{
printf("\nnumero righe file %d = %d\n",file,file_size);
char *state[file_size];
while(fgets(buffer, sizeof buffer, pf))
{
//printf("start csv \n");
char *token;
char *ptr = buffer;
const char end[2]=",";//fgets(buffer, sizeof buffer, pf);
token = strtok(ptr, end);
f=0;
/* walk through other tokens */
while( token != NULL )
{
if(f==0){
X=token;
// printf( "X %s\n", token );
}else if(f==1){
PaX=token;
// printf( "PaX %s\n", token );
}
else if(f==2){
Time=strtod(token,NULL);
// printf( "Time %f \n", token );
}
else if(f==3){
pa=token;
// printf( "pa %s \n", token );
}
else if(f==4){
xixj=(token);
// printf( "xixj %s \n", token );
}
else{
Value[kk]=strtod(&token[1], NULL);
// printf("Value %f \n", Value[kk]);
kk++;
}
token = strtok(NULL, end);
f++;
}
}
//
//insert in variable
if (file==0){
for (i=0;i<(file_size0-1)/3;++i){
new_value0[i]=Value[i+1];
cudaMemcpy(new_value0_dev,new_value0,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value1[i]=Value[i + 1+((file_size0-1)/3)];
cudaMemcpy(new_value1_dev,new_value1,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value2[i]=Value[i + (1+ 2*(file_size0-1)/3)];
cudaMemcpy(new_value2_dev,new_value2,N*sizeof(file_size0), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
}
}else if(file==1 || file==2){
for (i=0; i<(file_size1-1)/6;++i)
{
new_value00[i]=Value[i+1];
cudaMemcpy(new_value00_dev,new_value00,N*sizeof(file_size0), cudaMemcpyHostToDevice);
new_value01[i]=Value[i+ ((file_size0-1)/3)+1];
cudaMemcpy(new_value01_dev,new_value01,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value10[i]=Value[i+ (2*(file_size1-1)/6)+1];
cudaMemcpy(new_value10_dev,new_value10,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value11[i]=Value[i+ (3*(file_size1-1)/6)+1];
cudaMemcpy(new_value11_dev,new_value11,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value20[i]=Value[i+ (4*(file_size1-1)/6)+1];
cudaMemcpy(new_value20_dev,new_value20,N*sizeof(file_size1), cudaMemcpyHostToDevice);
new_value21[i]=Value[i+ (5*(file_size1-1)/6)+1];
cudaMemcpy(new_value21_dev,new_value21,N*sizeof(file_size1), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
}
}else{
for (i=0; i<(file_size3-1)/12;++i)
{
new_value000[i]=Value[i+1];
cudaMemcpy(new_value000_dev,new_value000,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value001[i]=Value[i+ ((file_size3-1)/12)+1];
cudaMemcpy(new_value001_dev,new_value001,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value010[i]=Value[i+ (2*(file_size3-1)/12)+1];
cudaMemcpy(new_value010_dev,new_value010,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value011[i]=Value[i+ (3*(file_size3-1)/12)+1];
cudaMemcpy(new_value011_dev,new_value011,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value100[i]=Value[i+ (4*(file_size3-1)/12)+1];
cudaMemcpy(new_value100_dev,new_value100,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value101[i]=Value[i+ (5*(file_size3-1)/12)+1];
cudaMemcpy(new_value101_dev,new_value101,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value110[i]=Value[i+ (6*(file_size3-1)/12)+1];
cudaMemcpy(new_value110_dev,new_value110,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value111[i]=Value[i+ (7*(file_size3-1)/12)+1];
cudaMemcpy(new_value111_dev,new_value111,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value200[i]=Value[i+ (8*(file_size3-1)/12)+1];
cudaMemcpy(new_value200_dev,new_value200,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value201[i]=Value[i+ (9*(file_size3-1)/12)+1];
cudaMemcpy(new_value201_dev,new_value201,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value210[i]=Value[i+ (10*(file_size3-1)/12)+1];
cudaMemcpy(new_value210_dev,new_value210,N*sizeof(file_size3), cudaMemcpyHostToDevice);
new_value211[i]=Value[i+ (11*(file_size3-1)/12)+1];
cudaMemcpy(new_value211_dev,new_value211,N*sizeof(file_size3), cudaMemcpyHostToDevice);
// printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
}
}
}
}
//cudaMemcpy(Value_dev,Value,N*sizeof(file_size), cudaMemcpyHostToDevice);
//variable of kernel
//no parent
//START computation
printf("\nPRE KERNEL\n");
const int sz=(sizeof(parents)/sizeof(*(parents)));
const int dim=(sizeof(traject)/sizeof(*(traject)));
printf("%d - %d \n",sz, dim);
//chiamata kernel
int block_size = 3;
int n_blocks =1 ;
int *m, *m_dev;
m=(int *)malloc(sz*N);
cudaMalloc((void **)&m_dev, N * sz);
float *trns_dev;
cudaMalloc((void **)&trns_dev, N * dim);
int i;
for(i=0;i<(file_size0-1)/3;i++){
printf(" new_value- %d - %f - %f - %f \n",i,new_value0[i],new_value1[i],new_value2[i]);
}
printf("\n");
for(i=0;i<(file_size1-1)/6;i++){
printf(" new_value- %d - %f - %f - %f - %f - %f - %f \n",i,new_value00[i],new_value01[i],new_value10[i],new_value11[i],new_value20[i],new_value21[i]);
}
printf("\n");
for(i=0;i<(file_size3-1)/12;i++){
printf(" new_value- %d - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f - %f \n",i,new_value000[i],new_value001[i],new_value010[i],new_value011[i],new_value100[i],new_value101[i],new_value110[i],new_value111[i],new_value200[i],new_value201[i],new_value210[i],new_value211[i]);
}
for(Epoca=0; Epoca<3; Epoca++){
bs=0;
float bf=0;
cudaMalloc((void **)&bf, N * sz);
cudaMemcpy(bs_dev,bs,N*sizeof(float), cudaMemcpyHostToDevice);
if(Epoca==0){
calculateMLpa<<<n_blocks, block_size >>>(N,bs_dev,new_value0_dev,1.0,0.1,sz,dim,m_dev);
cudaDeviceSynchronize();
cudaMemcpy(bs,bs_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(m,m_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
bf =+ bs[0];
printf("score= %f m0 = %d, m1 = %d, m2 = %d \n\n", bf, m[0], m[1], m[2]);
calculateMLpa<<<n_blocks, block_size >>>(N,bs_dev,new_value00_dev,1.0,0.1,sz,dim,m_dev);
cudaDeviceSynchronize();
cudaMemcpy(bs,bs_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(m,m_dev,N*sizeof(float), cudaMemcpyDeviceToHost);
bf =+ bs[0];
printf("score= %f \n", bf);
}
printf("score %d= %f \n",Epoca, bf);
}
free(bs_dev);
}
我认为我可以将其与流并行化,但我之前从未使用过它。我已经看了this开始。
答案 0 :(得分:0)
听起来你应该使用并行CUDA streams。
一个有趣的选择:
CUDA 7引入了一个新选项,即每线程默认流 有两个影响。首先,它为每个主机线程提供了自己的默认值 流。这意味着通过命令发出默认流 不同的主机线程可以同时运行。
另外值得注意的是:
如CUDA C编程指南所述,异步命令 在设备具有之前将控制权返回给调用主机线程 完成了请求的任务(它们是非阻塞的)。这些命令 是:
内核发布;两个地址之间的内存副本相同 设备内存;内存从主机到内存块的设备进行复制 64 KB或更少;使用Async执行的内存复制 后缀;内存设置函数调用。