我编写了一个CUDA代码,基本上为我总结了一个数组。数组大小N
应该是2
的幂,即2^x
。但是,我的代码无法正常工作。例如,如果输出为150177410
,则我的代码输出150177408
。我一直试图在过去5
小时内调试这个。任何帮助将不胜感激。以下是代码:
//only for array size of 2^x and TPB of 2^y as godata is = num of blocks. But num of blocks 2^sth if previous satisfied
//Works for arbitrary size array of type 2^x
#include<stdio.h>
__global__ void computeAddShared(int *in , int *out, int sizeInput){
//not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
extern __shared__ float temp[];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int ltid = threadIdx.x;
temp[ltid] = 0;
while(tid < sizeInput){
temp[ltid] += in[tid];
tid+=gridDim.x * blockDim.x; // to handle array of any size
}
__syncthreads();
int offset = 1;
while(offset < blockDim.x){
if(ltid % (offset * 2) == 0){
temp[ltid] = temp[ltid] + temp[ltid + offset];
}
__syncthreads();
offset*=2;
}
if(ltid == 0){
out[blockIdx.x] = temp[0];
}
}
int main(){
int N = 8192;//should be 2^sth
int size = N;
int *a = (int*)malloc(N * sizeof(int));
/* TO create random number
FILE *f;
f = fopen("invertedList.txt" , "w");
a[0] = 1 + (rand() % 8);
fprintf(f, "%d,",a[0]);
for( int i = 1 ; i< N; i++){
a[i] = a[i-1] + (rand() % 8) + 1;
fprintf(f, "%d,",a[i]);
}
fclose(f);
return 0;*/
FILE *f;
f = fopen("invertedList.txt","r");
if( f == NULL){
printf("File not found\n");
system("pause");
exit(1);
}
int count = 0 ;
long actualSum = 0;
for( int i =0 ; i < N ; i++){
fscanf(f, "%d,", &a[count]);
actualSum+=a[count];
count++;
}
fclose(f);
printf("The actual sum is %d\n",actualSum);
int* gidata;
int* godata;
cudaMalloc((void**)&gidata, N* sizeof(int));
cudaMemcpy(gidata,a, size * sizeof(int), cudaMemcpyHostToDevice);
int TPB = 256;
int blocks = 10; //to get things kicked off
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
while(blocks != 1 ){
if(size < TPB){
TPB = size; // size is 2^sth
}
blocks = (size+ TPB -1 ) / TPB;
cudaMalloc((void**)&godata, blocks * sizeof(int));
computeAddShared<<<blocks, TPB,TPB*sizeof(int)>>>(gidata, godata,size);
//cudaFree(gidata);
gidata = godata;
size = blocks;
}
//printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime , start, stop);
printf("time is %f ms", elapsedTime);
int *output = (int*)malloc(sizeof(int));
cudaMemcpy(output, gidata,size * sizeof(int), cudaMemcpyDeviceToHost);
//Cant free either earlier as both point to same location
cudaError_t chk = cudaFree(godata);
if(chk!=0){
printf("First chk also printed error. Maybe error in my logic\n");
}
printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
printf("The sum of the array is %d\n", output[0]);
getchar();
return 0;
}
答案 0 :(得分:1)
正如talonmies提前说的那样,内核本身就可以了。基本上,在改进{{1}时使用的布伦特优化意义上改进了算法级联的CUDA SDK的reduce0
内核内核到同一SDK的reduce5
。
可以通过以下测试代码显示内核正常工作,该代码还将reduce6
的性能与代码中名为reduce0
的OP内核的性能进行比较。 reduce0_stackoverflow
内核还报告,评论了reduce0_stackoverflow
的相应代码行。
对于下面的测试用例,与reduce0
相比,reduce0_stackoverflow
在GeForce GT540M卡上执行0.030ms
。
请注意,下面的代码不要求数组大小必须是0.049ms
的幂。
2