Question

我编写了一个CUDA代码，基本上为我总结了一个数组。数组大小N应该是2的幂，即2^x。但是，我的代码无法正常工作。例如，如果输出为150177410，则我的代码输出150177408。我一直试图在过去5小时内调试这个。任何帮助将不胜感激。以下是代码：

//only for array size of 2^x and TPB of 2^y as godata is = num of blocks. But num of blocks 2^sth if previous satisfied
//Works for arbitrary size array of type 2^x


#include<stdio.h>

__global__ void computeAddShared(int *in , int *out, int sizeInput){
    //not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
    extern __shared__ float temp[];

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int ltid = threadIdx.x;
    temp[ltid] = 0;
    while(tid < sizeInput){
        temp[ltid] += in[tid];
        tid+=gridDim.x * blockDim.x; // to handle array of any size
    }
    __syncthreads();
    int offset = 1;
    while(offset < blockDim.x){
        if(ltid % (offset * 2) == 0){
            temp[ltid] = temp[ltid] + temp[ltid + offset];
        }
        __syncthreads();
        offset*=2;
    }
    if(ltid == 0){
        out[blockIdx.x] = temp[0];
    }

}

int main(){



    int N = 8192;//should be 2^sth
    int size = N;
    int *a = (int*)malloc(N * sizeof(int));
    /* TO create random number
    FILE *f;
        f = fopen("invertedList.txt" , "w");
        a[0] = 1 + (rand() % 8);
        fprintf(f, "%d,",a[0]);
        for( int i = 1 ; i< N; i++){
            a[i] = a[i-1] + (rand() % 8) + 1;
            fprintf(f, "%d,",a[i]);
        }
        fclose(f);
        return 0;*/
    FILE *f;
    f = fopen("invertedList.txt","r");
    if( f == NULL){
            printf("File not found\n");
            system("pause");
            exit(1);
    }
    int count = 0 ;
    long actualSum = 0;
    for( int i =0 ; i < N ; i++){
        fscanf(f, "%d,", &a[count]);
        actualSum+=a[count];
        count++;
    }
    fclose(f);
    printf("The actual sum is %d\n",actualSum);
    int* gidata;
    int* godata;
    cudaMalloc((void**)&gidata, N* sizeof(int));
    cudaMemcpy(gidata,a, size * sizeof(int), cudaMemcpyHostToDevice);
    int TPB  = 256;
    int blocks = 10; //to get things kicked off
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    while(blocks != 1 ){
        if(size < TPB){
            TPB  = size; // size is 2^sth
        }
        blocks  = (size+ TPB -1 ) / TPB;
        cudaMalloc((void**)&godata, blocks * sizeof(int));
        computeAddShared<<<blocks, TPB,TPB*sizeof(int)>>>(gidata, godata,size);
        //cudaFree(gidata);
        gidata = godata;
        size = blocks;
    }
    //printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));


    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime; 
    cudaEventElapsedTime(&elapsedTime , start, stop);
    printf("time is %f ms", elapsedTime);
    int *output = (int*)malloc(sizeof(int));
    cudaMemcpy(output, gidata,size *  sizeof(int), cudaMemcpyDeviceToHost);
    //Cant free either earlier as both point to same location
    cudaError_t chk = cudaFree(godata);
    if(chk!=0){
        printf("First chk also printed error. Maybe error in my logic\n");
    }

    printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
    printf("The sum of the array is %d\n", output[0]);
    getchar();

    return 0;
}

Answer 1

正如talonmies提前说的那样，内核本身就可以了。基本上，在改进{{1}时使用的布伦特优化意义上改进了算法级联的CUDA SDK的reduce0内核内核到同一SDK的reduce5。

可以通过以下测试代码显示内核正常工作，该代码还将reduce6的性能与代码中名为reduce0的OP内核的性能进行比较。 reduce0_stackoverflow内核还报告，评论了reduce0_stackoverflow的相应代码行。

对于下面的测试用例，与reduce0相比，reduce0_stackoverflow在GeForce GT540M卡上执行0.030ms。

请注意，下面的代码不要求数组大小必须是0.049ms的幂。

改进的reduce0 CUDA减少代码不起作用

1 个答案: