Question

我有一个简单的内核应该计算b数组的乘法和，但__syncthreads（）似乎根本不工作，我调试它，temp [i]返回一些元素的未初始化值。如果我省略__syncthreads（），结果是相同的。（我查了一下cuda代码的所有其他部分（比如数组的初始化，复制到内存等）都写得很好，所以问题出现在这个内核中）（注意：我不想使用atomicAdd）

 #include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h> 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"  

#define MINREAL -1024.0
#define MAXREAL 1024.0

#define ACCURACY 0.01

#define NUM_OF_GPU_THREADS 256 

void checkCUDAError(const char *msg) {
    cudaError_t err = cudaGetLastError(); 
    if( cudaSuccess != err){
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
        exit(EXIT_FAILURE); 
    } 
} 

void vecFillRand(int N, float *vec) {
    int i;
    for(i = 0; i < N; i++) 
        vec[i] = (rand() / (float) RAND_MAX)*(MAXREAL - MINREAL) + MINREAL;
}

float seq_dotProduct(float *a, float *b, int n) {
    int i;
    float dp;
    dp = 0;
    for(i = 0; i < n; i++) {
        dp += a[i] * b[i];
    }
    return dp;
}

// krenel
__global__ void dotProduct(float *a, float *b, float *c) {
    __shared__ float temp[NUM_OF_GPU_THREADS];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    float t;

    if (idx >= gridDim.x * blockDim.x)
        return;

    temp[threadIdx.x] = a[idx] * b[idx];

    __syncthreads();

    if(threadIdx.x == 0) {
        c[blockIdx.x] = 0.0f;
        for(int i = 0; i < NUM_OF_GPU_THREADS; i++){
            t = temp[i];
            c[blockIdx.x] = c[blockIdx.x] + t;
        }
    }
}

int main(int argc, char* argv[]) {
    int i, n, ARRAY_BYTES;
    float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
    float sum;
    float seq_sum;
    clock_t t;

    srand(time(NULL));

    if (argc == 2) {
        n = atoi(argv[1]);
    } else {
        printf("N? ");
        fflush(stdout);
        scanf("%d", &n);    
    }

    int BLOCKS_PER_GRID = (unsigned int)ceil(n/(float)NUM_OF_GPU_THREADS);

    // arrays n host
    ARRAY_BYTES = n * sizeof(float);
    h_A = (float *) malloc(ARRAY_BYTES);
    h_B = (float *) malloc(ARRAY_BYTES);
    h_C = (float *) malloc(BLOCKS_PER_GRID * sizeof(float));
    printf("\ncreating A and B...\n\n");
    vecFillRand(n, h_A);
    vecFillRand(n, h_B);
    vecFillRand(BLOCKS_PER_GRID, h_C);

    // arrays on device
    cudaMalloc((void**) &d_A, ARRAY_BYTES);
    cudaMalloc((void**) &d_B, ARRAY_BYTES);
    cudaMalloc((void**) &d_C, BLOCKS_PER_GRID * sizeof(float));

    // transfer the arrays to the GPU
    cudaMemcpy(d_A, h_A, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_C, BLOCKS_PER_GRID, cudaMemcpyHostToDevice);

    // TIME START
    // create events for timing execution 
    cudaEvent_t start = cudaEvent_t(); 
    cudaEvent_t stop = cudaEvent_t(); 
    cudaEventCreate( &start ); 
    cudaEventCreate( &stop ); 
    // record time into start event 
    cudaEventRecord( start, 0 ); // 0 is the default stream id 

    // launch the kernel
    dim3 block(NUM_OF_GPU_THREADS); // 256, 1, 1
    dim3 grid(BLOCKS_PER_GRID);  
    printf ("computing dotProduct... \n");
    dotProduct<<<grid, block>>>(d_A, d_B, d_C);

    // block until the device has completed 
    cudaThreadSynchronize();    

    // check if kernel execution generated an error 
    // Check for any CUDA errors 
    checkCUDAError("kernel invocation"); 

    // TIME END
    // record time into stop event 
    cudaEventRecord( stop, 0 ); 
    // synchronize stop event to wait for end of kernel execution on stream 0 
    cudaEventSynchronize( stop ); 
    // compute elapsed time (done by CUDA run-time) 
    float elapsed_kernel = 0.f; 
    cudaEventElapsedTime( &elapsed_kernel, start, stop ); 
    // release events 
    cudaEventDestroy( start ); 
    cudaEventDestroy( stop ); 
    // print krenel time
    printf("CUDA TIME: %f \n\n", elapsed_kernel/1000);

    // copy back the result array to the CPU
    cudaMemcpy(h_C, d_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyDeviceToHost );

    // Check for any CUDA errors 
    checkCUDAError("memcpy");

    // compute sum
    sum = 0;
    for (i = 0; i < BLOCKS_PER_GRID; i++)
        sum += h_C[i];

    //  launch sequential
    t = clock();
    printf ("computing seq_dotProduct... \n");
    seq_sum = seq_dotProduct(h_A, h_B, n);
    t = clock() - t;
    printf ("SEQ TIME: %f \n\n",((float)t)/CLOCKS_PER_SEC);

    // check sum and seq_sum    
    float value = abs(sum - seq_sum);
    if (value > ACCURACY) {
        printf("Test FAILED: %f \n", value);        
    }
    else{
        printf("Test PASSED \n");
    } 

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);  

    return 0;
}

Answer 1

您的内核需要稍微重写。

＆＃34;最后一个块＆＃34;将终止任何超过输入向量长度的线程，但对最后一个块的元素求和的for循环没有正确检查以确保它不超过向量的长度。这导致越界读取访问，当您使用cuda-memcheck运行代码时（并假设您输入的矢量大小不是256的倍数），这将显示出来。

此外，__syncthreads()不应在条件代码中使用，除非条件在块中的所有线程上对其进行评估。对于矢量长度不是256的倍数，您的最后一个块将违反此规则。

除此之外，对于较大的矢量大小，您需要在float数量之外预期太多（太多位数）精度。您需要根据要汇总的浮点数来缩放ACCURACY测试。

这是您的代码的修改版本，其中包含我上面提到的更改，这似乎对我有效：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

//#include "cuda_runtime.h"
//#include "device_launch_parameters.h"

#define MINREAL -1024.0
#define MAXREAL 1024.0
#define FAST_RED
#define ACCURACY 0.0001

#define NUM_OF_GPU_THREADS 256

void checkCUDAError(const char *msg) {
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err){
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }
}

void vecFillRand(int N, float *vec) {
    int i;
    for(i = 0; i < N; i++)
        vec[i] = (rand() / (float) RAND_MAX)*(MAXREAL - MINREAL) + MINREAL;
}

float seq_dotProduct(float *a, float *b, int n) {
    int i;
    float dp;
    dp = 0;
    for(i = 0; i < n; i++) {
        dp += a[i] * b[i];
    }
    return dp;
}

// krenel
__global__ void dotProduct(float *a, float *b, float *c, int n) {
    __shared__ float temp[NUM_OF_GPU_THREADS];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n)
      temp[threadIdx.x] = a[idx] * b[idx];
    else temp[threadIdx.x] = 0.0f;

    __syncthreads();
#ifdef FAST_RED
    // assumes block dimension is a power of 2
    for (int i = blockDim.x>>1; i > 0; i >>= 1){
      if (threadIdx.x < i) temp[threadIdx.x] += temp[threadIdx.x+i];
      __syncthreads();}
    if (threadIdx.x == 0) c[blockIdx.x] = temp[0];
#else
    float t;
    if(threadIdx.x == 0) {
        c[blockIdx.x] = 0.0f;
        int j=0;
        for(int i = blockIdx.x*blockDim.x; ((i < ((blockIdx.x+1)*blockDim.x)) && (i < n)); i++){
            t = temp[j++];
            c[blockIdx.x] = c[blockIdx.x] + t;
        }
    }
#endif
}

int main(int argc, char* argv[]) {
    int i, n, ARRAY_BYTES;
    float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
    float sum;
    float seq_sum;
    clock_t t;

    srand(time(NULL));

    if (argc == 2) {
        n = atoi(argv[1]);
    } else {
        printf("N? ");
        fflush(stdout);
        scanf("%d", &n);
    }

    int BLOCKS_PER_GRID = (unsigned int)ceil(n/(float)NUM_OF_GPU_THREADS);
    printf("bpg = %d\n", BLOCKS_PER_GRID);

    // arrays n host
    ARRAY_BYTES = n * sizeof(float);
    h_A = (float *) malloc(ARRAY_BYTES);
    h_B = (float *) malloc(ARRAY_BYTES);
    h_C = (float *) malloc(BLOCKS_PER_GRID * sizeof(float));
    printf("\ncreating A and B...\n\n");
    vecFillRand(n, h_A);
    vecFillRand(n, h_B);
    vecFillRand(BLOCKS_PER_GRID, h_C);

    // arrays on device
    cudaMalloc((void**) &d_A, ARRAY_BYTES);
    cudaMalloc((void**) &d_B, ARRAY_BYTES);
    cudaMalloc((void**) &d_C, BLOCKS_PER_GRID * sizeof(float));

    // transfer the arrays to the GPU
    cudaMemcpy(d_A, h_A, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyHostToDevice);

    // TIME START
    // create events for timing execution
    cudaEvent_t start = cudaEvent_t();
    cudaEvent_t stop = cudaEvent_t();
    cudaEventCreate( &start );
    cudaEventCreate( &stop );
    // record time into start event
    cudaEventRecord( start, 0 ); // 0 is the default stream id

    // launch the kernel
    dim3 block(NUM_OF_GPU_THREADS); // 256, 1, 1
    dim3 grid(BLOCKS_PER_GRID);
    printf ("computing dotProduct... \n");
    dotProduct<<<grid, block>>>(d_A, d_B, d_C, n);

    // block until the device has completed
    cudaDeviceSynchronize();

    // check if kernel execution generated an error
    // Check for any CUDA errors
    checkCUDAError("kernel invocation");

    // TIME END
    // record time into stop event
    cudaEventRecord( stop, 0 );
    // synchronize stop event to wait for end of kernel execution on stream 0
    cudaEventSynchronize( stop );
    // compute elapsed time (done by CUDA run-time)
    float elapsed_kernel = 0.f;
    cudaEventElapsedTime( &elapsed_kernel, start, stop );
    // release events
    cudaEventDestroy( start );
    cudaEventDestroy( stop );
    // print krenel time
    printf("CUDA TIME: %f \n\n", elapsed_kernel/1000);

    // copy back the result array to the CPU
    cudaMemcpy(h_C, d_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyDeviceToHost );

    // Check for any CUDA errors
    checkCUDAError("memcpy");

    // compute sum
    sum = 0;
    for (i = 0; i < BLOCKS_PER_GRID; i++)
        sum += h_C[i];

    //  launch sequential
    t = clock();
    printf ("computing seq_dotProduct... \n");
    seq_sum = seq_dotProduct(h_A, h_B, n);
    t = clock() - t;
    printf ("SEQ TIME: %f \n\n",((float)t)/CLOCKS_PER_SEC);

    // check sum and seq_sum
    float value = abs((sum - seq_sum)/sum);
    if (value > ACCURACY) {
        printf("Test FAILED: err: %f cpu: %f  gpu: %f \n", value, seq_sum, sum);
    }
    else{
        printf("Test PASSED \n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

编辑：在回答下面的问题时，我修改了代码以证明基本的并行缩减。

__syncthreads似乎不起作用

1 个答案: