Question

我正在做一个cuda教程，其中我必须制作两个向量的点积。在实现本教程中提供的解决方案后，我遇到了this堆栈溢出帖子中解决的一些问题。现在，无论我做什么，我都会得到答案0。贝娄，你可以找到代码！

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_atomic_functions.h"
#include <stdio.h>
#include <stdlib.h>
#define N (2048 * 8)
#define THREADS_PER_BLOCK 512

__global__ void dot(int *a, int *b, int *c)
{
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    temp[threadIdx.x] = a[index] * b[index];

    __syncthreads();

    if (threadIdx.x == 0)
    {
        int sum = 0;
        for (int i = 0; i < N; i++)
        {
            sum += temp[i];
        }
        atomicAdd(c, sum);
    }
}

int main()
{
    int *a, *b, *c;
    int *dev_a, *dev_b, *dev_c;
    int size = N * sizeof(int);

   //allocate space for the variables on the device
    cudaMalloc((void **)&dev_a, size);
    cudaMalloc((void **)&dev_b, size);
    cudaMalloc((void **)&dev_c, sizeof(int));

   //allocate space for the variables on the host
   a = (int *)malloc(size);
   b = (int *)malloc(size);
   c = (int *)malloc(sizeof(int));

   //this is our ground truth
   int sumTest = 0;
   //generate numbers
   for (int i = 0; i < N; i++)
   {
       a[i] = rand() % 10;
       b[i] = rand() % 10;
       sumTest += a[i] * b[i];
       printf(" %d %d \n",a[i],b[i]);
   }

   *c = 0;

   cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
   cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
   cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);

   dot<<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >> >(dev_a, dev_b,    dev_c);

   cudaMemcpy(c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);

   printf("%d ", *c);
   printf("%d ", sumTest);

   free(a);
   free(b);
   free(c);

   cudaFree(a);
   cudaFree(b);
   cudaFree(c);

   system("pause");

   return 0;

 }

Answer 1

首先，请按照this legendary post中的说明在代码中添加CUDA错误检查。

在内核执行调用之前，您将在以下行中将额外内存复制到dev_c：

cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);

应该是：

cudaMemcpy(dev_c, c, sizeof(int), cudaMemcpyHostToDevice);

代码中的另一个错误是内核内部，{for循环中的__shared__内存变量temp被访问超出范围。当循环迭代到THREADS_PER_BLOCK时，共享内存的元素数等于N。只需在循环中将N替换为THREADS_PER_BLOCK。

CUDA点积

1 个答案: