我正在尝试在C / CUDA中实现dotproduct。我主要复制了Nvidias教程中的代码:http://www.nvidia.com/content/gtc-2010/pdfs/2131_gtc2010.pdf
我想要的结果是输出
*c = 44870400
result = 44870400
但我得到
*c = 44608256
result = 44870400
似乎是“511 * 511案例”不是计算结果的一部分。我上下检查了代码,我甚至找不到同步错误。我在这里做错了什么?
编译标志是:
cuda_dotp: ./cuda_dotp.cu
nvcc -arch=sm_13 \
-o cuda_dotp ./cuda_dotp.cu
和文件cuda_dotp.cu
的内容#include <stdio.h>
#include <cuda.h>
#define N 513
#define THREADS_PER_BLOCK 512
__global__ void dot(int *a, int *b, int *c) {
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
if (index >= N) return;
__syncthreads();
if(0 == threadIdx.x) {
int sum = 0;
int max = THREADS_PER_BLOCK;
if (N < max) max = N;
for (int i = 0; i < max; i++) {
sum += temp[i];
}
c[0] = sum;
}
}
void random_ints(int *a, int size)
{
int i;
for (i=0; i<size; i++)
a[i] = i;
return;
}
int main(void) {
int i;
int result;
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof(int); // we need space for N ints
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, sizeof(int) );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( sizeof(int) );
random_ints( a, N );
random_ints( b, N );
/*
printf("a = ");
for (i=0; i<N; i++) printf("%d, ", a[i]);
printf("\n");
printf("b = ");
for (i=0; i<N; i++) printf("%d, ", b[i]);
printf("\n");
*/
result = 0;
for (i=0; i<N; i++) result += a[i] * b[i];
*c = 0;
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
int blocks = N/THREADS_PER_BLOCK;
if(blocks<1) blocks=1;
// launch dot() kernel
dot <<< blocks, THREADS_PER_BLOCK >>> (dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy(c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);
printf("*c = %d\n", *c);
printf("result = %d\n", result);
free(a); free(b); free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
答案 0 :(得分:2)
有很多错误。 访问已分配空间之外的数组,运行0个块((int)10 /(int)512 = 0),在内核中添加c之前不初始化c。
将您的代码与以下内容进行比较。
#include <stdio.h>
#include <cuda.h>
#define N 10
#define THREADS_PER_BLOCK 512
__global__ void dot(int *a, int *b, int *c) {
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
if(index>=N) return;
__syncthreads();
if(0 == threadIdx.x) {
int sum = 0;
int max= THREADS_PER_BLOCK;
if(N<max)max=N;
for(int i = 0; i < max; i++){
sum += temp[i];
}
c[0]=sum;
}
}
void random_ints(int *a, int size)
{
int i;
for (i=0; i<size; i++)
a[i] = i;
return;
}
int main(void) {
int i;
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof(int); // we need space for N ints
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, sizeof(int) );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( sizeof(int) );
random_ints( a, N );
random_ints( b, N );
printf("a = ");
for (i=0; i<N; i++) printf("%d, ", a[i]);
printf("\n");
printf("b = ");
for (i=0; i<N; i++) printf("%d, ", b[i]);
printf("\n");
*c = 0;
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
int blocks = N/THREADS_PER_BLOCK;
if(blocks<1) blocks=1;
// launch dot() kernel
dot<<< blocks,THREADS_PER_BLOCK>>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy(c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);
printf("*c = %d\n", *c);
free(a); free(b); free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
答案 1 :(得分:1)
最后这解决了这个问题。这些变化可以在上述帖子的评论中找到。
#include <stdio.h>
#include <cuda.h>
#define N 4096
#define THREADS_PER_BLOCK 512
__global__ void dot(int *a, int *b, int *c) {
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index >= N) return;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if (0 == threadIdx.x) {
int sum = 0;
int max = THREADS_PER_BLOCK;
if (N < max) max = N;
for (int i = 0; i < max; i++) {
sum += temp[i];
}
//c[0] = sum;
atomicAdd(c, sum);
}
}
void random_ints(int *a, int size)
{
int i;
for (i=0; i<size; i++)
a[i] = i;
return;
}
int main(void) {
int i;
int result;
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof(int); // we need space for N ints
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, sizeof(int) );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( sizeof(int) );
random_ints( a, N );
random_ints( b, N );
/*
printf("a = ");
for (i=0; i<N; i++) printf("%d, ", a[i]);
printf("\n");
printf("b = ");
for (i=0; i<N; i++) printf("%d, ", b[i]);
printf("\n");
*/
result = 0;
for (i=0; i<N; i++) result += a[i] * b[i];
*c = 0;
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
int blocks = (int)(N/THREADS_PER_BLOCK) + 1; // ceil(...)
//if(blocks<1) blocks=1;
// launch dot() kernel
dot <<< blocks, THREADS_PER_BLOCK >>> (dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy(c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);
printf("*c = %d\n", *c);
printf("result = %d\n", result);
free(a); free(b); free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}