我正在努力做标题所说的。进行此计算:A * X + Y,其中A是int
。
以下是代码:
#include <stdio.h>
#define N 1024
__global__ void calculate(int A, int X, int Y, int S) {
int tID = blockIdx.x;
S[tID] = A*X + Y;
}
int main(int argc, char *argv[]) {
int A, S[N], X[N], Y[N], i;
int dev_A, *dev_S=0, *dev_X=0, *dev_Y=0;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void **) &dev_S, N*sizeof(int));
cudaMalloc((void **) &dev_X, N*sizeof(int));
cudaMalloc((void **) &dev_Y, N*sizeof(int));
cudaEventRecord(start, 0);
for(i=0; i<N; i++) {
X[i] = i;
Y[i] = i;
}
cudaMemcpy(&dev_A, &A, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&dev_X, X, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(&dev_Y, Y, N*sizeof(int), cudaMemcpyHostToDevice);
calculate<<<N,1>>>(dev_A, *dev_X, *dev_Y, *dev_S);
cudaMemcpy(S, dev_S, N*sizeof(int), cudaMemcpyDeviceToHost);
printf("Array X:\n");
for(i=0; i<N; i++) {
printf("%d\n", X[i]);
}
printf("Array Y:\n");
for(i=0; i<N; i++) {
printf("%d\n", Y[i]);
}
printf("Array S:\n");
for(i=0; i<N; i++) {
printf("%d\n", S[i]);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed Time: %f\n", time);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(dev_S);
cudaFree(dev_X);
cudaFree(dev_Y);
return 0;
}
一旦我用nvcc编译,我就会收到这个错误:
(7): error: expression must have pointer-to-object type
1 error detected in the compilation of "/tmp/tmpxft_00003d10_00000000-6_saxy.cpp1.ii".
我是CUDA的新手,刚开始。你能帮帮忙吗?非常感谢!
答案 0 :(得分:1)
S[tID]
是一个整数,您无法将其作为数组进行访问。这不是CUDA的错。
答案 1 :(得分:0)
您的内核代码稍有不妥。正如David Kernin所说,你使用的是整数。但是你应该使用整数数组。此外,对于tID
,您必须使用threadIdx.x
和不 blockIdx.x
。或者以更通用的形式:
__global__ void calculate(int A, int *X, int *Y, int *S) {
int tID = blockIdx.x * blockDim.x + threadIdx.x;
S[tID] = A*X[tID] + Y[tID];
}
这将允许您处理大于threadIdx.x
的最大值的矩阵,因为即使使用多个块也可以通过调用
calculate<<<numThreadsPerBlock,numBlocks>>>(A, dev_X, dev_Y, dev_S);