我是CUDA C中的新手......我想在1个块中对数组元素(使用reduce)求和,267个线程使用共享内存。我通过示例读了一本书" CUDA,介绍了GPU编程的通用"。根据她的一些推荐,我写了我的程序版本:
__global__ void
conva(int* a, int* out)
{
__shared__ int cache[534];
int cacheIndex = threadIdx.x;
for(int n=0; n<2;++n) {
cache[cacheIndex+n] = a[cacheIndex+n];
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex + n] += cache[cacheIndex + n + i];
__syncthreads();
i /= 2;
}
}
//need or not this __syncthreads(), I don't know
__syncthreads();
if (cacheIndex == 0)
out = &cache[0];
}
int main(int argc, char** argv)
{
//enter array for sum
int convolution[534];
for(int i=0; i<534; ++i)
convolution[i] = 1;
//variable in which we take a sum from device
int summa = 0;
//it we copy on device from host
int* tash;
int* convolution_gpu;
cudaMalloc((void**)(&convolution_gpu), 534*sizeof(int));
cudaMalloc((void**)(&tash), sizeof(int));
cudaMemcpy(convolution_gpu, convolution, 534*sizeof(int), cudaMemcpyHostToDevice );
//call core with 1 block and 267 threads
conva<<<1, 267>>>(convolution_gpu, tash);
cudaMemcpy(&summa, tash, sizeof(int), cudaMemcpyDeviceToHost);
//and here I want 534 but I have garbage(may be)
std::cout<<summa<<std::endl;
cudaFree(convolution_gpu);
cudaFree(tash);
getchar();
}
请告诉我,这里有错误并帮我解决她... (对不起我的英文)
答案 0 :(得分:1)
在你的内核中,这个:
System.out.print("Name: ");
几乎肯定是错的。当然你想要这样的东西:
try (Scanner sc = new Scanner(System.in)) {
System.out.print("Name: ");
while (sc.hasNextLine()) {
String name = sc.nextLine();
System.out.println("Name is \"" + name + "\"");
}
}