atomicAdd(times,1);
而不是*times++
。我这样称呼内核函数
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
voxelize << < Dg, Db >> > ();
cudaDeviceSynchronize();
但是我发现我的程序只能解决部分问题,因此我在全局函数printf()
中使用voxelize ()
就像下面的代码
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
输出仅显示每个维的最后一部分(即blockIdx.x始终为5,只有一部分blockIndex.z从0更改为5)。但我不明白为什么,是当我调用此内核函数时有什么问题吗? 我的计算机装有GTX1050Ti MaxQ和cuda 10。
之后,我向内核传递了一个指针以监视运行时间。
int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
int* times = new int(0);
int* gpu_times;
cudaMalloc((void **)&gpu_times, sizeof(int));
cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);
voxelize << < Dg, Db >> > (gpu_times);
cudaDeviceSynchronize();
cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << *times << std::endl;
内核被修改为
__global__ void voxelize(int* times){
(*times)++;
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
输出为 输出显示它运行了141次,但实际上输出应该远远大于69664
atomicAdd(times,1);
而不是*times++
。但是为什么printf()
只输出我之前所述的索引的一部分?
答案 0 :(得分:0)
您需要调用cudaDeviceSynchronize()
(为清楚起见,省略了错误检查),如果您使用大量的printf(这种情况),则还需要cudaDeviceSetLimit(...)
:
#include <stdio.h>
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
int main()
{
// Increase device printf buffer to 50 MiB
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 50*1024*1024);
dim3 Dg(5, 5, 5);
dim3 Db(8, 8, 8);
voxelize<<<Dg, Db>>>();
cudaDeviceSynchronize();
return 0;
}
这将打印如下内容:
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
the thread blockIdx.x 2, blockIdx.y 3 blockIdx.z 4
[...]
然后您可以像这样检查它:
# This will keep one line per block and count them, so 5*5*5 == 125
$ ./a.out | sort | uniq | wc -l
125
# This will output one line per thread and count them, so 5*5*5 * 8*8*8 == 64000
$ ./a.out | wc -l
64000
您不能这样做:(*times)++;
。您将遇到并发问题。您需要使用atomic functions。