最近我使用CUDA编程,当blockNum超过500时我遇到了一个不可思议的问题。为了简化模式,我编写了以下测试代码:
#include <assert.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
/* Example from "Introduction to CUDA C" from NVIDIA website:
https://developer.nvidia.com/cuda-education
Compile with:
$ nvcc example_intro.cu */
#define num 1000
const int N = num*32*12;
__global__ void add_blocks (int *a, int *c) {
int threadId = blockIdx.x * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
int block_id = threadIdx.y;
if(threadId % 2 == 0){
c[threadId] = 1;
}
}
int main(void) {
int *a, *c;
int *d_a, *d_c; /* Device (GPU) copies of a, b, c */
size_t size = N * sizeof(int);
/* Allocate memory in device */
cudaMalloc((void **) &d_a, size);
cudaMalloc((void **) &d_c, size);
/* Allocate memory in host */
a = (int *) malloc(size);
c = (int *) malloc(size);
/* Allocate random data in vectors a and b (inside host) */
for (int i = 0; i < N; ++i) {
a[i] = 0;
c[i] = 0;
}
/* Copy data to device */
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
dim3 threads_per_block(32, 12);
add_blocks<<<num, threads_per_block>>>(d_a,d_c);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
cudaError_t errSync = cudaGetLastError();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
int counter = 0;
for (int i = 0; i < N; ++i) {
if(c[i] == 1){
counter ++;
}
}
printf("%d\n",counter);
/* Clean-up */
free(a);
free(c);
cudaFree(d_a);
cudaFree(d_c);
return 0;
}
当线程数为2的倍数时,我将c数组设置为1,最后我计算数字1,我认为是N / 2。当块数低于500时,它运行良好,例如是num * 32 * 12/2 = 500 * 32 * 12/2 = 96 000.但是当num为1000时,结果是312846,应该是192000.任何人都可以帮助我?谢谢大家。
答案 0 :(得分:2)
问题在于此代码:
int counter = 0;
for (int i = 0; i < N; ++i) {
if(c[i] == 1){
counter ++;
}
}
printf("%d\n",counter);
您隐含地假设c
中的每个值都必须由先前的GPU内核设置。但是,你根本没有设置d_c
中的一半元素的值(因此在程序的这一点上c
),所以不能保证其中一些元素也不会有值为1.读取和使用单位化内存的值并不令人惊讶,这只是糟糕的编程习惯。