我开始使用cuda,我正在尝试一个简单的例子,我将两个数组发送到一个全局函数,一个复制到另一个,然后返回第二个。
我有:
__global__
void add(int n, int *tri, int *y)
{
int index = threadIdx.x;
int stride = blockDim.x;
for (int i = index; i < n; i += stride)
y[i] = tri[i];
}
和
//local copy of data
int *tri2 = tri; // data checked, and is valid
int *y = new int[width * height]; // same size as `tri`
int N = width * height;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&tri2, N * sizeof(int));
cudaMallocManaged(&y, N * sizeof(int));
// initialize y array on the host
for (int i = 0; i < N; i++) {
y[i] = 2;
}
// Run kernel on the GPU
add << <1, 256 >> >(N, tri2, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
//copy back to host
int i = 0;
int f = -999.0; /* CPU copy of value */
cudaMemcpy(&f, &y[i], sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "back: " << f << std::endl;
std::cout << "orig: " << tri[i] << std::endl;
orig
值为128,与进入时相同。返回的f
值始终为0.我缺少什么?
答案 0 :(得分:1)
数组tri的值与数组tri2的值不同。
使用
cudaMallocManaged(&tri2, N * sizeof(int));
你在设备上分配新内存,我假设恰好是零。然后在内核中将这个零数组复制到y。永远不会复制数组tri的值。
以下是一些如何做到这一点的示例。 (另)
int* tri = ....
int* tri_managed;
//allocate new managed memory and save pointer in tri_managed
cudaMallocManaged(&tri_managed, sizeof(int) * N);
//now copy entries of tri to tri_managed
for(int i = 0; i < N; i++)
tri_managed[i] = tri[i];
int* y;
cudaMallocManaged(&y, N * sizeof(int));
// initialize y array
for (int i = 0; i < N; i++) {
y[i] = 2;
}
//copy entries of tri_managed to y
add << <1, 256 >> >(N, tri_managed, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
//copy back to host
int i = 0;
int f = -999.0; /* CPU copy of value */
//cudaMemcpy(&f, &y[i], sizeof(int), cudaMemcpyDeviceToHost);
//since managed memory is accessible on host and device, we can just to this
f = y[i];
std::cout << "back: " << f << std::endl;
std::cout << "orig: " << tri[i] << std::endl;
//don't forget to free memory after usage
cudaFree(tri_managed);
cudaFree(y);