我是CUDA的新手,并试图弄清楚如何将2d数组传递给内核。 我必须遵循1维数组的工作代码:
class Program
{
static void Main(string[] args)
{
int N = 10;
int deviceID = 0;
CudaContext ctx = new CudaContext(deviceID);
CudaKernel kernel = ctx.LoadKernel(@"doubleIt.ptx", "DoubleIt");
kernel.GridDimensions = (N + 255) / 256;
kernel.BlockDimensions = Math.Min(N,256);
// Allocate input vectors h_A in host memory
float[] h_A = new float[N];
// Initialize input vectors h_A
for (int i = 0; i < N; i++)
{
h_A[i] = i;
}
// Allocate vectors in device memory and copy vectors from host memory to device memory
CudaDeviceVariable<float> d_A = h_A;
CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(N);
// Invoke kernel
kernel.Run(d_A.DevicePointer, d_C.DevicePointer, N);
// Copy result from device memory to host memory
float[] h_C = d_C;
// h_C contains the result in host memory
}
}
使用以下内核代码:
__global__ void DoubleIt(const float* A, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] * 2;
}
正如我所说,一切正常,但我想使用2d数组如下:
// Allocate input vectors h_A in host memory
int W = 10;
float[][] h_A = new float[N][];
// Initialize input vectors h_A
for (int i = 0; i < N; i++)
{
h_A[i] = new float[W];
for (int j = 0; j < W; j++)
{
h_A[i][j] = i*W+j;
}
}
我需要所有第二维都在同一个线程上,所以 kernel.BlockDimensions 必须保持为1维,每个内核线程需要获得带有10个元素的1d数组。
所以我的底层问题是:我如何将这个2d数组复制到设备以及如何在内核中使用它? (至于它应该总共有10个线程的例子)。
答案 0 :(得分:1)
简短的回答:你不应该这样做......
答案很长:一般来说,锯齿状阵列很难处理。你的数据中没有一个连续的内存段,而是在你的记忆中的某个地方有很多小的内容。如果将数据复制到GPU会发生什么?如果您有一个大的连续段,则调用cudaMemcpy/CopyToDevice
函数并立即复制整个块。但是,与在for循环中分配锯齿状数组相同,您必须逐行将数据复制到CudaDeviceVariable<CUdeviceptr>
,其中每个条目都指向CudaDeviceVariable<float>
。同时,您维护一个主机阵列CudaDeviceVariable<float>[]
,用于管理主机端的CUdeviceptrs
。一般来说,复制数据已经很慢了,这样做可能是一个真正的性能杀手......
总结:如果可以,请使用展平数组并使用索引y * DimX + x
索引条目。在GPU方面甚至更好,使用音调内存,在这里进行分配,以便每行开始一个“好”的地址:索引然后转向y * Pitch + x
(简化)。 CUDA中的2D复制方法是针对这些间距内存分配进行的,其中每一行都添加了一些额外的字节。
为了完整性:在C#中你还有像float[,]
这样的二维数组。您也可以在主机端使用这些而不是扁平的1D阵列。但我不建议这样做,因为.net的ISO标准不能保证内部存储器实际上是连续的,这是managedCuda必须使用的假设才能使用这些数组。当前的.net框架没有任何内部的怪异,但是谁知道它是否会像这样......
这将实现锯齿状阵列副本:
float[][] data_h;
CudaDeviceVariable<CUdeviceptr> data_d;
CUdeviceptr[] ptrsToData_h; //represents data_d on host side
CudaDeviceVariable<float>[] arrayOfarray_d; //Array of CudaDeviceVariables to manage memory, source for pointers in ptrsToData_h.
int sizeX = 512;
int sizeY = 256;
data_h = new float[sizeX][];
arrayOfarray_d = new CudaDeviceVariable<float>[sizeX];
data_d = new CudaDeviceVariable<CUdeviceptr>(sizeX);
ptrsToData_h = new CUdeviceptr[sizeX];
for (int x = 0; x < sizeX; x++)
{
data_h[x] = new float[sizeY];
arrayOfarray_d[x] = new CudaDeviceVariable<float>(sizeY);
ptrsToData_h[x] = arrayOfarray_d[x].DevicePointer;
//ToDo: init data on host...
}
//Copy the pointers once:
data_d.CopyToDevice(ptrsToData_h);
//Copy data:
for (int x = 0; x < sizeX; x++)
{
arrayOfarray_d[x].CopyToDevice(data_h[x]);
}
//Call a kernel:
kernel.Run(data_d.DevicePointer /*, other parameters*/);
//kernel in *cu file:
//__global__ void kernel(float** data_d, ...)
这是CudaPitchedDeviceVariable:
的示例int dimX = 512;
int dimY = 512;
float[] array_host = new float[dimX * dimY];
CudaPitchedDeviceVariable<float> arrayPitched_d = new CudaPitchedDeviceVariable<float>(dimX, dimY);
for (int y = 0; y < dimY; y++)
{
for (int x = 0; x < dimX; x++)
{
array_host[y * dimX + x] = x * y;
}
}
arrayPitched_d.CopyToDevice(array_host);
kernel.Run(arrayPitched_d.DevicePointer, arrayPitched_d.Pitch, dimX, dimY);
//Correspondend kernel:
extern "C"
__global__ void kernel(float* data, size_t pitch, int dimX, int dimY)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= dimX || y >= dimY)
return;
//pointer arithmetic: add y*pitch to char* pointer as pitch is given in bytes,
//which gives the start of line y. Convert to float* and add x, to get the
//value at entry x of line y:
float value = *(((float*)((char*)data + y * pitch)) + x);
*(((float*)((char*)data + y * pitch)) + x) = value + 1;
//Or simpler if you don't like pointers:
float* line = (float*)((char*)data + y * pitch);
float value2 = line[x];
}