我正在尝试实现一个Cuda内核以将其直接与Jcuda一起使用。
我想做两个元素为N的向量的向量点积。我已经在线上关注了一些教程,并且能够编写以下代码。我不知道代码有什么问题,但是我无法将结果从GPU记录到主机内存中。错误在
行中`
int ThreadPerBlock = 256;
int blockPerGrid = Math.max(32 , ((int) n+ThreadPerBlock-1) / ThreadPerBlock );
float hostResult[] = new float[blockPerGrid];
cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, blockPerGrid * Sizeof.FLOAT);
`
extern "C"
__global__ void Vector_Dot_Product (size_t N, float *V1 , float *V2 , float *V3 )
{
const int ThreadPerBlock = 256;
__shared__ float chache[ThreadPerBlock] ;
float temp ;
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;
unsigned int chacheindex = threadIdx.x ;
while ( tid < N )
{
temp += V1[tid] * V2[tid] ;
tid += blockDim.x * gridDim.x ;
}
chache[chacheindex] = temp ;
__syncthreads () ;
int i = blockDim.x / 2 ;
while ( i!=0 )
{
if ( chacheindex < i )
chache[chacheindex] += chache [chacheindex + i] ;
__syncthreads () ;
i/=2 ;
}
if ( chacheindex == 0 )
V3[blockIdx.x] = chache [0] ;
}
这是我在内核上调用的Java函数。
public static float[] VecDotProd(long n, float[] x, float[] y){
// Define the values of blockPerGrid.
int ThreadPerBlock = 256;
int blockPerGrid = Math.max(32 , ((int) n+ThreadPerBlock-1) / ThreadPerBlock );
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Afterwards, initialize the vector library, which will
// attach to the current context
VecMatOperation.init();
// Allocate the device pointers, and copy the
// host input data to the device
CUdeviceptr deviceX = new CUdeviceptr();
cuMemAlloc(deviceX, n * Sizeof.FLOAT);
cuMemcpyHtoD(deviceX, Pointer.to(x), n * Sizeof.FLOAT);
CUdeviceptr deviceY = new CUdeviceptr();
cuMemAlloc(deviceY, n* Sizeof.FLOAT);
cuMemcpyHtoD(deviceY, Pointer.to(y), n * Sizeof.FLOAT);
CUdeviceptr deviceResult = new CUdeviceptr();
cuMemAlloc(deviceResult, blockPerGrid * Sizeof.FLOAT);
// Calling the add function to perform vector addition.
VecMatOperation.VecDotProd(n, deviceX , deviceY, deviceResult);
// Allocate host output memory and copy the device output
// to the host.
float hostResult[] = new float[blockPerGrid];
// System.out.println(Sizeof(deviceResult));
cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, blockPerGrid * Sizeof.FLOAT);
cuMemFree(deviceX);
cuMemFree(deviceY);
cuMemFree(deviceResult);
VecMatOperation.shutdown();
return hostResult;
}
我创建了一个接口,该接口使用call函数从Cuda内核中调用实际函数。但是,从Java代码中,您可以假定它只是简单的调用,其中包含传递的参数,而没有其他内容。 完整的代码真的很大,在这里我无法显示出来,这就是为什么我在出错时放下内核代码和代码片段的原因。错误所在
cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, blockPerGrid * Sizeof.FLOAT);
它说由于两者的大小不匹配,无法将deviceResult写入hostResult。我尝试从内核代码打印结果值,它具有49个值,这些值不是hostResult的大小。但是我尝试使用49作为hostResult数组的大小,但仍然遇到相同的错误。所以以某种方式我无法将结果写回到CPU。写下
GPU的结果大小应该是多少?如果有什么问题,代码中有什么问题?