Question

我正在尝试实现一个Cuda内核以将其直接与Jcuda一起使用。

我想做两个元素为N的向量的向量点积。我已经在线上关注了一些教程，并且能够编写以下代码。我不知道代码有什么问题，但是我无法将结果从GPU记录到主机内存中。错误在

行中

`

int ThreadPerBlock = 256;
  int blockPerGrid = Math.max(32 , ((int) n+ThreadPerBlock-1) / ThreadPerBlock );
 float hostResult[] = new float[blockPerGrid];
 cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, blockPerGrid * Sizeof.FLOAT);

`

extern "C"
__global__ void Vector_Dot_Product (size_t N, float *V1 , float *V2 , float *V3   )
{
    const int ThreadPerBlock = 256;
 __shared__ float chache[ThreadPerBlock] ;

    float temp ;

unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;

unsigned int chacheindex = threadIdx.x ;

 while ( tid < N )
 {
      temp += V1[tid] * V2[tid] ;

      tid += blockDim.x * gridDim.x ;
 }

  chache[chacheindex] = temp ;

  __syncthreads () ;

 int i  = blockDim.x / 2 ;

while ( i!=0 )
 {

  if ( chacheindex < i )
         chache[chacheindex] += chache [chacheindex + i] ;

         __syncthreads () ;

   i/=2 ;
 }

  if ( chacheindex == 0 )
         V3[blockIdx.x] = chache [0] ;


}

这是我在内核上调用的Java函数。

public static float[] VecDotProd(long n, float[] x, float[] y){
        // Define the values of blockPerGrid.
        int ThreadPerBlock = 256;
        int blockPerGrid = Math.max(32 , ((int) n+ThreadPerBlock-1) / ThreadPerBlock );

        // Enable exceptions and omit all subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);

        // Initialize the driver and create a context for the first device.
        cuInit(0);
        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);
        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        // Afterwards, initialize the vector library, which will
        // attach to the current context
        VecMatOperation.init();

        // Allocate the device pointers, and copy the
        // host input data to the device
        CUdeviceptr deviceX = new CUdeviceptr();
        cuMemAlloc(deviceX, n * Sizeof.FLOAT);
        cuMemcpyHtoD(deviceX, Pointer.to(x), n * Sizeof.FLOAT);

        CUdeviceptr deviceY = new CUdeviceptr();
        cuMemAlloc(deviceY, n* Sizeof.FLOAT);
        cuMemcpyHtoD(deviceY, Pointer.to(y), n * Sizeof.FLOAT);

        CUdeviceptr deviceResult = new CUdeviceptr();
        cuMemAlloc(deviceResult, blockPerGrid * Sizeof.FLOAT);

        // Calling the add function to perform vector addition.
        VecMatOperation.VecDotProd(n, deviceX , deviceY, deviceResult);

        // Allocate host output memory and copy the device output
        // to the host.
        float hostResult[] = new float[blockPerGrid];
//        System.out.println(Sizeof(deviceResult));
        cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, blockPerGrid * Sizeof.FLOAT);

        cuMemFree(deviceX);
        cuMemFree(deviceY);
        cuMemFree(deviceResult);
        VecMatOperation.shutdown();

        return hostResult;
    }

我创建了一个接口，该接口使用call函数从Cuda内核中调用实际函数。但是，从Java代码中，您可以假定它只是简单的调用，其中包含传递的参数，而没有其他内容。完整的代码真的很大，在这里我无法显示出来，这就是为什么我在出错时放下内核代码和代码片段的原因。错误所在

cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, blockPerGrid * Sizeof.FLOAT);

它说由于两者的大小不匹配，无法将deviceResult写入hostResult。我尝试从内核代码打印结果值，它具有49个值，这些值不是hostResult的大小。但是我尝试使用49作为hostResult数组的大小，但仍然遇到相同的错误。所以以某种方式我无法将结果写回到CPU。写下

GPU的结果大小应该是多少？如果有什么问题，代码中有什么问题？

代码中结果的大小将是多少？代码有什么问题？

0 个答案: