Question

大家：最近我试图利用cuda 5.5的最新属性来编程，即动态并行。但我有一些非常令人困惑的问题。我的代码在这里：

    /* Includes, system */
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <iostream>
    using namespace std;
    /* Includes, cuda */
    #include <cuda_runtime.h>
    #include <cublas_v2.h>

    /* Includes, cuda helper functions */
    #include <helper_cuda.h>

    #include "kernels.cu"
    /* Matrix size */
    #define N  (275)

    #define LengthSignal (64)

    #define AmountSignal (255025)

    #define NBLOCKX (32768)

    #define NTHREADS_PER_BLOCK (128)
    /* Declaration of the function that computes sgemm using CUBLAS device API */

    __global__ void invokeDeviceCublasSgemm(float *d_A, float *Test);

    /* Main */
    int main(int argc, char **argv)
    {
      float *h_A;
      float *d_A = 0;
      int n2 = N * N;

      h_A = (float *)malloc(n2 * sizeof(h_A[0]));
  /* Fill the matrices with test data */
  for (int i = 0; i < n2; i++)
    {
      h_A[i] = rand() / (float)RAND_MAX;
    }

      cudaMalloc((void **)&d_A, n2 * sizeof(h_A[0]));

      /* Initialize the device matrices with the host matrices */
      //  cudaMemcpy(d_A, h_A, sizeof(float) * LengthSignal * AmountSignal, cudaMemcpyHostToDevice);
      cudaMemcpy(d_A, h_A, n2 * sizeof(h_A[0]), cudaMemcpyHostToDevice);

      int Length = 100;
      float *h_Test = (float *) malloc(sizeof(float) * Length);
      float *d_Test;
      cudaMalloc((void **) &d_Test, sizeof(float) * Length);
      cudaMemset(d_Test, 0, sizeof(float) * Length);

  invokeDeviceCublasSgemm<<<NBLOCKX, NTHREADS_PER_BLOCK>>>(d_A, d_Test);
  cudaMemcpy(h_Test, d_Test, sizeof(float) * Length, cudaMemcpyDeviceToHost);

  printf("\n The first 10 elements of d_A in location 1 are: \n");
  for (int j = 0; j < 10; j ++)
    {
      printf("%f ", h_Test[j]);
    }

  printf("\n The first 10 elements of d_A in location 2 are: \n");
  for (int j = 10; j < 20; j ++)
    {
      printf("%f ", h_Test[j]);
    }
  printf("\n");

  free(h_Test);
  cudaFree(d_Test);

  /* Memory clean up */
  free(h_A);
  cudaFree(d_A);
}

#ifndef __GLOBAL__CU__
#define __GLOBAL__CU__

__global__ void invokeDeviceCublasSgemm(float *d_A, float *Test)
{
  // save the first 10 elements of d_A in location 1
  for (int j = 0; j < 10; j ++)
    {
      Test[j] = d_A[j];
    }
  cublasHandle_t cnpHandle;
  cublasCreate(&cnpHandle);

    // save the first 10 elements of d_A in location 2
    for (int j = 10; j < 20; j ++)
      {
        Test[j] = d_A[j - 10];
      }
  cublasDestroy(cnpHandle);
}

#endif

如果我将配置参数设置为＆lt;＆lt;＆lt;＆lt;＆lt; 1＆gt;＆gt;＆gt;＆gt;，一切正常。输出就是这样：

位置1中d_A的前10个元素是：

0.840188 0.394383 0.783099 0.798440 0.911647 0.197551 0.335223 0.768230 0.277775 0.553970

位置2中d_A的前10个元素是：

0.840188 0.394383 0.783099 0.798440 0.911647 0.197551 0.335223 0.768230 0.277775 0.553970

但是，如果我将配置参数设置为＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆lt;＆quot;＆nbsp;＆nbsp;＆gt;输出就是这样：

位置1中d_A的前10个元素是：

-0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

位置2中d_A的前10个元素是：

0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

我真的不知道为什么！我的代码只是来自“样本”，稍有改动。

然后我只删除最后一个代码“cublasDestroy（cnpHandle）;”，然后它就变得正常了。输出是：

位置1中d_A的前10个元素是：

0.840188 0.394383 0.783099 0.798440 0.911647 0.197551 0.335223 0.768230 0.277775 0.553970

位置2中d_A的前10个元素是：

0.840188 0.394383 0.783099 0.798440 0.911647 0.197551 0.335223 0.768230 0.277775 0.553970

有人遇到同样的问题吗？

谢谢！

Answer 1

执行一些proper cuda error checking您可以在主机API调用以及设备API调用和CUBLAS API调用（以及内核调用）上执行此操作。如果您不确定，请阅读动态并行度documentation。

您可能随时exceeding the number of kernel launches that can be outstanding。有一个（可配置的）2048内核启动限制，可以很出色。由于您的代码因主机内核启动参数<<<32768, 128>>>而失败，这意味着您尝试启动32768x128个线程，每个线程可能尝试启动子内核。如果内核启动次数超过限制，则剩余的内核启动将失败。

“但我没有启动任何子内核？”实际上，使用设备CUBLAS API意味着内核可能会启动。这就是设备CUBLAS系统的工作方式。

为了真正清晰起见，我再次强烈建议您进行可靠的错误检查。

cublas device api输出奇怪的结果

1 个答案: