openCL CL_OUT_OF_RESOURCES错误

时间:2015-06-07 17:46:09

标签: opencl odroid mali

我正在尝试将用Cuda编写的代码转换为openCL并遇到麻烦。我的最终目标是在带有Mali T628 GPU的Odroid XU3板上实现代码。

为了简化转换并节省尝试调试openCL内核的时间,我完成了以下步骤:

  1. 在Cuda中实施代码并在Nvidia GeForce 760上进行测试
  2. 在openCL中实现代码并在Nvidia GeForce 760上进行测试
  3. 测试带有Mali T628 GPU的Odroid XU3板上的openCL代码。
  4. 我知道不同的架构可能有不同的优化,但这不是我现在主要关注的问题。我想在我的Nvidia GPU上运行openCL代码没有明显的问题,但在尝试运行Odroid板上的代码时一直遇到奇怪的错误。我知道不同的架构对异常等有不同的处理方式,但我不知道如何解决这些问题。

    由于openCL代码适用于我的Nvidia,我认为我设法在线程/块之间进行了正确的转换 - > workItems / workGroups等 我已经修复了几个与cl_device_max_work_group_size问题相关的问题,因此不能成为cuase。

    运行代码时出现“CL_OUT_OF_RESOURCES”错误。我已将错误的原因缩小到代码中的2行,但不确定是否已解决这些问题。

    错误是由以下行引起的:

    1. lowestDist [pixelNum] = partialDiffSumTemp;这两个变量都是内核的私有变量,因此我没有看到任何潜在的问题。
    2. d_disparityLeft [globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 0] = bestDisparity [0];         在这里,我猜原因是“OUT_OF_BOUND”,但不知道如何调试它,因为原始代码没有任何问题。
    3. 我的内核代码是:

      #define ALIGN_IMAGE_WIDTH          64
      #define NUM_PIXEL_PER_THREAD        4
      
      #define MIN_DISPARITY               0  
      #define MAX_DISPARITY              55  
      
      #define WINDOW_SIZE                19 
      #define WINDOW_RADIUS              (WINDOW_SIZE / 2)   
      
      #define TILE_SHARED_MEM_WIDTH      96                       
      #define TILE_SHARED_MEM_HEIGHT     32
      #define TILE_BOUNDARY_WIDTH        64
      #define TILE_BOUNDARY_HEIGHT       (2 * WINDOW_RADIUS)
      
      #define BLOCK_WIDTH                (TILE_SHARED_MEM_WIDTH  - TILE_BOUNDARY_WIDTH) 
      #define BLOCK_HEIGHT               (TILE_SHARED_MEM_HEIGHT - TILE_BOUNDARY_HEIGHT)  
      
      #define THREAD_NUM_WIDTH            8
      #define THREADS_NUM_HEIGHT         TILE_SHARED_MEM_HEIGHT
      
       //TODO fix input arguments
      __kernel void hello_kernel( __global unsigned char*  d_leftImage,
                                  __global unsigned char*  d_rightImage,
                                  __global float* d_disparityLeft) {
      
          int blockX      = get_group_id(0);
          int blockY      = get_group_id(1);
          int threadX     = get_local_id(0);
          int threadY     = get_local_id(1);
      
          __local unsigned char leftImage      [TILE_SHARED_MEM_WIDTH * TILE_SHARED_MEM_HEIGHT];
          __local unsigned char rightImage     [TILE_SHARED_MEM_WIDTH * TILE_SHARED_MEM_HEIGHT];
          __local unsigned int  partialDiffSum [BLOCK_WIDTH           * TILE_SHARED_MEM_HEIGHT];
      
          int alignedImageWidth = 640;
          int partialDiffSumTemp;
          float bestDisparity[4] = {0,0,0,0};
          int lowestDist[4];
              lowestDist[0] = 214748364;
              lowestDist[1] = 214748364;
              lowestDist[2] = 214748364;
              lowestDist[3] = 214748364;
      
          // Read image blocks into shared memory. read is done at 32bit integers on a uchar array. each thread reads 3 integers(12byte) 96/12=8threads
          int sharedMemIdx = threadY * TILE_SHARED_MEM_WIDTH + 4 * threadX; 
          int globalMemIdx = (blockY * BLOCK_HEIGHT + threadY) * alignedImageWidth + blockX * BLOCK_WIDTH + 4 * threadX; 
      
          for (int i = 0; i < 4; i++) {
              leftImage [sharedMemIdx                        + i ] = d_leftImage [globalMemIdx                        + i];
              leftImage [sharedMemIdx + 4 * THREAD_NUM_WIDTH + i ] = d_leftImage [globalMemIdx + 4 * THREAD_NUM_WIDTH + i];
              leftImage [sharedMemIdx + 8 * THREAD_NUM_WIDTH + i ] = d_leftImage [globalMemIdx + 8 * THREAD_NUM_WIDTH + i];
              rightImage[sharedMemIdx                        + i ] = d_rightImage[globalMemIdx                        + i];
              rightImage[sharedMemIdx + 4 * THREAD_NUM_WIDTH + i ] = d_rightImage[globalMemIdx + 4 * THREAD_NUM_WIDTH + i];
              rightImage[sharedMemIdx + 8 * THREAD_NUM_WIDTH + i ] = d_rightImage[globalMemIdx + 8 * THREAD_NUM_WIDTH + i];
          }
      
          barrier(CLK_LOCAL_MEM_FENCE);
      
          int imageIdx = sharedMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS;
          int partialSumIdx = threadY * BLOCK_WIDTH + 4 * threadX;
      
          for(int dispLevel = MIN_DISPARITY; dispLevel <= MAX_DISPARITY; dispLevel++) {
      
              // horizontal partial sum
              partialDiffSumTemp = 0;
              #pragma unroll
              for(int i = imageIdx - WINDOW_RADIUS; i <= imageIdx + WINDOW_RADIUS; i++) {
                          //partialDiffSumTemp += calcDiff(leftImage [i], rightImage[i - dispLevel]);
                            partialDiffSumTemp += abs(leftImage[i] - rightImage[i - dispLevel]);
              }
              partialDiffSum[partialSumIdx] = partialDiffSumTemp;
      
              barrier(CLK_LOCAL_MEM_FENCE);
      
              for (int pixelNum = 1, i = imageIdx - WINDOW_RADIUS; pixelNum < NUM_PIXEL_PER_THREAD; pixelNum++, i++) {
                  partialDiffSum[partialSumIdx + pixelNum] = partialDiffSum[partialSumIdx + pixelNum - 1] + 
                                                             abs(leftImage[i + WINDOW_SIZE] - rightImage[i - dispLevel + WINDOW_SIZE]) -
                                                             abs(leftImage[i]               - rightImage[i - dispLevel]);
              }
      
              barrier(CLK_LOCAL_MEM_FENCE);
      
              // vertical sum
              if(threadY >= WINDOW_RADIUS && threadY < TILE_SHARED_MEM_HEIGHT - WINDOW_RADIUS) {
      
                  for (int pixelNum = 0; pixelNum < NUM_PIXEL_PER_THREAD; pixelNum++) {
                      int rowIdx = partialSumIdx - WINDOW_RADIUS * BLOCK_WIDTH;
                      partialDiffSumTemp = 0;
      
                          for(int i = -WINDOW_RADIUS; i <= WINDOW_RADIUS; i++,rowIdx += BLOCK_WIDTH) {
                                 partialDiffSumTemp += partialDiffSum[rowIdx + pixelNum];
                          }
      
                          if (partialDiffSumTemp < lowestDist[pixelNum]) {
                              lowestDist[pixelNum]    = partialDiffSumTemp;
                              bestDisparity[pixelNum] = dispLevel - 1;
                          }
      
      
                  }
              }
      
          }
      
          if (threadY >= WINDOW_RADIUS && threadY < TILE_SHARED_MEM_HEIGHT - WINDOW_RADIUS && blockY < 32) {
      
              d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 0] = bestDisparity[0];
              d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 1] = bestDisparity[1];
              d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 2] = bestDisparity[2];
              d_disparityLeft[globalMemIdx + TILE_BOUNDARY_WIDTH - WINDOW_RADIUS + 3] = bestDisparity[3];
          }
      
      }
      

      感谢所有帮助

      尤瓦

1 个答案:

答案 0 :(得分:0)

根据我的经验,NVidia GPU并不总是在绑定访问时崩溃,很多时候内核仍会返回预期的结果。

使用printf检查索引。如果您安装了Nvidia OpenCL 1.2驱动程序printf应该可用作核心功能。据我检查,Mali-T628使用OpenCL 1.1,然后检查printf是否可用作供应商扩展。您也可以在printf可用的AMD / Intel CPU上运行内核(OpenCL 1.2 / 2.0)。

检查索引的替代方法可以是将__global int* debug数组传递到存储索引的位置,然后在主机上进行检查。确保将其分配得足够大,以便记录超出范围的索引。