NVIDIA上的CL_OUT_OF_RESOURCES错误

时间:2014-12-23 23:13:53

标签: opencl nvidia

我正在尝试在NVIDIA GPU上运行代码。重复执行CL_OUT_OF_RESOURCES时出现CL_COMMAND_READ_BUFFER错误。相同的代码确实在英特尔CPU上运行,有8个计算单元完美无缺。

在CPU上:

  • local size = 1
  • 全球尺寸= 8
  • DEVICE_GLOBAL_MEM_SIZE = 12431327232

在GPU上:

  • 本地尺寸= 32(经纱尺寸)
  • 全球尺寸= 32 * 8
  • DEVICE_GLOBAL_MEM_SIZE = 1072889856

我知道CL_OUT_OF_RESOURCES错误是因为在分配的缓冲区之外的内存访问。但我不确定为什么只在GPU中看到这个错误。这可能是什么错误?我错过了什么吗?

编辑:

h_ref [add]读取的结构数据不符合预期。在主机端的打印sizeof(h_ref)在设备端返回8返回4.在Intel CPU上运行时,它在主机和设备上读取8。如何确保正确传输数据?

规格GeForce GTX 560 Ti http://www.pastebin.ca/2892740

内核代码:

__kernel void product(__global double* h_inputvec,__global struct Reference* h_ref ,__global double* h_res,const int num_ipvec,const int N,const int numb)
 {  
 const int indicator = get_group_id(0);

const int add = (int)(indicator*num_ipvec)/N;
unsigned int ipv_size = h_ref[add].num ;

const int fract = (int)(ipv_size*N/num_ipvec); 
int fractn = (int) fract/numb;
const int div = (int)ipv_size/fractn  ; 

struct Node temp = h_ref[add].noderef ; 


__local double centroid[441];
double value;
int tempo ;

const int th_id = get_local_id(0);
int te = 14;

for(int i=0; i< 14 ; i++) // assuming that number of work items allotted are greater than centroids
{
    int val =  (441*(indicator % numb));
    centroid[(32*i+th_id)%441] = temp.centroids[(32*i+th_id)%441 + val] ;
}   


for( int i = 0 ; i<= div ; i++)
{           
    value = 0;
    int inc = (int) (indicator / numb);
    tempo = h_ref[add].input_vec[(i*fractn+inc)%ipv_size];


    for(int j=0 ; j<441 ;j++)
    {                       
        double diff = centroid[j]- h_inputvec[441*tempo + j];
        value += pow(diff,2);
    }
    double norm =  sqrt(value);
    h_res[N*tempo+(indicator%numb)] =  norm;
}                                                                       

}

主机代码:

 struct Node {
 public:
 cl_uint num
 cl_double *dic;
 cl_double *c;
 cl_double *di;
 cl_double *bs;
 Node *bs;
 };

 struct Reference{
 public:
 cl_uint* inec;
 cl_uint nu;
 Node nref;
 };

     N=8
     size_t local[1] = {32};
     size_t global[1] = {32*8};

    size_t sz_ipv = 441*num_ipvec*sizeof(double);
    size_t sz_ref = sizeof(temp_ref);
    size_t  sz_res = N*num_ipvec*sizeof(double); 
    size_t sz_minres = num_ipvec*sizeof(int)*(N/2);

    /* create host array */

    double *h_inputvec = (double*) calloc(441*num_ipvec,sizeof(double));
    Reference *h_ref = (Reference*) calloc(1,sizeof(temp_ref)); // change
    double *h_res = (double*) calloc(N*num_ipvec,sizeof(double)); // N,X data forward
    int *h_minres = (int*) calloc(num_ipvec*(N/2),sizeof(int));

            for(int in =0 ; in<N*num_ipvec ; in++) // initialze result to maximum value (padding) 
    {
            h_res[in] = numeric_limits<int>::max();
            h_minres[in%(N/2*num_ipvec)] = in % int(N/2) ;
    }

    cl_mem c_inputvec = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sz_ipv, h_inputvec, &err);
    cl_mem c_ref = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sz_ref, h_ref, &err);
    cl_mem c_res = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sz_res, h_res, &err);
    cl_mem c_minres =  clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sz_minres, h_minres, &err);


    // set arguments    

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &c_inputvec);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &c_ref);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_res);
    clSetKernelArg(kernel, 3, sizeof(const int), &num_ipvec);
    clSetKernelArg(kernel, 4, sizeof(const int), &N);
    clSetKernelArg(kernel, 5, sizeof(const int), &numb);
    printf(" arguments to kernel \n");

    //launch kernel         
    clEnqueueNDRangeKernel(queue, kernel, dim, 0, global, local, 0, (cl_event*)NULL, &event_time); 

0 个答案:

没有答案