我正在尝试在NVIDIA GPU上运行代码。重复执行CL_OUT_OF_RESOURCES
时出现CL_COMMAND_READ_BUFFER
错误。相同的代码确实在英特尔CPU上运行,有8个计算单元完美无缺。
在CPU上:
DEVICE_GLOBAL_MEM_SIZE
= 12431327232 在GPU上:
DEVICE_GLOBAL_MEM_SIZE
= 1072889856 我知道CL_OUT_OF_RESOURCES
错误是因为在分配的缓冲区之外的内存访问。但我不确定为什么只在GPU中看到这个错误。这可能是什么错误?我错过了什么吗?
编辑:
h_ref [add]读取的结构数据不符合预期。在主机端的打印sizeof(h_ref)在设备端返回8返回4.在Intel CPU上运行时,它在主机和设备上读取8。如何确保正确传输数据?
规格GeForce GTX 560 Ti http://www.pastebin.ca/2892740
内核代码:
__kernel void product(__global double* h_inputvec,__global struct Reference* h_ref ,__global double* h_res,const int num_ipvec,const int N,const int numb)
{
const int indicator = get_group_id(0);
const int add = (int)(indicator*num_ipvec)/N;
unsigned int ipv_size = h_ref[add].num ;
const int fract = (int)(ipv_size*N/num_ipvec);
int fractn = (int) fract/numb;
const int div = (int)ipv_size/fractn ;
struct Node temp = h_ref[add].noderef ;
__local double centroid[441];
double value;
int tempo ;
const int th_id = get_local_id(0);
int te = 14;
for(int i=0; i< 14 ; i++) // assuming that number of work items allotted are greater than centroids
{
int val = (441*(indicator % numb));
centroid[(32*i+th_id)%441] = temp.centroids[(32*i+th_id)%441 + val] ;
}
for( int i = 0 ; i<= div ; i++)
{
value = 0;
int inc = (int) (indicator / numb);
tempo = h_ref[add].input_vec[(i*fractn+inc)%ipv_size];
for(int j=0 ; j<441 ;j++)
{
double diff = centroid[j]- h_inputvec[441*tempo + j];
value += pow(diff,2);
}
double norm = sqrt(value);
h_res[N*tempo+(indicator%numb)] = norm;
}
}
主机代码:
struct Node {
public:
cl_uint num
cl_double *dic;
cl_double *c;
cl_double *di;
cl_double *bs;
Node *bs;
};
struct Reference{
public:
cl_uint* inec;
cl_uint nu;
Node nref;
};
N=8
size_t local[1] = {32};
size_t global[1] = {32*8};
size_t sz_ipv = 441*num_ipvec*sizeof(double);
size_t sz_ref = sizeof(temp_ref);
size_t sz_res = N*num_ipvec*sizeof(double);
size_t sz_minres = num_ipvec*sizeof(int)*(N/2);
/* create host array */
double *h_inputvec = (double*) calloc(441*num_ipvec,sizeof(double));
Reference *h_ref = (Reference*) calloc(1,sizeof(temp_ref)); // change
double *h_res = (double*) calloc(N*num_ipvec,sizeof(double)); // N,X data forward
int *h_minres = (int*) calloc(num_ipvec*(N/2),sizeof(int));
for(int in =0 ; in<N*num_ipvec ; in++) // initialze result to maximum value (padding)
{
h_res[in] = numeric_limits<int>::max();
h_minres[in%(N/2*num_ipvec)] = in % int(N/2) ;
}
cl_mem c_inputvec = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sz_ipv, h_inputvec, &err);
cl_mem c_ref = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sz_ref, h_ref, &err);
cl_mem c_res = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sz_res, h_res, &err);
cl_mem c_minres = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sz_minres, h_minres, &err);
// set arguments
clSetKernelArg(kernel, 0, sizeof(cl_mem), &c_inputvec);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &c_ref);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_res);
clSetKernelArg(kernel, 3, sizeof(const int), &num_ipvec);
clSetKernelArg(kernel, 4, sizeof(const int), &N);
clSetKernelArg(kernel, 5, sizeof(const int), &numb);
printf(" arguments to kernel \n");
//launch kernel
clEnqueueNDRangeKernel(queue, kernel, dim, 0, global, local, 0, (cl_event*)NULL, &event_time);