移至具体来说,是从主机到设备上以异步方式提供最佳性能优势(我认为最好是local Memory
cl_mem clCreateBuffer( cl_context ctx,
cl_mem flags,
syse_t size,
void* host_ptr,
cl_int errorcode_ret)
err = clEnqueueWriteBuffer(
command_queue, // command queue managing the transaction
output, // buffer object to write to, could it be in local memory?
CL_TRUE, // indicating a blocking transfer
0, // offset in the output to start writing the data
size, // size of the data transfer
host_ptr, // pointer to the buffer in host memory holding the data
0, // number of event, what could I do with this?
NULL, // number of events that predate the current one? the previous argument, I guess?
NULL, // event object to return after successful completion
在OpenCL Best Practices Guide
cl_context cxGPUContext; // computational context for the current arena
cl_mem cmPinnedBufIn = NULL; // memory buffer on the host-side
cl_mem cmDevBufIn = NULL; // memory buffer on the device-side
unsigned char* cDataIn = NULL; // holder for the data buffer
cmPinnedBufIn = clCreateBuffer(cxGPUContext, // computational context for the current arena
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, // zero-copy I guess??
memSize, // size of the stack of memory to hold the data transfers
NULL, // initializing the host pointer to NULL
NULL); // error code
cmDevBufIn = clCreateBuffer(cxGPUContext, // computational context for the current arena
CL_MEM_READ_ONLY, // read-only mode in the device side
memSize, // size of the stack of memory to hold the transaction, it is in the Global memory I guess?
NULL, // initializing the device pointer to NULL
NULL); // error code
cDataIn = (double8 *)clEnqueueMapBuffer(cqCommandQue, //command queue to manage the data transfer
cmPinnedBufIn, // pinned buffer instantiated avobe
CL_TRUE, // is this a blocking map?
CL_MAP_WRITE, // what we are doing?
0, // offset in the data
memSize, // size of the transfer
0, // number of events in the waiting list, howt to capitalize on this?
NULL, // waiting list
NULL, // event
NULL); // error code
cDataIn = get_data(x);
err = clEnqueueWriteBuffer(
cqCommandQue, // command queue managing the transaction
cmDevBufIn, // buffer object to write to, could it be in local memory?
CL_FALSE, // indicating that this is not a blocking transfer
0, // offset in the output to start writing the data
sizeof(double8), // size of the data transfer
cDataIn, // pointer to the buffer in host memory holding the data
0, // number of event, what could I do with this?
NULL, // number of events that predate the current one? the previous argument, I guess?
NULL, // event object to return after successful completion
6)在GPU设备上运行计算内核。 在这一步中,我的应用程序打破了更多标准计算的模式,即需要传输的数据块才能完成计算取决于此处x
__kernel void distance_kernel(__global double *pixelInfo,
__global double *clusterCentres,
__global double *distanceFromClusterCentre)
int index = get_global_id(0);
int d, dl, da, db, dx, dy;
dl = pixelInfo[5 * index] - clusterCentres[0];
dl = dl * dl;
da = pixelInfo[5 * index + 1] - clusterCentres[1];
da = da * da;
db = pixelInfo[5 * index + 2] - clusterCentres[2];
db = db * db;
dx = pixelInfo[5 * index + 3] - clusterCentres[3];
dx = dx * dx;
dy = pixelInfo[5 * index + 4] - clusterCentres[4];
dy = dy * dy;
double x = dx + dy + dl + da + db;
// how could I grab from the host the data corresponding to
// the value taken by x above?
// let ussupose that the inlined function get_data(x) does it
// 'transparently' :)
double8 point = get_data(x);
// use point to compute y by interpolation
double y = interp(x,point);
istanceFromClusterCentre[index] = y;
转移。正如Best practices Guide
通过非阻塞的读或写传输,返回控制 立即进入主机线程,从而允许进行操作 在主机线程中同时进行 设备继续运行。
全局内存。该存储区允许对所有存储区进行读/写访问 所有工作组中的工作项目。工作项目可以读取或写入 内存对象的任何元素。读写全局内存可能 根据设备的功能进行缓存。
但是我还不知道如何触发内核的数据传输。就是get_data(double x)