我正在尝试在OpenCL内核中实现以下循环。
for(i=0;i<N;i++) for(j=0;j<M;j++) weights[i*M+j] += gradients[i] * input[j];
这是我的内核。我目前正在将M硬编码为4,并且仅适用于前4个元素。
__kernel
void cwk3( __global float *gradients, __global float *inputs, __global float *weights)
{
// The global id tells us the index of the vector for this thread.
int gid1 = get_global_id(0);
int gid2 = get_global_id(1);
// Perform the addition.
weights[(gid1 * 4) + gid2] += gradients[gid1] * inputs[gid2];
}
相关的c ++代码是
float
*gradients = (float*) malloc( N *sizeof(float) ),
*inputs = (float*) malloc( M*sizeof(float) ),
*weights = (float*) malloc( N*M*sizeof(float) );
initialiseArrays( gradients, inputs, weights, N, M );
cl_mem deviceGradients = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*sizeof(float), gradients
, &status );
cl_mem deviceInputs = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*sizeof(float), inputs
, &status );
cl_mem deviceWeights = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N*M*sizeof(float), weights
, &status );
cl_kernel kernel = compileKernelFromFile( "kernel.cl", "cwk3", context, device );
status = clSetKernelArg( kernel, 0, sizeof(deviceGradients), &deviceGradients );
status = clSetKernelArg( kernel, 1, sizeof(deviceInputs), &deviceInputs );
status = clSetKernelArg( kernel, 2, sizeof(deviceWeights), &deviceWeights );
size_t indexSpaceSize[2], workGroupSize[1];
indexSpaceSize[0] = N;
indexSpaceSize[1] = M;
workGroupSize [0] = 4;
status = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, indexSpaceSize, workGroupSize, 0, NULL, NULL );
if( status != CL_SUCCESS )
{
printf( "Failure enqueuing kernel: Error %d.\n", status );
return EXIT_FAILURE;
}
status = clEnqueueReadBuffer( queue, deviceWeights, CL_TRUE, 0, N*M*sizeof(float), weights, 0, NULL, NULL );
if( status != CL_SUCCESS )
{
printf( "Could not copy device data to host: Error %d.\n", status );
return EXIT_FAILURE;
}
这只是创建缓冲区并将它们复制到GPU,启动内核,然后将答案从GPU读回CPU。 N和M作为命令行参数读取。我目前将它们都设置为4进行测试
答案 0 :(得分:4)
您似乎对全球和本地工作组感到困惑。
全局工作量指定执行的呼叫(工作项目)总数。
global_work_size=[M,N]
将总共调用内核MxN次。一个工作项可以通过get_global_id
确定其位置。 OpenCL可以这样实现:
for(i=0;i<N;i++)
for(j=0;j<M;j++)
call_kernel(set global_id=[i,j])
本地工作组描述了如何将各个启动的工作项(根据全局大小创建)组合在一起,并使它们彼此了解并在彼此之间共享内存。您不需要/不需要使用这些功能,因此请忽略它们。 因此,要在OpenCL中实现for循环:
for(i=0;i<N;i++)
for(j=0;j<M;j++)
weights[i*M+j] += gradients[i] * input[j];
您将拥有此内核:
__kernel
void cwk3( __global float *gradients, __global float *inputs, __global float *weights)
{
int gid1 = get_global_id(0);
int gid2 = get_global_id(1);
int M = get_global_size(0);
weights[(gid1 * M) + gid2] += gradients[gid1] * inputs[gid2];
}
并这样称呼它:
size_t global_work[2];
global_work[0]=M;
global_work[1]=N;
// This is 2D kernel, not 1D
// Offsets are 0
// Global work size is M*N
// Ignore local work size
status = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work);