在OpenCL中,如果我想添加两个N维向量,则全局工作组大小(globalSize
)应该满足globalSize = ceil(N/localSize) * localSize
,其中localSize
是本地工作组大小。它是否正确?如果N = 1000
和localSize = 128
,globalSize
应为1024
?我们是否可以将globalSize
设置为localSize
的某个倍数并且大于所需的倍数?
我尝试了很多次,它对一维问题很有效。
但是,当涉及到2d问题时,例如,将两个矩阵m*n
和n*p
相乘,结果矩阵的顺序为m*p
,事情变得更复杂。< / p>
我的设备上的最大工作组大小为128
,因此我设置localSize [2] = {16,8}
和。{
globalSize [2] = {ceil(m/16)*16,ceil(p/8)*8}
。
它类似于1维案例,但结果是错误的!
如果我设置localSize [2] = {1,128}
并相应更改globalSize
,我可以得到正确的结果。那问题出在哪里?谁能告诉我为什么?
另外,我找出矩阵元素错误的索引。
似乎(i,j)
i*p + j = n * some constant (n = 1,2,3...)
为什么?
这是我的核心功能:
kernel void mmult(const int Mdim, const int Ndim, const int Pdim,
global float *A, global float *B, global float *C)
{
int i = get_global_id(1);
int j = get_global_id(0);
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
else
{
float tmp = 0;
for(int k = 0; k < Ndim; k++)
tmp += A[i*Ndim+k] * B[k*Pdim+j];
C[i*Pdim + j] = tmp;
}
}
然后是主持人计划:
#define __NO_STD_VECTOR // Use cl::vector instead of STL version
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <utility>
#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace cl;
int main()
{
// Create the two input matrices
int m = 1000;
int n = 1000;
int p = 1000;
float *A = new float[m*n];
float *B = new float[n*p];
for(int i = 0; i < m*n; i++)
{
A[i] = i;
}
for(int i = 0; i < n*p; i++)
{
B[i] = i;
}
try
{
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0]);
// Read source file
std::ifstream sourceFile("mmul.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source);
// Build program for these specific devices
program.build(devices);
// Make kernel
Kernel kernel(program, "mmult");
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, m*n * sizeof(float));
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, p*n * sizeof(float));
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, m*p * sizeof(float));
// Copy lists A and B to the memory buffers
queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, m * n * sizeof(float), A);
queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, p * n * sizeof(float), B);
// Set arguments to kernel
kernel.setArg(0, m);
kernel.setArg(1, n);
kernel.setArg(2, p);
kernel.setArg(3, bufferA);
kernel.setArg(4, bufferB);
kernel.setArg(5, bufferC);
// Run the kernel on specific ND range
NDRange global((ceil((float)(p)/16))*16,(ceil((float)(m)/8))*8);
NDRange local(16,8);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local);
// Read buffer C into a local list
float *C = new float[m*p];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, m*p * sizeof(float), C);
// check the correctness of the result
float *c = new float[m*p];
for(int i = 0; i < m; i++)
for(int j = 0; j < p; j++)
{
float z = 0.0;
for(int k = 0; k < n; k++)
{
z += A[i*n+k] * B[k*p+j];
}
c[i*p+j] = z;
}
for(int i = 0; i < m*p; i++)
{
if(fabs(c[i]-C[i])>0.001)
std::cout<<i<<" "<<c[i]<<" "<<C[i]<<std::endl;
}
delete []A;
delete []B;
delete []C;
}
catch(Error error)
{
std::cout << error.what() << "(" << error.err() << ")" << std::endl;
}
return 0;
}
答案 0 :(得分:2)
您的OpenCL内核中的边界检查代码不正确。而不是:
if(i < 0 || j < 0 || i > Mdim || j > Pdim) return;
你应该有这个:
if(i < 0 || j < 0 || i >= Mdim || j >= Pdim) return;
答案 1 :(得分:0)
假设你有一个大小为1000x1000的浮点矩阵:
const int size = 1000;
// Whatever
float* myMatrix = (float*)calloc(size * size, sizeof(*myMatrix));
首先确定本地组的大小:
size_t localSize[] = {16, 8};
然后确定您需要多少本地组:
size_t numLocalGroups[] = {ceil(size/localSize[0]), ceil(size/localSize[1])};
最后,确定NDRange大小:
size_t globalSize[] = {localSize[0] * numLocalGroups[0], localSize[1] * numLocalGroups[1]};
不要忘记在最右边的本地群组中处理越界访问。