我编写了一个用于在GPU上计算矩阵产品的程序。
我的问题是,对于大型矩阵,Catalyst驱动程序会崩溃。我知道长时间计算的超时检测导致potential problem,但我的计算速度相当快,所以我不认为这是一个问题。
本质上,我有三种不同的代码迭代。第一个是矩阵乘法的简单实现,它使用OpenCL中的内置功能来确定工作组大小(WGS)。这个版本工作正常,但由于带宽模仿(我猜),效率很低!
第二个版本手动指定WGS。在这种情况下,工作组的尺寸为细长条{256,1,1}(为了能够乘以具有由素数指定的尺寸的矩阵)。 WGS是 clGetKernelWorkGroupInfo()返回的 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 的复数形式。当使用大于4000 x 4000的矩阵时,此版本崩溃。它比第一个版本显着慢!
第三个版本与第二个版本类似,只是它使用本地内存。对于大于2000 x 2000的矩阵,此版本崩溃。它实际上是矩阵大小最快的。
我在Windows 8.1上使用带有gcc的MinGW64(如果需要可以检查版本,不记得)。我使用带有CCC 14.9驱动程序的AMD R9 290。
有用的链接
Khronos OpenCL 1.2 Reference pages
My code is roughly based on the code and writings on this blog
内核(本地内存版本):
__kernel void matMult(__global float* A,
__global float* B,
__global float* C,
int m, int p)
{
int a, b, k, group_idx, group_idz, tx, ty;
const int wgsize = get_local_size(0);
float value;
group_idx = get_group_id(0);
group_idz = get_group_id(2);
tx = get_local_id(0);
ty = get_local_id(2);
if(tx >= p) {
//printf("Thread %d exiting ...\n", tx);
return;
}
// Index of the first sub-matrix of A processed
// group_idz the block
int aBegin = m * wgsize * group_idz;
// Index of the last sub-matrix of A processed
// group_idz the block
int aEnd = aBegin + m - 1;
// Step size used to iterate through the
// sub-matrices of A
int aStep = wgsize;
// Index of the first sub-matrix of B processed
// group_idz the block
int bBegin = wgsize * group_idx;
// Step size used to iterate through the
// sub-matrices of B
int bStep = wgsize * p;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{
// Declaration of the local memory array As
// used to store the sub-matrix of A
__local float As[256][1];
// Declaration of the local memory array Bs
// used to store the sub-matrix of B
__local float Bs[256][1];
// Load the matrices from global memory
// to local memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + m * ty + tx];
Bs[ty][tx] = B[b + p * ty + tx];
// Synchronize to make sure the matrices
// are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (k = 0; k < wgsize; k++)
value += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
//printf("value: %f\n", value);
int c = p * wgsize * group_idz + wgsize * group_idx;
C[c + p * ty + tx] = value;
}
主机代码:
int main(int argc, const char * argv[])
{
...
//Allocate memory for and generate test data on host for matrix multiplication
float* hostA = allocMatrix(n, m);
float* hostB = allocMatrix(m, p);
//Allocate results array on host
float* hostC = (float *)malloc(sizeof(float) * p * n);
//Setup the objects OpenCL needs in order to function
if(SetupCL(&context, properties, &kernel, &command_queue, &program, &platform_id, &device_id, suppressoutp, usecpu)) {
printf("Failed to setup OpenCL\n");
return -1;
}
//10. Allocate memory on device
cl_mem devA = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * m * n, NULL, &err);
cl_mem devB = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * p * m, NULL, &err);
cl_mem devC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * p * n, NULL, &err);
//Load data into the input buffer
clEnqueueWriteBuffer(command_queue, devA, CL_TRUE, 0,
sizeof(float) * m * n, hostA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, devB, CL_TRUE, 0,
sizeof(float) * m * p, hostB, 0, NULL, NULL);
//11. Set the argument list for the kernel command
int wa = m;
int wb = p;
clSetKernelArg(kernel, 0, sizeof(cl_mem), &devA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &devB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &devC);
clSetKernelArg(kernel, 3, sizeof(int), &wa);
clSetKernelArg(kernel, 4, sizeof(int), &wb);
//Fetch information about compute device
unsigned int pref_workg_size_mult;
const unsigned int max_workg_size;
const unsigned int max_workit_sizes[3];
clGetKernelWorkGroupInfo(kernel, device_id,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof(size_t), (void*) &pref_workg_size_mult, NULL);
clGetDeviceInfo(device_id,
CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), (void*) &max_workg_size, NULL);
clGetDeviceInfo(device_id,
CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(size_t) * 3, (void*) max_workit_sizes, NULL);
//Determine work group size
int k = 1, s = 1;
if (pref_workg_size_mult == 0)
pref_workg_size_mult = 1;
while(k * pref_workg_size_mult < n && k * pref_workg_size_mult < max_workg_size)
k++;
while(k *s * pref_workg_size_mult < n)
s++;
const size_t work_group_size[3] = {k * pref_workg_size_mult, 1, 1};
const size_t global_work_size[3] = {k * s * pref_workg_size_mult, 1, p};
//12. Enqueue the kernel command for execution
cl_event event0;
cl_int enqueue_error = clEnqueueNDRangeKernel(command_queue, kernel, 3, NULL, global_work_size,
work_group_size, 0, NULL, &event0);
if (enqueue_error != CL_SUCCESS)
{
printf("Kernel launch failed, error %d\n", enqueue_error);
return enqueue_error;
}
clWaitForEvents(1, &event0);
//Call profiling function to obtain starting and ending times of kernel execution
clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end, NULL);
duration = end - start;
//13. Copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, devC, CL_TRUE, 0,
sizeof(float) * p * n, hostC, 0, NULL, NULL);
//14. Cleanup - release OpenCL resources
clReleaseMemObject(devA);
clReleaseMemObject(devB);
clReleaseMemObject(devC);
clReleaseEvent(event0);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
//Release host memory
free(hostA);
free(hostB);
free(hostC);
return 0;
}