Question

我编写了一个用于在GPU上计算矩阵产品的程序。

我的问题是，对于大型矩阵，Catalyst驱动程序会崩溃。我知道长时间计算的超时检测导致potential problem，但我的计算速度相当快，所以我不认为这是一个问题。

本质上，我有三种不同的代码迭代。第一个是矩阵乘法的简单实现，它使用OpenCL中的内置功能来确定工作组大小（WGS）。这个版本工作正常，但由于带宽模仿（我猜），效率很低！

第二个版本手动指定WGS。在这种情况下，工作组的尺寸为细长条{256,1,1}（为了能够乘以具有由素数指定的尺寸的矩阵）。 WGS是 clGetKernelWorkGroupInfo（）返回的 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 的复数形式。当使用大于4000 x 4000的矩阵时，此版本崩溃。它比第一个版本显着慢！

第三个版本与第二个版本类似，只是它使用本地内存。对于大于2000 x 2000的矩阵，此版本崩溃。它实际上是矩阵大小最快的。

我在Windows 8.1上使用带有gcc的MinGW64（如果需要可以检查版本，不记得）。我使用带有CCC 14.9驱动程序的AMD R9 290。

有用的链接

Khronos OpenCL 1.2 Reference pages

My code is roughly based on the code and writings on this blog

内核（本地内存版本）：

    __kernel void matMult(__global float* A,
                    __global float* B,
                    __global float* C,
                    int m, int p)
{
    int a, b, k, group_idx, group_idz, tx, ty;
    const int wgsize = get_local_size(0);
    float value;

    group_idx = get_group_id(0);
    group_idz = get_group_id(2);
    tx = get_local_id(0);
    ty = get_local_id(2);

    if(tx >= p)  {
        //printf("Thread %d exiting ...\n", tx);
        return;
    }

    // Index of the first sub-matrix of A processed 
    // group_idz the block
    int aBegin = m * wgsize * group_idz;

    // Index of the last sub-matrix of A processed 
    // group_idz the block
    int aEnd   = aBegin + m - 1;

    // Step size used to iterate through the 
    // sub-matrices of A
    int aStep  = wgsize;

    // Index of the first sub-matrix of B processed 
    // group_idz the block
    int bBegin = wgsize * group_idx;

    // Step size used to iterate through the 
    // sub-matrices of B
    int bStep  = wgsize * p;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) 
    {

        // Declaration of the local memory array As 
        // used to store the sub-matrix of A
        __local float As[256][1];

        // Declaration of the local memory array Bs 
        // used to store the sub-matrix of B
        __local float Bs[256][1];

        // Load the matrices from global memory
        // to local memory; each thread loads
        // one element of each matrix
        As[ty][tx] = A[a + m * ty + tx];
        Bs[ty][tx] = B[b + p * ty + tx];

        // Synchronize to make sure the matrices 
        // are loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (k = 0; k < wgsize; k++)
            value += As[ty][k] * Bs[k][tx];

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    //printf("value: %f\n", value);
    int c = p * wgsize * group_idz + wgsize * group_idx;
    C[c + p * ty + tx] = value;
}

主机代码：

int main(int argc, const char * argv[])
{
    ...

    //Allocate memory for and generate test data on host for matrix multiplication
    float* hostA = allocMatrix(n, m);
    float* hostB = allocMatrix(m, p);

    //Allocate results array on host
    float* hostC = (float *)malloc(sizeof(float) * p * n);

    //Setup the objects OpenCL needs in order to function
    if(SetupCL(&context, properties, &kernel, &command_queue, &program, &platform_id, &device_id, suppressoutp, usecpu))  {
        printf("Failed to setup OpenCL\n");
        return -1;
    }


    //10. Allocate memory on device
    cl_mem devA  = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                 sizeof(cl_float) * m * n, NULL, &err);

    cl_mem devB  = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                 sizeof(cl_float) * p * m, NULL, &err);

    cl_mem devC  = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                 sizeof(cl_float) * p * n, NULL, &err);

    //Load data into the input buffer
    clEnqueueWriteBuffer(command_queue, devA, CL_TRUE, 0,
                         sizeof(float) * m * n, hostA, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, devB, CL_TRUE, 0,
                         sizeof(float) * m * p, hostB, 0, NULL, NULL);


    //11. Set the argument list for the kernel command
    int wa = m;
    int wb = p;
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &devA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &devB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &devC);
    clSetKernelArg(kernel, 3, sizeof(int), &wa);
    clSetKernelArg(kernel, 4, sizeof(int), &wb);


    //Fetch information about compute device
    unsigned int pref_workg_size_mult;
    const unsigned int max_workg_size;
    const unsigned int max_workit_sizes[3];

    clGetKernelWorkGroupInfo(kernel, device_id,
        CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
        sizeof(size_t), (void*) &pref_workg_size_mult, NULL);

    clGetDeviceInfo(device_id, 
        CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(size_t), (void*) &max_workg_size, NULL);

    clGetDeviceInfo(device_id, 
        CL_DEVICE_MAX_WORK_ITEM_SIZES,
        sizeof(size_t) * 3, (void*) max_workit_sizes, NULL);


    //Determine work group size
    int k = 1, s = 1;
    if (pref_workg_size_mult == 0)
        pref_workg_size_mult = 1;

    while(k * pref_workg_size_mult < n && k * pref_workg_size_mult < max_workg_size)
        k++;
    while(k *s * pref_workg_size_mult < n)
        s++;

    const size_t work_group_size[3] = {k * pref_workg_size_mult, 1, 1};
    const size_t global_work_size[3] = {k * s * pref_workg_size_mult, 1, p};


    //12. Enqueue the kernel command for execution
    cl_event event0;

    cl_int enqueue_error = clEnqueueNDRangeKernel(command_queue, kernel, 3, NULL, global_work_size,
                           work_group_size, 0, NULL, &event0);
    if (enqueue_error != CL_SUCCESS)
    {
        printf("Kernel launch failed, error %d\n", enqueue_error);
        return enqueue_error;
    }
    clWaitForEvents(1, &event0);


    //Call profiling function to obtain starting and ending times of kernel execution
    clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_START,
                                sizeof(cl_ulong), &start, NULL);
    clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_END,
                                sizeof(cl_ulong), &end, NULL);
    duration = end - start;



    //13. Copy the results from out of the output buffer
    clEnqueueReadBuffer(command_queue, devC, CL_TRUE, 0,
                                sizeof(float) * p * n, hostC, 0, NULL, NULL);


    //14. Cleanup - release OpenCL resources
    clReleaseMemObject(devA);
    clReleaseMemObject(devB);
    clReleaseMemObject(devC);
    clReleaseEvent(event0);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);


    //Release host memory
    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}

OpenCl程序崩溃视频卡驱动程序

0 个答案: