OpenCL实现的算法比普通循环慢

时间:2014-09-27 14:09:20

标签: c++ macos opencl gpu amd-processor

我是并行计算和OpenCL的新手。我按照OpenCLProgramming指南。在卷积实现部分。

我的main.cpp:

#include <iostream>
#include <sstream>
#include <fstream>
#include <string>
#include <OpenCL/OpenCL.h>

using namespace std;

const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;

cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
    {3, 1, 1, 4, 8, 2, 1, 3},
    {4, 2, 1, 1, 2, 1, 2, 3},
    {4, 4, 4, 4, 3, 2, 2, 2},
    {9, 8, 3, 8, 9, 0, 0, 0},
    {9, 3, 3, 9, 0, 0, 0, 0},
    {0, 9, 0, 8, 0, 0, 0, 0},
    {3, 0, 8, 8, 9, 4, 4, 4},
    {5, 9, 8 ,1 ,8, 1, 1, 1}
};

const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;

cl_uint outputSignal[outputSignalWidth][outputSignalHeight];

const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;

cl_uint mask[maskWidth][maskHeight] =
{
    {1, 1, 1}, {1, 0, 1}, {1, 1, 1}
};

inline void checkErr(cl_int err, const char* name)
{
    if (err != CL_SUCCESS)
    {
        cerr << "Error: " << name << endl;
        exit(EXIT_FAILURE);
    }
}

void CL_CALLBACK contextCallback(const char * errInfo,
                                 const void * private_info,
                                 size_t cb,
                                 void * user_data)
{
    cout << "Error occurred during contxt use: " << errInfo << endl;
    exit(EXIT_FAILURE);
}

int main(int argc, const char * argv[])
{
    cl_int errNum;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_platform_id * platformIDs;
    cl_device_id * deviceIDs;
    cl_context context = NULL;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem inputSignalBuffer;
    cl_mem outputSignalBuffer;
    cl_mem maskBuffer;

    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

    platformIDs = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numPlatforms);
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatFormIDs");

    deviceIDs = NULL;
    cl_uint i;
    for (i = 0; i < numPlatforms; i++)
    {
        errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
        if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
        {
            checkErr(errNum, "clGetDeviceIDs");
        } else if (numDevices > 0)
        {
            deviceIDs = (cl_device_id *) alloca(sizeof(cl_device_id) * numDevices);
            errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);
            checkErr(errNum, "clGetDeviceIDs");
            break;
        }
    }

    if (deviceIDs == NULL)
    {
        cout << "No CPU devices found." << endl;
        exit(-1);
    }

    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[i], 0
    };

    context = clCreateContext(contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum);
    checkErr(errNum, "clCreateContext");


    ifstream srcFile("Convolution.cl");
    checkErr(srcFile.is_open()?CL_SUCCESS:-1, "reading Convolution.cl");

    string srcProg(istreambuf_iterator<char>(srcFile),
                   (istreambuf_iterator<char>()));

    const char* src = srcProg.c_str();
    size_t length = srcProg.length();

    program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
    checkErr(errNum, "clCreateProgramWithSource");

    cout << "Device count: " << sizeof(deviceIDs)/sizeof(cl_device_id) << endl;

    errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
    checkErr(errNum, "clBuildProgram");

    kernel = clCreateKernel(program, "convolve", &errNum);
    checkErr(errNum, "clCreateKernel");

    inputSignalBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight*inputSignalWidth, static_cast<void*>(inputSignal), &errNum);
    checkErr(errNum, "clCreateBuffer(inputSignal)");

    maskBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void*>(mask), &errNum);
    checkErr(errNum, "clCreateBuffer(mask)");

    outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum);
    checkErr(errNum, "clCreateBuffer(outputSignal)");

    queue = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum);
    checkErr(errNum, "clCreateCommandQueue");

    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
    errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
    errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
    checkErr(errNum, "clSetKernelArg");

    const size_t globalWorkSize[1] =
    {
        outputSignalWidth * outputSignalWidth
    };

    const size_t localWorkSize[1] =
    {
        1
    };

    clock_t start, end;

    clFinish(queue);
    start = clock();
    errNum = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
    clFinish(queue);
    end = clock();
    cout << "time for calculation: " << (float)(end - start) << endl;
    checkErr(errNum, "clEnequeueNDRangeKernel");

    errNum = clEnqueueReadBuffer(queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, outputSignal, 0, NULL, NULL);
    checkErr(errNum, "clEnqueueReadBuffer");

    clFinish(queue);
    start = clock();
    for (int y = 0; y < outputSignalHeight; y++)
    {
        for (int x = 0; x < outputSignalHeight; x++)
        {
            uint sum = 0;
            for (int r = 0; r < maskWidth; r++)
            {
                for (int c =0; c < maskWidth; c++)
                {
                    sum += inputSignal[y+r][x+c]*mask[r][c];
                }
            }
            outputSignal[y][x] = sum;

        }
    }
    end = clock();
    cout << "Loop version time: " << (float)(end - start) << endl;


    return 0;
}

和Convolution.cl:

__kernel void convolve(const __global uint * const input,
                       __constant uint * const mask,
                       __global uint * const output,
                       const int inputWidth,
                       const int maskWidth)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    uint sum = 0;
    for (int r = 0; r < maskWidth; r++)
    {
        const int idxIntmp = (y + r) * inputWidth + x;
        for (int c =0; c < maskWidth; c++)
        {
            sum+= mask[r * maskWidth + c] * input[idxIntmp + c];
        }
    }

    output[y * get_global_id(0) + x] = sum;
}

该平台是MacOS 10.9和AMD 6750M。 CL版本比循环版本慢得多(大约慢10倍),这没有任何意义。你能帮助我指出代码有什么问题吗?

1 个答案:

答案 0 :(得分:5)

有两个主要问题:

const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalWidth };

首先,正如Basile Starynkevitch的评论所指出的,您的数据集非常小。太小而无法从任何GPU加速中受益。在这里,您只运行了36个工作项:这个非常小,以至于它只能容纳在一个计算单元上的波前的一半。

您应该运行数千个工作项,以正确利用GPU的强大功能。因此,OpenCL的开销使GPU版本比CPU版本慢。尝试使用更多更大的数据集,您应该注意到性能的显着提高。

此外:

const size_t localWorkSize[1] = { 1 };

您正在运行outputSignalWidth * outputSignalWidth1工作项的工作组。这是一个很大的问题。

在AMD GPU上,波前大小为64.这意味着如果要充分利用硬件,则应安排至少 64个工作项的工作组(理想情况下为64的倍数) 。您目前正在浪费64个64个硬件线程中,98.4%的GPU无所事事!

调整代码以使用更大的工作组(并相应地更改全局工作大小),或者让OpenCL驱动程序为您选择最佳大小,方法是传递NULL而不是localWorkSize。< / p>

简而言之,你只是使用一个巨大的反铲来移动一个小砾石