我是并行计算和OpenCL的新手。我按照OpenCLProgramming指南。在卷积实现部分。
我的main.cpp:
#include <iostream>
#include <sstream>
#include <fstream>
#include <string>
#include <OpenCL/OpenCL.h>
using namespace std;
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;
cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
{3, 1, 1, 4, 8, 2, 1, 3},
{4, 2, 1, 1, 2, 1, 2, 3},
{4, 4, 4, 4, 3, 2, 2, 2},
{9, 8, 3, 8, 9, 0, 0, 0},
{9, 3, 3, 9, 0, 0, 0, 0},
{0, 9, 0, 8, 0, 0, 0, 0},
{3, 0, 8, 8, 9, 4, 4, 4},
{5, 9, 8 ,1 ,8, 1, 1, 1}
};
const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;
cl_uint outputSignal[outputSignalWidth][outputSignalHeight];
const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;
cl_uint mask[maskWidth][maskHeight] =
{
{1, 1, 1}, {1, 0, 1}, {1, 1, 1}
};
inline void checkErr(cl_int err, const char* name)
{
if (err != CL_SUCCESS)
{
cerr << "Error: " << name << endl;
exit(EXIT_FAILURE);
}
}
void CL_CALLBACK contextCallback(const char * errInfo,
const void * private_info,
size_t cb,
void * user_data)
{
cout << "Error occurred during contxt use: " << errInfo << endl;
exit(EXIT_FAILURE);
}
int main(int argc, const char * argv[])
{
cl_int errNum;
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id * platformIDs;
cl_device_id * deviceIDs;
cl_context context = NULL;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem inputSignalBuffer;
cl_mem outputSignalBuffer;
cl_mem maskBuffer;
errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
platformIDs = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numPlatforms);
errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatFormIDs");
deviceIDs = NULL;
cl_uint i;
for (i = 0; i < numPlatforms; i++)
{
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
{
checkErr(errNum, "clGetDeviceIDs");
} else if (numDevices > 0)
{
deviceIDs = (cl_device_id *) alloca(sizeof(cl_device_id) * numDevices);
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);
checkErr(errNum, "clGetDeviceIDs");
break;
}
}
if (deviceIDs == NULL)
{
cout << "No CPU devices found." << endl;
exit(-1);
}
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[i], 0
};
context = clCreateContext(contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum);
checkErr(errNum, "clCreateContext");
ifstream srcFile("Convolution.cl");
checkErr(srcFile.is_open()?CL_SUCCESS:-1, "reading Convolution.cl");
string srcProg(istreambuf_iterator<char>(srcFile),
(istreambuf_iterator<char>()));
const char* src = srcProg.c_str();
size_t length = srcProg.length();
program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
checkErr(errNum, "clCreateProgramWithSource");
cout << "Device count: " << sizeof(deviceIDs)/sizeof(cl_device_id) << endl;
errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
checkErr(errNum, "clBuildProgram");
kernel = clCreateKernel(program, "convolve", &errNum);
checkErr(errNum, "clCreateKernel");
inputSignalBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight*inputSignalWidth, static_cast<void*>(inputSignal), &errNum);
checkErr(errNum, "clCreateBuffer(inputSignal)");
maskBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void*>(mask), &errNum);
checkErr(errNum, "clCreateBuffer(mask)");
outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum);
checkErr(errNum, "clCreateBuffer(outputSignal)");
queue = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum);
checkErr(errNum, "clCreateCommandQueue");
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
checkErr(errNum, "clSetKernelArg");
const size_t globalWorkSize[1] =
{
outputSignalWidth * outputSignalWidth
};
const size_t localWorkSize[1] =
{
1
};
clock_t start, end;
clFinish(queue);
start = clock();
errNum = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
clFinish(queue);
end = clock();
cout << "time for calculation: " << (float)(end - start) << endl;
checkErr(errNum, "clEnequeueNDRangeKernel");
errNum = clEnqueueReadBuffer(queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, outputSignal, 0, NULL, NULL);
checkErr(errNum, "clEnqueueReadBuffer");
clFinish(queue);
start = clock();
for (int y = 0; y < outputSignalHeight; y++)
{
for (int x = 0; x < outputSignalHeight; x++)
{
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
for (int c =0; c < maskWidth; c++)
{
sum += inputSignal[y+r][x+c]*mask[r][c];
}
}
outputSignal[y][x] = sum;
}
}
end = clock();
cout << "Loop version time: " << (float)(end - start) << endl;
return 0;
}
和Convolution.cl:
__kernel void convolve(const __global uint * const input,
__constant uint * const mask,
__global uint * const output,
const int inputWidth,
const int maskWidth)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
const int idxIntmp = (y + r) * inputWidth + x;
for (int c =0; c < maskWidth; c++)
{
sum+= mask[r * maskWidth + c] * input[idxIntmp + c];
}
}
output[y * get_global_id(0) + x] = sum;
}
该平台是MacOS 10.9和AMD 6750M。 CL版本比循环版本慢得多(大约慢10倍),这没有任何意义。你能帮助我指出代码有什么问题吗?
答案 0 :(得分:5)
有两个主要问题:
const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalWidth };
首先,正如Basile Starynkevitch的评论所指出的,您的数据集非常小。太小而无法从任何GPU加速中受益。在这里,您只运行了36个工作项:这个非常小,以至于它只能容纳在一个计算单元上的波前的一半。
您应该运行数千个工作项,以正确利用GPU的强大功能。因此,OpenCL的开销使GPU版本比CPU版本慢。尝试使用更多更大的数据集,您应该注意到性能的显着提高。
此外:
const size_t localWorkSize[1] = { 1 };
您正在运行outputSignalWidth * outputSignalWidth
个1
工作项的工作组。这是一个很大的问题。
在AMD GPU上,波前大小为64.这意味着如果要充分利用硬件,则应安排至少 64个工作项的工作组(理想情况下为64的倍数) 。您目前正在浪费64个64个硬件线程中,98.4%的GPU无所事事!
调整代码以使用更大的工作组(并相应地更改全局工作大小),或者让OpenCL驱动程序为您选择最佳大小,方法是传递NULL
而不是localWorkSize
。< / p>
简而言之,你只是使用一个巨大的反铲来移动一个小砾石。