即时使用clEnqueueNDRangeKernel函数,它比clEnqueueTask快得多。尽管如此,我还是不能让它快于16毫秒,即使添加更多global_item_size也无济于事。它只是在global_item_size = 3上停止工作得更快,而且我认为它应该可以更快地运行更多的global_size。我错了吗?我该如何解决?
__kernel void red_to_green(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
unsigned int gid = get_global_id(0);
unsigned int gsize = get_global_size(0);
unsigned int lid = get_local_id(0);
unsigned int lsize = get_local_size(0);
unsigned int vstart = ((InSize/gsize) * gid);
unsigned int vstop = (vstart + (InSize/gsize));
for (unsigned int i = vstart; i < vstop; i+=4)
{
pDataOut[i/4] = (pDataIn[i] + pDataIn[i + 1] + pDataIn[i + 2]) / 3;
}
}
vector<unsigned char> pDataIn;
vector<unsigned char> pDataOut;
SizeIn = pDataIn.size();
SizeOut = pDataOut.size();
const size_t cycles_max = 100;
clock_t t4 = clock();
for (int i = 0; i<cycles_max; i++){
double start_time = clock();
double search_time = 0;
//float last_time = 0;
//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
size_t global_item_size = 3;
size_t local_item_size = 1;
ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
//copy from buffer
ret = clEnqueueReadBuffer(command_queue, memobj1, CL_TRUE, 0, pDataOut.size(), pDataOut.data(), 0, NULL, NULL);
ret = clFinish(command_queue);
double end_time = clock(); // конечное время
search_time = end_time - start_time;
//float last_time = last_time + search_time;
cout << search_time << " ms" << endl;
}
clock_t t5 = clock();
double time_seconds2 = (t5-t4)*CLOCKS_PER_SEC/cycles_max;
cout << "Average time: " << time_seconds2/1000 << " ms" <<endl;
WriteBmpFile(L"3840x2160_ndrange.bmp", iWidth, iHeight, 8, pDataOut.size(), pDataOut.data(), false);
system("PAUSE");
答案 0 :(得分:1)
尽管如此,我仍然无法让它超过16毫秒,甚至增加更多 global_item_size没有帮助。它只是停止了更快的工作 global_item_size = 3以及所有这些,我认为它应该可行 使用更多global_size更快。我错了吗?我该如何解决?
在将本地大小设置为1时,增加全局大小只会有所帮助。这意味着您的工作组大小为1,效率非常低。 GPU Nvidia GT 740M拥有2个计算单元,这意味着它可以同时运行2个工作组,因此在全局大小设置为3后,您看不到任何改进。
尝试将本地大小增加到至少128以充分利用GPU(或512或1024)。 CUDA Occupancy Calculator有助于确定最佳设置。