我有一台带有Radeon R7 GPU的AMD A10 APU。我相信这个设备支持并发内核执行。但是当我编写以下代码并获得分析信息时,它看起来并不像内核同时执行。我的openCL代码如下所示(每次迭代中的内核被添加到同一个队列中,不同迭代中的内核被添加到不同的队列中,因此应该并行运行)。
for(j = 0; j < 8; j++){
cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER1_KERNEL],1,NULL,&globalSize,&localSize,4,eventList,&eventList[4 + j * 4]); //Invoking the first filter kernel
cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER2_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 4 + 4 * j,&eventList[5 + j * 4]); //Invoking the second filter kernel
cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER3_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 5 + 4 * j,&eventList[6 + j * 4]); //Invoking the third filter kernel
cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][AGGREGATE_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 6 + 4 * j,&eventList[7 + j * 4]); //Invoking the aggregate kernel
}
我用于分析的代码是:
for(j = 0; j < 8; j++){
//Code for obtaining the profiling data
clWaitForEvents(4 + 4*j, eventList+4);
clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time += total_time;
final_time_queued += total_time_queued;
cout<<"\n1 : "<<time_start<<" "<<time_end;
clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time += total_time;
final_time_queued += total_time_queued;
cout<<"\n2 : "<<time_start<<" "<<time_end;
clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time += total_time;
final_time_queued += total_time_queued;
cout<<"\n3 : "<<time_start<<" "<<time_end;
clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);
clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = ((double)time_end - time_start)/1000000;
total_time_queued = ((double)time_end - time_start_queued)/1000000;
final_time += total_time;
final_time_queued += total_time_queued;
cout<<"\n4 : "<<time_start<<" "<<time_end;
}
我的分析代码的输出是:
1:3989633359630 3989657015190
2:3989657016860 3989683273450
3:3989683275090 3989708840030
4:3989708841760 3989734915610
1:3989800219990 3989824648510
2:3989824650240 3989850888860
3:3989850890610 3989876392210
4:3989876393890 3989902432920
1:3989954275546 3989978865766
2:3989978867476 3990005037296
3:3990005038976 3990030592876
4:3990030594566 3990056566896
1:3990113144067 3990137315217
2:3990137316937 3990163458337
3:3990163460057 3990189007267
4:3990189008967 3990215129227
1:3990274589700 3990299102730
2:3990299104430 3990325570980
3:3990325572730 3990351050810
4:3990351052550 3990377255070
1:3990424871514 3990448828874
2:3990448830524 3990475309034
3:3990475310744 3990500849914
4:3990500851664 3990526839444
1:3990584574567 3990608802017
2:3990608803727 3990635102497
3:3990635104427 3990660647987
4:3990660649697 3990686716887
1:3990733269328 3990757174868
2:3990757176588 3990783429448
3:3990783431118 3990809003598
4:3990809005298 3990835207128
答案 0 :(得分:1)
我弄清楚为什么内核不是并发执行的。对于并发执行,GPU应该有足够的可用资源来支持并发执行。在我的代码中,每个内核的工作组数量非常多,因此GPU只有足够的资源来支持1个并发内核。因此,将工作组的数量保持在最低限度以允许并发执行
答案 1 :(得分:0)
您需要使用无序命令队列(如果您的驱动程序支持)或多个有序命令队列,以便获得并发内核执行。