我有一个OpenCL代码,我在循环中多次调用clEnqueueWriteBuffer和clEnqueueNDRangeKernel。我使用GetLocalTime函数测量每个循环的数据传输时间和内核执行时间。我面临的问题是第一次迭代中的clEnqueueWriteBuffer和clEnqueueNDRangeKernel需要比第二次迭代中的clEnqueueWriteBuffer和clEnqueueNDRangeKernel更长的时间。为什么会这样?
我正在使用ARM A10 APU的系统。我的opencl循环代码是:
for(j = 0; j< PARTITION_COUNT; j ++){
//Writing to input buffers
GetLocalTime(&start);
clEnqueueWriteBuffer(queue[0], buf_A, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_A + (PARTITION_SIZE * j), 0, NULL, &eventList[0]);
checkErr(cl_err, "clEnqueueWriteBuffer : buf_A");
clEnqueueWriteBuffer(queue[1], buf_B, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_B + (PARTITION_SIZE * j), 0, NULL, &eventList[1]);
checkErr(cl_err, "clEnqueueWriteBuffer : buf_B");
clEnqueueWriteBuffer(queue[2], buf_C, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_C + (PARTITION_SIZE * j), 0, NULL, &eventList[2]);
checkErr(cl_err, "clEnqueueWriteBuffer : buf_C");
clEnqueueWriteBuffer(queue[3], buf_D, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_D + (PARTITION_SIZE * j), 0, NULL, &eventList[3]);
checkErr(cl_err, "clEnqueueWriteBuffer : buf_D");
clFinish(queue[0]);
clFinish(queue[1]);
clFinish(queue[2]);
clFinish(queue[3]);
//getting end time
GetLocalTime(&end);
//displaying final time
cout<<"\nTime : "<<start.wMinute<<" "<<start.wSecond<<" "<<start.wMilliseconds;
cout<<"\nTime : "<<end.wMinute<<" "<<end.wSecond<<" "<<end.wMilliseconds;
GetLocalTime(&start);
cl_err = clEnqueueNDRangeKernel(queue[4],kernel[Q6_PROGRAM_ID][FILTER1_KERNEL],1,NULL,&globalSize,&localSize,4,eventList,&eventList[4]);
checkErr(cl_err, "clEnqueueNDRangeKernel : filter1_kernel");
//clFinish(queue[4]);
//Invoking the second filter kernel
cl_err = clEnqueueNDRangeKernel(queue[5],kernel[Q6_PROGRAM_ID][FILTER2_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 4,&eventList[5]);
checkErr(cl_err, "clEnqueueNDRangeKernel : filter2_kernel");
//clFinish(queue[5]);
//Invoking the third filter kernel
cl_err = clEnqueueNDRangeKernel(queue[6],kernel[Q6_PROGRAM_ID][FILTER3_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 5,&eventList[6]);
checkErr(cl_err, "clEnqueueNDRangeKernel : filter3_kernel");
//clFinish(queue[6]);
//Invoking the aggregate kernel
cl_err = clEnqueueNDRangeKernel(queue[8],kernel[Q6_PROGRAM_ID][AGGREGATE_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 6,&eventList[7]);
checkErr(cl_err, "clEnqueueNDRangeKernel : aggregate kernel");
output_A = (int *)clEnqueueMapBuffer(queue[9],output_buf_A,CL_TRUE, CL_MAP_READ, 0, rLen * sizeof(int), 1, eventList + 7, &eventList[8], &cl_err);
checkErr(cl_err, "clEnqueueReadBuffer : output_A");
for(i = 0; i < rLen; i++){
if(output_A[i] > 0){
//cout<<"\n"<<output_A[i];
sum += output_A[i];
}
}
clFinish(queue[4]);
clFinish(queue[5]);
clFinish(queue[6]);
clFinish(queue[8]);
clFinish(queue[9]);
GetLocalTime(&end);
//displaying final time
cout<<"\nTime1 : "<<start.wMinute<<" "<<start.wSecond<<" "<<start.wMilliseconds;
cout<<"\nTime1 : "<<end.wMinute<<" "<<end.wSecond<<" "<<end.wMilliseconds;
}
GetLocalTime(&end1);
//displaying final time
cout<<"\nTime2 : "<<start1.wMinute<<" "<<start1.wSecond<<" "<<start1.wMilliseconds;
cout<<"\nTime2 : "<<end1.wMinute<<" "<<end1.wSecond<<" "<<end1.wMilliseconds;
我的输出是:
时间:27 30 404
时间:27 30 466
时间1:27 30 474
时间1:27 30 547
时间:27 30 551
时间:27 30 555
时间1:27 30 561
时间1:27 30 582
时间:27 30 587
时间:27 30 591
时间1:27 30 597
时间1:27 30 617
时间:27 30 622
时间:27 30 627
时间1:27 30 638
时间1:27 30 659
时间:27 30 670
时间:27 30 675
时间1:27 30 679
时间1:27 30 699
时间:27 30 706
时间:27 30 711
时间1:27 30 718
时间1:27 30 737
时间2:27 30 404
时间2:27 30 743
程序执行