clEnqueueWriteBuffer循环中的执行时间

时间:2016-02-10 08:31:08

标签: opencl amd

我有一个OpenCL代码,我在循环中多次调用clEnqueueWriteBuffer和clEnqueueNDRangeKernel。我使用GetLocalTime函数测量每个循环的数据传输时间和内核执行时间。我面临的问题是第一次迭代中的clEnqueueWriteBuffer和clEnqueueNDRangeKernel需要比第二次迭代中的clEnqueueWriteBuffer和clEnqueueNDRangeKernel更长的时间。为什么会这样?

我正在使用ARM A10 APU的系统。我的opencl循环代码是:

for(j = 0; j< PARTITION_COUNT; j ++){

    //Writing to input buffers
    GetLocalTime(&start);
    clEnqueueWriteBuffer(queue[0], buf_A, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_A + (PARTITION_SIZE * j), 0, NULL, &eventList[0]);
    checkErr(cl_err, "clEnqueueWriteBuffer : buf_A");

    clEnqueueWriteBuffer(queue[1], buf_B, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_B + (PARTITION_SIZE * j), 0, NULL, &eventList[1]);
    checkErr(cl_err, "clEnqueueWriteBuffer : buf_B");

    clEnqueueWriteBuffer(queue[2], buf_C, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_C + (PARTITION_SIZE * j), 0, NULL, &eventList[2]);
    checkErr(cl_err, "clEnqueueWriteBuffer : buf_C");

    clEnqueueWriteBuffer(queue[3], buf_D, CL_TRUE, 0, PARTITION_SIZE * sizeof(int), input_D + (PARTITION_SIZE * j), 0, NULL, &eventList[3]);
    checkErr(cl_err, "clEnqueueWriteBuffer : buf_D");

    clFinish(queue[0]);
    clFinish(queue[1]);
    clFinish(queue[2]);
    clFinish(queue[3]);

    //getting end time
    GetLocalTime(&end);
    //displaying final time
    cout<<"\nTime : "<<start.wMinute<<" "<<start.wSecond<<" "<<start.wMilliseconds;
    cout<<"\nTime : "<<end.wMinute<<" "<<end.wSecond<<" "<<end.wMilliseconds;

    GetLocalTime(&start);
    cl_err = clEnqueueNDRangeKernel(queue[4],kernel[Q6_PROGRAM_ID][FILTER1_KERNEL],1,NULL,&globalSize,&localSize,4,eventList,&eventList[4]);
    checkErr(cl_err, "clEnqueueNDRangeKernel : filter1_kernel");

    //clFinish(queue[4]);

    //Invoking the second filter kernel
    cl_err = clEnqueueNDRangeKernel(queue[5],kernel[Q6_PROGRAM_ID][FILTER2_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 4,&eventList[5]);
    checkErr(cl_err, "clEnqueueNDRangeKernel : filter2_kernel");

    //clFinish(queue[5]);

    //Invoking the third filter kernel
    cl_err = clEnqueueNDRangeKernel(queue[6],kernel[Q6_PROGRAM_ID][FILTER3_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 5,&eventList[6]);
    checkErr(cl_err, "clEnqueueNDRangeKernel : filter3_kernel");

    //clFinish(queue[6]);

    //Invoking the aggregate kernel
    cl_err = clEnqueueNDRangeKernel(queue[8],kernel[Q6_PROGRAM_ID][AGGREGATE_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 6,&eventList[7]);
    checkErr(cl_err, "clEnqueueNDRangeKernel : aggregate kernel");

    output_A = (int *)clEnqueueMapBuffer(queue[9],output_buf_A,CL_TRUE, CL_MAP_READ, 0, rLen * sizeof(int), 1, eventList + 7, &eventList[8], &cl_err);
    checkErr(cl_err, "clEnqueueReadBuffer : output_A");



    for(i = 0; i < rLen; i++){
        if(output_A[i] > 0){
            //cout<<"\n"<<output_A[i];
            sum += output_A[i];
        }

    }

    clFinish(queue[4]);
    clFinish(queue[5]);
    clFinish(queue[6]);
    clFinish(queue[8]);
    clFinish(queue[9]);

    GetLocalTime(&end);

    //displaying final time
    cout<<"\nTime1 : "<<start.wMinute<<" "<<start.wSecond<<" "<<start.wMilliseconds;
    cout<<"\nTime1 : "<<end.wMinute<<" "<<end.wSecond<<" "<<end.wMilliseconds;

}

GetLocalTime(&end1);
//displaying final time
cout<<"\nTime2 : "<<start1.wMinute<<" "<<start1.wSecond<<" "<<start1.wMilliseconds;
cout<<"\nTime2 : "<<end1.wMinute<<" "<<end1.wSecond<<" "<<end1.wMilliseconds;

我的输出是:

时间:27 30 404

时间:27 30 466

时间1:27 30 474

时间1:27 30 547

时间:27 30 551

时间:27 30 555

时间1:27 30 561

时间1:27 30 582

时间:27 30 587

时间:27 30 591

时间1:27 30 597

时间1:27 30 617

时间:27 30 622

时间:27 30 627

时间1:27 30 638

时间1:27 30 659

时间:27 30 670

时间:27 30 675

时间1:27 30 679

时间1:27 30 699

时间:27 30 706

时间:27 30 711

时间1:27 30 718

时间1:27 30 737

时间2:27 30 404

时间2:27 30 743

程序执行

0 个答案:

没有答案