Question

我有一个简单的OpenCL主机代码，它将一些数据写入OpenCL启用设备的缓冲区，然后将其读回主机内存。它针对不同的缓冲区大小进行此实验。这是我使用的代码：

#include <iostream>
#include "support.h"
#include "Event.h"
#include "ResultDatabase.h"
#include "OptionParser.h"

using namespace std;

void addBenchmarkSpecOptions(OptionParser &op)
{
    op.addOption("nopinned", OPT_BOOL, "",
                 "disable usage of pinned (pagelocked) memory");
}

// Modifications:
//    Jeremy Meredith, Wed Dec  1 17:05:27 EST 2010
//    Added calculation of latency estimate.
void RunBenchmark(cl_device_id id,
                  cl_context ctx,
                  cl_command_queue queue,
                  ResultDatabase &resultDB,
                  OptionParser &op)
{
    bool verbose = op.getOptionBool("verbose");
    bool pinned = !op.getOptionBool("nopinned");
    int  npasses = op.getOptionInt("passes");
    const bool waitForEvents = true;

    // Sizes are in kb
    int nSizes  = 20;
    int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,
                     32768,65536,131072,262144,524288};

    // Max sure we don't surpass the OpenCL limit.
    cl_long maxAllocSizeBytes = 0;
    clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                    sizeof(cl_long), &maxAllocSizeBytes, NULL);
    while (sizes[nSizes-1]*1024 > 0.90 * maxAllocSizeBytes)
    {
        --nSizes;
        if (verbose) cout << " - dropping allocation size to keep under reported limit.\n";
        if (nSizes < 1)
        {
            cerr << "Error: OpenCL reported a max allocation size less than 1kB.\b";
            return;
        }
    }

    // Create some host memory pattern
    if (verbose) cout << ">> creating host mem pattern\n";
    int err;
    float *hostMem1;
    float *hostMem2;
    cl_mem hostMemObj1;
    cl_mem hostMemObj2;
    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
    if (pinned)
    {
        int err1, err2;
        hostMemObj1 = clCreateBuffer(ctx,
                                     CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                     sizeof(float)*numMaxFloats, NULL, &err1);
        if (err1 == CL_SUCCESS)
        {
            hostMem1 = (float*)clEnqueueMapBuffer(queue, hostMemObj1, true,
                                                  CL_MAP_READ|CL_MAP_WRITE,
                                                  0,sizeof(float)*numMaxFloats,0,
                                                  NULL,NULL,&err1);
        }
        hostMemObj2 = clCreateBuffer(ctx,
                                     CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                     sizeof(float)*numMaxFloats, NULL, &err2);
        if (err2 == CL_SUCCESS)
        {
            hostMem2 = (float*)clEnqueueMapBuffer(queue, hostMemObj2, true,
                                                  CL_MAP_READ|CL_MAP_WRITE,
                                                  0,sizeof(float)*numMaxFloats,0,
                                                  NULL,NULL,&err2);
        }
        while (err1 != CL_SUCCESS || err2 != CL_SUCCESS)
        {
            // free the first buffer if only the second failed
            if (err1 == CL_SUCCESS)
                clReleaseMemObject(hostMemObj1);

            // drop the size and try again
            if (verbose) cout << " - dropping size allocating pinned mem\n";
            --nSizes;
            if (nSizes < 1)
            {
                cerr << "Error: Couldn't allocated any pinned buffer\n";
                return;
            }
            numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
            hostMemObj1 = clCreateBuffer(ctx,
                                         CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                         sizeof(float)*numMaxFloats, NULL, &err1);
            if (err1 == CL_SUCCESS)
            {
                hostMem1 = (float*)clEnqueueMapBuffer(queue, hostMemObj1, true,
                                                      CL_MAP_READ|CL_MAP_WRITE,
                                                      0,sizeof(float)*numMaxFloats,0,
                                                      NULL,NULL,&err1);
            }
            hostMemObj2 = clCreateBuffer(ctx,
                                         CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                         sizeof(float)*numMaxFloats, NULL, &err2);
            if (err2 == CL_SUCCESS)
            {
                hostMem2 = (float*)clEnqueueMapBuffer(queue, hostMemObj2, true,
                                                      CL_MAP_READ|CL_MAP_WRITE,
                                                      0,sizeof(float)*numMaxFloats,0,
                                                      NULL,NULL,&err2);
            }
        }
    }
    else
    {
        hostMem1 = new float[numMaxFloats];
        hostMem2 = new float[numMaxFloats];
    }

    for (int i=0; i<numMaxFloats; i++) {
        hostMem1[i] = i % 77;
        hostMem2[i] = -1;
    }

    // Allocate some device memory
    if (verbose) cout << ">> allocating device mem\n";
    cl_mem mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
                                 sizeof(float)*numMaxFloats, NULL, &err);
    while (err != CL_SUCCESS)
    {
        // drop the size and try again
        if (verbose) cout << " - dropping size allocating device mem\n";
        --nSizes;
        if (nSizes < 1)
        {
            cerr << "Error: Couldn't allocated any device buffer\n";
            return;
        }
        numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
        mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
                              sizeof(float)*numMaxFloats, NULL, &err);
    }
    if (verbose) cout << ">> filling device mem to force allocation\n";
    Event evDownloadPrime("DownloadPrime");
    err = clEnqueueWriteBuffer(queue, mem1, false, 0,
                               numMaxFloats*sizeof(float), hostMem1,
                               0, NULL, &evDownloadPrime.CLEvent());
    CL_CHECK_ERROR(err);
    if (verbose) cout << ">> waiting for download to finish\n";
    err = clWaitForEvents(1, &evDownloadPrime.CLEvent());
    CL_CHECK_ERROR(err);

    // Three passes, forward and backward both
    for (int pass = 0; pass < npasses; pass++)
    {
        // store the times temporarily to estimate latency
        //float times[nSizes];
        // Step through sizes forward on even passes and backward on odd
        for (int i = 0; i < nSizes; i++)
        {
            int sizeIndex;
            if ((pass%2) == 0)
                sizeIndex = i;
            else
                sizeIndex = (nSizes-1) - i;

            // Read memory back from the device
            if (verbose) cout << ">> reading from device "<<sizes[sizeIndex]<<"kB\n";
            Event evReadback("Readback");
            err = clEnqueueReadBuffer(queue, mem1, false, 0,
                                       sizes[sizeIndex]*1024, hostMem2,
                                       0, NULL, &evReadback.CLEvent());
            CL_CHECK_ERROR(err);

            // Wait for event to finish
            if (verbose) cout << ">> waiting for readback to finish\n";
            err = clWaitForEvents(1, &evReadback.CLEvent());
            CL_CHECK_ERROR(err);

            if (verbose) cout << ">> finish!";
            if (verbose) cout << endl;

            // Get timings
            err = clFlush(queue);
            CL_CHECK_ERROR(err);
            evReadback.FillTimingInfo();
            if (verbose) evReadback.Print(cerr);

            double t = evReadback.SubmitEndRuntime() / 1.e6; // in ms
            //times[sizeIndex] = t;

            // Add timings to database
            double speed = (double(sizes[sizeIndex] * 1024.) /  (1000.*1000.)) / t;
            char sizeStr[256];
            sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]);
            resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed);

            // Add timings to database
            double delay = evReadback.SubmitStartDelay() / 1.e6;
            resultDB.AddResult("ReadbackDelay", sizeStr, "ms", delay);
            resultDB.AddResult("ReadbackTime", sizeStr, "ms", t);
        }
        //resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.);
        //resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.);
        //resultDB.AddResult("ReadbackLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.);
    }

    // Cleanup
    err = clReleaseMemObject(mem1);
    CL_CHECK_ERROR(err);
    if (pinned)
    {
        err = clReleaseMemObject(hostMemObj1);
        CL_CHECK_ERROR(err);
        err = clReleaseMemObject(hostMemObj2);
        CL_CHECK_ERROR(err);
    }
    else
    {
        delete[] hostMem1;
        delete[] hostMem2;
    }
}

在运行代码并根据缓冲区大小绘制速度时，我看到：

如您所见，数据大小超过1024KB的性能下降。然后在某些时候保持不变。

我的设备是特斯拉K40，我使用的是Nvidia的OpenCL驱动程序。但仍然不明白这种奇怪行为背后的主要原因。

OpenCL Read Back总线速度大幅下降

0 个答案: