Question

我一直在尝试在MAC，Ubuntu和其他平台上分析用于FIR过滤的OpenCL主机代码。我的主机代码和内核如下所示。

问题在于，无论我为FIR滤波器提供的采样数量如何，clenquendrangelernel都会花费相同的时间。此外，我也分析了clEnqueueReadBuffer和clEnqueueWriteBuffer，不知怎的，他们最终也花了相同的时间。在mac中我使用mach进行分析以及使用OpenCL事件，在ubuntu中，我正在使用PAPI进行分析。我无法理解为什么会发生这种情况，理想情况是随着样本数量的增加，clEnqueueReadBuffer和clEnqueueWriteBuffer应该花费更多时间，因此应该执行内核。

内核： -

__kernel void fir4(                                                     
    __global float* input,                                              
    __global float* output)                                           
{                                                                     
   int i = get_global_id(0); 
   int j = 0;
   int coeff[4] = {5,7,5,7};
   /*for(j=0;j<4;j++)
   {
    output[i] += coeff[j]*(input[i+4-j-1]);
   }*/
   //unrolled 
   output[i] += coeff[0]*(input[i+4-0-1]);
   output[i] += coeff[1]*(input[i+4-1-1]);
   output[i] += coeff[2]*(input[i+4-2-1]);
   output[i] += coeff[3]*(input[i+4-3-1]);                             
}  


__kernel void fir8(                                                     
    __global float* input,                                              
    __global float* output)                                           
{                                                                     
   int i = get_global_id(0); 
   int j = 0;
   int coeff[8] = {5,7,5,7,5,7,5,7};
   for(j=0;j<8;j++)
   {
    output[i] += coeff[j]*(input[i+8-j-1]);
   }                                
}    


__kernel void fir12(                                                     
    __global float* input,                                              
    __global float* output)                                           
{                                                                     
   int i = get_global_id(0); 
   int j = 0;
   int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
   for(j=0;j<12;j++)
   {
    output[i] += coeff[j]*(input[i+12-j-1]);
   }                                
}

主机代码： -

// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)

int main(int argc, char** argv)
{
    uint64_t        start;
    uint64_t        end;
    uint64_t        elapsed;
    double        elapsedmilli;

    int err;                            // error code returned from api calls

    float data[DATA_SIZE];              // original data set given to device
    float coeff[NUM_COEFF];
    float results_host[DATA_SIZE] = {};
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned

    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation

    cl_event event;                     //Linking event to kernel for profiling
    cl_platform_id platform_id = NULL;  // compute device platform id
    cl_device_id device_id;             // compute device id 
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel

    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array

    // Fill our data set with random float values
    //
    int i,j = 0;
    unsigned int count = DATA_SIZE;
    unsigned int taps = NUM_COEFF;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;

    for(i=0; i < taps; i++)
    {
        if(!(i%2))
            coeff[i] = 5;
        else
            coeff[i] = 7;
    }



    //Connect to a platform on device
    err = clGetPlatformIDs(1, &platform_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to locate opencl platform!\n");
        return EXIT_FAILURE;
    }

    // Connect to a compute device
    //
    int gpu = 0;
    err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }

    // Create a compute context 
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    //Use function and load the kernel source from .cl files in the same folder
    //
    char *KernelSource = load_program_source("fir.cl");

    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel in the program we wish to run
    //
    switch(taps)
    {
        case(4):
            {
                kernel = clCreateKernel(program, "fir4", &err);
                break;
            }
        case(8):
            {
                kernel = clCreateKernel(program, "fir8", &err);
                break;
            }
        case(12):
            {
                kernel = clCreateKernel(program, "fir12", &err);
                break;
            }
        default:
            {
                kernel = clCreateKernel(program, "fir4", &err);
                break;
            }
    }
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel! - %d\n",err);
        exit(1);
    }

    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }    
    // Write our data set into the input array in device memory 
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }


    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    local = 48;
    start = mach_absolute_time();
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
    if (err)
    {
        printf("Error: Failed to execute kernel!-%d\n",err);
        return EXIT_FAILURE;
    }

    // Wait for the command commands to get serviced before reading back results
    //
    clWaitForEvents(1, &event);
    clFinish(commands);
    end = mach_absolute_time();

    cl_ulong time_start, time_end;
    double total_time;
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    total_time = time_end - time_start;
    printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);

    elapsed = end - start;
    struct mach_timebase_info info;
    mach_timebase_info(&info);
    double t = 1e-9 * (elapsed) * info.numer / info.denom;
    elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
    printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);

    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );  
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }

    // Validate our results
    //
    correct = 0;
    for(i=0; i<DATA_SIZE; i++)
    {
        for(j=0;j<NUM_COEFF;j++)
        {
            results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
        }
        //printf("Host Output[%d]-%f\n",i,results_host[i]);
    }
    for(i = 0; i < count; i++)
    {
        if(results[i] == results_host[i])
            correct++;
        //printf("CL Output[%d]-%f\n",i,results[i]);
    }

    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);

    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return 0;
}

Answer 1

每个项目仅添加10-20次乘法和加法与内核开销时间不可比。尝试使用100或1000宽系数阵列。

使用这种方式为每个项目使用更多输入元素，只需增加缓存命中数（也是比率），因为更多线程从相同位置读取。

如果DATA_SIZE是几百万，那么所有数据都无法适应缓存，并且随着其长度线性变慢。 48000意味着小于200kB。例如，HD5850具有512 k二级高速缓存（3倍内存带宽）和每个计算单元8kB L1（太快）。

无论样本大小如何，OPENCL API几乎都需要相同的时间

1 个答案: