在IMX8上的Vivante G7200L上测量FLOPS

时间:2019-07-16 14:58:27

标签: gpu opencl imx6 nxp-microcontroller

您好,我是Open Cl的新手,我正尝试在 IMX8

上的GPU Vivante7200 Lite上测量GFLOPS。

使用此代码的输出是0.127989 GFLOPS,而发布的FLOPS是32 那么这段代码可能出了什么问题。

这是内核代码

__kernel void flops(__global float* input , __global float* output) {

  int gid = get_global_id(0);
  float scale = 2.35;
 float x = *input;
 int i=0;
 for(;i<62500000;++i)
 {
    scale = scale * (x);
  }
  output[gid] = scale;
}

这是主机程序

  #include "gpu_test.h"


 int  main ()
 {


gpu_intialize();

cl_int errNum;
cl_platform_id      platform;            /* OpenCL platform. */
cl_device_id        device;              /* OpenCL device. */
cl_context          context;             /* OpenCL context. */
cl_command_queue    commandQueue;        /* OpenCL command queue. */
cl_program          program;                 /* OpenCL program. */
cl_kernel           kernel;              /* OpenCL kernel. */
cl_mem              memObject_input;         /* OpenCL memory buffer objects. */
cl_mem              memObject_output;        /* OpenCL memory buffer objects. */
cl_event            eventFlops;          /* OpenCL event. */
cl_event        eventWrite;


long time0=0;
long time1=0;
size_t nSize=0;
char *buf;
size_t wg_size=0;
size_t wg_multiple=0;
size_t globalWorkSize[1] = {16};
size_t localWorkSize[1] = {1};

double flopCount=0;  
double consumed_time_in_nano_seconds=0;
double consumed_time_in_seconds=0;
double flop=0;
double Gflop=0;

cl_long maxAllocSizeBytes = 0;
cl_long maxComputeUnits = 0;
cl_long maxWorkGroupSize = 0;
cl_ulong end=0;
cl_ulong start=0;
cl_ulong private_usage=0;
cl_ulong local_usage=0;

printf("Initializing OpenCL...\n");

/* Get the available platform. */
errNum = clGetPlatformIDs(1, &platform, NULL);
clmCHECKERROR(errNum, CL_SUCCESS);

/* Get a GPU device. */
errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
clmCHECKERROR(errNum, CL_SUCCESS);

/* Create the context. */
context = clCreateContext(0, 1, &device, NULL, NULL, &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);

/* Create a command-queue. */
commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);

printf("Creating program...\n");
printf("Creating program with binary...\n");

nSize = kernel_binary_program_size;
buf = (char *)&kernel_binary_program;

program = clCreateProgramWithBinary(context, 1, &device, &nSize, (const unsigned char**)&buf, NULL, NULL);
if (program == NULL) 
{
    printf("program creation failed \n");
}

printf("Program created successfully! \n");

kernel = clCreateKernel (program, "flops", &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);

printf("Kernel created successfully! \n");


// Identify maximum size of the global memory on the device side


clGetDeviceInfo (device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                     sizeof(cl_long), &maxAllocSizeBytes, NULL);
clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS,
                     sizeof(cl_long), &maxComputeUnits, NULL);
clGetDeviceInfo (device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
                     sizeof(cl_long), &maxWorkGroupSize, NULL);


printf("Max compute unit is %d \n", maxComputeUnits);
printf("Max Work Group size is %d \n", maxWorkGroupSize);



// Creating buffer on the device side



float *input_buffer =(float *) malloc (sizeof(float));
float *output_buffer =(float *) malloc (sizeof(float));

input_buffer[0] = 5.36;
output_buffer[0] = 0;

printf("Host buffer been prepared! \n");

memObject_input = clCreateBuffer (context, CL_MEM_READ_ONLY,
                            sizeof(float), NULL, &errNum);
clmCHECKERROR(errNum,CL_SUCCESS);


memObject_output = clCreateBuffer (context, CL_MEM_WRITE_ONLY,
                            sizeof(float), NULL, &errNum);
clmCHECKERROR(errNum,CL_SUCCESS);

if ((memObject_input == NULL) || (memObject_output == NULL)) {
    printf ("Error creating memory objects \n");
    return false;
}

errNum = clEnqueueWriteBuffer (commandQueue, memObject_input, CL_FALSE, 0,  sizeof(float),
            input_buffer, 0, NULL, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
    errNum = clWaitForEvents (1, &eventWrite);
    clmCHECKERROR(errNum,CL_SUCCESS);
    clReleaseEvent(eventWrite);

errNum = clEnqueueWriteBuffer (commandQueue, memObject_output, CL_FALSE, 0,  sizeof(float),
                output_buffer, 0, NULL, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
errNum = clWaitForEvents (1, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
clReleaseEvent(eventWrite);


errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &memObject_input);
    clmCHECKERROR(errNum, CL_SUCCESS);

errNum = clSetKernelArg (kernel, 1, sizeof(cl_mem), &memObject_output);
    clmCHECKERROR(errNum, CL_SUCCESS);



errNum = clGetKernelWorkGroupInfo (kernel, device,
                                   CL_KERNEL_WORK_GROUP_SIZE,
                                   sizeof (wg_size), &wg_size, NULL);
    clmCHECKERROR (errNum, CL_SUCCESS);

errNum = clGetKernelWorkGroupInfo (kernel, device,
                                   CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
                                  sizeof (wg_multiple), &wg_multiple, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);

errNum = clGetKernelWorkGroupInfo (kernel, device,
                                   CL_KERNEL_LOCAL_MEM_SIZE,
                                   sizeof (local_usage), &local_usage, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);

errNum = clGetKernelWorkGroupInfo (kernel, device,
                                   CL_KERNEL_PRIVATE_MEM_SIZE,
                                   sizeof (private_usage), &private_usage, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);


printf("Work Group size is %d \n",wg_size); 
printf("Preferred Work Group size is %d \n",wg_multiple);
printf("Local memory size is %d \n",local_usage);
printf("Private memory size is %d \n",private_usage);       


errNum = clEnqueueNDRangeKernel (commandQueue, kernel, 1, NULL,
                                globalWorkSize, localWorkSize,
                                0, NULL, &eventFlops);
         clmCHECKERROR (errNum, CL_SUCCESS);

printf("Waiting for execution to finish \n");
errNum = clWaitForEvents(1, &eventFlops);
clmCHECKERROR(errNum, CL_SUCCESS);

errNum = clGetEventProfilingInfo(eventFlops, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, 0);
errNum |= clGetEventProfilingInfo(eventFlops, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, 0);
clmCHECKERROR(errNum, CL_SUCCESS);

printf("Kernel execution terminated successfully! \n");

flopCount = 62500000  * 16; // 16 work item
consumed_time_in_nano_seconds = (double)(end - start);
consumed_time_in_seconds = consumed_time_in_nano_seconds/1000000000;

printf("consumed_time_in_nano_seconds : %f \n",consumed_time_in_nano_seconds);
printf("consumed_time_in_seconds : %f \n",consumed_time_in_seconds);

flop = flopCount / consumed_time_in_seconds;
Gflop = flop / 1000000000;

printf("Floating Operations per second : %f \n",flop);
printf("Giga Floating Operations per second : %f \n",Gflop);

cleanup(context,program,kernel,memObject_input,memObject_output,eventFlops);
printf ("Program executed successfully! \n");
 }


void cleanup (cl_context context,
      cl_program program,
      cl_kernel kernel, 
      cl_mem memObject_input, 
      cl_mem memObject_output,
      cl_event eventFlops) {

 if (   eventFlops != NULL)
 clReleaseEvent(eventFlops);

 if (memObject_input != NULL)
 clReleaseMemObject (memObject_input);

 if (memObject_output != NULL)
 clReleaseMemObject (memObject_output);

 if (kernel != NULL)
 clReleaseKernel (kernel);

 if (program != NULL)
 clReleaseProgram (program);

  }



void checkError(cl_int Value,cl_int Reference,const char* FileName,const 
int LineNumber)
{
   if (Reference != Value)
  {
    printf("\n !!! Error # %i at line %i , in file %s !!!\n\n",
                 Value, LineNumber, FileName);

     printf("Exiting...\n");
      exit(EXIT_FAILURE);
  }
}  

 void gpu_intialize()
 {

   Error error;
   printf("[GalCore] GalCore_TaskInit\n");
   error = GalCore_TaskInit();

    if (error == 0) 
    printf("[GalCore] GalCore_TaskInit successful \ne");
   else
    printf("[GalCore] GalCore_TaskInit Failed \n"); 

  printf("[GalCore] VivanteInit\n");
   VivanteInit();
    printf("[GalCore] VivanteInit successful \n");
    }

所以我想知道该实现可能是什么问题

1 个答案:

答案 0 :(得分:1)

无特殊顺序:

  • 请勿使用主机计时器(getTime())来衡量设备上代码的性能。在clGetEventProfilingInfo()上使用eventFlops
  • 您需要释放eventFlops事件,以免发生记忆泄漏
  • 32GFlops的理论性能带有大量算术代码。您的代码每1个负载+ 1个存储执行一次Flop,IOW受内存限制。如果您希望这样的内核以32GFlops运行,则需要2x 128GB / s的内存带宽。
  • 许多低功耗芯片GPU具有矢量化单元,并且为了获得最佳性能,代码必须使用矢量(例如float4)。有时,编译器设法对代码进行矢量化处理,但实际上却并非如此。