OpenCL传输速率超过PCI-e带宽

时间:2013-12-06 03:22:25

标签: opencl

我制作了一个OpenCL程序并使用固定内存(CL_MEM_ALLOC_HOST_PTR)来提高设备之间的传输速率。

传输速率按我的预期增加(使用AMD APP Profiler 2.4获得传输速率)。 问题是矩阵4096 x 4096(64 MB)的传输速率高于PCIe带宽(93703 GB / s)。

当我使用零拷贝缓冲区(CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer)时也发生了这种情况。 我搜索一些信息,如果固定内存和零复制缓冲区具有高传输速率,但它仍然受限于PCIe带宽用于独立GPU。 那么,如果传输速率超过PCIe带宽(使用PCIe带宽2.0 x 16),这是正常的吗?

我的操作系统是Windows 7 64位。 我使用AMD APP SDK 2.6和独立GPU AMD HD 6630M。

编辑: 这是代码:

#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;

#ifdef __APPLE__   
   #include <OpenCL/opencl.h>   
#else  
   #include <CL/cl.h>   
#endif 

#define MAX_SOURCE_SIZE (0x100000)

cl_context context = NULL; 
cl_command_queue queue = NULL; 
cl_program program = NULL; 

void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;

// Create Kernel Object Bound To Kernel Function 
naive = clCreateKernel(program, "naiveAlgorithm", &err);

//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;

if(globalsize >= 16)
{
    localsize =16;
}else
{
    localsize = globalsize;
}

size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items  [2] = {localsize, localsize};

// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);



// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);

//Release Kernel
err = clReleaseKernel(naive);
}

void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;

// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context,  CL_MEM_READ_ONLY   ,   sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context,  CL_MEM_READ_ONLY   ,   sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);     

// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL); 
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL); 

MatrixMul(d_A, d_B, d_C, size);

err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);

err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}



//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096; 

//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;

//Allocate  and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));

//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);

//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);

cl_int err;     // error code   

cl_platform_id* platforms;
cl_uint platformCount;

cl_device_id device;

int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk

clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); 
clGetPlatformIDs(platformCount, platforms, NULL);  //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices

const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
     reinterpret_cast<cl_context_properties> (platforms [platformtype]),
     0, 0
};


context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
    ![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);


//Load Kernel Source 
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;

fp = fopen(fileName, "r");
if (!fp) 
{
    fprintf(stderr, "Failed to load kernel.\n");
    exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);

// Create Program Object 
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
    &source_size, &err); 

// Build Program 
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

Naive(matrixA, matrixB, matrixC, size);

    //Cleanup all memory
err = clFlush(queue);
    err = clFinish(queue);
    err = clReleaseProgram(program);
    err = clReleaseCommandQueue(queue);
    err = clReleaseContext(context);

// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
    printMatrix(matrixC, size*size, size);
cout<<endl;

free(matrixA);
    free(matrixB);
    free(matrixC);
free(source_str);

    return 0;
}

这是内核代码:

 __kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {

 int tx = get_global_id(0); //2D Thread IDx
 int ty = get_global_id(1); //2D Thread IDy

 float sum = 0;

 //Calculate result of one element of Matrix C
 for (int k = 0; k < size; k++) {
    sum += A[ty*size+k] * B[k*size+tx];
 }
  C[ty*size+tx] = sum;
 }

这是图像:

enter image description here

1 个答案:

答案 0 :(得分:1)

由于以下行中的CL_MEM_ALLOC_HOST_PTR标志,我看到您的输出数组实际上位于主机内存中:

d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);

这意味着您应该使用clEnqueueMapBuffer,然后以您认为合适的方式使用矩阵,然后使用clEnqueueUnmapMemObject。由于d_C已经在主机存储器中,所以不需要阵列matrixC。

在内核运行时,实际上会发生从GPU到主机的数据传输。地图调用确保所有数据已完成从GPU移动到CPU。这就是为什么传输时间实际上很小。

我无法找到有关clEnqueueReadBuffer是否适用于固定内存的任何文档。我还看到您正在检索每个操作的错误代码,但不检查这些错误代码,因此您的代码可能会无声地失败。

关于clEnqueueReadBuffer所花费的时间与传输数据所花费的时间之间的巨大差异,请注意所有排队的操作都不会立即发送到GPU。延迟的一个来源是用于图形卡的Windows显示驱动程序模型(WDDM)。用于clEnqueueReadBuffer的+ -20微秒听起来适合这种延迟(我实际上看到了更长的延迟)。