如何使用OpenCL在多个GPU上运行clblasSgemm?

时间:2018-08-24 11:43:38

标签: opencl gpu

            #include <sys/types.h>
            #include <string.h>
            #include <stdio.h>
            #include <stdlib.h>
            #include <CL/cl.h>
            #include <clBLAS.h>

            #define M  4
            #define N  3
            #define K  5
            static const clblasOrder order = clblasRowMajor;
            static const cl_float alpha = 10;
            static const clblasTranspose transA = clblasNoTrans;
            static const cl_float A[M*K] = {
                11, 12, 13, 14, 15,
                21, 22, 23, 24, 25,
                31, 32, 33, 34, 35,
                41, 42, 43, 44, 45,
            };
            static const size_t lda = K;        /* i.e. lda = K */
            static const clblasTranspose transB = clblasNoTrans;
            static const cl_float B[K*N] = {
                11, 12, 13,
                21, 22, 23,
                31, 32, 33,
                41, 42, 43,
                51, 52, 53,
            };
            static const size_t ldb = N;        /* i.e. ldb = N */
            static const cl_float beta = 20;
            static cl_float C[M*N] = {
                11, 12, 13,
                21, 22, 23,
                31, 32, 33,
                41, 42, 43,
            };
            static const size_t ldc = N;        /* i.e. ldc = N */
            static cl_float result[M*N];
            static const size_t off  = 1;
            static const size_t offA = K + 1;   /* K + off */
            static const size_t offB = N + 1;   /* N + off */
            static const size_t offC = N + 1;   /* N + off */
            static void
            printResult(const char* str)
            {
                size_t i, j, nrows;
                printf("%s:\n", str);
                nrows = (sizeof(result) / sizeof(cl_float)) / ldc;
                for (i = 0; i < nrows; i++) {
                    for (j = 0; j < ldc; j++) {
                        printf("%d ", (int)result[i * ldc + j]);
                    }
                    printf("\n");
                }
            }

            int main(int argc,char** argv){

              printf("Let's see what devices are there in this Node\n\n");

              cl_int errNum=0,errCPU=0,errGPU=0,errNum1=0,errNum2=0;
              cl_uint numPlatforms;
              cl_platform_id *platformIds,cpuPlatform,gpuPlatform;
              cl_context cpuContext=NULL,gpuContext=NULL;
              cl_context_properties cpuProps[3] = {CL_CONTEXT_PLATFORM,0,0},gpuProps[3]={CL_CONTEXT_PLATFORM,0,0};
              cl_command_queue cpuQueue = 0,gpuQueue=0;
              cl_mem bufA, bufB, bufC;
              cl_event cpuevent = NULL,gpuevent=NULL;

              char dname[500];
              int i,dc,dg;
              cl_device_id *cpuDevices,*gpuDevices;
              cl_uint numCPUDevices,numGPUDevices,entries;
              cl_ulong long_entries;
              size_t p_size;

              errNum = clGetPlatformIDs(0,NULL,&numPlatforms);
              if(errNum == CL_SUCCESS){
                   printf("Number of Platforms on this Node: %d\n\n",numPlatforms);
                   platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id)*numPlatforms);
                   errNum = clGetPlatformIDs(numPlatforms,platformIds,NULL);
              }else{printf("Error in getting number of platforms, Error code:%d\n",errNum);}


              if(errNum == CL_SUCCESS){
                for(i=0;i<numPlatforms;i++){

                   printf("Platform Information on %d Platform\n",i+1);
                   /*Obtain information about platform*/
                   clGetPlatformInfo(platformIds[i],CL_PLATFORM_NAME,500,dname,NULL);
                   printf("\tCL_PLATFORM_NAME = %s\n",dname);
                   clGetPlatformInfo(platformIds[i],CL_PLATFORM_VERSION,500,dname,NULL);
                   printf("\tCL_PLATFORM_VERSION = %s\n",dname);

                   /*obtain list of devices available on platform*/
                   errCPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_CPU,0,NULL,&numCPUDevices);
                   if(errCPU == CL_SUCCESS && numCPUDevices>0){
                       printf("\t%d CPUs found\n",numCPUDevices);
                       cpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numCPUDevices);
                       errCPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_CPU,numCPUDevices,cpuDevices,NULL);
                       if(errCPU = CL_SUCCESS){
                          cpuPlatform =  platformIds[i];
                          cpuProps[1] = (cl_context_properties)cpuPlatform;
                          cpuContext  = clCreateContext(cpuProps,numCPUDevices,cpuDevices,NULL,NULL,&errCPU);
                       }else{printf("clCreateContext() for cpu failed with %d\n",errCPU);return 1;}

                       cpuQueue = clCreateCommandQueue(cpuContext,cpuDevices,0,&errCPU);
                       if(errCPU != CL_SUCCESS) {
                          printf( "clCreateCommandQueue() for cpu failed with %d\n", errCPU);
                          clReleaseContext(cpuContext);
                          return 1;
                       }
                       /* Setup clblas. */
                       errCPU = clblasSetup();
                       if(errNum1 != CL_SUCCESS){
                          printf("clblasSetup() failed with %d\n",errCPU);
                          clReleaseCommandQueue(cpuQueue);
                          clReleaseContext(cpuContext);
                          return 1;
                       }

                   }else{printf("\tZero CPUs found\n");}

                   errGPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,0,NULL,&numGPUDevices);
                   if(errGPU == CL_SUCCESS && numGPUDevices>0){

                       printf("\t%d GPUs found\n",numGPUDevices);
                       gpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numGPUDevices);
                       errGPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,numGPUDevices,gpuDevices,NULL);

                       if (errGPU != CL_SUCCESS) {
                           gpuPlatform =  platformIds[i];
                           gpuProps[1] = (cl_context_properties)gpuPlatform;
                           gpuContext  = clCreateContext(gpuProps,numGPUDevices,gpuDevices,NULL,NULL,&errGPU);
                       }else{printf("clCreateContext() for GPU failed with %d\n",errGPU);return 1;}

                       gpuQueue = clCreateCommandQueue(gpuContext,gpuDevices,0,&errGPU);
                       if(errGPU != CL_SUCCESS) {
                          printf("clCreateCommandQueue() for GPU failed with %d\n",errGPU);
                          clReleaseContext(gpuContext);
                          return 1;
                       }
                       /* Setup clblas. */
                       errGPU = clblasSetup();
                       if(errGPU != CL_SUCCESS){
                          printf("clblasSetup() failed with %d\n",errGPU);
                          clReleaseCommandQueue(gpuQueue);
                          clReleaseContext(gpuContext);
                          return 1;
                       }

                   }else{printf("\tZero GPUs found\n");}       
                }    
              }
              else{printf("Error:Failure in clGetPlatformIDs,error code = %d\n",errNum);}

              /* Prepare OpenCL memory objects and place matrices inside them. */
                bufA     = clCreateBuffer(cpuContext,CL_MEM_READ_ONLY,M * K * sizeof(*A),NULL,&errNum1);
                bufB     = clCreateBuffer(cpuContext,CL_MEM_READ_ONLY,K * N * sizeof(*B),NULL,&errNum1);
                bufC     = clCreateBuffer(cpuContext,CL_MEM_READ_WRITE,M * N * sizeof(*C),NULL,&errNum1);
                errNum1  = clEnqueueWriteBuffer(cpuQueue, bufA, CL_TRUE, 0,M * K * sizeof(*A), A, 0, NULL, NULL);
                errNum1  = clEnqueueWriteBuffer(cpuQueue, bufB, CL_TRUE, 0,K * N * sizeof(*B), B, 0, NULL, NULL);
                errNum1  = clEnqueueWriteBuffer(cpuQueue, bufC, CL_TRUE, 0,M * N * sizeof(*C), C, 0, NULL, NULL);
              /* Call clblas extended function. Perform gemm for the lower right sub-matrices */
                errNum1 = clblasSgemm(order,transA,transB,M-off,N-off,K-off,alpha,bufA,offA,lda,bufB,offB,ldb,beta,bufC,offC,ldc,1,&cpuQueue,0,NULL,&cpuevent);
                if (errNum1 != CL_SUCCESS) {
                    printf("clblasSgemmEx() failed with %d\n",errNum1);
                    ret = 1;
                }
                else {
                    /* Wait for calculations to be finished. */
                    err = clWaitForEvents(1,&cpuevent);
                    /* Fetch results of calculations from GPU memory. */
                    err = clEnqueueReadBuffer(cpuQueue,bufC,CL_TRUE,0,M * N * sizeof(*result),result,0,NULL,NULL);
                    /* At this point you will get the result of SGEMM placed in 'result' array. */
                    puts("");
                    printResult("clblasSgemmEx result");
                }
                /* Release OpenCL events. */
                clReleaseEvent(cpuevent);
                /* Release OpenCL memory objects. */
                clReleaseMemObject(bufC);
                clReleaseMemObject(bufB);
                clReleaseMemObject(bufA);
                /* Finalize work with clblas. */
                clblasTeardown();
                /* Release OpenCL working objects. */
                clReleaseCommandQueue(cpuQueue);
                clReleaseContext(cpuContext);

               /* Prepare OpenCL memory objects and place matrices inside them. */
                bufA     = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY,M * K * sizeof(*A),NULL,&errNum1);
                bufB     = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY,K * N * sizeof(*B),NULL,&errNum1);
                bufC     = clCreateBuffer(gpuContext,CL_MEM_READ_WRITE,M * N * sizeof(*C),NULL,&errNum1);
                errNum1  = clEnqueueWriteBuffer(gpuQueue, bufA, CL_TRUE, 0,M * K * sizeof(*A), A, 0, NULL, NULL);
                errNum1  = clEnqueueWriteBuffer(gpuQueue, bufB, CL_TRUE, 0,K * N * sizeof(*B), B, 0, NULL, NULL);
                errNum1  = clEnqueueWriteBuffer(gpuQueue, bufC, CL_TRUE, 0,M * N * sizeof(*C), C, 0, NULL, NULL);
              /* Call clblas extended function. Perform gemm for the lower right sub-matrices */
                errNum1 = clblasSgemm(order,transA,transB,M-off,N-off,K-off,alpha,bufA,offA,lda,bufB,offB,ldb,beta,bufC,offC,ldc,1,&gpuQueue,0,NULL,&gpuevent);
                if (errNum1 != CL_SUCCESS) {
                    printf("clblasSgemmEx() failed with %d\n",errNum1);
                    ret = 1;
                }
                else {
                    /* Wait for calculations to be finished. */
                    err = clWaitForEvents(1,&gpuevent);
                    /* Fetch results of calculations from GPU memory. */
                    err = clEnqueueReadBuffer(gpuQueue,bufC,CL_TRUE,0,M * N * sizeof(*result),result,0,NULL,NULL);
                    /* At this point you will get the result of SGEMM placed in 'result' array. */
                    puts("");
                    printResult("clblasSgemmEx result");
                }
                /* Release OpenCL events. */
                clReleaseEvent(gpuevent);
                /* Release OpenCL memory objects. */
                clReleaseMemObject(bufC);
                clReleaseMemObject(bufB);
                clReleaseMemObject(bufA);
                /* Finalize work with clblas. */
                clblasTeardown();
                /* Release OpenCL working objects. */
                clReleaseCommandQueue(gpuQueue);
                clReleaseContext(gpuContext);

              return 0;
            }

在此代码中,我想进行更改以在多个GPU上运行代码。我不知道在clblasSgemm电话中要进行的更改。假设如果上下文包含三个GPU,我应该在 clblasSgemm()调用中进行哪些更改? 我对在我进行的clblasSgemm()函数调用中&gpuQueue之前的参数中要进行的更改有特定的疑问?可以通过获取AMD提供的在线代码来修改此代码。

0 个答案:

没有答案