#include <sys/types.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <clBLAS.h>
#define M 4
#define N 3
#define K 5
static const clblasOrder order = clblasRowMajor;
static const cl_float alpha = 10;
static const clblasTranspose transA = clblasNoTrans;
static const cl_float A[M*K] = {
11, 12, 13, 14, 15,
21, 22, 23, 24, 25,
31, 32, 33, 34, 35,
41, 42, 43, 44, 45,
};
static const size_t lda = K; /* i.e. lda = K */
static const clblasTranspose transB = clblasNoTrans;
static const cl_float B[K*N] = {
11, 12, 13,
21, 22, 23,
31, 32, 33,
41, 42, 43,
51, 52, 53,
};
static const size_t ldb = N; /* i.e. ldb = N */
static const cl_float beta = 20;
static cl_float C[M*N] = {
11, 12, 13,
21, 22, 23,
31, 32, 33,
41, 42, 43,
};
static const size_t ldc = N; /* i.e. ldc = N */
static cl_float result[M*N];
static const size_t off = 1;
static const size_t offA = K + 1; /* K + off */
static const size_t offB = N + 1; /* N + off */
static const size_t offC = N + 1; /* N + off */
static void
printResult(const char* str)
{
size_t i, j, nrows;
printf("%s:\n", str);
nrows = (sizeof(result) / sizeof(cl_float)) / ldc;
for (i = 0; i < nrows; i++) {
for (j = 0; j < ldc; j++) {
printf("%d ", (int)result[i * ldc + j]);
}
printf("\n");
}
}
int main(int argc,char** argv){
printf("Let's see what devices are there in this Node\n\n");
cl_int errNum=0,errCPU=0,errGPU=0,errNum1=0,errNum2=0;
cl_uint numPlatforms;
cl_platform_id *platformIds,cpuPlatform,gpuPlatform;
cl_context cpuContext=NULL,gpuContext=NULL;
cl_context_properties cpuProps[3] = {CL_CONTEXT_PLATFORM,0,0},gpuProps[3]={CL_CONTEXT_PLATFORM,0,0};
cl_command_queue cpuQueue = 0,gpuQueue=0;
cl_mem bufA, bufB, bufC;
cl_event cpuevent = NULL,gpuevent=NULL;
char dname[500];
int i,dc,dg;
cl_device_id *cpuDevices,*gpuDevices;
cl_uint numCPUDevices,numGPUDevices,entries;
cl_ulong long_entries;
size_t p_size;
errNum = clGetPlatformIDs(0,NULL,&numPlatforms);
if(errNum == CL_SUCCESS){
printf("Number of Platforms on this Node: %d\n\n",numPlatforms);
platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id)*numPlatforms);
errNum = clGetPlatformIDs(numPlatforms,platformIds,NULL);
}else{printf("Error in getting number of platforms, Error code:%d\n",errNum);}
if(errNum == CL_SUCCESS){
for(i=0;i<numPlatforms;i++){
printf("Platform Information on %d Platform\n",i+1);
/*Obtain information about platform*/
clGetPlatformInfo(platformIds[i],CL_PLATFORM_NAME,500,dname,NULL);
printf("\tCL_PLATFORM_NAME = %s\n",dname);
clGetPlatformInfo(platformIds[i],CL_PLATFORM_VERSION,500,dname,NULL);
printf("\tCL_PLATFORM_VERSION = %s\n",dname);
/*obtain list of devices available on platform*/
errCPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_CPU,0,NULL,&numCPUDevices);
if(errCPU == CL_SUCCESS && numCPUDevices>0){
printf("\t%d CPUs found\n",numCPUDevices);
cpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numCPUDevices);
errCPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_CPU,numCPUDevices,cpuDevices,NULL);
if(errCPU = CL_SUCCESS){
cpuPlatform = platformIds[i];
cpuProps[1] = (cl_context_properties)cpuPlatform;
cpuContext = clCreateContext(cpuProps,numCPUDevices,cpuDevices,NULL,NULL,&errCPU);
}else{printf("clCreateContext() for cpu failed with %d\n",errCPU);return 1;}
cpuQueue = clCreateCommandQueue(cpuContext,cpuDevices,0,&errCPU);
if(errCPU != CL_SUCCESS) {
printf( "clCreateCommandQueue() for cpu failed with %d\n", errCPU);
clReleaseContext(cpuContext);
return 1;
}
/* Setup clblas. */
errCPU = clblasSetup();
if(errNum1 != CL_SUCCESS){
printf("clblasSetup() failed with %d\n",errCPU);
clReleaseCommandQueue(cpuQueue);
clReleaseContext(cpuContext);
return 1;
}
}else{printf("\tZero CPUs found\n");}
errGPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,0,NULL,&numGPUDevices);
if(errGPU == CL_SUCCESS && numGPUDevices>0){
printf("\t%d GPUs found\n",numGPUDevices);
gpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numGPUDevices);
errGPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,numGPUDevices,gpuDevices,NULL);
if (errGPU != CL_SUCCESS) {
gpuPlatform = platformIds[i];
gpuProps[1] = (cl_context_properties)gpuPlatform;
gpuContext = clCreateContext(gpuProps,numGPUDevices,gpuDevices,NULL,NULL,&errGPU);
}else{printf("clCreateContext() for GPU failed with %d\n",errGPU);return 1;}
gpuQueue = clCreateCommandQueue(gpuContext,gpuDevices,0,&errGPU);
if(errGPU != CL_SUCCESS) {
printf("clCreateCommandQueue() for GPU failed with %d\n",errGPU);
clReleaseContext(gpuContext);
return 1;
}
/* Setup clblas. */
errGPU = clblasSetup();
if(errGPU != CL_SUCCESS){
printf("clblasSetup() failed with %d\n",errGPU);
clReleaseCommandQueue(gpuQueue);
clReleaseContext(gpuContext);
return 1;
}
}else{printf("\tZero GPUs found\n");}
}
}
else{printf("Error:Failure in clGetPlatformIDs,error code = %d\n",errNum);}
/* Prepare OpenCL memory objects and place matrices inside them. */
bufA = clCreateBuffer(cpuContext,CL_MEM_READ_ONLY,M * K * sizeof(*A),NULL,&errNum1);
bufB = clCreateBuffer(cpuContext,CL_MEM_READ_ONLY,K * N * sizeof(*B),NULL,&errNum1);
bufC = clCreateBuffer(cpuContext,CL_MEM_READ_WRITE,M * N * sizeof(*C),NULL,&errNum1);
errNum1 = clEnqueueWriteBuffer(cpuQueue, bufA, CL_TRUE, 0,M * K * sizeof(*A), A, 0, NULL, NULL);
errNum1 = clEnqueueWriteBuffer(cpuQueue, bufB, CL_TRUE, 0,K * N * sizeof(*B), B, 0, NULL, NULL);
errNum1 = clEnqueueWriteBuffer(cpuQueue, bufC, CL_TRUE, 0,M * N * sizeof(*C), C, 0, NULL, NULL);
/* Call clblas extended function. Perform gemm for the lower right sub-matrices */
errNum1 = clblasSgemm(order,transA,transB,M-off,N-off,K-off,alpha,bufA,offA,lda,bufB,offB,ldb,beta,bufC,offC,ldc,1,&cpuQueue,0,NULL,&cpuevent);
if (errNum1 != CL_SUCCESS) {
printf("clblasSgemmEx() failed with %d\n",errNum1);
ret = 1;
}
else {
/* Wait for calculations to be finished. */
err = clWaitForEvents(1,&cpuevent);
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer(cpuQueue,bufC,CL_TRUE,0,M * N * sizeof(*result),result,0,NULL,NULL);
/* At this point you will get the result of SGEMM placed in 'result' array. */
puts("");
printResult("clblasSgemmEx result");
}
/* Release OpenCL events. */
clReleaseEvent(cpuevent);
/* Release OpenCL memory objects. */
clReleaseMemObject(bufC);
clReleaseMemObject(bufB);
clReleaseMemObject(bufA);
/* Finalize work with clblas. */
clblasTeardown();
/* Release OpenCL working objects. */
clReleaseCommandQueue(cpuQueue);
clReleaseContext(cpuContext);
/* Prepare OpenCL memory objects and place matrices inside them. */
bufA = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY,M * K * sizeof(*A),NULL,&errNum1);
bufB = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY,K * N * sizeof(*B),NULL,&errNum1);
bufC = clCreateBuffer(gpuContext,CL_MEM_READ_WRITE,M * N * sizeof(*C),NULL,&errNum1);
errNum1 = clEnqueueWriteBuffer(gpuQueue, bufA, CL_TRUE, 0,M * K * sizeof(*A), A, 0, NULL, NULL);
errNum1 = clEnqueueWriteBuffer(gpuQueue, bufB, CL_TRUE, 0,K * N * sizeof(*B), B, 0, NULL, NULL);
errNum1 = clEnqueueWriteBuffer(gpuQueue, bufC, CL_TRUE, 0,M * N * sizeof(*C), C, 0, NULL, NULL);
/* Call clblas extended function. Perform gemm for the lower right sub-matrices */
errNum1 = clblasSgemm(order,transA,transB,M-off,N-off,K-off,alpha,bufA,offA,lda,bufB,offB,ldb,beta,bufC,offC,ldc,1,&gpuQueue,0,NULL,&gpuevent);
if (errNum1 != CL_SUCCESS) {
printf("clblasSgemmEx() failed with %d\n",errNum1);
ret = 1;
}
else {
/* Wait for calculations to be finished. */
err = clWaitForEvents(1,&gpuevent);
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer(gpuQueue,bufC,CL_TRUE,0,M * N * sizeof(*result),result,0,NULL,NULL);
/* At this point you will get the result of SGEMM placed in 'result' array. */
puts("");
printResult("clblasSgemmEx result");
}
/* Release OpenCL events. */
clReleaseEvent(gpuevent);
/* Release OpenCL memory objects. */
clReleaseMemObject(bufC);
clReleaseMemObject(bufB);
clReleaseMemObject(bufA);
/* Finalize work with clblas. */
clblasTeardown();
/* Release OpenCL working objects. */
clReleaseCommandQueue(gpuQueue);
clReleaseContext(gpuContext);
return 0;
}
在此代码中,我想进行更改以在多个GPU上运行代码。我不知道在clblasSgemm电话中要进行的更改。假设如果上下文包含三个GPU,我应该在 clblasSgemm()调用中进行哪些更改? 我对在我进行的clblasSgemm()函数调用中&gpuQueue之前的参数中要进行的更改有特定的疑问?可以通过获取AMD提供的在线代码来修改此代码。