下面的代码应该执行矩阵矩阵乘法。使用OpenCL。执行代码时,结果矩阵C填充为零。这是不正确的,因为我将两个对角矩阵与非零对角项相乘。
/*
Taken from http://gpgpu-computing4.blogspot.com/2009/09/matrix-multiplication-2-opencl.html
We have made some modifications, but need to understand what is going on.
*/
// Multiply two matrices A * B = C
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <CL\cl.h>
#include"OpenCLUtils.h"
/* WA : Width of A, HA : Width of A etc*/
#define WA 1024
#define HA 1024
#define WB 1024
#define LOCAL_WORK_SIZE 16
#define HB WA
#define WC WB
#define HC HA
void identityMultipliedByCoeff(float* data, int width, float coeff)
{
for (int i = 0; i <width*width; ++i)
{
data[i] = 0.0;
}
for (int i = 0; i < width; ++i)
{
data[i*width + i] = 1.0*coeff;
}
}
/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
// set seed for rand()
srand(2006);
// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*)malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*)malloc(mem_size_B);
// 2. initialize host memory
identityMultipliedByCoeff(h_A, WA, 12.0);
identityMultipliedByCoeff(h_B, WB, -9.0);
// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*)malloc(mem_size_C);
// 5. Initialize OpenCL
// OpenCL specific variables
cl_device_id device;
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
cl_kernel clKernel;
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
/*****************************************/
/* Initialize OpenCL */
/*****************************************/
/* Create a device and context */
device = create_device();
clGPUContext = clCreateContext(NULL, 1, &device, NULL, NULL, &errcode);
if (errcode < 0) {
perror("Couldn't create a context");
exit(1);
}
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,clDevices[0], 0, &errcode);
// Setup device memory
/* We are passing the host memory as an argument. This is where
the device memory obtains the data from the host memory. */
d_C = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_C, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,mem_size_B, h_B, &errcode);
// 6. Load and build OpenCL kernel
// Open the .cl file and load it
// into a char* buffer
FILE* fp = fopen("MatMul.cl", "r");
fseek(fp, 0, SEEK_END);
const size_t lSize = ftell(fp);
rewind(fp);
unsigned char* buffer;
buffer = (unsigned char*)malloc(lSize);
fread(buffer, 1, lSize, fp);
fclose(fp);
cl_int status;
clProgram = clCreateProgramWithBinary(clGPUContext,
1, (const cl_device_id *)clDevices,
&lSize, (const unsigned char**)&buffer,
&status, &errcode);
errcode = clBuildProgram(clProgram, 0, NULL, NULL,
NULL, NULL);
errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
// 7. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];
int wA = WA;
int wC = WC;
/* Set the arguments for the kernel. */
errcode = clSetKernelArg(clKernel, 0,sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,sizeof(int), (void *)&wC);
/* Rember that in OpenCL we need to express the
globalWorkSize in terms of the total number of threads.
The underlying OpenCL API will look at the
globalWorkSize and divide by the localWorkSize to
arrive at a 64 by 64 NDRange of 16 by 16 work groups. */
localWorkSize[0] = LOCAL_WORK_SIZE;
localWorkSize[1] = LOCAL_WORK_SIZE;
globalWorkSize[0] = WA;
globalWorkSize[1] = HA;
errcode = clEnqueueNDRangeKernel(clCommandQue,clKernel, 2, NULL, globalWorkSize,localWorkSize, 0, NULL, NULL);
//
// The calculation has now been carried out
//
// 8. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
// We must check the result
for (int i = 0; i < WA; i++)
{
for (int j = 0; j < WA; j++)
{
float prod = 0;
for (int k = 0; k < WA;k++)
{
prod += h_A[i*WA + k] * h_B[k*WA + j];
}
if (fabs(h_C[i*WA+j] - prod) > 0.01)
{
printf("The indices where the comparison failed, i = %d, j = %d\n", i,j);
printf("C[i*WA+j] should equal %f\n", prod);
printf("C[i*WA+j] = %f\n", h_C[i*WA + j]);
perror("The matrix check has failed");
exit(1);
break;
}
}
}
printf("The matrix check has been successfull!\n");
// 10. clean up memory
free(h_A);
free(h_B);
free(h_C);
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
free(clDevices);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);
}
头文件OpenCLUtils.h的.cpp文件给出为:
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include<CL\cl.h>
cl_device_id create_device() {
cl_platform_id* platforms;
cl_uint num_platforms,num_devices;
cl_device_id* devices;
char platform_name_data[50];
char name_data[50];
cl_int i,err,platformchoice;
platformchoice =1;
/* Find out how many platforms there are */
err = clGetPlatformIDs(1, NULL, &num_platforms);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Reserve memory for platforms*/
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
/* Obtain the available platforms and store them in the array platforms */
clGetPlatformIDs(num_platforms, platforms, NULL);
/* We want to know the names of the platforms.
This will the inform us and lead to a
cannonical choice for 'platformchoice'.*/
for (i = 0; i < num_platforms; i++)
{
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name_data), platform_name_data, NULL);
if (err < 0)
{
perror("Unable to obtain information about platform");
}
printf("%s\n", platform_name_data);
}
printf("\nSearching %s for available devices...\n", platform_name_data);
/* Obtain the number of GPUS available on this platform */
err = clGetDeviceIDs(platforms[platformchoice], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
if (err == CL_DEVICE_NOT_FOUND)
{
perror("No GPU devices available");
exit(1);
}
if (err < 0) {
perror("Could not access any devices. Not as a result of the device not being found. Debug for error code");
exit(1);
}
/* Reserve memory for devices */
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
/* Populate devices with devices compatible with the chosen platform */
clGetDeviceIDs(platforms[platformchoice], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
for (i = 0; i < num_devices; i++)
{
err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name_data), name_data, NULL);
if (err < 0)
{
perror("Unable to obtain information about device");
}
printf("%s\n", name_data);
}
printf("\n");
return devices[0];
}
我正在使用AMD GPU。它是R9380。上面的“实用程序”代码允许我选择平台,然后选择具有支持该平台的设备。无论我选择英特尔平台还是AMD平台,结果都是一样的。
矩阵乘法内核在
中给出http://gpgpu-computing4.blogspot.com/2009/10/matrix-multiplication-3-opencl.html
VisualStudio调试器的打印屏幕为
它表明状态等于-42,这意味着根据https://streamhpc.com/blog/2013-04-28/opencl-error-codes/的CL_INVALID_BINARY
编辑:
我尝试过
...
// 6. Load and build OpenCL kernel
// Obtain size of source file
FILE* fp = fopen("MatMul.cl", "r");
fseek(fp, 0, SEEK_END);
const size_t lSize = ftell(fp);
rewind(fp);
// Read file content into buffer
unsigned char* buffer = (unsigned char*)malloc(lSize+1);
buffer[lSize] = '\0';
fread(buffer, sizeof(char), lSize, fp);
fclose(fp);
//create program from buffer
clProgram = clCreateProgramWithSource(clGPUContext,1,(const char**)&buffer,&lSize, &errcode);
errcode = clBuildProgram(clProgram, 1,&device, NULL,NULL, NULL);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
...
但是现在获取错误代码-11 CL_BUILD_PROGRAM _FAILURE
最终编辑:
我开始工作
这是校正后的内核
/* Matrix multiplication: C = A * B.
* Device code.
*/
// Thread block size
#define BLOCK_SIZE 16
//////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! wA is A's width and wB is B's width
//////////////////////////////////////////////////////
__kernel void
matrixMul(__global float* C,
__global float* A,
__global float* B, int wA, int wB)
{
float Csub=0; <<<<<<< THIS WAS MISSING IN THE BLOG
// Block index
int bx = get_group_id(0);
int by = get_group_id(1);
// Thread index
int tx = get_local_id(0);
int ty = get_local_id(1);
// Index of the first sub-matrix of A processed
// by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed
// by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the
// sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed
// by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the
// sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{
// Declaration of the local memory array As
// used to store the sub-matrix of A
__local float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the local memory array Bs
// used to store the sub-matrix of B
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from global memory
// to local memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices
// are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
答案 0 :(得分:1)
我认为错误在于这一行:
d_C = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_A, NULL, &errcode);
第三个参数应为mem_size_C
编辑:此外,出于性能考虑,我建议您使用clEnqueueWriteBuffer在矩阵A和B上进行复制。进一步阅读here