我有一台MSI Radeon R9 390X 8GB视频卡(如下所示,名为“夏威夷”)。我在Windows 10桌面上安装了OpenCL,我使用Cygwin编译并运行程序。
我正在尝试运行一个示例OpenCL程序,我从大学时代的课程中保留了一些,修改了一点。
它不会在我的显卡上运行。这是我得到的:
$ ./ex26.exe -v 30 40
Bw=30 Bn=40 n=1200
OpenCL Platform 0: AMD Accelerated Parallel Processing
----- OpenCL Device # 0: Hawaii-----
Gflops: 47.520000
Max Compute Units: 44
Max Clock Frequency: 1080
Total Memory of Device (bytes): 8589934592
Max Size of Memory Object Allocation (bytes): 4244635648
Max Work Group Size: 256
Fastest OpenCL Device: Hawaii
Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY
winnerPlatform: 140717488209200
您可以看到下面的代码,以查看此错误语句的打印位置。无论出于何种原因,clCreateCommandQueueWithProperties都返回CL_OUT_OF_HOST_MEMORY。我不明白我的CPU侧内存是如何接近耗尽内存来运行的。我真的不知道。特别是因为所有这些方法都在创建队列。
实际上,如果我将CL_DEVICE_TYPE_GPU切换到CL_DEVICE_TYPE_CPU,那么程序在CPU上执行时没有问题。
它只是在.cpp文件上。我真的找不到任何可以切割的东西,因为它已经是一个例子所以你可以看到我所拥有的代码。
以下是以下所有代码:
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <CL/opencl.h>
#include <windows.h>
#include <sys/time.h>
/*
* Return elapsed wall time since last call (seconds)
*/
static double t0=0;
float Elapsed(void)
{
#ifdef _WIN32
// Windows version of wall time
LARGE_INTEGER tv,freq;
QueryPerformanceCounter((LARGE_INTEGER*)&tv);
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
double t = tv.QuadPart/(double)freq.QuadPart;
#else
// Unix/Linux/OSX version of wall time
struct timeval tv;
gettimeofday(&tv,NULL);
double t = tv.tv_sec+1e-6*tv.tv_usec;
#endif
float s = t-t0;
t0 = t;
return s;
}
/*
* Print message to stderr and exit
*/
void Fatal(const char* format , ...)
{
va_list args;
va_start(args,format);
vfprintf(stderr,format,args);
va_end(args);
exit(1);
}
/*
* Initialize matrix with random values
*/
void RandomInit(float x[],const unsigned int n)
{
for (unsigned int i=0;i<n*n;i++)
x[i] = rand() / (float)RAND_MAX;
}
/*
* OpenCL notify callback (echo to stderr)
*/
void Notify(const char* errinfo,const void* private_info,size_t cb,void* user_data)
{
fprintf(stderr,"%s\n",errinfo);
}
class ErrorReader {
public:
private:
};
/*
* Initialize fastest OpenCL device
*/
cl_device_id _DEV_ID;
cl_context _CONTEXT;
cl_command_queue _QUEUE;
int InitGPU(int verbose)
{
cl_uint Nplat;
cl_int err;
char name[1024];
int MaxGflops = -1;
cl_platform_id winnerPlatform = 0;
// Get platforms
cl_platform_id platforms[8];
if (clGetPlatformIDs(8, platforms, &Nplat)) Fatal("Cannot get number of OpenCL platforms\n");
else if (Nplat<1) Fatal("No OpenCL platforms found\n");
// Loop over platforms
for (unsigned int platform = 0; platform < Nplat; platform++) {
if (clGetPlatformInfo(platforms[platform], CL_PLATFORM_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL platform name\n");
if (verbose) printf("OpenCL Platform %d: %s\n", platform, name);
// Get GPU device IDs
cl_uint Ndev;
cl_device_id id[1024];
if (clGetDeviceIDs(platforms[platform], CL_DEVICE_TYPE_GPU, 1024, id, &Ndev))
Fatal("Cannot get number of OpenCL devices\n");
else if (Ndev<1)
Fatal("No OpenCL devices found\n");
// Find the fastest device
for (unsigned int devId = 0; devId < Ndev; devId++) {
cl_uint compUnits, freq;
cl_ulong memSize, maxAlloc;
size_t maxWorkGrps;
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compUnits), &compUnits, NULL)) Fatal("Cannot get OpenCL device units\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL)) Fatal("Cannot get OpenCL device frequency\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL device name\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(memSize), &memSize, NULL)) Fatal("Cannot get OpenCL memory size.\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(memSize), &maxAlloc, NULL)) Fatal("Cannot get OpenCL memory size.\n");
if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGrps), &maxWorkGrps, NULL)) Fatal("Cannot get OpenCL max work group size\n");
int Gflops = compUnits * freq;
if (verbose) printf(" ----- OpenCL Device # %d: %s-----\n"
"Gflops: %f\n"
"Max Compute Units: %d\n"
"Max Clock Frequency: %d\n"
"Total Memory of Device (bytes): %lu\n"
"Max Size of Memory Object Allocation (bytes): %lu\n"
"Max Work Group Size: %zu\n\n",
devId,
name,
1e-3*Gflops,
compUnits,
freq,
memSize,
maxAlloc,
maxWorkGrps);
if (Gflops > MaxGflops)
{
_DEV_ID = id[devId];
MaxGflops = Gflops;
winnerPlatform = platforms[platform];
}
}
}
// Print fastest device info
if (clGetDeviceInfo(_DEV_ID,CL_DEVICE_NAME,sizeof(name),name,NULL)) Fatal("Cannot get OpenCL device name\n");
printf("Fastest OpenCL Device: %s\n",name);
// Check thread count
size_t mwgs;
if (clGetDeviceInfo(_DEV_ID,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(mwgs),&mwgs,NULL)) Fatal("Cannot get OpenCL max work group size\n");
printf("winnerPlatform: %zu", winnerPlatform);
// cl_platform_id platform = NULL;
// int retValue = GetPlatform(&platform, winnerPlatform, true);
// Create OpenCL _CONTEXT for fastest device
// _CONTEXT = clCreateContext(0,1,&_DEV_ID,Notify,NULL,&err);
cl_context_properties cps[3] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)winnerPlatform,
(cl_context_properties)0
};
_CONTEXT = clCreateContextFromType(cps,
CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
if (!_CONTEXT || err) Fatal("Cannot create OpenCL Context\n");
cl_command_queue_properties *propers;
cl_command_queue_properties prop = 0;
//prop |= CL_QUEUE_PROFILING_ENABLE;
//prop |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
propers = ∝
_QUEUE = clCreateCommandQueueWithProperties(_CONTEXT, _DEV_ID, propers, &err); // Create OpenCL command _QUEUE for fastest device
if (err) {
if (err == CL_INVALID_CONTEXT) Fatal("Cannot create OpenCL command cue: CL_INVALID_CONTEXT\n");
else if (err == CL_INVALID_DEVICE) Fatal("Cannot create OpenCL command cue: CL_INVALID_DEVICE\n");
else if (err == CL_INVALID_VALUE) Fatal("Cannot create OpenCL command cue: CL_INVALID_VALUE\n");
else if (err == CL_INVALID_QUEUE_PROPERTIES) Fatal("Cannot create OpenCL command cue: CL_INVALID_QUEUE_PROPERTIES\n");
else if (err == CL_OUT_OF_HOST_MEMORY) Fatal("Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY\n");
else Fatal("Cannot create OpenCL command cue: ???????????? Unknown Error\n");
} else if (!_QUEUE) {
Fatal("Cannot create OpenCL command cue: NULL\n");
}
return mwgs;
}
/*
* C = A * B -- host
*/
void AxBh(float C[], const float A[], const float B[], unsigned int n)
{
for (unsigned int i=0;i<n;i++)
for (unsigned int j=0;j<n;j++)
{
double sum=0;
for (unsigned int k=0;k<n;k++)
sum += (double)A[i*n+k] * (double)B[k*n+j];
C[i*n+j] = (float)sum;
}
}
/*
* Compute one element of A * B
*/
const char* source =
"__kernel void AxB(__global float C[],__global const float A[],__global const float B[],const unsigned int n)\n"
"{\n"
" unsigned int j = get_global_id(0);\n"
" unsigned int i = get_global_id(1);\n"
" float sum =0;\n"
" for (int k=0;k<n;k++)\n"
" sum += A[i*n+k] * B[k*n+j];\n"
" C[i*n+j] = sum;\n"
"}\n";
/*
* C = A * B -- device
*/
void AxBd(float Ch[],float Ah[],float Bh[],const unsigned int Bw,const unsigned int Bn)
{
// Calculate matrix dimensions
int n = Bw*Bn;
int N = n*n*sizeof(float);
// Allocate device memory and copy A&B from host to device
cl_int err;
cl_mem Ad = clCreateBuffer(_CONTEXT, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD, N, Ah, &err);
if (err) Fatal("Cannot create and copy A from host to device\n");
cl_mem Bd = clCreateBuffer(_CONTEXT, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD, N, Bh, &err);
if (err) Fatal("Cannot create and copy B from host to device\n");
// Allocate device memory for C on device
cl_mem Cd = clCreateBuffer(_CONTEXT,CL_MEM_WRITE_ONLY,N,NULL,&err);
if (err) Fatal("Cannot create C on device\n");
// Compile kernel
cl_program prog = clCreateProgramWithSource(_CONTEXT,1,&source,0,&err);
if (err) Fatal("Cannot create program\n");
if (clBuildProgram(prog,0,NULL,NULL,NULL,NULL))
{
char log[1048576];
if (clGetProgramBuildInfo(prog,_DEV_ID,CL_PROGRAM_BUILD_LOG,sizeof(log),log,NULL))
Fatal("Cannot get build log\n");
else
Fatal("Cannot build program\n%s\n",log);
}
cl_kernel kernel = clCreateKernel(prog,"AxB",&err);
if (err) Fatal("Cannot create kernel\n");
// Set parameters for kernel
if (clSetKernelArg(kernel,0,sizeof(cl_mem),&Cd)) Fatal("Cannot set kernel parameter Cd\n");
if (clSetKernelArg(kernel,1,sizeof(cl_mem),&Ad)) Fatal("Cannot set kernel parameter Ad\n");
if (clSetKernelArg(kernel,2,sizeof(cl_mem),&Bd)) Fatal("Cannot set kernel parameter Bd\n");
if (clSetKernelArg(kernel,3,sizeof(int),&n)) Fatal("Cannot set kernel parameter n\n");
// Run kernel
size_t Global[2] = {(size_t)n, (size_t)n};
size_t Local[2] = {(size_t)Bw, (size_t)Bw};
if (clEnqueueNDRangeKernel(_QUEUE,kernel,2,NULL,Global,Local,0,NULL,NULL)) Fatal("Cannot run kernel\n");
// Release kernel and program
if (clReleaseKernel(kernel)) Fatal("Cannot release kernel\n");
if (clReleaseProgram(prog)) Fatal("Cannot release program\n");
// Copy C from device to host (block until done)
if (clEnqueueReadBuffer(_QUEUE,Cd,CL_TRUE,0,N,Ch,0,NULL,NULL)) Fatal("Cannot copy C from device to host\n");
// Free device memory
clReleaseMemObject(Ad);
clReleaseMemObject(Bd);
clReleaseMemObject(Cd);
}
/*
* main
*/
int main(int argc, char* argv[])
{
// Process options
int opt;
int verbose=0;
while ((opt=getopt(argc,argv,"v"))!=-1)
{
if (opt=='v')
verbose++;
else
Fatal("Usage: [-v] <block width> <number of blocks>\n");
}
argc -= optind;
argv += optind;
// Get width and number of blocks
if (argc!=2) Fatal("Usage: [-v] <block width> <number of blocks>\n");
int Bw = atoi(argv[0]);
if (Bw<1) Fatal("Block width out of range %d\n",Bw);
int Bn = atoi(argv[1]);
if (Bn<1) Fatal("Number of blocks out of range %d\n",Bn);
// Total width is block times number of blocks
int n = Bw*Bn;
int N = n*n*sizeof(float);
printf("Bw=%d Bn=%d n=%d\n",Bw,Bn,n);
// Initialize GPU
int Mw = InitGPU(verbose);
if (Mw<Bw*Bw) Fatal("Thread count %d exceeds max work group size of %d\n",Bw*Bw,Mw);
// Allocate host matrices A/B/C/R
float* Ah = (float*)malloc(N);
float* Bh = (float*)malloc(N);
float* Ch = (float*)malloc(N);
float* Rh = (float*)malloc(N);
if (!Ah || !Bh || !Ch || !Rh) Fatal("Cannot allocate host memory\n");
// Initialize A & B
srand(9999);
RandomInit(Ah,n);
RandomInit(Bh,n);
// Compute R = AB on host
Elapsed();
AxBh(Rh,Ah,Bh,n);
float Th = Elapsed();
// Compute C = AB on device
Elapsed();
AxBd(Ch,Ah,Bh,Bw,Bn);
float Td = Elapsed();
// Compute difference between R and C
double r2=0;
for (int i=0;i<n*n;i++)
r2 += fabs(Ch[i]-Rh[i]);
r2 /= n*n;
// Free host memory
free(Ah);
free(Bh);
free(Ch);
free(Rh);
// Print results
printf("Host Time = %6.3f s\n",Th);
printf("Device Time = %6.3f s\n",Td);
printf("Speedup = %.1f\n",Th/Td);
printf("Difference = %.2e\n",r2);
// Done
return 0;
}
我使用它编译它(你显然必须改变一点):
g ++ -Wall -o exMatrixMult -I“/ cygdrive / c / Program Files(x86)/ AMD APP SDK / 3.0 / include”-L“/ cygdrive / c / Program Files(x86)/ AMD APP SDK / 3.0 / lib / x86_64“exMatrixMult.cpp -lOpenCL
然而,我的“夏威夷”显卡可以运行OpenCL SDK附带的示例程序(在“AMD APP SDK \ 3.0 \ samples \ opencl \ bin \ x86_64”中)。我花了大部分时间来看看他们的源代码与我的源代码有什么不同,到目前为止还没有成功。
如果有用的话,发生错误的代码看起来像这样(仍然发生同样的问题):
// Print fastest device info
if (clGetDeviceInfo(devid,CL_DEVICE_NAME,sizeof(name),name,NULL)) Fatal("Cannot get OpenCL device name\n");
printf("Fastest OpenCL Device: %s\n",name);
// Check thread count
size_t mwgs;
if (clGetDeviceInfo(devid,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(mwgs),&mwgs,NULL)) Fatal("Cannot get OpenCL max work group size\n");
// Create OpenCL context for fastest device
context = clCreateContext(0,1,&devid,Notify,NULL,&err);
if(!context || err) Fatal("Cannot create OpenCL context\n");
// Create OpenCL command queue for fastest device
queue = clCreateCommandQueueWithProperties(context, devid, 0, &err);
if (err) Fatal("Cannot create OpenCL command cue\n");
如果您没有类似的显卡,现在问题可能无法验证。但我不知道。
答案 0 :(得分:0)
当我更新显卡的驱动程序时,问题就消失了。