在Visual Studio 2015中,我创建了一个“Windows的代码构建项目”(用于CPU)。这个项目带有我根本没有碰过的代码。它基本上是矢量加法。但是,向量添加发生在Template.cl文件中。当我尝试编译这个项目时,它给了我以下错误:
错误MSB3722命令“”C:\ Program Files(x86)\ Intel \ OpenCL SDK \ bin \ x86 \ ioc32.exe“-cmd = build -input =”blahblah \ user \ visual studio 2015 \ Projects \ OpenCLProject3 \ OpenCLProject3 \ Template.cl“-output =”Debug \ Template.out“-VS -device = CPU_2_0 -simd = default -targetos = current -bo =”“”退出代码5.请确认您有足够的权利运行此命令。 OpenCLProject3 C:\ Program Files(x86)\ MSBuild \ Microsoft.Cpp \ v4.0 \ V140 \ BuildCustomizations \ IntelOpenCL.targets 98
但是如果我将内核复制到我的cpp文件中并将其作为字符串,那么它就会执行。字符串如下所示:
const char* prog1 = "__kernel void Add(__global int* pA, __global int* pB, __global int* pC){const int x = get_global_id(0);const int y = get_global_id(1);const int width = get_global_size(0);const int id = y * width + x;pC[id] = pA[id] + pB[id];}"
此外,我不是从源文件中读取,而是使用& prog1为函数CreateAndBuildProgram调用地址。
以下是Visual Studio项目树的结构:
--References
--External
--Headers
--OpenCL
--Template.cl
--Source Files
--OpenCLProject3.cpp
--utils.cpp
请注意,我已删除了错误代码。如果您在visual studio 2015中生成代码构建项目,您将获得完全相同的代码和结构。
这是主机代码(OpenCLProject3.cpp)。
#include <stdio.h>
#include <stdlib.h>
#include <tchar.h>
#include <memory.h>
#include <vector>
#include "CL\cl.h"
#include "utils.h"
//for perf. counters
#include <Windows.h>
// Macros for OpenCL versions
#define OPENCL_VERSION_1_2 1.2f
#define OPENCL_VERSION_2_0 2.0f
struct ocl_args_d_t
{
ocl_args_d_t();
~ocl_args_d_t();
// Regular OpenCL objects:
cl_context context; // hold the context handler
cl_device_id device; // hold the selected device handler
cl_command_queue commandQueue; // hold the commands-queue handler
cl_program program; // hold the program handler
cl_kernel kernel; // hold the kernel handler
float platformVersion; // hold the OpenCL platform version (default 1.2)
float deviceVersion; // hold the OpenCL device version (default. 1.2)
float compilerVersion; // hold the device OpenCL C version (default. 1.2)
// Objects that are specific for algorithm implemented in this sample
cl_mem srcA; // hold first source buffer
cl_mem srcB; // hold second source buffer
cl_mem dstMem; // hold destination buffer
};
ocl_args_d_t::ocl_args_d_t():
context(NULL),
device(NULL),
commandQueue(NULL),
program(NULL),
kernel(NULL),
platformVersion(OPENCL_VERSION_1_2),
deviceVersion(OPENCL_VERSION_1_2),
compilerVersion(OPENCL_VERSION_1_2),
srcA(NULL),
srcB(NULL),
dstMem(NULL)
{
}
ocl_args_d_t::~ocl_args_d_t()
{
cl_int err = CL_SUCCESS;
if (kernel)
{
err = clReleaseKernel(kernel);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (program)
{
err = clReleaseProgram(program);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (srcA)
{
err = clReleaseMemObject(srcA);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (srcB)
{
err = clReleaseMemObject(srcB);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (dstMem)
{
err = clReleaseMemObject(dstMem);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (commandQueue)
{
err = clReleaseCommandQueue(commandQueue);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (device)
{
err = clReleaseDevice(device);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (context)
{
err = clReleaseContext(context);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err));
}
}
}
bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform)
{
size_t stringLength = 0;
cl_int err = CL_SUCCESS;
bool match = false;
// In order to read the platform's name, we first read the platform's name string length (param_value is NULL).
// The value returned in stringLength
err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength);
if (CL_SUCCESS != err)
{
LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err));
return false;
}
// Now, that we know the platform's name string length, we can allocate enough space before read it
std::vector<char> platformName(stringLength);
// Read the platform's name string
// The read value returned in platformName
err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err));
return false;
}
// Now check if the platform's name is the required one
if (strstr(&platformName[0], preferredPlatform) != 0)
{
// The checked platform is the one we're looking for
match = true;
}
return match;
}
cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType)
{
cl_uint numPlatforms = 0;
cl_int err = CL_SUCCESS;
// Get (in numPlatforms) the number of OpenCL platforms available
// No platform ID will be return, since platforms is NULL
err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (CL_SUCCESS != err)
{
LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err));
return NULL;
}
LogInfo("Number of available platforms: %u\n", numPlatforms);
if (0 == numPlatforms)
{
LogError("Error: No platforms found!\n");
return NULL;
}
std::vector<cl_platform_id> platforms(numPlatforms);
// Now, obtains a list of numPlatforms OpenCL platforms available
// The list of platforms available will be returned in platforms
err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err));
return NULL;
}
// Check if one of the available platform matches the preferred requirements
for (cl_uint i = 0; i < numPlatforms; i++)
{
bool match = true;
cl_uint numDevices = 0;
// If the preferredPlatform is not NULL then check if platforms[i] is the required one
// Otherwise, continue the check with platforms[i]
if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0))
{
// In case we're looking for a specific platform
match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform);
}
// match is true if the platform's name is the required one or don't care (NULL)
if (match)
{
// Obtains the number of deviceType devices available on platform
// When the function failed we expect numDevices to be zero.
// We ignore the function return value since a non-zero error code
// could happen if this platform doesn't support the specified device type.
err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices);
if (CL_SUCCESS != err)
{
LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err));
}
if (0 != numDevices)
{
// There is at list one device that answer the requirements
return platforms[i];
}
}
}
return NULL;
}
/*
* This function read the OpenCL platdorm and device versions
* (using clGetxxxInfo API) and stores it in the ocl structure.
* Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices
* in the same program.
*/
int GetPlatformAndDeviceVersion (cl_platform_id platformId, ocl_args_d_t *ocl)
{
cl_int err = CL_SUCCESS;
// Read the platform's version string length (param_value is NULL).
// The value returned in stringLength
size_t stringLength = 0;
err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength);
if (CL_SUCCESS != err)
{
LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
return err;
}
// Now, that we know the platform's version string length, we can allocate enough space before read it
std::vector<char> platformVersion(stringLength);
// Read the platform's version string
// The read value returned in platformVersion
err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err));
return err;
}
if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL)
{
ocl->platformVersion = OPENCL_VERSION_2_0;
}
// Read the device's version string length (param_value is NULL).
err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength);
if (CL_SUCCESS != err)
{
LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
return err;
}
// Now, that we know the device's version string length, we can allocate enough space before read it
std::vector<char> deviceVersion(stringLength);
// Read the device's version string
// The read value returned in deviceVersion
err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err));
return err;
}
if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL)
{
ocl->deviceVersion = OPENCL_VERSION_2_0;
}
// Read the device's OpenCL C version string length (param_value is NULL).
err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength);
if (CL_SUCCESS != err)
{
LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
return err;
}
// Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it
std::vector<char> compilerVersion(stringLength);
// Read the device's OpenCL C version string
// The read value returned in compilerVersion
err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err));
return err;
}
else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL)
{
ocl->compilerVersion = OPENCL_VERSION_2_0;
}
return err;
}
/*
* Generate random value for input buffers
*/
void generateInput(cl_int* inputArray, cl_uint arrayWidth, cl_uint arrayHeight)
{
srand(12345);
// random initialization of input
cl_uint array_size = arrayWidth * arrayHeight;
for (cl_uint i = 0; i < array_size; ++i)
{
inputArray[i] = rand();
}
}
int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
{
// The following variable stores return codes for all OpenCL calls.
cl_int err = CL_SUCCESS;
// Query for all available OpenCL platforms on the system
// Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
if (NULL == platformId)
{
LogError("Error: Failed to find OpenCL platform.\n");
return CL_INVALID_VALUE;
}
// Create context with device of specified type.
// Required device type is passed as function argument deviceType.
// So you may use this function to create context for any CPU or GPU OpenCL device.
// The creation is synchronized (pfn_notify is NULL) and NULL user_data
cl_context_properties contextProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0};
ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err);
if ((CL_SUCCESS != err) || (NULL == ocl->context))
{
LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err));
return err;
}
// Query for OpenCL device which was used for context creation
err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err));
return err;
}
// Read the OpenCL platform's version and the device OpenCL and OpenCL C versions
GetPlatformAndDeviceVersion(platformId, ocl);
// Create command queue.
// OpenCL kernels are enqueued for execution to a particular device through special objects called command queues.
// Command queue guarantees some ordering between calls and other OpenCL commands.
// Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device.
#ifdef CL_VERSION_2_0
if (OPENCL_VERSION_2_0 == ocl->deviceVersion)
{
const cl_command_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err);
}
else {
// default behavior: OpenCL 1.2
cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
}
#else
// default behavior: OpenCL 1.2
cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
#endif
if (CL_SUCCESS != err)
{
LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err));
return err;
}
return CL_SUCCESS;
}
/*
* Create and build OpenCL program from its source code
*/
int CreateAndBuildProgram(ocl_args_d_t *ocl)
{
cl_int err = CL_SUCCESS;
// Upload the OpenCL C source code from the input file to source
// The size of the C program is returned in sourceSize
char* source = NULL;
size_t src_size = 0;
err = ReadSourceFromFile("Template.cl", &source, &src_size);
if (CL_SUCCESS != err)
{
LogError("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err));
goto Finish;
}
// And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object.
ocl->program = clCreateProgramWithSource(ocl->context, 1, (const char**)&source, &src_size, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err));
goto Finish;
}
// Build the program
// During creation a program is not built. You need to explicitly call build function.
// Here you just use create-build sequence,
// but there are also other possibilities when program consist of several parts,
// some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as
// alternatives.
err = clBuildProgram(ocl->program, 1, &ocl->device, "", NULL, NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
// In case of error print the build log to the standard output
// First check the size of the log
// Then allocate the memory and obtain the log from the program
if (err == CL_BUILD_PROGRAM_FAILURE)
{
size_t log_size = 0;
clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
std::vector<char> build_log(log_size);
clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);
LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
}
}
Finish:
if (source)
{
delete[] source;
source = NULL;
}
return err;
}
int CreateBufferArguments(ocl_args_d_t *ocl, cl_int* inputA, cl_int* inputB, cl_int* outputC, cl_uint arrayWidth, cl_uint arrayHeight)
{
cl_int err = CL_SUCCESS;
// Create new OpenCL buffer objects
// As these buffer are used only for read by the kernel, you are recommended to create it with flag CL_MEM_READ_ONLY.
// Always set minimal read/write flags for buffers, it may lead to better performance because it allows runtime
// to better organize data copying.
// You use CL_MEM_COPY_HOST_PTR here, because the buffers should be populated with bytes at inputA and inputB.
ocl->srcA = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, inputA, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateBuffer for srcA returned %s\n", TranslateOpenCLError(err));
return err;
}
ocl->srcB = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, inputB, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateBuffer for srcB returned %s\n", TranslateOpenCLError(err));
return err;
}
// If the output buffer is created directly on top of output buffer using CL_MEM_USE_HOST_PTR,
// then, depending on the OpenCL runtime implementation and hardware capabilities,
// it may save you not necessary data copying.
// As it is known that output buffer will be write only, you explicitly declare it using CL_MEM_WRITE_ONLY.
ocl->dstMem = clCreateBuffer(ocl->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * arrayWidth * arrayHeight, outputC, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateBuffer for dstMem returned %s\n", TranslateOpenCLError(err));
return err;
}
return CL_SUCCESS;
}
cl_uint SetKernelArguments(ocl_args_d_t *ocl)
{
cl_int err = CL_SUCCESS;
err = clSetKernelArg(ocl->kernel, 0, sizeof(cl_mem), (void *)&ocl->srcA);
if (CL_SUCCESS != err)
{
LogError("error: Failed to set argument srcA, returned %s\n", TranslateOpenCLError(err));
return err;
}
err = clSetKernelArg(ocl->kernel, 1, sizeof(cl_mem), (void *)&ocl->srcB);
if (CL_SUCCESS != err)
{
LogError("Error: Failed to set argument srcB, returned %s\n", TranslateOpenCLError(err));
return err;
}
err = clSetKernelArg(ocl->kernel, 2, sizeof(cl_mem), (void *)&ocl->dstMem);
if (CL_SUCCESS != err)
{
LogError("Error: Failed to set argument dstMem, returned %s\n", TranslateOpenCLError(err));
return err;
}
return err;
}
/*
* Execute the kernel
*/
cl_uint ExecuteAddKernel(ocl_args_d_t *ocl, cl_uint width, cl_uint height)
{
cl_int err = CL_SUCCESS;
// Define global iteration space for clEnqueueNDRangeKernel.
size_t globalWorkSize[2] = {width, height};
// execute kernel
err = clEnqueueNDRangeKernel(ocl->commandQueue, ocl->kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
if (CL_SUCCESS != err)
{
LogError("Error: Failed to run kernel, return %s\n", TranslateOpenCLError(err));
return err;
}
// Wait until the queued kernel is completed by the device
err = clFinish(ocl->commandQueue);
if (CL_SUCCESS != err)
{
LogError("Error: clFinish return %s\n", TranslateOpenCLError(err));
return err;
}
return CL_SUCCESS;
}
/*
* "Read" the result buffer (mapping the buffer to the host memory address)
*/
bool ReadAndVerify(ocl_args_d_t *ocl, cl_uint width, cl_uint height, cl_int *inputA, cl_int *inputB)
{
cl_int err = CL_SUCCESS;
bool result = true;
// Enqueue a command to map the buffer object (ocl->dstMem) into the host address space and returns a pointer to it
// The map operation is blocking
cl_int *resultPtr = (cl_int *)clEnqueueMapBuffer(ocl->commandQueue, ocl->dstMem, true, CL_MAP_READ, 0, sizeof(cl_uint) * width * height, 0, NULL, NULL, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
return false;
}
// Call clFinish to guarantee that output region is updated
err = clFinish(ocl->commandQueue);
if (CL_SUCCESS != err)
{
LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
}
// We mapped dstMem to resultPtr, so resultPtr is ready and includes the kernel output !!!
// Verify the results
unsigned int size = width * height;
for (unsigned int k = 0; k < size; ++k)
{
if (resultPtr[k] != inputA[k] + inputB[k])
{
LogError("Verification failed at %d: (%d + %d = %d)\n", k, inputA[k], inputB[k], resultPtr[k]);
result = false;
}
}
// Unmapped the output buffer before releasing it
err = clEnqueueUnmapMemObject(ocl->commandQueue, ocl->dstMem, resultPtr, 0, NULL, NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clEnqueueUnmapMemObject returned %s\n", TranslateOpenCLError(err));
}
return result;
}
/*
* main execution routine
* Basically it consists of three parts:
* - generating the inputs
* - running OpenCL kernel
* - reading results of processing
*/
int _tmain(int argc, TCHAR* argv[])
{
cl_int err;
ocl_args_d_t ocl;
cl_device_type deviceType = CL_DEVICE_TYPE_CPU;
LARGE_INTEGER perfFrequency;
LARGE_INTEGER performanceCountNDRangeStart;
LARGE_INTEGER performanceCountNDRangeStop;
cl_uint arrayWidth = 1024;
cl_uint arrayHeight = 1024;
//initialize Open CL objects (context, queue, etc.)
if (CL_SUCCESS != SetupOpenCL(&ocl, deviceType))
{
return -1;
}
// allocate working buffers.
// the buffer should be aligned with 4K page and size should fit 64-byte cached line
cl_uint optimizedSize = ((sizeof(cl_int) * arrayWidth * arrayHeight - 1)/64 + 1) * 64;
cl_int* inputA = (cl_int*)_aligned_malloc(optimizedSize, 4096);
cl_int* inputB = (cl_int*)_aligned_malloc(optimizedSize, 4096);
cl_int* outputC = (cl_int*)_aligned_malloc(optimizedSize, 4096);
if (NULL == inputA || NULL == inputB || NULL == outputC)
{
LogError("Error: _aligned_malloc failed to allocate buffers.\n");
return -1;
}
//random input
generateInput(inputA, arrayWidth, arrayHeight);
generateInput(inputB, arrayWidth, arrayHeight);
// Create OpenCL buffers from host memory
// These buffers will be used later by the OpenCL kernel
if (CL_SUCCESS != CreateBufferArguments(&ocl, inputA, inputB, outputC, arrayWidth, arrayHeight))
{
return -1;
}
// Create and build the OpenCL program
if (CL_SUCCESS != CreateAndBuildProgram(&ocl))
{
return -1;
}
// Program consists of kernels.
// Each kernel can be called (enqueued) from the host part of OpenCL application.
// To call the kernel, you need to create it from existing program.
ocl.kernel = clCreateKernel(ocl.program, "Add", &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateKernel returned %s\n", TranslateOpenCLError(err));
return -1;
}
// Passing arguments into OpenCL kernel.
if (CL_SUCCESS != SetKernelArguments(&ocl))
{
return -1;
}
// Regularly you wish to use OpenCL in your application to achieve greater performance results
// that are hard to achieve in other ways.
// To understand those performance benefits you may want to measure time your application spent in OpenCL kernel execution.
// The recommended way to obtain this time is to measure interval between two moments:
// - just before clEnqueueNDRangeKernel is called, and
// - just after clFinish is called
// clFinish is necessary to measure entire time spending in the kernel, measuring just clEnqueueNDRangeKernel is not enough,
// because this call doesn't guarantees that kernel is finished.
// clEnqueueNDRangeKernel is just enqueue new command in OpenCL command queue and doesn't wait until it ends.
// clFinish waits until all commands in command queue are finished, that suits your need to measure time.
bool queueProfilingEnable = true;
if (queueProfilingEnable)
QueryPerformanceCounter(&performanceCountNDRangeStart);
// Execute (enqueue) the kernel
if (CL_SUCCESS != ExecuteAddKernel(&ocl, arrayWidth, arrayHeight))
{
return -1;
}
if (queueProfilingEnable)
QueryPerformanceCounter(&performanceCountNDRangeStop);
// The last part of this function: getting processed results back.
// use map-unmap sequence to update original memory area with output buffer.
ReadAndVerify(&ocl, arrayWidth, arrayHeight, inputA, inputB);
// retrieve performance counter frequency
if (queueProfilingEnable)
{
QueryPerformanceFrequency(&perfFrequency);
LogInfo("NDRange performance counter time %f ms.\n",
1000.0f*(float)(performanceCountNDRangeStop.QuadPart - performanceCountNDRangeStart.QuadPart) / (float)perfFrequency.QuadPart);
}
_aligned_free(inputA);
_aligned_free(inputB);
_aligned_free(outputC);
return 0;
}
这是内核代码(Template.cl):
__kernel void Add(__global int* pA, __global int* pB, __global int* pC)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
const int width = get_global_size(0);
const int id = y * width + x;
pC[id] = pA[id] + pB[id];
}
答案 0 :(得分:1)
这可能与此重复:
OpenCL code 'Error MSB3721' for Intel OpenCL SDK on Visual Studio 2010
可能的解决办法是删除&#39; .cl&#39;来自项目的文件。