我正在尝试在openCL中使用double类型,但无论如何都不起作用,我想使用double来获得更高的精度,如果有任何其他类型的话,请告诉我。
如果你没有时间阅读我的代码,恢复是:我想在openCL中使用double(或其他类型)来更精确地计算pi。
我的代码:
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
////////////////////////////////////////////////////////////////////////////////
// Use a static data size for simplicity
//
#define DATA_SIZE (1000000)
////////////////////////////////////////////////////////////////////////////////
#define TIPO double
// Simple compute kernel that computes the calcpi of an input array. [1]
//
const char *KernelSource = "\n" \
"#pragma OPENCL EXTENSION cl_amd_fp64 : enable \n" \
"__kernel void calcpi( \n" \
" __global double* input, \n" \
" __global double* output, \n" \
" const unsigned int count) \n" \
"{ \n" \
" int i = get_global_id(0); \n" \
" double z = get_global_id(0)*2+1; \n" \
" if(i < count) \n" \
" output[i] = 4.0/z; \n" \
"} \n" \
"\n";
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
int err; // error code returned from api calls
//printf("%d",sizeof(TIPO));
//scanf("%d",&err);
TIPO data[2]; // original data set given to device
TIPO *results = malloc(sizeof(TIPO)*DATA_SIZE); // results returned from device
//unsigned int correct; // number of correct results returned
//printf("TESTE");
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_device_id device_id; // device ID
cl_context context; // context
cl_command_queue queue; // command queue
cl_program program; // program
cl_kernel kernel; // kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Get data on which to operate
//
//int i = 0;
//int n = 3;
unsigned int count = DATA_SIZE;
//for(i = 0; i < count; i+=2) {
//data[i] = n;
//n += 2;
//}
//printf("TESTE");
// Get an ID for the device [2]
int gpu = 1;
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1,&device_id, NULL);
if (err != CL_SUCCESS)
printf("ERROR CLGETDEVICEIDS!\n"); // [3]
// Create a context [4]
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context) {
printf("ERROR CONTEXT\n");
}
// Create a command queue [5]
//
queue = clCreateCommandQueue(context, device_id, 0, &err);
if (!queue) {
printf("ERROR QUEUE\n");
}
// Create the compute program from the source buffer [6]
//
program = clCreateProgramWithSource(context, 1,(const char **) & KernelSource, NULL, &err);
if ( !program) {
printf("ERROR PROGRAM\n");
}
// Build the program executable [7]
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable\n"); //[8]
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run [9]
//
kernel = clCreateKernel(program, "calcpi", &err);
if (!kernel || err != CL_SUCCESS) {
printf("ERROR KERNEL OR CL_SUCESS\n");
}
// Create the input and output arrays in device memory for our calculation
// [10]
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(TIPO) *count,NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(TIPO) *count,NULL, NULL);
if (!input || !output) {
printf("ERROR !INPUT OR !OUTPUT\n");
}
// Write our data set into the input array in device memory [11]
//
err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0,sizeof(TIPO) *2, data, 0, NULL, NULL);
if (err != CL_SUCCESS) {
printf("ERROR WRITE OUR DATA\n");
}
// Set the arguments to our compute kernel [12]
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
if (err != CL_SUCCESS) {
printf("ERROR ARGUMENTS COMPUTE KERNEL - ERROR NUMBER: %d\n",err);
exit(1);
}
// Get the maximum work-group size for executing the kernel on the device
// [13]
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,sizeof(size_t), &local, NULL);
if (err != CL_SUCCESS) {
printf("ERROR MAXIMUM WORK-GROUP - ERROR NUMBER: %d\n",err);
exit(1);
}
// Execute the kernel over the entire range of the data set [14]
//
global = count;
//printf("TESTE");
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL,0, NULL, NULL);
if (err) {
printf("ERROR EXECUTE KERNEL - ERROR NUMBER: %d\n",err);
exit(1);
}
// Wait for the command queue to get serviced before reading back results
// [15]
clFinish(queue);
// Read the results from the device [16]
//
err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0,sizeof(TIPO) *count, results, 0, NULL, NULL );
if (err != CL_SUCCESS) {
printf("ERROR READ RESULTS - ERROR NUMBER: %d\n",err);
}
//printf("TESTE");
TIPO pi = 0.0;
int i;
for (i=0;i<count-1;i++) {
//printf("%f",results[i]);
pi += (pow(-1.0,i)) * (TIPO) results[i];
//pi = (TIPO) results[i];
//printf("casa %d deu: %1.50f\n",i,pi);
//printf("%f",(pow(-1,i)));
//pi += (pow(-1.0,i));
}
printf("PI: %1.50f",pi);
// Shut down and clean up
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
scanf("%d",&i);
return 0;
}
当我放入kernelSource时:
输出= 4.0;
只有这样,我得到512.000123023986816406250000000000000000000000000
在结果中..
或1.0 = 0.00781250184809323400259017944335937500000000000
答案 0 :(得分:10)
您是否在AMD OpenCL提供商下运营?如果没有,可能无法识别双精度OpenCL扩展。
如果您可以/希望同时支持这两种扩展程序,则可以执行以下操作:
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double precision floating point not supported by OpenCL implementation."
#endif
但请注意cl_khr_fp64支持的cl_amd_fp64下不支持某些功能。