从文件读取两个float4向量然后计算相反数字的总和是简单的程序。 我无法找到问题: MAIN文件:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
const int number_of_points = 16; // number of points in Both A and B files (number of rows)
const int number_of_axis = 4; // number of points axis in Both A and B files (number of Columns)
using namespace std;
void checkError(cl_int err, const char *operation)
{
if (err != CL_SUCCESS)
{
fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
exit(1);
}
}
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// working arrays
// int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float);
// int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float);
float tempAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data
float tempBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data
int mem_size_InputA = number_of_points * number_of_axis ;
int mem_size_InputB = number_of_points * number_of_axis ;
int mem_size_Output = number_of_points * number_of_axis ;
float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file A data
float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
tempAArray[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
tempBArray[row][col] = x;
col++;
}
row++;
}
// switch columns of B array
for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
{
float temporary = tempBArray[row_of_arrayB][2];
tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1];
tempBArray[row_of_arrayB][1] = temporary;
}
// from Array to 3d vectors
// for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++)
// {
// inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0);
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
// }
for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
{
inputAArray[row_of_array*number_of_points+0] = tempAArray[row_of_array][0];
inputAArray[row_of_array*number_of_points+1] = tempAArray[row_of_array][1];
inputAArray[row_of_array*number_of_points+2] = tempAArray[row_of_array][2];
inputAArray[row_of_array*number_of_points+3] = 0.0f;
inputBArray[row_of_array*number_of_points+0] = tempBArray[row_of_array][0];
inputBArray[row_of_array*number_of_points+1] = tempBArray[row_of_array][1];
inputBArray[row_of_array*number_of_points+2] = tempBArray[row_of_array][2];
inputBArray[row_of_array*number_of_points+3] = tempBArray[row_of_array][3];
outputArray[row_of_array*number_of_points+0] = 0.0f;
outputArray[row_of_array*number_of_points+1] = 0.0f;
outputArray[row_of_array*number_of_points+2] = 0.0f;
outputArray[row_of_array*number_of_points+3] = 0.0f;
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
}
// for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
// {
// printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1],
// inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]);
// }
// close input files
input_fileA.close();
input_fileB.close();
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);
rewind(fp);
source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = '\0';
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputA*sizeof(cl_float4) , NULL, &ret);
cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputB*sizeof(cl_float4), NULL, &ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
mem_size_Output*sizeof(cl_float4), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret == CL_BUILD_PROGRAM_FAILURE)
{
// Get size of build log
size_t logSize;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
0, NULL, &logSize);
checkError(ret, "getting build log size");
// Get build log
char log[logSize];
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
logSize, log, NULL);
checkError(ret, "getting build log");
printf("OpenCL program build log:\n%s\n", log);
exit(1);
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 4; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
// float *C = (float*)malloc(sizeof(float)*number_of_points);
clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
mem_size_Output, outputArray, 0, NULL, NULL);
// Display the result to the screen
// float buttomSNM = 0;
// for(i = 0; i < number_of_points; i++)
// {
// for (int t=0; t<4; t++)
// {
// cout << "h" ;
//// printf("%f, \n", outputArray[i*number_of_points+t]);
// }
// }
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(inputa_mem_obj);
ret = clReleaseMemObject(inputb_mem_obj);
ret = clReleaseMemObject(output_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free (inputAArray);
free (inputBArray);
free (outputArray);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
内核:
__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray,
__global float4 *outputArray) {
// Get the index of the current element
int i = get_global_id(0);
int number_of_points = 16;
outputArray[i*number_of_points+0] = inputAArray[i*number_of_points+0] + inputBArray[i*number_of_points+0];
outputArray[i*number_of_points+1] = inputAArray[i*number_of_points+1] + inputBArray[i*number_of_points+1];
outputArray[i*number_of_points+2] = inputAArray[i*number_of_points+2] + inputBArray[i*number_of_points+2];
outputArray[i*number_of_points+3] = inputAArray[i*number_of_points+3] + inputBArray[i*number_of_points+3];
}
第一个输入文件:A.txt
0 0.000000e+00 9.998994e-01
1 1.000000e-03 9.998981e-01
2 2.000000e-03 9.998967e-01
3 3.000000e-03 9.998953e-01
4 4.000000e-03 9.998939e-01
5 5.000000e-03 9.998925e-01
6 6.000000e-03 9.998911e-01
7 7.000000e-03 9.998896e-01
8 8.000000e-03 9.998881e-01
9 9.000000e-03 9.998865e-01
10 1.000000e-02 9.998850e-01
11 1.100000e-02 9.998834e-01
12 1.200000e-02 9.998817e-01
13 1.300000e-02 9.998800e-01
14 1.400000e-02 9.998783e-01
15 1.500000e-02 9.998766e-01
第二个输入文件B:
0 0.000000e+00 9.998966e-01
1 1.000000e-03 9.998953e-01
2 2.000000e-03 9.998939e-01
3 3.000000e-03 9.998925e-01
4 4.000000e-03 9.998911e-01
5 5.000000e-03 9.998896e-01
6 6.000000e-03 9.998881e-01
7 7.000000e-03 9.998866e-01
8 8.000000e-03 9.998850e-01
9 9.000000e-03 9.998834e-01
10 1.000000e-02 9.998818e-01
11 1.100000e-02 9.998801e-01
12 1.200000e-02 9.998785e-01
13 1.300000e-02 9.998767e-01
14 1.400000e-02 9.998750e-01
15 1.500000e-02 9.998732e-01
提前致谢
答案 0 :(得分:1)
您正在以相当奇怪的方式在内核中计算数组索引:
i*number_of_points+0
i*number_of_points+1
i*number_of_points+2
i*number_of_points+3
考虑这实际转换为i
的不同值的假设(假设为number_of_points=16
):
i array indices (i*16 + (0,1,2,3))
--------------------------------------
0 0, 1, 2, 3
1 16, 17, 18, 19
2 32, 33, 34, 35
...
etc
这肯定不是你想要的!您的示例代码似乎只是尝试执行向量化矢量添加。如果是这种情况,您的内核代码只需要看起来像这样:
__kernel void vecadd(__global float4 *inputA,
__global float4 *inputB,
__global float4 *output)
{
int i = get_global_id(0);
output[i] = inputA[i] + inputB[i];
}
这是有效的,因为它们正在对向量的每个元素执行相同的操作。如果你有一个需要单独使用这些元素的内核,你可以编写如下代码:
float4 valueA = inputA[i];
float4 valueB = inputB[i];
float4 result;
result.x = valueA.x + valueB.x; // Do something with first component
result.y = valueA.y * valueB.y; // Do something with second component
result.z = valueA.z / valueB.z; // Do something with third component
result.w = valueA.w - valueB.w; // Do something with fourth component