我正从一个拥有超过一百万个元素的数组中读取大整数值。获取的值来自使用libsndfile库的wav文件。现在,如果我不使用内核,我可以将原始数组写入我的输出文件并听取音频没有问题。但是,当我决定使用内核做同样的事情时,它只能写不到一秒的歌曲。
起初,我认为这是一个内存问题,所以我玩了缓冲区大小但仍然没有运气。然后我认为它可能是我在内核中做的循环,我也玩了循环值来得出相同的结论(仍然不起作用。)我现在很难过,不知道该怎么做。这是我的代码如下。下面的一些代码是我的,但我在网上发现的主要结构可以帮助我设置内核。
在此代码的最底部,如果我将OutputData更改为Array,我会得到完全相同的音频。我很确定内核有问题,这就是为什么它不会写回整首歌。
我知道这段代码很乱,但你要做的就是尝试测试这段代码就是复制并浪费它,只需改变输入wav文件的路径并输出wav文件。
为了明确目标,我将尝试修改wav文件中的每个值,看看会发生什么。到目前为止,如果我将内核中的输出值乘以2,则会使其失真。但同样,只持续1秒钟,剪辑的其余部分为空。请注意,输入和输出文件的大小都相同。
My For循环也循环执行1.2百万次迭代,因为这是我在示例wav文件中拥有的项目数
const char* prog = "__kernel void exchange(__global int *Array, __global int *Output) { for(int j = 0; j < 100000; j++){ for(int i = 0; i < 12; i++){ Output[j+i] = (Array[j+i]);} } }";
int main() {
// This code executes on the OpenCL host
SNDFILE *sf;
SF_INFO info;
int num_channels;
int num, num_items;
//input and output data
int *Array;
int *OutputData;
int f, sr, c;
int i, j;
FILE *out;
/* Open the WAV file. */
info.format = 0;
sf = sf_open("Yourwavfilepathhere", SFM_READ, &info);
if (sf == NULL)
{
printf("Failed to open the file.\n");
perror("Error");
exit(-1);
}
/* Print some of the info, and figure out how much data to read. */
f = info.frames;
sr = info.samplerate;
int format = info.format;
c = info.channels;
printf("frames=%d\n", f);
printf("samplerate=%d\n", sr);
printf("channels=%d\n", c);
printf("format %i\n", format);
num_items = f*c;
printf("num_items=%d\n", num_items);
/* Allocate space for the data to be read, then read it. */
Array = (int *)malloc(num_items*sizeof(int));
OutputData = (int*)malloc(num_items*sizeof(int));;
num = sf_read_int(sf, Array, num_items);
sf_close(sf);
printf("Read %d items\n", num);
//Time variables for performance execution. Event variable needed for timing constraint
cl_event someEvent;
cl_ulong start = (cl_ulong)0;
cl_ulong end = (cl_ulong)0;
cl_ulong finalTime = (cl_ulong)0;
//Number of sampling points
int sampleSize = 100;
float h = 0;
//Coefficient used to multiply the values entering the FIFO buffer implemented inside the kernel
float coefficient = 1 / sampleSize;
//Signal Frequency in Hz
float signalFreq = 10;
//Number of points between 0 and max val (T_Sample)
float freqSample = sampleSize*signalFreq;
//Step = max value or T_Sample. ******Either 1/freqSample or 1/sampleSize for the stepSize******
float stepSize = 1.0 / freqSample;
/*
This is a different Example
*/
// Use this to check the output of each API call
cl_int status;
//-----------------------------------------------------
// STEP 1: Discover and initialize the platforms
//-----------------------------------------------------
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
// Use clGetPlatformIDs() to retrieve the number of
// platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each platform
platforms =
(cl_platform_id*)malloc(
numPlatforms*sizeof(cl_platform_id));
// Fill in platforms with clGetPlatformIDs()
status = clGetPlatformIDs(numPlatforms, platforms,
NULL);
//-----------------------------------------------------
// STEP 2: Discover and initialize the devices
//-----------------------------------------------------
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
// Use clGetDeviceIDs() to retrieve the number of
// devices present
status = clGetDeviceIDs(
platforms[0],
CL_DEVICE_TYPE_CPU,
0,
NULL,
&numDevices);
// Allocate enough space for each device
devices =
(cl_device_id*)malloc(
numDevices*sizeof(cl_device_id));
// Fill in devices with clGetDeviceIDs()
status = clGetDeviceIDs(
platforms[0],
CL_DEVICE_TYPE_CPU,
numDevices,
devices,
NULL);
//-----------------------------------------------------
// STEP 3: Create a context
//-----------------------------------------------------
cl_context context = NULL;
// Create a context using clCreateContext() and
// associate it with the devices
context = clCreateContext(
NULL,
numDevices,
devices,
NULL,
NULL,
&status);
//-----------------------------------------------------
// STEP 4: Create a command queue
//-----------------------------------------------------
cl_command_queue cmdQueue;
// Create a command queue using clCreateCommandQueue(),
// and associate it with the device you want to execute
// on
cmdQueue = clCreateCommandQueue(
context,
devices[0],
CL_QUEUE_PROFILING_ENABLE,
&status);
//-----------------------------------------------------
// STEP 5: Create device buffers
//-----------------------------------------------------
cl_mem input;
cl_mem output;
cl_float coeff;
input = clCreateBuffer(
context,
CL_MEM_READ_ONLY,
num_items,
NULL,
&status);
output = clCreateBuffer(
context,
CL_MEM_WRITE_ONLY,
num_items,
NULL,
&status);
//-----------------------------------------------------
// STEP 6: Write host data to device buffers
//-----------------------------------------------------
// Use clEnqueueWriteBuffer() to write input array Array to
// the device buffer input
status = clEnqueueWriteBuffer(
cmdQueue,
input,
CL_FALSE,
0,
num_items,
Array,
0,
NULL,
NULL);
printf("status %i \n", status);
//-----------------------------------------------------
// STEP 7: Create and compile the program
//-----------------------------------------------------
// Create a program using clCreateProgramWithSource()
cl_program program = clCreateProgramWithSource(
context,
1,
(const char**)&prog,
NULL,
&status);
printf("status %i \n", status);
// Build (compile) the program for the devices with
// clBuildProgram()
status = clBuildProgram(
program,
numDevices,
devices,
NULL,
NULL,
NULL);
//-----------------------------------------------------
// STEP 8: Create the kernel
//-----------------------------------------------------
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "exchange", &status);
//-----------------------------------------------------
// STEP 9: Set the kernel arguments
//-----------------------------------------------------
// Associate the input and output buffers with the
// kernel
// using clSetKernelArg()
status = clSetKernelArg(
kernel,
0,
sizeof(cl_mem),
&input);
printf("Status %i \n",status);
status |= clSetKernelArg(
kernel,
1,
sizeof(cl_mem),
&output);
//-----------------------------------------------------
// STEP 10: Configure the work-item structure
//-----------------------------------------------------
// Define an index space (global work size) of work
// items for
// execution. A workgroup size (local work size) is not
// required,
// but can be used.
size_t globalWorkSize[1];
// There are 'elements' work-items
globalWorkSize[0] = sampleSize;
//-----------------------------------------------------
// STEP 11: Enqueue the kernel for execution
//-----------------------------------------------------
// Execute the kernel by using
// clEnqueueNDRangeKernel().
// 'globalWorkSize' is the 1D dimension of the
// work-items
status = clEnqueueNDRangeKernel(
cmdQueue,
kernel,
1,
NULL,
globalWorkSize,
NULL,
0,
NULL,
&someEvent);
clFinish(cmdQueue);
clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
double totalTime = end - start;
printf("Total time is: %f ms \n", totalTime / 1000000.0);
//-----------------------------------------------------
// STEP 12: Read the output buffer back to the host
//-----------------------------------------------------
// Use clEnqueueReadBuffer() to read the OpenCL output
// buffer (bufferC)
// to the host output array (C)
printf("Made it here! %i \n", status);
clEnqueueReadBuffer(
cmdQueue,
output,
CL_TRUE,
0,
num_items,
OutputData,
0,
NULL,
NULL);
printf("Made it here2! %i \n", status);
SNDFILE * outfile = sf_open("outputwavfilepathhere", SFM_WRITE, &info);
sf_count_t count = sf_write_int(outfile, OutputData, num_items);
sf_write_sync(outfile);
sf_close(outfile);
//-----------------------------------------------------
// STEP 13: Release OpenCL resources
//-----------------------------------------------------
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseContext(context);
// Free host resources
free(OutputData);
free(platforms);
free(devices);
free(Array);
}
答案 0 :(得分:1)
试试这个:
__kernel void exchange(__global int *Array, __global int *Output)
{
int globalSize = get_global_size(0)
int globalId = get_global_id(0)
for(int i = globalId; i < 1200000; i += globalSize){
Output[i] = (Array[i]);
}
}
确保在for循环中使用正确的上限。理想情况下,您将此作为另一个参数传递。
你最初做错了,只是重复写了前100012个元素。重新开始工作项功能,以了解变量的含义。 OpenCL 1.2 reference here.