Question

我正从一个拥有超过一百万个元素的数组中读取大整数值。获取的值来自使用libsndfile库的wav文件。现在，如果我不使用内核，我可以将原始数组写入我的输出文件并听取音频没有问题。但是，当我决定使用内核做同样的事情时，它只能写不到一秒的歌曲。

起初，我认为这是一个内存问题，所以我玩了缓冲区大小但仍然没有运气。然后我认为它可能是我在内核中做的循环，我也玩了循环值来得出相同的结论（仍然不起作用。）我现在很难过，不知道该怎么做。这是我的代码如下。下面的一些代码是我的，但我在网上发现的主要结构可以帮助我设置内核。

在此代码的最底部，如果我将OutputData更改为Array，我会得到完全相同的音频。我很确定内核有问题，这就是为什么它不会写回整首歌。

我知道这段代码很乱，但你要做的就是尝试测试这段代码就是复制并浪费它，只需改变输入wav文件的路径并输出wav文件。

为了明确目标，我将尝试修改wav文件中的每个值，看看会发生什么。到目前为止，如果我将内核中的输出值乘以2，则会使其失真。但同样，只持续1秒钟，剪辑的其余部分为空。请注意，输入和输出文件的大小都相同。

My For循环也循环执行1.2百万次迭代，因为这是我在示例wav文件中拥有的项目数

const char* prog = "__kernel void exchange(__global int *Array, __global int *Output) { for(int j = 0; j < 100000; j++){ for(int i = 0; i < 12; i++){ Output[j+i] = (Array[j+i]);}  }  }";

int main() {
// This code executes on the OpenCL host
SNDFILE *sf;
SF_INFO info;
int num_channels;
int num, num_items;
//input and output data
int *Array;
int *OutputData;

int f, sr, c;
int i, j;
FILE *out;

/* Open the WAV file. */
info.format = 0;
sf = sf_open("Yourwavfilepathhere", SFM_READ, &info);
if (sf == NULL)
{
    printf("Failed to open the file.\n");
    perror("Error");
    exit(-1);
}
/* Print some of the info, and figure out how much data to read. */
f = info.frames;
sr = info.samplerate;
int format = info.format;
c = info.channels;
printf("frames=%d\n", f);
printf("samplerate=%d\n", sr);
printf("channels=%d\n", c);
printf("format %i\n", format);
num_items = f*c;
printf("num_items=%d\n", num_items);
/* Allocate space for the data to be read, then read it. */
Array = (int *)malloc(num_items*sizeof(int));
OutputData = (int*)malloc(num_items*sizeof(int));;
num = sf_read_int(sf, Array, num_items);
sf_close(sf);
printf("Read %d items\n", num);

//Time variables for performance execution. Event variable needed for timing constraint 
cl_event someEvent;
cl_ulong start = (cl_ulong)0;
cl_ulong end = (cl_ulong)0;
cl_ulong finalTime = (cl_ulong)0;

//Number of sampling points 
int sampleSize = 100;
float h = 0;

//Coefficient used to multiply the values entering the FIFO buffer implemented inside the kernel
float coefficient = 1 / sampleSize;

//Signal Frequency in Hz
float signalFreq = 10;

//Number of points between 0 and max val (T_Sample)
float freqSample = sampleSize*signalFreq;

//Step = max value or T_Sample. ******Either 1/freqSample or 1/sampleSize for the stepSize******
float stepSize = 1.0 / freqSample;

/*
  This is a different Example
*/


// Use this to check the output of each API call
cl_int status;

//-----------------------------------------------------
// STEP 1: Discover and initialize the platforms
//-----------------------------------------------------

cl_uint numPlatforms = 0;

cl_platform_id *platforms = NULL;

// Use clGetPlatformIDs() to retrieve the number of 
// platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);

// Allocate enough space for each platform
platforms =
    (cl_platform_id*)malloc(
        numPlatforms*sizeof(cl_platform_id));

// Fill in platforms with clGetPlatformIDs()
status = clGetPlatformIDs(numPlatforms, platforms,
    NULL);

//-----------------------------------------------------
// STEP 2: Discover and initialize the devices
//----------------------------------------------------- 

cl_uint numDevices = 0;
cl_device_id *devices = NULL;

// Use clGetDeviceIDs() to retrieve the number of 
// devices present
status = clGetDeviceIDs(
    platforms[0],
    CL_DEVICE_TYPE_CPU,
    0,
    NULL,
    &numDevices);

// Allocate enough space for each device
devices =
    (cl_device_id*)malloc(
        numDevices*sizeof(cl_device_id));

// Fill in devices with clGetDeviceIDs()
status = clGetDeviceIDs(
    platforms[0],
    CL_DEVICE_TYPE_CPU,
    numDevices,
    devices,
    NULL);


//-----------------------------------------------------
// STEP 3: Create a context
//----------------------------------------------------- 

cl_context context = NULL;

// Create a context using clCreateContext() and 
// associate it with the devices
context = clCreateContext(
    NULL,
    numDevices,
    devices,
    NULL,
    NULL,
    &status);

//-----------------------------------------------------
// STEP 4: Create a command queue
//----------------------------------------------------- 

cl_command_queue cmdQueue;

// Create a command queue using clCreateCommandQueue(),
// and associate it with the device you want to execute 
// on
cmdQueue = clCreateCommandQueue(
    context,
    devices[0],
    CL_QUEUE_PROFILING_ENABLE,
    &status);

//-----------------------------------------------------
// STEP 5: Create device buffers
//----------------------------------------------------- 

cl_mem input;
cl_mem output;
cl_float coeff;

input = clCreateBuffer(
    context,
    CL_MEM_READ_ONLY,
    num_items,
    NULL,
    &status);

output = clCreateBuffer(
    context,
    CL_MEM_WRITE_ONLY,
    num_items,
    NULL,
    &status);

//-----------------------------------------------------
// STEP 6: Write host data to device buffers
//----------------------------------------------------- 

// Use clEnqueueWriteBuffer() to write input array Array to
// the device buffer input
status = clEnqueueWriteBuffer(
    cmdQueue,
    input,
    CL_FALSE,
    0,
    num_items,
    Array,
    0,
    NULL,
    NULL);

printf("status %i \n", status);

//-----------------------------------------------------
// STEP 7: Create and compile the program
//----------------------------------------------------- 

// Create a program using clCreateProgramWithSource()
cl_program program = clCreateProgramWithSource(
    context,
    1,
    (const char**)&prog,
    NULL,
    &status);
printf("status %i \n", status);

// Build (compile) the program for the devices with
// clBuildProgram()
status = clBuildProgram(
    program,
    numDevices,
    devices,
    NULL,
    NULL,
    NULL);

//-----------------------------------------------------
// STEP 8: Create the kernel
//----------------------------------------------------- 

cl_kernel kernel = NULL;

kernel = clCreateKernel(program, "exchange", &status);

//-----------------------------------------------------
// STEP 9: Set the kernel arguments
//----------------------------------------------------- 

// Associate the input and output buffers with the 
// kernel 
// using clSetKernelArg()
status = clSetKernelArg(
    kernel,
    0,
    sizeof(cl_mem),
    &input);
printf("Status %i \n",status);

status |= clSetKernelArg(
    kernel,
    1,
    sizeof(cl_mem),
    &output);


//-----------------------------------------------------
// STEP 10: Configure the work-item structure
//----------------------------------------------------- 

// Define an index space (global work size) of work 
// items for 
// execution. A workgroup size (local work size) is not 
// required, 
// but can be used.
size_t globalWorkSize[1];
// There are 'elements' work-items 
globalWorkSize[0] = sampleSize;

//-----------------------------------------------------
// STEP 11: Enqueue the kernel for execution
//----------------------------------------------------- 

// Execute the kernel by using 
// clEnqueueNDRangeKernel().
// 'globalWorkSize' is the 1D dimension of the 
// work-items
status = clEnqueueNDRangeKernel(
    cmdQueue,
    kernel,
    1,
    NULL,
    globalWorkSize,
    NULL,
    0,
    NULL,
    &someEvent);

clFinish(cmdQueue);

clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

double totalTime = end - start;

printf("Total time is: %f ms \n", totalTime / 1000000.0);
//-----------------------------------------------------
// STEP 12: Read the output buffer back to the host
//----------------------------------------------------- 

// Use clEnqueueReadBuffer() to read the OpenCL output  
// buffer (bufferC) 
// to the host output array (C)
printf("Made it here! %i \n", status);
clEnqueueReadBuffer(
    cmdQueue,
    output,
    CL_TRUE,
    0,
    num_items,
    OutputData,
    0,
    NULL,
    NULL);
printf("Made it here2! %i \n", status);


SNDFILE * outfile = sf_open("outputwavfilepathhere", SFM_WRITE, &info);
sf_count_t count = sf_write_int(outfile, OutputData, num_items);
sf_write_sync(outfile);
sf_close(outfile);

//-----------------------------------------------------
// STEP 13: Release OpenCL resources
//----------------------------------------------------- 

// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseContext(context);

// Free host resources
free(OutputData);
free(platforms);
free(devices);
free(Array);
}

Answer 1

试试这个：

__kernel void exchange(__global int *Array, __global int *Output)
{
    int globalSize = get_global_size(0)
    int globalId = get_global_id(0)

    for(int i = globalId; i < 1200000; i += globalSize){
        Output[i] = (Array[i]);
    }
}

确保在for循环中使用正确的上限。理想情况下，您将此作为另一个参数传递。

你最初做错了，只是重复写了前100012个元素。重新开始工作项功能，以了解变量的含义。 OpenCL 1.2 reference here.

OpenCL内核只部分写入输出缓冲区

1 个答案: