问题解决了(如果你感兴趣;你可以看到第二段;在线下面)。现在我有了一个新问题;为什么#define BLOCK_DIM 16;
导致下面的函数出错?只需使用16
即可。
以下是错误
expected a "]"
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
^
line 110: error:
expected a ")"
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
^
line 110: error: operand
of "*" must be a pointer
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
error:
expected a ";"
int Idout = get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1);
^
和功能
__kernel void transpose(
__global float2* dataout,
__global float2* datain,
int width, int height)
// width = N (signal length)
// height = batch_size (number of signals in a batch)
{
// read the matrix tile into shared memory
__local float2 block[32 * (32 + 1)] ;
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
int Idin = get_local_id(1)*(32+1)+get_local_id(0);
block[Idin]= datain[index_in];
}
barrier(CLK_LOCAL_MEM_FENCE);
// write the transposed matrix tile to global memory
xIndex = get_group_id(1) * 32 + get_local_id(0);
yIndex = get_group_id(0) * 32 + get_local_id(1);
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
int Idout = get_local_id(0)*(32+1)+get_local_id(1);
dataout[index_out] = block[Idout];
}
}
===============================
我正致力于提高图像上2D FFT的性能。经过基准测试;我调整转置函数是使程序变慢的原因,所以我用更优化的替换它。
但在那之后;我收到了CL_INVALID_KERNEL_NAME
之前工作正常的所有函数的返回码。除了转置函数和主机代码中的clSetKernelArg
之外;我不改变任何其他东西。所以我不在乎。希望你们帮助我:))
更新:这是错误。不介意行号:)这些行对我来说似乎很正常。有什么不对吗?
错误:
expected a "]"
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
^
line 110: error:
expected a ")"
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
^
line 110: error: operand
of "*" must be a pointer
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
error:
expected a ";"
int Idout = get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1);
^
以下是 kernel file
新的:
#define BLOCK_DIM 16
__kernel void transpose(
__global float2* dataout,
__global float2* datain,
int width, int height)
// width = N (signal length)
// height = batch_size (number of signals in a batch)
{
// read the matrix tile into shared memory
__local float2 block[BLOCK_DIM * (BLOCK_DIM + 1)] ;
unsigned int xIndex = get_global_id(0);
unsigned int yIndex = get_global_id(1);
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
int Idin = get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0);
block[Idin]= datain[index_in];
}
barrier(CLK_LOCAL_MEM_FENCE);
// write the transposed matrix tile to global memory
xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
int Idout = get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1);
dataout[index_out] = block[Idout];
}
}
答案 0 :(得分:0)
你的#define问题..他们不需要分号。基本上,#define X Y将在编译之前在代码中用“Y”替换所有出现的“X”,如果你最后添加一个分号,它将成为“Y”的一部分,并产生大量的语法错误。 #define不是声明。
实际上,这是一个简单的解释,但它足以满足这个问题的范围(如果你想了解更多,我建议你看一下预处理器教程和文档)。