大家, 我最近尝试在OpenCL中实现快速傅里叶变换,并尝试了Book" OpenCL in Action"中提供的一些代码。以下代码旨在通过在位反转后执行4-compoent-wise FFT来初始化输入序列(或离散信号):
fft.cl:
__kernel void fft_init( __global float2* g_data, __global float2* l_data, uint points_per_group, uint size, int dir )
{
uint g_addr, l_addr, points_per_item;
points_per_item = points_per_group / get_local_size( 0 );
l_addr = get_local_id( 0 ) * points_per_item;
g_addr = get_group_id( 0 ) * points_per_group + l_addr;
uint4 index;
uint mask_left, mask_right, shift_pos;
uint4 br;
float2 x1, x2, x3, x4;
float2 sum12, diff12, sum34, diff34;
for (int i = 0; i < points_per_item; i += 4)
{
index = (uint4){ g_addr, g_addr + 1, g_addr + 2, g_addr + 3 };
mask_left = size / 2;
mask_right = 1;
shift_pos = log2( ( float ) size ) - 1;
while( shift_pos > 1 )
{
br = ( index << shift_pos ) & mask_left;
br |= ( index >> shift_pos ) & mask_right;
mask_left >>= 1;
mask_right <<= 1;
shift_pos -= 2;
}
x1 = g_data[ br.s0 ];
x2 = g_data[ br.s1 ];
x3 = g_data[ br.s2 ];
x4 = g_data[ br.s3 ];
sum12 = x1 + x2;
diff12 = x1 - x2;
sum34 = x3 + x4;
diff34 = ( float2 ){
x3.s1 - x4.s1,
x4.s0 - x3.s1
};
l_data[ l_addr ] = sum12 + sum34;
l_data[ l_addr + 1 ] = diff12 + diff34;
l_data[ l_addr + 2 ] = sum12 - sum34;
l_data[ l_addr + 3 ] = diff12 - diff34;
g_addr += 4;
l_addr += 4;
}
}
当我设置参数&#34; points_per_group&#34;时,我尝试了一个非常短的4个点(4个复数)序列。和&#34;尺寸&#34;到4,GPU耗尽了资源;虽然我将这两个参数设置为8,这显然与输入序列不一致,但它没有崩溃;我还尝试了另一个大小为8的序列,其中包含&#34; points_per_group&#34;和&#34;尺寸&#34;设置为8,它也运行良好。 这是主机端代码的一部分;我用Python编写,使用pyopencl:
def setInputs( self ):
self.numPoints = 8
self.points = ( 1.0, 1.0 ) * self.numPoints
self.pointsArray = numpy.array( self.points, dtype = numpy.float32 )
self.resultArray = numpy.array( ( -1.0, -1.0 ) * self.numPoints, dtype = numpy.float32 )
self.pointsBuffer = cl.Buffer( self.context,
cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
hostbuf = self.pointsArray
)
self.resultBuffer = cl.Buffer( self.context,
cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
hostbuf = self.resultArray
)
def runKernel( self ):
globalWorkSize = ( 1, )
localWorkSize = ( 1, )
print 'Input points : ', self.pointsArray
event = self.program.fft_init( self.commandQueues[ 0 ],
globalWorkSize,
localWorkSize,
self.pointsBuffer,
self.resultBuffer,
numpy.uint32( 8 ),
numpy.uint32( 8 ),
numpy.int32( 1 )
)
event.wait()
print 'Time Consumption : ', ( event.profile.end - event.profile.start ) * 1e-9, ' seconds'
cl.enqueue_copy( self.commandQueues[ 0 ],
self.resultArray,
self.resultBuffer
).wait()
print 'Result : ', self.resultArray
对这个奇怪的问题有什么看法吗? :-o