我想在大型2D复杂阵列上运行一个函数(最终2 * 12x2 * 12数据点)。但是,pycuda不能按预期工作。 ElementWise函数在2d数组中不起作用,因此我使用了具有块大小的SourceModule函数。
现在的问题是GPU上的C代码与CPU上的numpy计算结果不一致。结果非常大而奇怪。
我正在使用以下代码。出了什么问题?
#!/usr/bin/env python
#https://github.com/lebedov/scikits.cuda/blob/master/demos/indexing_2d_demo.py
"""
Demonstrates how to access 2D arrays within a PyCUDA kernel in a
numpy-consistent manner.
"""
from string import Template
import pycuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from matplotlib import pyplot as plt
# Set size
A = 2**3
B = 2**3
N = A*B
x_gpu = gpuarray.to_gpu(np.fromfunction(lambda x,y: (1.+x)*np.exp(1.j*y*np.pi/10), (A,B)) )
y_gpu = gpuarray.to_gpu(np.fromfunction(lambda x,y: 1.*x, (A,B)).astype(
x_gpu.dtype))
d_gpu = gpuarray.to_gpu(np.zeros_like(x_gpu.get()))#.astype(np.float32))
func_mod_template = Template("""
// Macro for converting subscripts to linear index:
#define INDEX(a, b) a*${B}+b
#include <pycuda-complex.hpp>
//__global__ void func(double *d,double *x,double *y, unsigned int N) {
__global__ void func(pycuda::complex<float> *d,pycuda::complex<float> *x,
pycuda::complex<float> *y)
{
// Obtain the linear index corresponding to the current thread:
// unsigned int idx = blockIdx.y*blockDim.y*gridDim.x +
blockIdx.x*blockDim.x*gridDim.y +threadIdx.x+threadIdx.y;
unsigned int block_num = blockIdx.x + blockIdx.y * gridDim.x;
unsigned int thread_num = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int threads_in_block = blockDim.x * blockDim.y;
unsigned int idx = (threads_in_block * block_num + thread_num);
// Convert the linear index to subscripts:
unsigned int a = idx/${B};
unsigned int b = idx%${B};
// Use the subscripts to access the array:
// d[INDEX(a,b)] = x[INDEX(a,b)]+y[INDEX(a,b)];
pycuda::complex<float> j(0,arg(x[idx]));
pycuda::complex<float> i(abs(x[idx]),0);
d[idx] = i * exp(j);
}
""")
max_threads_per_block = pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK)
max_block_dim = (pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_X),
pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_Y),
pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_Z))
max_grid_dim = (pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_X),
pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_Y),
pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_Z))
max_blocks_per_grid = max(max_grid_dim)
block_dim = max_block_dim
block_dim = (max_block_dim[0],1,1)
grid_dim = (int(np.ceil(1.*x_gpu.shape[0]/block_dim[0])),
int(np.ceil(1.*x_gpu.shape[1]/block_dim[1])))
print block_dim,grid_dim, N
func_mod = \
SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block,
max_blocks_per_grid=max_blocks_per_grid,
A=A, B=B))
func = func_mod.get_function('func')
func(d_gpu,x_gpu,y_gpu,
block=block_dim,
grid=grid_dim)
print d_gpu.get()/x_gpu.get()
#print 'Success status: ', np.allclose(x_np, x_gpu.get())
plt.imshow((d_gpu.get()/x_gpu.get()).real)
plt.colorbar()
plt.show()
答案 0 :(得分:2)
作为实际答案:将x_gpu
行更改为
x_gpu = gpuarray.to_gpu(np.fromfunction(
lambda x,y: (1.+x)*np.exp(1.j*y*np.pi/10), (A,B)).astype(np.complex64) )
似乎解决了这个问题。此外,虽然ElementwiseKernel
不适用于2D阵列,但无论如何你都在使用2d-> 1d转换,所以没有什么能阻止你写作
func = ElementwiseKernel(
"pycuda::complex<float> *d, pycuda::complex<float> *x, pycuda::complex<float> *y",
Template("""
// Convert the linear index to subscripts:
unsigned int a = i/${B};
unsigned int b = i%${B};
// Use the subscripts to access the array:
//d[INDEX(a,b)] = x[INDEX(a,b)]+y[INDEX(a,b)];
pycuda::complex<float> angle(0,arg(x[i]));
pycuda::complex<float> module(abs(x[i]),0);
d[i] = module * exp(angle);
""").substitute(A=A, B=B),
preamble=Template("""
#define INDEX(a, b) a*${B}+b
""").substitute(A=A, B=B))
func(d_gpu, x_gpu, y_gpu)
这样您就不需要处理块/网格大小,因为PyCUDA
会为您处理此问题。