我正在使用pyOpenCL优化我的Mandelbrot渲染器,并希望将迭代分割成块,以便我可以更好地利用我的GPU。
max iterations = 1000和2“chunks”的示例:
1.对迭代0-500运行mandelbrot转义算法
2.为迭代<的每个点保存所需的迭代。 500并再次运行所有其他点,迭代次数为500 - 1000。
第一个循环的工作方式与预期相同,但之后的每个块都会导致错误的结果。我真的很想更具体,但我不知道真正的问题在哪里(现在代码为>主演2天)。
我怀疑从内核复制旧的x,y(真实的,虚构的)部分时会出现问题,但我不知道如何调试它。
我在GPU和CPU上运行的结果相同,所以我猜它没有特定的GPU。
迭代= 2000和10个块的示例图像:
这几乎只是第一块(加上一些“错误”的像素)
全部在一个块中完成(迭代= 200和1块):
iterations = 2000和chunks = 1 的预期结果:
import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal
def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
mf = cl.mem_flags
cl_queue = cl.CommandQueue(ctx)
# build program
code = """
#if real_t == double
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
kernel void mandel(
__global real_t *coords,
__global uint *output,
__global real_t *output_coord,
const uint max_iter,
const uint start_iter
){
uint id = get_global_id(0);
real_t2 my_coords = vload2(id, coords);
real_t x = my_coords.x;
real_t y = my_coords.y;
uint iter = 0;
for(iter=start_iter; iter<max_iter; ++iter){
if(x*x + y*y > 4.0f){
break;
}
real_t xtemp = x*x - y*y + my_coords.x;
y = 2*x*y + my_coords.y;
x = xtemp;
}
// copy the current x,y pair back
real_t2 val = (real_t2){x, y};
vstore2(val, id, output_coord);
output[id] = iter;
}
"""
_cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))
# Calculate the "viewport".
x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
y1 = y + ((Decimal(2) * zoom)/Decimal(2.))
# Create index map in x,y pairs
xx = np.arange(0, width, 1, dtype=np.uint32)
yy = np.arange(0, height, 1, dtype=np.uint32)
index_map = np.dstack(np.meshgrid(xx, yy))
# and local "coordinates" (real, imaginary parts)
coord_map = np.ndarray(index_map.shape, dtype=_nptype)
coord_map[:] = index_map
coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
coord_map[:] += (_nptype(x0), _nptype(y0))
coord_map = coord_map.flatten()
index_map = index_map.flatten().astype(dtype=np.uint32)
# Create input and output buffer
buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
buffer_out_coords_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out_coords.nbytes)
# 2D Buffer to collect the iterations needed per pixel
#iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))
start_max_iter = 0
to_do = coord_map.size / 2
steps_size = int(max_iter / float(iter_steps))
while to_do > 0 and start_max_iter < max_iter:
end_max_iter = min(max_iter, start_max_iter + steps_size )
print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)
# copy x/y pairs to device
cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()
# and finally call the ocl function
prg.mandel(cl_queue, (to_do,), None,
buffer_in_cl,
buffer_out_cl,
buffer_out_coords_cl,
np.uint32(end_max_iter),
np.uint32(start_max_iter)
).wait()
# Copy the output back
cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()
# Get indices of "found" escapes
done = np.where(buffer_out[:to_do]<end_max_iter)[0]
# and write the iterations to the coresponding cell
index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
tmp = index_reshaped[done]
iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]
#iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]
# Get the indices of non escapes
undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
# and write them back to our "job" maps for the next loop
tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
coord_map[:undone.size*2] = tmp[undone].flatten()
index_map[:undone.size*2] = index_reshaped[undone].flatten()
to_do = undone.size
start_max_iter = end_max_iter
print "%i done. %i unknown" % (done.size, undone.size)
# simple coloring by modulo 255 on the iter_map
return (iter_map % 255).astype(np.uint8).reshape((height, width))
if __name__ == '__main__':
ctx = cl.create_some_context(interactive=True)
img = mandel(ctx,
x=Decimal("-0.7546546453361122021732941811"),
y=Decimal("0.05020518634419688663435986387"),
zoom=Decimal("0.0002046859427855630601247281079"),
max_iter=2000,
iter_steps=1,
width=500,
height=400,
use_double=False
)
Image.fromarray(img).show()
编辑:Here是另一个版本,其中实/虚部件永远不会离开GPU内存。
结果是一样的
我完全没有想法。
答案 0 :(得分:2)
在进行buffer_out_coords
计算时,您使用Z squared plus c
中更新的“co-ords”而不是原始坐标作为c值。 buffer_out_coords包含当前Z
值,而不是原始的c
值,因此这些是您想要开始的值,但您还需要原始的co-ords。
Theres只需要4次更改:
我没有获得与你提供的代码相同的输出,所以我不确定这里是否有其他错误但是这个改变给了我一致的输出:
1 chunk = 153052 unknown
PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 500 for 200000 numbers
46948 done. 153052 unknown
5个块= 153052未知
PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 100 for 200000 numbers
0 done. 200000 unknown
Iterations from iteration 100 to 200 for 200000 numbers
11181 done. 188819 unknown
Iterations from iteration 200 to 300 for 188819 numbers
9627 done. 179192 unknown
Iterations from iteration 300 to 400 for 179192 numbers
16878 done. 162314 unknown
Iterations from iteration 400 to 500 for 162314 numbers
9262 done. 153052 unknown
下面是代码:
import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal
def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
mf = cl.mem_flags
cl_queue = cl.CommandQueue(ctx)
# build program
code = """
#if real_t == double
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
kernel void mandel(
__global real_t *coords,
__global uint *output,
__global real_t *output_coord,
const uint max_iter,
const uint start_iter
){
uint id = get_global_id(0);
real_t2 my_coords = vload2(id, coords);
real_t2 my_value_coords = vload2(id, output_coord);
real_t x = my_value_coords.x;
real_t y = my_value_coords.y;
uint iter = 0;
for(iter=start_iter; iter<max_iter; ++iter){
if(x*x + y*y > 4.0f){
break;
}
real_t xtemp = x*x - y*y + my_coords.x;
y = 2*x*y + my_coords.y;
x = xtemp;
}
// copy the current x,y pair back
real_t2 val = (real_t2){x, y};
vstore2(val, id, output_coord);
output[id] = iter;
}
"""
_cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))
# Calculate the "viewport".
x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
y1 = y + ((Decimal(2) * zoom)/Decimal(2.))
# Create index map in x,y pairs
xx = np.arange(0, width, 1, dtype=np.uint32)
yy = np.arange(0, height, 1, dtype=np.uint32)
index_map = np.dstack(np.meshgrid(xx, yy))
# and local "coordinates" (real, imaginary parts)
coord_map = np.ndarray(index_map.shape, dtype=_nptype)
coord_map[:] = index_map
coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
coord_map[:] += (_nptype(x0), _nptype(y0))
coord_map = coord_map.flatten()
index_map = index_map.flatten().astype(dtype=np.uint32)
# Create input and output buffer
buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes)
# 2D Buffer to collect the iterations needed per pixel
#iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))
start_max_iter = 0
to_do = coord_map.size / 2
steps_size = int(max_iter / float(iter_steps))
while to_do > 0 and start_max_iter < max_iter:
end_max_iter = min(max_iter, start_max_iter + steps_size )
print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)
# copy x/y pairs to device
cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()
cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do*2]).wait()
# and finally call the ocl function
prg.mandel(cl_queue, (to_do,), None,
buffer_in_cl,
buffer_out_cl,
buffer_out_coords_cl,
np.uint32(end_max_iter),
np.uint32(start_max_iter)
).wait()
# Copy the output back
cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()
# Get indices of "found" escapes
done = np.where(buffer_out[:to_do]<end_max_iter)[0]
# and write the iterations to the coresponding cell
index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
tmp = index_reshaped[done]
iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]
#iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]
# Get the indices of non escapes
undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
# and write them back to our "job" maps for the next loop
tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
buffer_out_coords[:undone.size*2] = tmp[undone].flatten()
tmp = coord_map[:to_do*2].reshape((to_do, 2))
coord_map[:undone.size*2] = tmp[undone].flatten()
index_map[:undone.size*2] = index_reshaped[undone].flatten()
to_do = undone.size
start_max_iter = end_max_iter
print "%i done. %i unknown" % (done.size, undone.size)
# simple coloring by modulo 255 on the iter_map
return (iter_map % 255).astype(np.uint8).reshape((height, width))
if __name__ == '__main__':
ctx = cl.create_some_context(interactive=True)
img = mandel(ctx,
x=Decimal("-0.7546546453361122021732941811"),
y=Decimal("0.05020518634419688663435986387"),
zoom=Decimal("0.0002046859427855630601247281079"),
max_iter=2000,
iter_steps=1,
width=500,
height=400,
use_double=False
)
Image.fromarray(img).show()