Question

我正在使用pyOpenCL优化我的Mandelbrot渲染器，并希望将迭代分割成块，以便我可以更好地利用我的GPU。
max iterations = 1000和2“chunks”的示例：
1.对迭代0-500运行mandelbrot转义算法 2.为迭代＆lt;的每个点保存所需的迭代。 500并再次运行所有其他点，迭代次数为500 - 1000。

第一个循环的工作方式与预期相同，但之后的每个块都会导致错误的结果。我真的很想更具体，但我不知道真正的问题在哪里（现在代码为＆gt;主演2天）。
我怀疑从内核复制旧的x，y（真实的，虚构的）部分时会出现问题，但我不知道如何调试它。
我在GPU和CPU上运行的结果相同，所以我猜它没有特定的GPU。

迭代= 2000和10个块的示例图像：
enter image description here

这几乎只是第一块（加上一些“错误”的像素） 全部在一个块中完成（迭代= 200和1块）：
enter image description here

iterations = 2000和chunks = 1 的预期结果：

import pyopencl as cl import numpy as np from PIL import Image from decimal import Decimal def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False): mf = cl.mem_flags cl_queue = cl.CommandQueue(ctx) # build program code = """ #if real_t == double #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif kernel void mandel( __global real_t *coords, __global uint *output, __global real_t *output_coord, const uint max_iter, const uint start_iter ){ uint id = get_global_id(0); real_t2 my_coords = vload2(id, coords); real_t x = my_coords.x; real_t y = my_coords.y; uint iter = 0; for(iter=start_iter; iter<max_iter; ++iter){ if(x*x + y*y > 4.0f){ break; } real_t xtemp = x*x - y*y + my_coords.x; y = 2*x*y + my_coords.y; x = xtemp; } // copy the current x,y pair back real_t2 val = (real_t2){x, y}; vstore2(val, id, output_coord); output[id] = iter; } """ _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32) prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype)) # Calculate the "viewport". x0 = x - ((Decimal(3) * zoom)/Decimal(2.)) y0 = y - ((Decimal(2) * zoom)/Decimal(2.)) x1 = x + ((Decimal(3) * zoom)/Decimal(2.)) y1 = y + ((Decimal(2) * zoom)/Decimal(2.)) # Create index map in x,y pairs xx = np.arange(0, width, 1, dtype=np.uint32) yy = np.arange(0, height, 1, dtype=np.uint32) index_map = np.dstack(np.meshgrid(xx, yy)) # and local "coordinates" (real, imaginary parts) coord_map = np.ndarray(index_map.shape, dtype=_nptype) coord_map[:] = index_map coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height))) coord_map[:] += (_nptype(x0), _nptype(y0)) coord_map = coord_map.flatten() index_map = index_map.flatten().astype(dtype=np.uint32) # Create input and output buffer buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes) buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes) buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values buffer_out_coords_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out_coords.nbytes) # 2D Buffer to collect the iterations needed per pixel #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width)) iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width)) start_max_iter = 0 to_do = coord_map.size / 2 steps_size = int(max_iter / float(iter_steps)) while to_do > 0 and start_max_iter < max_iter: end_max_iter = min(max_iter, start_max_iter + steps_size ) print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do) # copy x/y pairs to device cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait() # and finally call the ocl function prg.mandel(cl_queue, (to_do,), None, buffer_in_cl, buffer_out_cl, buffer_out_coords_cl, np.uint32(end_max_iter), np.uint32(start_max_iter) ).wait() # Copy the output back cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait() cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait() # Get indices of "found" escapes done = np.where(buffer_out[:to_do]<end_max_iter)[0] # and write the iterations to the coresponding cell index_reshaped = index_map[:to_do*2].reshape((to_do, 2)) tmp = index_reshaped[done] iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done] #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done] # Get the indices of non escapes undone = np.where(buffer_out[:to_do]==end_max_iter)[0] # and write them back to our "job" maps for the next loop tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2)) coord_map[:undone.size*2] = tmp[undone].flatten() index_map[:undone.size*2] = index_reshaped[undone].flatten() to_do = undone.size start_max_iter = end_max_iter print "%i done. %i unknown" % (done.size, undone.size) # simple coloring by modulo 255 on the iter_map return (iter_map % 255).astype(np.uint8).reshape((height, width)) if __name__ == '__main__': ctx = cl.create_some_context(interactive=True) img = mandel(ctx, x=Decimal("-0.7546546453361122021732941811"), y=Decimal("0.05020518634419688663435986387"), zoom=Decimal("0.0002046859427855630601247281079"), max_iter=2000, iter_steps=1, width=500, height=400, use_double=False ) Image.fromarray(img).show()

编辑：Here是另一个版本，其中实/虚部件永远不会离开GPU内存。
结果是一样的我完全没有想法。

Answer 1

在进行buffer_out_coords计算时，您使用Z squared plus c中更新的“co-ords”而不是原始坐标作为c值。 buffer_out_coords包含当前Z值，而不是原始的c值，因此这些是您想要开始的值，但您还需要原始的co-ords。

Theres只需要4次更改：

make buffer_out_coords_cl READ_WRITE
每次运行前将buffer_out_coords复制到buffer_out_coords_cl
通过“撤消”过滤buffer_out_coords和coord_map
在opencl代码中，从output_coord而不是coords

我没有获得与你提供的代码相同的输出，所以我不确定这里是否有其他错误但是这个改变给了我一致的输出：

1 chunk = 153052 unknown

PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 500 for 200000 numbers
46948 done. 153052 unknown

5个块= 153052未知

PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 100 for 200000 numbers
0 done. 200000 unknown
Iterations from iteration 100 to 200 for 200000 numbers
11181 done. 188819 unknown
Iterations from iteration 200 to 300 for 188819 numbers
9627 done. 179192 unknown
Iterations from iteration 300 to 400 for 179192 numbers
16878 done. 162314 unknown
Iterations from iteration 400 to 500 for 162314 numbers
9262 done. 153052 unknown

下面是代码：

import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal

def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
    mf = cl.mem_flags
    cl_queue = cl.CommandQueue(ctx)
    # build program
    code = """
    #if real_t == double
        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
    #endif
    kernel void mandel(
        __global real_t *coords,
        __global uint *output,
        __global real_t *output_coord,
        const uint max_iter,
        const uint start_iter    
    ){
        uint id = get_global_id(0);         
        real_t2 my_coords = vload2(id, coords);           
        real_t2 my_value_coords = vload2(id, output_coord);           
        real_t x = my_value_coords.x;
        real_t y = my_value_coords.y;
        uint iter = 0;
        for(iter=start_iter; iter<max_iter; ++iter){
            if(x*x + y*y > 4.0f){
                break;
            }
            real_t xtemp = x*x - y*y + my_coords.x;
            y = 2*x*y + my_coords.y;
            x = xtemp;
        }
        // copy the current x,y pair back
        real_t2 val = (real_t2){x, y};
        vstore2(val, id, output_coord);
        output[id] = iter;
    }        
    """
    _cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
    prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))

    # Calculate the "viewport".
    x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
    y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
    x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
    y1 = y + ((Decimal(2) * zoom)/Decimal(2.))

    # Create index map in x,y pairs
    xx = np.arange(0, width, 1, dtype=np.uint32)
    yy = np.arange(0, height, 1, dtype=np.uint32)
    index_map = np.dstack(np.meshgrid(xx, yy))
    # and local "coordinates" (real, imaginary parts)
    coord_map = np.ndarray(index_map.shape, dtype=_nptype)
    coord_map[:] = index_map
    coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
    coord_map[:] += (_nptype(x0), _nptype(y0))
    coord_map = coord_map.flatten()
    index_map = index_map.flatten().astype(dtype=np.uint32)
    # Create input and output buffer
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
    buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
    buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
    buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes)
    # 2D Buffer to collect the iterations needed per pixel 
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
    iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))

    start_max_iter = 0
    to_do = coord_map.size / 2
    steps_size = int(max_iter / float(iter_steps))
    while to_do > 0 and start_max_iter < max_iter:
        end_max_iter = min(max_iter, start_max_iter + steps_size )
        print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)

        # copy x/y pairs to device 
        cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()        
        cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do*2]).wait()        
        # and finally call the ocl function
        prg.mandel(cl_queue, (to_do,), None,
            buffer_in_cl,                   
            buffer_out_cl,
            buffer_out_coords_cl,
            np.uint32(end_max_iter),
            np.uint32(start_max_iter)
        ).wait()
        # Copy the output back
        cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
        cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()

        # Get indices of "found" escapes
        done = np.where(buffer_out[:to_do]<end_max_iter)[0]
        # and write the iterations to the coresponding cell
        index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
        tmp = index_reshaped[done]
        iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]        
        #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]        

        # Get the indices of non escapes
        undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
        # and write them back to our "job" maps for the next loop
        tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
        buffer_out_coords[:undone.size*2] = tmp[undone].flatten()
        tmp = coord_map[:to_do*2].reshape((to_do, 2))
        coord_map[:undone.size*2] = tmp[undone].flatten()
        index_map[:undone.size*2] = index_reshaped[undone].flatten()

        to_do = undone.size
        start_max_iter = end_max_iter
        print "%i done. %i unknown" % (done.size, undone.size)                            

    # simple coloring by modulo 255 on the iter_map
    return (iter_map % 255).astype(np.uint8).reshape((height, width))


if __name__ == '__main__':
    ctx = cl.create_some_context(interactive=True)
    img = mandel(ctx,
          x=Decimal("-0.7546546453361122021732941811"),
          y=Decimal("0.05020518634419688663435986387"),
          zoom=Decimal("0.0002046859427855630601247281079"),
          max_iter=2000,
          iter_steps=1,
          width=500,
          height=400,
          use_double=False
    )
    Image.fromarray(img).show()

未能在pyopencl中呈现mandelbrot

1 个答案: