Question

我是Halide的初学者。我想从一堆输入图像计算结果图像，我想用GPU加速它（目前我使用CUDA作为目标）。例如，我想计算4个图像的atan2，如下所示：

Func result;
result(x,y) = Halide::atan2(images(x,y,3) - images(x,y,1), images(x,y,0) - images(x,y,2));

我遇到了这个问题。它产生一个空结果数组或一个带有伪值的结果数组。

请注意，如果我修改函数只处理单个图像并传入二维输入缓冲区，它会产生正确的结果，但只要我将输入数组设为三维并发送一堆N个图像出了点问题。

我的代码中是否有错误？当我传入一个3D缓冲区（有N个图像）但是只在Halide函数中循环遍历x和y时，我需要做些什么特别的事情吗？

请参阅下面的完整代码：

#include "Halide.h"
#include <stdio.h>
#include <math.h>
#include <string>
#include <fstream>
#include <sstream>

using namespace Halide;

void generatePhaseImage(Halide::Buffer<float> image, float phi)
{
    for (int y=0; y<image.height(); y++)
    {
        for (int x=0; x<image.width(); x++)
        {
            image(x,y) = 128.0f + 128.0f * sin((2.0f*M_PI * static_cast<float>(x) / 64.0f) + phi);
        }
    }
}    

void writeBinaryFile(const std::string& filename, const Buffer<float>& image)
{
    std::ofstream f(filename.c_str(), std::ios::binary);

    std::cout << "Writing image of height " << image.height() << " ";
    std::cout << "and width " << image.width() << std::endl;

    for (int i=0; i<image.height(); i++)
    {
        for (int j=0; j<image.width(); j++)
        {
            f.write(reinterpret_cast<const char*>(&image(i,j)), sizeof(float));
        }
    }
}

int main(int argc, char **argv) 
{
    Var x, y, c, i, ii, xo, yo, xi, yi;

    int h = 100;
    int w = 100;

    Buffer<float> images(w, h, 4);

    for (int i=0; i<4; i++)
    {
        float phi = i * (2*M_PI / 4.);        
        generatePhaseImage(images.sliced(2, i), phi);
    }

    Func phaseStepping;
    phaseStepping(x,y) = Halide::atan2( images(x,y,3) - images(x,y,1), images(x,y,0) - images(x,y,2));

    for (int i=0; i<4; i++)
    {
        std::stringstream ss;
        ss << "image" << i <<".bin";
        writeBinaryFile(ss.str(), images.sliced(2, i));
    }

    Target target = get_host_target();
    target.set_feature(Target::CUDA);
    target.set_feature(Target::Debug);

    phaseStepping.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
    phaseStepping.compile_jit(target);

    Buffer<float> result(w,h);
    phaseStepping.realize(result);

    result.copy_to_host();  
    writeBinaryFile("result.bin", result);
}

代码的输出如下所示：

Writing image of height 100 and width 100
Writing image of height 100 and width 100
Writing image of height 100 and width 100
Writing image of height 100 and width 100
Entering Pipeline phaseStepping
 Input Buffer images: buffer(0, 0x0, 0x564812eefd80, 0, float32, {0, 100, 1}, {0, 100, 100}, {0, 4, 10000})
 Input (void *) __user_context: 0x7ffff380ac68
 Output Buffer phaseStepping: buffer(0, 0x0, 0x56481325b200, 0, float32, {0, 100, 1}, {0, 100, 100})
CUDA: halide_cuda_initialize_kernels (user_context: 0x0, state_ptr: 0x7f76cbb2b000, ptx_src: 0x7f76cbb28340, size: 6919
    load_libcuda (user_context: 0x0)
    Loaded CUDA runtime library: libcuda.so
    Got device 0
      GeForce GTX 560
      total memory: 959 MB
      max threads per block: 1024
      warp size: 32
      max block size: 1024 1024 64
      max grid size: 65535 65535 65535
      max shared memory per block: 49152
      max constant memory per block: 65536
      compute capability 2.1
      cuda cores: 7 x 48 = 48
    cuCtxCreate 0 -> 0x5648134ab8d0(3020)
    cuModuleLoadData 0x7f76cbb28340, 6919 -> 0x56481386f850
    Time: 1.857735e+00 ms
halide_copy_to_device 0x564812e02a68, host: 0x56481325b200, dev: 0, host_dirty: 0, dev_dirty: 0
halide_device_malloc: 0x564812e02a68 interface 0x7f76cbb340f0 host: 0x56481325b200, dev: 0, host_dirty: 0, dev_dirty:0 buf current interface: 0x0
CUDA: halide_cuda_device_malloc (user_context: 0x7ffff380ac68, buf: 0x564812e02a68)
    allocating buffer(0, 0x0, 0x56481325b200, 0, float32, {0, 100, 1}, {0, 100, 100})
    cuMemAlloc 40000 -> 0x501700000
    Time: 1.549260e-01 ms
halide_copy_to_device 0x564812dc6418, host: 0x564812eefd80, dev: 0, host_dirty: 0, dev_dirty: 0
halide_device_malloc: 0x564812dc6418 interface 0x7f76cbb340f0 host: 0x564812eefd80, dev: 0, host_dirty: 0, dev_dirty:0 buf current interface: 0x0
CUDA: halide_cuda_device_malloc (user_context: 0x7ffff380ac68, buf: 0x564812dc6418)
    allocating buffer(0, 0x0, 0x564812eefd80, 0, float32, {0, 100, 1}, {0, 100, 100}, {0, 4, 10000})
    cuMemAlloc 160000 -> 0x501800000
    Time: 1.099330e-01 ms
CUDA: halide_cuda_run (user_context: 0x7ffff380ac68, entry: kernel_phaseStepping_s0_y_yo___block_id_y, blocks: 13x13x1, threads: 8x8x1, shmem: 0
Got context.
Got module 0x56481386f850
Got function 0x56481387b2b0
    halide_cuda_run 0 4 [0x6400000064 ...] 0
    halide_cuda_run 1 4 [0x64 ...] 0
    halide_cuda_run 2 4 [0x0 ...] 0
    halide_cuda_run 3 4 [0x6400000000 ...] 0
    halide_cuda_run 4 4 [0x4000000064 ...] 0
    halide_cuda_run 5 8 [0x501800000 ...] 1
    halide_cuda_run 6 8 [0x501700000 ...] 1
    halide_cuda_run translated arg5 [0x501800000 ...]
    halide_cuda_run translated arg6 [0x501700000 ...]
    Time: 4.394600e-02 ms
Exiting Pipeline phaseStepping
halide_copy_to_host 0x564812e02a68
copy_to_host_already_locked 0x564812e02a68 dev_dirty is true
CUDA: halide_cuda_copy_to_host (user_context: 0x0, buf: 0x564812e02a68)
c.extent[0] = 100
c.extent[1] = 100
    cuMemcpyDtoH 0x501700000 -> 0x56481325b200, 40000 bytes
    Time: 2.062520e-01 ms
Writing image of height 100 and width 100
halide_device_free: 0x564812e02a68 buf dev 21498953728 interface 0x7f76cbb340f0
CUDA: halide_cuda_device_free (user_context: 0x0, buf: 0x564812e02a68)
    cuMemFree 0x501700000
    Time: 7.846700e-02 ms
halide_device_free: 0x564812dc6418 buf dev 21500002304 interface 0x7f76cbb340f0
CUDA: halide_cuda_device_free (user_context: 0x0, buf: 0x564812dc6418)
    cuMemFree 0x501800000
    Time: 8.416100e-02 ms

Answer 1

根据更新问题下的评论，我很确定问题是输入缓冲区没有被标记为主机脏，因此它不会被复制到GPU。这在缓冲逻辑中有点像错误，但我不确定我们是否能够修复它。呼叫＆＃34; set_host_dirty＆＃34;在缓冲区上明确应该解决问题。

Halide：从一堆输入图像（3D缓冲区）计算结果图像（2D缓冲区）

1 个答案: