在谨慎的管道功能中调度公共循环

时间:2017-09-06 19:47:23

标签: halide

我有许多卤化物管道(小写p),它们都读取相同的输入图像并产生唯一的输出。有些共享输出维度,有些则没有。每个管道都会读取源图像中的每个像素一次。所需的输出图像可能会在运行时根据用户输入而变化。

我正在使用Pipeline将所有这些输出计算为Realization。有没有办法安排这些不同的Funcs来实现管道中的单个外部循环?

看来我可以创建一个包装函数,将这些Func打包成一个元组,但这要求它们都输出相同的尺寸。

我错过了其他选择吗?

已修改为添加示例代码

//Buffer<> input = Buffer<uint8_t>::make_interleaved(width, height, 4);
//fill buffer with image data

Var x("x"), y("y"), c("c");

Func rgb("rgb");
rgb(x,y,c) = ConciseCasts::u8_sat(input(x,y,c));

// Define a one-dimensional reduction domain over x
RDom r(0, input.width());

Func hist1("hist1");
Func hist2("hist2");

// Histogram buckets start as zero.
hist1(x,y) = 0;
hist2(x,y,c) = 0;

// Make a histogram for every scanline of input
hist1(rgb(r, y, 0), y ) += 1;
hist2(rgb(r, y, c), y, c) += 1;

Func clamp1("clamp1");
clamp1(x,y) = ConciseCasts::u8_sat(hist1(x,y));

Func clamp2("clamp2");
clamp2(x,y,c) = ConciseCasts::u8_sat(hist2(x,y,c));


//use clamp1 as a wrapper
hist1.compute_at(clamp1, y);

//schedule hist2 the same way (but unroll c)
hist2.compute_at(clamp2, y);

clamp2.bound(c,0,3).reorder(c, x, y).unroll(c);

hist2.bound(c,0,3).reorder(c, x, y).unroll(c);
hist2.update(0).reorder(c, r, y).unroll(c);

clamp1
.bound(x, 0, 256)
.bound(y, 0, input.height());

clamp2
.bound(x, 0, 256)
.bound(y, 0, input.height());


Pipeline pipe = Pipeline({clamp1, clamp2});

看着降低的声明,我看到了:

produce clamp1 {
    for (clamp1.s0.y, 0, 2160) {
      allocate hist1[int32 * 256 * 1]
      produce hist1 {
        for (hist1.s0.x, 0, 256) {
          hist1[hist1.s0.x] = 0
        }
        for (hist1.s1.r4$x, 0, 4096) {
          hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] = (hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] + 1)
        }
      }
      for (clamp1.s0.x, 0, 256) {
        clamp1[((clamp1.s0.x + (clamp1.s0.y*clamp1.stride.1)) - (clamp1.min.0 + (clamp1.min.1*clamp1.stride.1)))] = uint8(max(min(hist1[clamp1.s0.x], 255), 0))
      }
      free hist1
    }
  }

  produce clamp2 {
    for (clamp2.s0.y, 0, 2160) {
      allocate hist2[int32 * 256 * 1 * 3]
      produce hist2 {
        for (hist2.s0.x, 0, 256) {
          hist2[hist2.s0.x] = 0
          hist2[(hist2.s0.x + 256)] = 0
          hist2[(hist2.s0.x + 512)] = 0
        }
        for (hist2.s1.r4$x, 0, 4096) {
          hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] = (hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] + 1)
          hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] + 1)
          hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] + 1)
        }
      }
      for (clamp2.s0.x, 0, 256) {
        clamp2[((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[clamp2.s0.x], 255), 0))
        clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + clamp2.stride.2) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 256)], 255), 0))
        clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + (clamp2.stride.2*2)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 512)], 255), 0))
      }
      free hist2
    }
  }
}

我希望实现的目标是降低声明(我只是剪切并粘贴在一起):

produce clamps {
    for (clamp1.s0.y, 0, 2160) {
      allocate hist1[int32 * 256 * 1]
      allocate hist2[int32 * 256 * 1 * 3]
      produce hists {
        for (hist1.s0.x, 0, 256) {
          hist1[hist1.s0.x] = 0
          hist2[hist2.s0.x] = 0
          hist2[(hist2.s0.x + 256)] = 0
          hist2[(hist2.s0.x + 512)] = 0
        }
        for (hist1.s1.r4$x, 0, 4096) {
          hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] = (hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] + 1)
          hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] = (hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] + 1)
          hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] + 1)
          hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] + 1)
        }
      }
      for (clamp1.s0.x, 0, 256) {
        clamp1[((clamp1.s0.x + (clamp1.s0.y*clamp1.stride.1)) - (clamp1.min.0 + (clamp1.min.1*clamp1.stride.1)))] = uint8(max(min(hist1[clamp1.s0.x], 255), 0))
         clamp2[((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[clamp2.s0.x], 255), 0))
        clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + clamp2.stride.2) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 256)], 255), 0))
        clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + (clamp2.stride.2*2)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 512)], 255), 0))
      }
      free hist1
      free hist2
    }
  }

但是,如果我尝试添加

clamp2.compute_with(clamp1, y); 

jitting时出现以下错误

Internal error at /Halide/src/ScheduleFunctions.cpp:2228
Condition failed: injector.found_store_level && injector.found_compute_level

1 个答案:

答案 0 :(得分:0)

这可能是compute_with的另一个用例,它尚未合并。您可以尝试使用compute_with_directive分支来查看它是否满足您的需求。希望这很快就会合并。