我有许多卤化物管道(小写p),它们都读取相同的输入图像并产生唯一的输出。有些共享输出维度,有些则没有。每个管道都会读取源图像中的每个像素一次。所需的输出图像可能会在运行时根据用户输入而变化。
我正在使用Pipeline将所有这些输出计算为Realization。有没有办法安排这些不同的Funcs来实现管道中的单个外部循环?
看来我可以创建一个包装函数,将这些Func打包成一个元组,但这要求它们都输出相同的尺寸。
我错过了其他选择吗?
已修改为添加示例代码
//Buffer<> input = Buffer<uint8_t>::make_interleaved(width, height, 4);
//fill buffer with image data
Var x("x"), y("y"), c("c");
Func rgb("rgb");
rgb(x,y,c) = ConciseCasts::u8_sat(input(x,y,c));
// Define a one-dimensional reduction domain over x
RDom r(0, input.width());
Func hist1("hist1");
Func hist2("hist2");
// Histogram buckets start as zero.
hist1(x,y) = 0;
hist2(x,y,c) = 0;
// Make a histogram for every scanline of input
hist1(rgb(r, y, 0), y ) += 1;
hist2(rgb(r, y, c), y, c) += 1;
Func clamp1("clamp1");
clamp1(x,y) = ConciseCasts::u8_sat(hist1(x,y));
Func clamp2("clamp2");
clamp2(x,y,c) = ConciseCasts::u8_sat(hist2(x,y,c));
//use clamp1 as a wrapper
hist1.compute_at(clamp1, y);
//schedule hist2 the same way (but unroll c)
hist2.compute_at(clamp2, y);
clamp2.bound(c,0,3).reorder(c, x, y).unroll(c);
hist2.bound(c,0,3).reorder(c, x, y).unroll(c);
hist2.update(0).reorder(c, r, y).unroll(c);
clamp1
.bound(x, 0, 256)
.bound(y, 0, input.height());
clamp2
.bound(x, 0, 256)
.bound(y, 0, input.height());
Pipeline pipe = Pipeline({clamp1, clamp2});
看着降低的声明,我看到了:
produce clamp1 {
for (clamp1.s0.y, 0, 2160) {
allocate hist1[int32 * 256 * 1]
produce hist1 {
for (hist1.s0.x, 0, 256) {
hist1[hist1.s0.x] = 0
}
for (hist1.s1.r4$x, 0, 4096) {
hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] = (hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] + 1)
}
}
for (clamp1.s0.x, 0, 256) {
clamp1[((clamp1.s0.x + (clamp1.s0.y*clamp1.stride.1)) - (clamp1.min.0 + (clamp1.min.1*clamp1.stride.1)))] = uint8(max(min(hist1[clamp1.s0.x], 255), 0))
}
free hist1
}
}
produce clamp2 {
for (clamp2.s0.y, 0, 2160) {
allocate hist2[int32 * 256 * 1 * 3]
produce hist2 {
for (hist2.s0.x, 0, 256) {
hist2[hist2.s0.x] = 0
hist2[(hist2.s0.x + 256)] = 0
hist2[(hist2.s0.x + 512)] = 0
}
for (hist2.s1.r4$x, 0, 4096) {
hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] = (hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] + 1)
hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] + 1)
hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] + 1)
}
}
for (clamp2.s0.x, 0, 256) {
clamp2[((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[clamp2.s0.x], 255), 0))
clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + clamp2.stride.2) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 256)], 255), 0))
clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + (clamp2.stride.2*2)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 512)], 255), 0))
}
free hist2
}
}
}
我希望实现的目标是降低声明(我只是剪切并粘贴在一起):
produce clamps {
for (clamp1.s0.y, 0, 2160) {
allocate hist1[int32 * 256 * 1]
allocate hist2[int32 * 256 * 1 * 3]
produce hists {
for (hist1.s0.x, 0, 256) {
hist1[hist1.s0.x] = 0
hist2[hist2.s0.x] = 0
hist2[(hist2.s0.x + 256)] = 0
hist2[(hist2.s0.x + 512)] = 0
}
for (hist1.s1.r4$x, 0, 4096) {
hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] = (hist1[int32(b0[((hist1.s1.r4$x*4) + (clamp1.s0.y*16384))])] + 1)
hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] = (hist2[int32(b0[((hist2.s1.r4$x*4) + (clamp2.s0.y*16384))])] + 1)
hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 1)]) + 256)] + 1)
hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] = (hist2[(int32(b0[(((hist2.s1.r4$x*4) + (clamp2.s0.y*16384)) + 2)]) + 512)] + 1)
}
}
for (clamp1.s0.x, 0, 256) {
clamp1[((clamp1.s0.x + (clamp1.s0.y*clamp1.stride.1)) - (clamp1.min.0 + (clamp1.min.1*clamp1.stride.1)))] = uint8(max(min(hist1[clamp1.s0.x], 255), 0))
clamp2[((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[clamp2.s0.x], 255), 0))
clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + clamp2.stride.2) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 256)], 255), 0))
clamp2[(((clamp2.s0.x + (clamp2.s0.y*clamp2.stride.1)) + (clamp2.stride.2*2)) - ((clamp2.min.0 + (clamp2.min.1*clamp2.stride.1)) + (clamp2.min.2*clamp2.stride.2)))] = uint8(max(min(hist2[(clamp2.s0.x + 512)], 255), 0))
}
free hist1
free hist2
}
}
但是,如果我尝试添加
clamp2.compute_with(clamp1, y);
jitting时出现以下错误
Internal error at /Halide/src/ScheduleFunctions.cpp:2228
Condition failed: injector.found_store_level && injector.found_compute_level
答案 0 :(得分:0)
这可能是compute_with的另一个用例,它尚未合并。您可以尝试使用compute_with_directive分支来查看它是否满足您的需求。希望这很快就会合并。