我尝试在Halide中实现光流算法。我对u和v向量的更新有一些问题。这是我的C ++版本:
for(int i= 0; i<h; i++) {
for(int j= 0; j<bpl; j++) {
float iix = Ix[i*bpl+j];
float iiy = Iy[i*bpl+j];
float iit = It[i*bpl+j];
for(int k=0; k<40; k++) {
float Uav = (u[(i-1)*bpl+j] + u[(i+1)*bpl+j] + u[i*bpl+j-1] + u[i*bpl+j+1])/4;
float Vav = (v[(i-1)*bpl+j] + v[(i+1)*bpl+j] + v[i*bpl+j-1] + v[i*bpl+j+1])/4;
float P = iix*Uav + iiy*Vav + iit;
float D = iix*iix + iiy*iiy + lambda*lambda;
float tmp = P/D;
float utmp = Uav - iix*tmp;
float vtmp = Vav - iiy*tmp;
u[i*bpl+j] = utmp;
v[i*bpl+j] = vtmp;
}
}
}
这是我的卤化物实施:
Func u("u"), v("v");
Func Uav("Uav"), Vav("Vav"), P("P"), D("D"), tmp("tmp"), utmp("utmp"), vtmp("vtmp");
RDom r_0(0, input_1.width(), 0, input_1.height());
u(x, y, c) = 0;
v(x, y, c) = 0;
for(int k=0; k<40; k++) {
Uav (x, y, c) = (u(x, y-1, c) + u(x, y+1, c) + u(x-1, y, c) + u(x+1, y, c))/4;
Vav (x, y, c) = (v(x, y-1, c) + v(x, y+1, c) + v(x-1, y, c) + v(x+1, y, c))/4;
P (x, y, c) = Ix(x, y, c) * Uav(x, y, c) + Iy(x, y, c) * Vav(x, y, c) + It(x, y, c);
D (x, y, c) = Ix(x, y, c) * Ix(x, y, c) + Iy(x, y, c) * Iy(x, y, c) + lambda * lambda;
tmp (x, y, c) = P(x, y, c)/D(x, y, c);
utmp(x, y, c) = Uav(x, y, c) - Ix(x, y, c) * tmp(x, y, c);
vtmp(x, y, c) = Vav(x, y, c) - Iy(x, y, c) * tmp(x, y, c);
u(r_0.x, r_0.y, c) = utmp(x, y, c);
v(r_0.x, r_0.y, c) = vtmp(x, y, c);
}
当我运行程序时,出现以下运行时错误:
错误: Func u不能给出新的更新定义,因为它已经在另一个Func的定义中实现或使用。 中止(核心倾销)
感谢您的回复。
如同说,AhiyaHiya,变量x,y,c被声明为:
Var x(“x”),y(“y”),c(“c”);
正如您所建议的,我使用extern c ++函数来更新我的Halide函数。这是我的外部功能:
extern "C" DLLEXPORT buffer_t compute_flow(buffer_t *Ix, buffer_t *Iy, buffer_t *It, buffer_t *u, buffer_t *v,
const int32_t bpl, const int32_t h, const float lambda, const uint8_t IsU) {
//about Ix
const auto min0_ix = Ix->min[0];
const auto internalX_ix = min0_ix;
const auto min1_ix = Ix->min[1];
const auto internalY_ix = min1_ix;
const auto stride0_ix = Ix->stride[0];
const auto stride1_ix = Ix->stride[1];
const auto x_ix = bpl + internalX_ix;
const auto y_ix = h + internalY_ix;
//about Iy
const auto min0_iy = Iy->min[0];
const auto internalX_iy = min0_iy;
const auto min1_iy = Iy->min[1];
const auto internalY_iy = min1_iy;
const auto stride0_iy = Iy->stride[0];
const auto stride1_iy = Iy->stride[1];
const auto x_iy = bpl + internalX_iy;
const auto y_iy = h + internalY_iy;
//about It
const auto min0_it = It->min[0];
const auto internalX_it = min0_it;
const auto min1_it = It->min[1];
const auto internalY_it = min1_it;
const auto stride0_it = It->stride[0];
const auto stride1_it = It->stride[1];
const auto x_it = bpl + internalX_it;
const auto y_it = h + internalY_it;
const auto iix = *(Ix->host + (x_ix - min0_ix) * stride0_ix + (y_ix - min1_ix) * stride1_ix);
const auto iiy = *(Iy->host + (x_iy - min0_iy) * stride0_iy + (y_iy - min1_iy) * stride1_iy);
const auto iit = *(It->host + (x_it - min0_it) * stride0_it + (y_it - min1_it) * stride1_it);
//about u
const auto min0_u = u->min[0];
const auto internalX_u = min0_u;
const auto min1_u = u->min[1];
const auto internalY_u = min1_u;
const auto stride0_u = u->stride[0];
const auto stride1_u = u->stride[1];
const auto x_u = bpl + internalX_u;
const auto y_u = h + internalY_u;
//about v
const auto min0_v = v->min[0];
const auto internalX_v = min0_v;
const auto min1_v = v->min[1];
const auto internalY_v = min1_v;
const auto stride0_v = v->stride[0];
const auto stride1_v = v->stride[1];
const auto x_v = bpl + internalX_v;
const auto y_v = h + internalY_v;
buffer_t *uResult, *vResult;
for(int k=0; k<40; k++) {
const auto u0 = *(u->host + (x_u - min0_u) * stride0_u + (y_u - 1 - min1_u) * stride1_u); //u[(i-1)*bpl+j]
const auto u1 = *(u->host + (x_u - min0_u) * stride0_u + (y_u + 1 - min1_u) * stride1_u); //u[(i+1)*bpl+j]
const auto u2 = *(u->host + (x_u - 1 - min0_u) * stride0_u + (y_u - min1_u) * stride1_u); //u[i*bpl+j-1]
const auto u3 = *(u->host + (x_u + 1 - min0_u) * stride0_u + (y_u - min1_u) * stride1_u); //u[i*bpl+j+1]
const auto v0 = *(v->host + (x_v - min0_v) * stride0_v + (y_v - 1 - min1_v) * stride1_v); //v[(i-1)*bpl+j]
const auto v1 = *(v->host + (x_v - min0_v) * stride0_v + (y_v + 1 - min1_v) * stride1_v); //v[(i+1)*bpl+j]
const auto v2 = *(v->host + (x_v - 1 - min0_v) * stride0_v + (y_v - min1_v) * stride1_v); //v[i*bpl+j-1]
const auto v3 = *(v->host + (x_v + 1 - min0_v) * stride0_v + (y_v - min1_v) * stride1_v); //v[i*bpl+j+1]
const auto Uav = (u0 + u1 + u2 + u3)/4;
const auto Vav = (v0 + v1 + v2 + v3)/4;
const auto P = iix*Uav + iiy*Vav + iit;
const auto D = iix*iix + iiy*iiy + lambda*lambda;
const auto tmp = P/D;
const auto utmp = Uav - iix*tmp;
const auto vtmp = Vav - iiy*tmp;
*(u->host + (x_u - min0_u) * stride0_u + (y_u - min1_u) * stride1_u) = utmp; //u[i*bpl+j]
*(v->host + (x_v - min0_v) * stride0_v + (y_v - min1_v) * stride1_v) = vtmp; //v[i*bpl+j]
if(IsU)
*(uResult->host + (x_u - min0_u) * stride0_u + (y_u - min1_u) * stride1_u) = utmp;
else
*(vResult->host + (x_v - min0_v) * stride0_v + (y_v - min1_v) * stride1_v) = vtmp;
}
if(IsU) return *uResult;
else return *vResult;
}
在我的主要内容中,我将其称为:
const float lambda = 0.05;
Image<uint8_t> input_1 = load_image(argv[1]);
Image<uint8_t> input_2 = load_image(argv[1]);
Var x("x"); //image indice in x direction
Var y("y"); //image indice in y direction
Var c("c"); //image number of channel
//clamp to edge
Func clamped_1("clamped_1"), clamped_2("clamped_2");
clamped_1 = BoundaryConditions::repeat_edge(input_1);
clamped_2 = BoundaryConditions::repeat_edge(input_2);
//convert rgb image to grayscale image
Func f_1("f_1"), f_2("f_2");
f_1(x,y,c) = min(0.299f * clamped_1(x,y,0) + 0.587f * clamped_1(x,y,1) + 0.114f * clamped_1(x,y,2), 255.0f);
f_2(x,y,c) = min(0.299f * clamped_2(x,y,0) + 0.587f * clamped_2(x,y,1) + 0.114f * clamped_2(x,y,2), 255.0f);
//gaussian bluring
Image<float> kernel(5, 5);
kernel(0, 1) = 0.000067; kernel(0, 1) = 0.001663; kernel(0, 2) = 0.004706; kernel(0, 3) = 0.001663; kernel(0, 4) = 0.000067;
kernel(1, 0) = 0.001663; kernel(1, 1) = 0.041482; kernel(1, 2) = 0.117381; kernel(1, 3) = 0.041482; kernel(1, 4) = 0.001663;
kernel(2, 0) = 0.004706; kernel(2, 1) = 0.117381; kernel(2, 2) = 0.332152; kernel(2, 3) = 0.117381; kernel(2, 4) = 0.004706;
kernel(3, 0) = 0.001663; kernel(3, 1) = 0.041482; kernel(3, 2) = 0.117381; kernel(3, 3) = 0.041482; kernel(3, 4) = 0.001663;
kernel(4, 0) = 0.000067; kernel(4, 1) = 0.001663; kernel(4, 2) = 0.004706; kernel(4, 3) = 0.001663; kernel(4, 4) = 0.000067;
RDom r(kernel);
Func I1("I1"), I2("I2");
I1(x, y, c) = sum(f_1(x+r.x, y+r.y, c) * kernel(r.x, r.y));
I2(x, y, c) = sum(f_2(x+r.x, y+r.y, c) * kernel(r.x, r.y));
//inputs derivations
Func Ix("Ix"), Iy("Iy"), It("It");
Ix(x, y, c) = (-I1(x-1, y-1, c) + I1(x, y-1, c) - I1(x-1, y, c) + I1(x, y, c)) +
(-I2(x-1, y-1, c) + I2(x, y-1, c) - I2(x-1, y, c) + I2(x, y, c));
Iy(x, y, c) = (-I1(x-1, y-1, c) - I1(x, y-1, c) + I1(x-1, y, c) + I1(x, y, c)) +
(-I2(x-1, y-1, c) - I2(x, y-1, c) + I2(x-1, y, c) + I2(x, y, c));
It(x, y, c) = (-I1(x-1, y-1, c) - I1(x, y-1, c) - I1(x-1, y, c) - I1(x, y, c)) -
( I2(x-1, y-1, c) + I2(x, y-1, c) + I2(x-1, y, c) + I2(x, y, c));
Func u("u"), v("v");
u(x, y, c) = 0; v(x, y, c) = 0;
Func callU("callU"), callV("callV");
vector<ExternFuncArgument> argsU(9);
argsU[0] = Ix; argsU[1] = Iy; argsU[2] = It;
argsU[3] = u; argsU[4] = v; argsU[5] = input_1.width();
argsU[6] = input_1.height(); argsU[7] = lambda; argsU[8] = 1;
vector<ExternFuncArgument> argsV(9);
argsV[0] = Ix; argsV[1] = Iy; argsV[2] = It;
argsV[3] = u; argsV[4] = v; argsV[5] = input_1.width();
argsV[6] = input_1.height(); argsV[7] = lambda; argsV[8] = 0;
vector<Type> types(9);
types[0] = Ix.output_types()[0]; types[1] = Iy.output_types()[0]; types[2] = It.output_types()[0];
types[3] = u.output_types()[0]; types[4] = v.output_types()[0]; types[5] = Int(32);
types[6] = Int(32); types[7] = Float(32); types[8] = UInt(8);
callU.define_extern("compute_flow", argsU, types, 1);
callV.define_extern("compute_flow", argsV, types, 1);
Func outputU("outputU"), outputV("outputV");
outputU(x, y, c) = callU(x, y, c);
outputV(x, y, c) = callV(x, y, c);
Ix.compute_root();
Iy.compute_root();
It.compute_root();
outputU.compile_jit();
outputV.compile_jit();
Image<uint8_t> out_u = outputU.realize(input_1.width(), input_1.height(), input_1.channels());
Image<uint8_t> out_v = outputV.realize(input_1.width(), input_1.height(), input_1.channels());
当我不安排你和v时,所有编译都很好,但是我得到了这个运行时错误:
错误:Func u无法安排内联计算,因为它用于外部计算的函数callU Aborted(core dumped)
但是,当我将你和你安排为:
u.compute_root();
v.compute_root();
我收到以下运行时错误:
/home/rokiatou/Documents/Thèse/halide/Halide-master/src/BoundsInference.cpp:283内部错误条件失败:b.empty()|| b.size()== func_args.size()中止(核心转储)
我不确定我的外部函数compute_flow是否定义良好。我无法解决调度问题。
欢迎任何帮助。谢谢。
答案 0 :(得分:2)
我假设变量x,y,c被声明为Halide :: Var;如果是这种情况,那么上面列出的错误实际上是准确的。
您可以使用c ++ for循环向Halide :: Func添加更新定义,但是您需要使用常规的C或C ++样式变量来实现,至少在一个变量点中;上面的代码只是一遍又一遍地引用相同的变量。
关于访问您列出的像素,&#34;(x-1,y,c),(x + 1,y,c),(x,y,c),(x,y-1, C)[...]&#34;这是一个访问外部Halide函数中的buffer_t *内的值的示例:
extern "C" void
auto get_something_done_in_c(buffer_t* my_buffer, const int32_t dx, const int32_t dy)
{
const auto min0 = my_buffer->min[0];
const auto internal_x = min0;
const auto min1 = my_buffer->min[1];
const auto internal_y = min1;
const auto stride0 = my_buffer->stride[0];
const auto stride1 = my_buffer->stride[1];
const auto x1 = dx + internal_x;
const auto y1 = dy + internal_y;
const auto value = *(my_buffer->host + (x1 - min0) * stride0 + (y1 - min1) * stride1);
return value;
}
HalideExtern_3(int32_t, get_something_done_in_c, buffer_t, int32_t, int32_t);
稍微解释如何获得&#39;值...我使用my_buffer变量来访问数据指针,称为主机。该主机变量返回您关心的数据的指针地址。由于这是一维缓冲区,因此使用x和y坐标,乘以stride0和stride1以获取数据指针中的地址偏移量,以获得您关注的值。