我正在尝试创建一个小的ffmpeg“hack”,它可以并行执行yadif过滤器。
我想我找到了一个解决方案,但是只能有一个并发的实例。这是因为“scalable_yadif_context”是函数“scalable_yadif_filter_line1”的本地,它取代了原始的yadif“filter_line”函数。我可以将“scalable_yadif_context”线程设置为本地,但是由于此函数经常被调用,因此它的开销会非常高。
有关如何解决此问题的任何想法?
// We need the context description in order to access the original filter_line function. Just redefine it here and hope that it is not changed inside of libavfilter.
typedef struct {
int mode;
int parity;
int frame_pending;
int auto_enable;
AVFilterBufferRef *cur;
AVFilterBufferRef *next;
AVFilterBufferRef *prev;
AVFilterBufferRef *out;
void (*filter_line)(uint8_t *dst,
uint8_t *prev, uint8_t *cur, uint8_t *next,
int w, int prefs, int mrefs, int parity, int mode);
const AVPixFmtDescriptor *csp;
} YADIFContext;
struct scalable_yadif_context
{
std::vector<std::function<void()>> calls;
int end_prefs;
scalable_yadif_context() : end_prefs(std::numeric_limits<int>::max()){}
};
void (*org_yadif_filter_line)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) = 0;
void scalable_yadif_filter_line(scalable_yadif_context& ctx, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode)
{
if(ctx.end_prefs == std::numeric_limits<int>::max())
ctx.end_prefs = -prefs; // Last call to filter_line will have negative pref
ctx.calls.push_back([=]
{
org_yadif_filter_line(dst, prev, cur, next, w, prefs, mrefs, parity, mode);
});
if(prefs == ctx.end_prefs)
{
tbb::parallel_for(tbb::blocked_range<size_t>(0, ctx.calls.size()), [=](const tbb::blocked_range<size_t>& r)
{
for(auto n = r.begin(); n != r.end(); ++n)
ctx.calls[n]();
});
ctx.calls.clear();
ctx.end_prefs = std::numeric_limits<int>::max();
}
}
void scalable_yadif_filter_line1(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode)
{
// local to the current function, making this thread local would incur heavy overhead.
static scalable_yadif_context ctx;
scalable_yadif_filter_line(ctx, dst, prev, cur, next, w, prefs, mrefs, parity, mode);
}
void make_scalable_yadif(AVFilterContext* ctx)
{
YADIFContext* yadif = (YADIFContext*)ctx->priv;
// Data race should not be problem since we are always writing the same value
org_yadif_filter_line = yadif->filter_line;
// hmm, will only work for one concurrent instance...
// I need a unique "scalable_yadif_filter_line1" for each call...
yadif->filter_line = scalable_yadif_filter_line1;
}
我创建了一个极其丑陋的解决方案,适用于多达18个并发实例。
#define RENAME(a) f ## a
#define ff(x) \
void RENAME(x)(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) \
{\
static scalable_yadif_context ctx;\
scalable_yadif_filter_line(ctx, dst, prev, cur, next, w, prefs, mrefs, parity, mode);\
}
ff(0); ff(1); ff(2); ff(3); ff(4); ff(5); ff(6); ff(7); ff(8); ff(9); ff(10); ff(11); ff(12); ff(13); ff(14); ff(15); ff(16); ff(17);
void (*fs[])(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode) =
{f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17};
namespace caspar {
void init()
{
for(int n = 0; n < 18; ++n)
tags.push(n);
}
int make_scalable_yadif(AVFilterContext* ctx)
{
static boost::once_flag flag = BOOST_ONCE_INIT;
boost::call_once(&init, flag);
YADIFContext* yadif = (YADIFContext*)ctx->priv;
org_yadif_filter_line = yadif->filter_line;
int tag;
if(!tags.try_pop(tag))
{
LOG(warning) << "Not enough scalable-yadif instances. Running non-scalable";
return -1;
}
yadif->filter_line = fs[tag];
return tag;
}
void release_scalable_yadif(int tag)
{
if(tag != -1)
tags.push(tag);
}
答案 0 :(得分:1)
为什么不将每线程缓冲区传递给scalable_yadif_filter_line1
函数?它可能需要一些重组,但它比使用静态或线程本地更好(毕竟,如果线程继续做其他事情,你的线程本地缓冲区会发生什么?)
如果你不能将缓冲区传递给函数(由于固定的ffmpeg API),TLS可能是你唯一的选择。开销并不像你想象的那么糟糕,但它仍然不是那么好。我强烈建议您修改ffmpeg以添加上下文参数。