我尝试使用OpenMP来并行OpenHEVC的去块过滤器。 但是,使用openMP比串行更慢。甚至,我试图在for循环中空白代码。 然而,它比串行花了四倍的时间。我不知道为什么会这样。
串行代码
for (y = y0; y < y_end; y += 8) {
for (x = x0 ? x0 : 8; x < x_end; x += 8) {
const int bs0 = s->vertical_bs[(x >> 3) + (y >> 2) * s->bs_width];
const int bs1 = s->vertical_bs[(x >> 3) + ((y + 4) >> 2) * s->bs_width];
int c_tc[2], beta[2], tc[2];
uint8_t no_p[2] = { 0 };
uint8_t no_q[2] = { 0 };
if (bs0 || bs1) {
const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1;
const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1;
beta[0] = betatable[av_clip(qp0 + (beta_offset >> 1 << 1), 0, MAX_QP)];
beta[1] = betatable[av_clip(qp1 + (beta_offset >> 1 << 1), 0, MAX_QP)];
tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0;
tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0;
src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)];
if (pcmf) {
no_p[0] = get_pcm(s, x - 1, y);
no_p[1] = get_pcm(s, x - 1, y + 4);
no_q[0] = get_pcm(s, x, y);
no_q[1] = get_pcm(s, x, y + 4);
omp_set_lock(&writelock);
s->hevcdsp.hevc_v_loop_filter_luma_c(src,
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
omp_unset_lock(&writelock);
} else{
omp_set_lock(&writelock);
s->hevcdsp.hevc_v_loop_filter_luma(src,
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
}
}
}
}
Openmp代码
omp_set_num_threads(4);
#pragma omp parallel shared(s) private(src)
{
#pragma omp for
for (y = y0; y < y_end; y += 8) {
for (x = x0 ? x0 : 8; x < x_end; x += 8) {
const int bs0 = s->vertical_bs[(x >> 3) + (y >> 2) * s->bs_width];
const int bs1 = s->vertical_bs[(x >> 3) + ((y + 4) >> 2) * s->bs_width];
int c_tc[2], beta[2], tc[2];
uint8_t no_p[2] = { 0 };
uint8_t no_q[2] = { 0 };
if (bs0 || bs1) {
const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1;
const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1;
beta[0] = betatable[av_clip(qp0 + (beta_offset >> 1 << 1), 0, MAX_QP)];
beta[1] = betatable[av_clip(qp1 + (beta_offset >> 1 << 1), 0, MAX_QP)];
tc[0] = bs0 ? TC_CALC(qp0, bs0) : 0;
tc[1] = bs1 ? TC_CALC(qp1, bs1) : 0;
src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)];
if (pcmf) {
no_p[0] = get_pcm(s, x - 1, y);
no_p[1] = get_pcm(s, x - 1, y + 4);
no_q[0] = get_pcm(s, x, y);
no_q[1] = get_pcm(s, x, y + 4);
s->hevcdsp.hevc_v_loop_filter_luma_c(src,
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
} else{
s->hevcdsp.hevc_v_loop_filter_luma(src,
s->frame->linesize[LUMA],
beta, tc, no_p, no_q);
}
}
}
}
}
时间(最长)
序列号:1004ns
openMP:4150ns
答案 0 :(得分:0)
空白循环 并行时间比串行时间长。你在循环中没有足够的工作来使它对你有益。产生和关闭线程所需的开销占用了大部分时间。
尝试在那里投入非常繁重的工作量,看看会发生什么!例如,我在Fortran代码中使用OpenMP,循环每个花费5分钟。
你甚至可以进行5秒钟的睡眠,以测试它们实际上并行运行。