Question

我尝试使用OpenMP来并行OpenHEVC的去块过滤器。但是，使用openMP比串行更慢。甚至，我试图在for循环中空白代码。然而，它比串行花了四倍的时间。我不知道为什么会这样。

串行代码

for (y = y0; y < y_end; y += 8) {       
    for (x = x0 ? x0 : 8; x < x_end; x += 8) {
        const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
        const int bs1 = s->vertical_bs[(x >> 3) + ((y + 4) >> 2) * s->bs_width];

        int c_tc[2], beta[2], tc[2];
        uint8_t no_p[2] = { 0 };
        uint8_t no_q[2] = { 0 };


        if (bs0 || bs1) {
            const int qp0 = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
            const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1;

            beta[0] = betatable[av_clip(qp0 + (beta_offset >> 1 << 1), 0, MAX_QP)];
            beta[1] = betatable[av_clip(qp1 + (beta_offset >> 1 << 1), 0, MAX_QP)];
            tc[0]   = bs0 ? TC_CALC(qp0, bs0) : 0;
            tc[1]   = bs1 ? TC_CALC(qp1, bs1) : 0;
            src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)];
            if (pcmf) {
                no_p[0] = get_pcm(s, x - 1, y);
                no_p[1] = get_pcm(s, x - 1, y + 4);
                no_q[0] = get_pcm(s, x, y);
                no_q[1] = get_pcm(s, x, y + 4);

                omp_set_lock(&writelock);
                s->hevcdsp.hevc_v_loop_filter_luma_c(src,
                    s->frame->linesize[LUMA],
                    beta, tc, no_p, no_q);
                omp_unset_lock(&writelock);
            } else{
                omp_set_lock(&writelock);
                s->hevcdsp.hevc_v_loop_filter_luma(src,
                    s->frame->linesize[LUMA],
                    beta, tc, no_p, no_q);

            }
        }
    }
}

Openmp代码

omp_set_num_threads(4);

#pragma omp parallel shared(s) private(src)
{
    #pragma omp for
for (y = y0; y < y_end; y += 8) {       
    for (x = x0 ? x0 : 8; x < x_end; x += 8) {
        const int bs0 = s->vertical_bs[(x >> 3) + (y       >> 2) * s->bs_width];
        const int bs1 = s->vertical_bs[(x >> 3) + ((y + 4) >> 2) * s->bs_width];

        int c_tc[2], beta[2], tc[2];
        uint8_t no_p[2] = { 0 };
        uint8_t no_q[2] = { 0 };


        if (bs0 || bs1) {
            const int qp0 = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
            const int qp1 = (get_qPy(s, x - 1, y + 4) + get_qPy(s, x, y + 4) + 1) >> 1;

            beta[0] = betatable[av_clip(qp0 + (beta_offset >> 1 << 1), 0, MAX_QP)];
            beta[1] = betatable[av_clip(qp1 + (beta_offset >> 1 << 1), 0, MAX_QP)];
            tc[0]   = bs0 ? TC_CALC(qp0, bs0) : 0;
            tc[1]   = bs1 ? TC_CALC(qp1, bs1) : 0;
            src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->sps->pixel_shift)];
            if (pcmf) {
                no_p[0] = get_pcm(s, x - 1, y);
                no_p[1] = get_pcm(s, x - 1, y + 4);
                no_q[0] = get_pcm(s, x, y);
                no_q[1] = get_pcm(s, x, y + 4);

                s->hevcdsp.hevc_v_loop_filter_luma_c(src,
                    s->frame->linesize[LUMA],
                    beta, tc, no_p, no_q);
            } else{                 
                s->hevcdsp.hevc_v_loop_filter_luma(src,
                    s->frame->linesize[LUMA],
                    beta, tc, no_p, no_q);

            }
        }
    }
}   
}

时间（最长）

序列号：1004ns

openMP：4150ns

Answer 1

空白循环并行时间比串行时间长。你在循环中没有足够的工作来使它对你有益。产生和关闭线程所需的开销占用了大部分时间。

尝试在那里投入非常繁重的工作量，看看会发生什么！例如，我在Fortran代码中使用OpenMP，循环每个花费5分钟。

你甚至可以进行5秒钟的睡眠，以测试它们实际上并行运行。

在Android中Openmp比串行慢

1 个答案: