_mm_storeu_si128花费太多时间?

时间:2016-01-29 10:04:10

标签: c sse intrinsics

这是一个C函数,它的权重值为src,并将它们存储到dst中。

static int _medium_c( DCTELEM * src, int index, int *dst )
{
    int i;
    //get weighted value
    for( i = 0; i < 16; i++ )
    {
        unsigned int threshold1 = threshold[index][i];//threshold contains constant value
        unsigned int threshold2 = ( threshold1<<1 );
        int level= src[i];
        if( ( ( unsigned )( level+threshold1 ) ) > threshold2 )
        {
            if( ( ( unsigned )( level+2*threshold1 ) ) > 2*threshold2 )
            {
                dst[i] = level * factor[i];
            }
            else
            {
                if( level>0 )
                {
                    dst[i] =  2*( level - ( int )threshold1 ) * factor[i];
                }
                else
                {
                    dst[i] =  2*( level + ( int )threshold1 ) * factor[i];
                }
            }
        }
    }
    return 0;
}

内在版本是:

int medium_intrinsic16( DCTELEM * src, int index, int* dst )
{
   int i, j = 0,  c[16], k = 0;
   for( j = 0;j < 2;j++ )
   {
        __m128i zero128 = _mm_setzero_si128();
        __m128i mask = _mm_set_epi8( 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,0x0d, 0x0c,0x09,0x08,0x05,0x04,0x01,0x00 );
        __m128i factor_a  = _mm_loadu_si128 ( (__m128i*)&factor[8*j] );
        factor_a = _mm_shuffle_epi8( factor_a, mask);
        __m128i factor_b  = _mm_loadu_si128 ( (__m128i*)&factor[8*j+4] );
        factor_b = _mm_shuffle_epi8( factor_b, mask);
        factor_a = _mm_unpacklo_epi64( factor_a, factor_b );

        __m128i  level_a  = _mm_loadu_si128( (__m128i*)&src[8*j] );

        __m128i  threshold1_a = _mm_loadu_si128((__m128i*)&threshold[index][8*j] );
        threshold1_a = _mm_shuffle_epi8( threshold1_a, mask);
        __m128i  threshold1_b = _mm_loadu_si128((__m128i*)&threshold[index][8*j+4] );
        threshold1_b = _mm_shuffle_epi8( threshold1_b, mask);
        threshold1_a = _mm_unpacklo_epi64( threshold1_a, threshold1_b );
        __m128i  threshold2_a = _mm_slli_epi32( threshold1_a, 1 );

        __m128i mif = _mm_cmpgt_epi16( level_a, zero128 );
        //keep
        __m128i m0 = _mm_sub_epi16( level_a, threshold1_a );//( level - ( int )threshold1 )
        __m128i m1 = _mm_add_epi16( level_a, threshold1_a );//( level + ( int )threshold1 )
        __m128i m2 = _mm_slli_epi16( factor_a, 1);

        __m128i m3 = _mm_mullo_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i];
        __m128i m4 = _mm_mulhi_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i];
        __m128i m5 = _mm_mullo_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i];
        __m128i m6 = _mm_mulhi_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i];

        //keep
        m3 = _mm_blendv_epi8( m5, m3, mif);
        m4 = _mm_blendv_epi8( m6, m4, mif);

        m0 = _mm_add_epi16( level_a, threshold2_a );//( level+2*threshold1 )
        m1 = _mm_slli_epi16( threshold2_a, 1 );//2*threshold2
        m2 = _mm_max_epu16( m0, m1 );
        mif = _mm_cmpeq_epi16( m2, m0 );
        m0 = _mm_mullo_epi16( level_a, factor_a );
        m1 = _mm_mulhi_epi16( level_a, factor_a );

        //keep
        m0 = _mm_blendv_epi8( m3, m0, mif );
        m1 = _mm_blendv_epi8( m4, m1, mif );

        m2  = _mm_add_epi16( level_a, threshold1_a );
        m3  = _mm_max_epu16( m2, threshold2_a );
        mif = _mm_cmpeq_epi16( m3, m2);

        m0 = _mm_and_si128( mif, m0 );
        m1 = _mm_and_si128( mif, m1 );

        m2 = _mm_unpacklo_epi16( m0, m1 );
        m3 = _mm_unpackhi_epi16( m0, m1 );
        _mm_storeu_si128((__m128i*)&dst[8*j] , m2 );//will run fast if removed 
        _mm_storeu_si128((__m128i*)&dst[8*j+4], m3 );//will run fast if removed      
    }  
    return 0;
}

内在版本并不比C版本快。问题是如果删除for循环的最后两行,如代码_mm_storeu_si128((__m128i*)&dst[8*j] , m2)_mm_storeu_si128((__m128i*)&dst[8*j+4], m3)中所示,内在版本将比c版跑得快(大约快4倍)。谁能解释为什么会这样? _mm_storeu_si128()会花费这么多时间吗?感谢

1 个答案:

答案 0 :(得分:0)

如果它与C版本的速度相同,那么你可能会对内存带宽产生瓶颈。在这种情况下,是的,存储到内存是算法中最昂贵的东西。

或者当结果没有存储在任何地方时,编译器可能会优化掉很多代码!你必须查看asm以确保它只是遗漏了商店的说明,而不是优化你的大部分功能。

请参阅http://agner.org/optimize/以及https://stackoverflow.com/tags/x86/info上的其他链接(尤其是Ulrich Drepper关于缓存的文章。)

查看缓存阻塞,即循环平铺。