Question

我正在使用内在函数来加速运行openCV代码。但在我用Intrinsics替换代码之后，代码的运行时成本几乎相同甚至更差。我无法弄清楚这是怎么发生的。我一直在寻找这个问题很长一段时间，但注意到变化。如果有人可以帮助我，我们将不胜感激。非常感谢你！这是我的代码

      // if useSSE is true,run the code with intrinsics and takes 1.45ms in my computer 
      // and if not run the general code and takes the same time.
     cv::Mat<float> results(shape.rows,2);
     if (useSSE) {
        float* pshape = (float*)shape.data;
        results = shape.clone();
        float* presults = (float*)results.data;
        // use SSE
        __m128 xyxy_center = _mm_set_ps(bbox.center_y, bbox.center_x, bbox.center_y, bbox.center_x);

        float bbox_width = bbox.width/2;
        float bbox_height = bbox.height/2;
        __m128 xyxy_size = _mm_set_ps(bbox_height, bbox_width, bbox_height, bbox_width);
        gettimeofday(&start, NULL); // this is for counting time

        int shape_size = shape.rows*shape.cols;
        for (int i=0; i<shape_size; i +=4) {
            __m128 a = _mm_loadu_ps(pshape+i);
            __m128 result = _mm_div_ps(_mm_sub_ps(a, xyxy_center),  xyxy_size);
            _mm_storeu_ps(presults+i, result);
        }
    }else {
        //SSE TO BE DONE
        for (int i = 0; i < shape.rows; i++){
            results(i, 0) = (shape(i, 0) - bbox.center_x) / (bbox.width / 2.0);
            results(i, 1) = (shape(i, 1) - bbox.center_y) / (bbox.height / 2.0);
        }
    }
    gettimeofday(&end, NULL);
    diff = 1000000*(end.tv_sec-start.tv_sec)+end.tv_sec-start.tv_usec;
    std::cout<<diff<<"-----"<<std::endl;
    return results;

Answer 1

如果shape.rows％2 == 1
尝试避免在循环中使用i变量，直接使用指针。编译器可以优化额外的加号操作，也可以不优化。

使用乘法而不是除法：

float bbox_width_inv = 2./bbox.width;
float bbox_height_inv = 2./bbox.height;
__m128 xyxy_size = _mm_set_ps(bbox_height, bbox_width, bbox_height, bbox_width);
float* p_shape_end = p_shape + shape.rows*shape.cols;
float* p_shape_end_batch = p_shape + shape.rows*shape.cols & (~3);
for (; p_shape<p_shape_end_batch; p_shape+=4, presults+=4) {
    __m128 a = _mm_loadu_ps(pshape);
    __m128 result = _mm_mul_ps(_mm_sub_ps(a, xyxy_center),  xyxy_size_inv);
    _mm_storeu_ps(presults, result);
}
while (p_shape < p_shape_end) {
    presults++ = (p_shape++ - bbox.center_x) * bbox_width_inv;
    presults++ = (p_shape++ - bbox.center_y) * bbox_height_inv;
}

尝试反汇编内在函数生成的代码，并确保有足够的寄存器来执行操作，并且不会将临时结果存储到RAM中

使用英特尔内在函数时，代码不会加快速度

1 个答案: