因此,在优化ARM NEON之前,我正在C ++中实现1D高斯内核,并直接在移动环境(Android与NDK)中将性能与OpenCV GaussianBlur()方法进行比较。这样,它将导致更简单的代码进行优化。

然而结果是我的实现比OpenCV4Android版本慢10倍。我已经读过OpenCV4 Tegra已经优化了GaussianBlur实现,但我不认为标准的OpenCV4Android有那种优化,那为什么我的代码这么慢?


Mat myGaussianBlur(Mat src){
    Mat dst(src.rows, src.cols, CV_8UC1);
    Mat temp(src.rows, src.cols, CV_8UC1);
    float sum, x1, y1;

    // coefficients of 1D gaussian kernel with sigma = 2
    double coeffs[] = {0.06475879783, 0.1209853623, 0.1760326634, 0.1994711402, 0.1760326634, 0.1209853623, 0.06475879783};
    //Normalize coeffs
    float coeffs_sum = 0.9230247873f;
    for (int i = 0; i < 7; i++){
        coeffs[i] /= coeffs_sum;

    // filter vertically
    for(int y = 0; y < src.rows; y++){
        for(int x = 0; x < src.cols; x++){
            sum = 0.0;
            for(int i = -3; i <= 3; i++){
                y1 = reflect101(src.rows, y - i);
                sum += coeffs[i + 3]*src.at<uchar>(y1, x);
            temp.at<uchar>(y,x) = sum;

    // filter horizontally
    for(int y = 0; y < src.rows; y++){
        for(int x = 0; x < src.cols; x++){
            sum = 0.0;
            for(int i = -3; i <= 3; i++){
                x1 = reflect101(src.rows, x - i);
                sum += coeffs[i + 3]*temp.at<uchar>(y, x1);
            dst.at<uchar>(y,x) = sum;

    return dst;

当人们使用两遍时,通常会转换中间数据 - 也就是说,一列输入变成一行输出。

这是因为CPU真的不喜欢在输入图像的多行中获取少量数据。如果你收集一堆水平像素并过滤那些水平像素,它的效率会更高(因为缓存的工作方式)。如果临时缓冲区被转置,那么第二遍将一堆水平点(在原始方向上垂直)收集起来,并再次转换其输出,使其以正确的方式出现。 / p>


Mat myGaussianBlur(Mat src){
    Mat dst(src.rows, src.cols, CV_8UC1);
    Mat temp(src.rows, src.cols, CV_16UC1); // <<<
    int sum, x1, y1;  // <<<

    // coefficients of 1D gaussian kernel with sigma = 2
    double coeffs[] = {0.06475879783, 0.1209853623, 0.1760326634, 0.1994711402, 0.1760326634, 0.1209853623, 0.06475879783};
    int coeffs_i[7] = { 0 }; // <<<
    //Normalize coeffs
    float coeffs_sum = 0.9230247873f;
    for (int i = 0; i < 7; i++){
        coeffs_i[i] = (int)(coeffs[i] / coeffs_sum * 256); // <<<

    // filter vertically
    for(int y = 0; y < src.rows; y++){
        for(int x = 0; x < src.cols; x++){
            sum = 0; // <<<
            for(int i = -3; i <= 3; i++){
                y1 = reflect101(src.rows, y - i);
                sum += coeffs_i[i + 3]*src.at<uchar>(y1, x); // <<<
            temp.at<uchar>(y,x) = sum;

    // filter horizontally
    for(int y = 0; y < src.rows; y++){
        for(int x = 0; x < src.cols; x++){
            sum = 0; // <<<
            for(int i = -3; i <= 3; i++){
                x1 = reflect101(src.rows, x - i);
                sum += coeffs_i[i + 3]*temp.at<uchar>(y, x1); // <<<
            dst.at<uchar>(y,x) = sum / (256 * 256); // <<<

    return dst;

这是执行@Paul R和@ sh1的所有建议后的代码,总结如下:





5)另外,作为个人观察,为了在不调用(慢)函数“round”或“cvRound”的情况下改进舍入,我添加了临时和最终像素结果0.5f(=整数精度为32768) )与OpenCV相比,减少错误/差异。



Mat myGaussianBlur(Mat src){

Mat dst(src.rows, src.cols, CV_8UC1);
Mat temp(src.rows, src.cols, CV_8UC1);
int sum;
int x1;

double coeffs[] = {0.070159, 0.131075, 0.190713, 0.216106, 0.190713, 0.131075, 0.070159};
int coeffs_i[7] = { 0 };
for (int i = 0; i < 7; i++){
        coeffs_i[i] = (int)(coeffs[i] * 65536); //65536

// filter horizontally - inside the image
for(int y = 0; y < src.rows; y++){
    uchar *ptr = src.ptr<uchar>(y);
    for(int x = 3; x < (src.cols - 3); x++){
        sum = ptr[x] * coeffs_i[3];
        for(int i = -3; i < 0; i++){
            int tmp = ptr[x+i] + ptr[x-i];
            sum += coeffs_i[i + 3]*tmp;
        temp.at<uchar>(y,x) = (sum + 32768) / 65536;
// filter horizontally - edges - needs reflect
for(int y = 0; y < src.rows; y++){
    uchar *ptr = src.ptr<uchar>(y);
    for(int x = 0; x <= 2; x++){
        sum = 0;
        for(int i = -3; i <= 3; i++){
            x1 = x + i;
            if(x1 < 0){
                x1 = -x1;
            sum += coeffs_i[i + 3]*ptr[x1];
        temp.at<uchar>(y,x) = (sum + 32768) / 65536;
for(int y = 0; y < src.rows; y++){
    uchar *ptr = src.ptr<uchar>(y);
    for(int x = (src.cols - 3); x < src.cols; x++){
        sum = 0;
        for(int i = -3; i <= 3; i++){
            x1 = x + i;
            if(x1 >= src.cols){
                x1 = 2*src.cols - x1 - 2;
            sum += coeffs_i[i + 3]*ptr[x1];
        temp.at<uchar>(y,x) = (sum + 32768) / 65536;

// transpose to apply again horizontal filter - better cache data locality
transpose(temp, temp);

// filter horizontally - inside the image
for(int y = 0; y < src.rows; y++){
    uchar *ptr = temp.ptr<uchar>(y);
    for(int x = 3; x < (src.cols - 3); x++){
        sum = ptr[x] * coeffs_i[3];
        for(int i = -3; i < 0; i++){
            int tmp = ptr[x+i] + ptr[x-i];
            sum += coeffs_i[i + 3]*tmp;
        dst.at<uchar>(y,x) = (sum + 32768) / 65536;
// filter horizontally - edges - needs reflect
for(int y = 0; y < src.rows; y++){
    uchar *ptr = temp.ptr<uchar>(y);
    for(int x = 0; x <= 2; x++){
        sum = 0;
        for(int i = -3; i <= 3; i++){
            x1 = x + i;
            if(x1 < 0){
                x1 = -x1;
            sum += coeffs_i[i + 3]*ptr[x1];
        dst.at<uchar>(y,x) = (sum + 32768) / 65536;
for(int y = 0; y < src.rows; y++){
    uchar *ptr = temp.ptr<uchar>(y);
    for(int x = (src.cols - 3); x < src.cols; x++){
        sum = 0;
        for(int i = -3; i <= 3; i++){
            x1 = x + i;
            if(x1 >= src.cols){
                x1 = 2*src.cols - x1 - 2;
            sum += coeffs_i[i + 3]*ptr[x1];
        dst.at<uchar>(y,x) = (sum + 32768) / 65536;

transpose(dst, dst);

return dst;

在Google设备上,根据Google文档,使用float / double比使用int / uchar慢两倍。

您可能会在此Android文档中找到一些解决方案来加快C ++代码的速度。 https://developer.android.com/training/articles/perf-tips