Question

使用OpenCV和C ++计算混淆矩阵的首选方法是什么？

开始于：

int TP = 0,FP = 0,FN = 0,TN = 0;
cv::Mat truth(60,60, CV_8UC1);
cv::Mat detections(60,60, CV_8UC1);

this->loadResults(truth, detections); // loadResults(cv::Mat& t, cv::Mat& d);

我尝试了几种不同的选择，例如：

直接通话：

for(int r = 0; r < detections.rows; ++r)
for(int c = 0; c < detections.cols; ++c)
{
    int d,t;
    d = detection.at<unsigned char>(r,c);
    t = truth.at<unsigned char>(r,c);
    if(d&&t)    ++TP;
    if(d&&!t)   ++FP;
    if(!d&&t)   ++FN;
    if(!d&&!t)  ++TN;
}

RAM重矩阵逻辑：

{
    cv::Mat truePos = detection.mul(truth);
    TP = cv::countNonZero(truePos)
}
{
    cv::Mat falsePos = detection.mul(~truth);
    FP = cv::countNonZero(falsePos )
}
{
    cv::Mat falseNeg = truth.mul(~detection);
    FN = cv::countNonZero(falseNeg )
}
{
    cv::Mat trueNeg = (~truth).mul(~detection);
    TN = cv::countNonZero(trueNeg )
}

forEach：

auto lambda = [&, truth,TP,FP,FN,TN](unsigned char d, const int pos[]){
    cv::Point2i pt(pos[1], pos[0]);
    char t = truth.at<unsigned char>(pt);
    if(d&&t)    ++TP;
    if(d&&!t)   ++FP;
    if(!d&&t)   ++FN;
    if(!d&&!t)  ++TN;
};
detection.forEach(lambda);

但是有标准的方法吗？我可能会错过OpenCV文档中的一个简单功能。

p.s。我使用的是VS2010 x64；

Answer 1

简而言之，三个都不是。

在开始之前，我们先定义一个简单的结构来保存结果：

struct result_t
{
    int TP;
    int FP;
    int FN;
    int TN;
};

这将使我们将每个实现包装在具有以下签名的函数中，以简化测试和性能评估。（请注意，我使用cv::Mat1b来明确指出，我们只需要CV_8UC1类型的垫子：

result_t conf_mat_x(cv::Mat1b truth, cv::Mat1b detections);

我将使用4096 x 4096大小的随机生成的数据来衡量性能。

我在此处将MSVS2013 64位与OpenCV 3.1一起使用。抱歉，尚未使用OpenCV设置MSVS2010进行测试，而时序代码使用c ++ 11，因此您可能需要对其进行修改才能编译。

变体1-“直接致电”

代码的更新版本如下：

result_t conf_mat_1a(cv::Mat1b truth, cv::Mat1b detections)
{
    CV_Assert(truth.size == detections.size);

    result_t result = { 0 };
    for (int r(0); r < detections.rows; ++r) {
        for (int c(0); c < detections.cols; ++c) {
            int d(detections.at<uchar>(r, c));
            int t(truth.at<uchar>(r, c));
            if (d&&t) { ++result.TP; }
            if (d&&!t) { ++result.FP; }
            if (!d&&t) { ++result.FN; }
            if (!d&&!t) { ++result.TN; }
        }
    }
    return result;
}

性能和结果：

#0:     min=120.017     mean=123.258    TP=4192029      FP=4195489      TN=4195118      FN=4194580      Total=16777216

这里的主要问题是，这（尤其是VS2010）不太可能自动矢量化，因此速度会很慢。利用SIMD可能潜在地使速度提高一个数量级。另外，重复调用cv::Mat::at也会增加一些开销。

这里确实没有什么收获，我们应该能够做得更好。

变体2-“ RAM重”

代码：

result_t conf_mat_2a(cv::Mat1b truth, cv::Mat1b detections)
{
    CV_Assert(truth.size == detections.size);

    result_t result = { 0 };
    {
        cv::Mat truePos = detections.mul(truth);
        result.TP = cv::countNonZero(truePos);
    }
    {
        cv::Mat falsePos = detections.mul(~truth);
        result.FP = cv::countNonZero(falsePos);
    }
    {
        cv::Mat falseNeg = truth.mul(~detections);
        result.FN = cv::countNonZero(falseNeg);
    }
    {
        cv::Mat trueNeg = (~truth).mul(~detections);
        result.TN = cv::countNonZero(trueNeg);
    }

    return result;
}

性能和结果：

#1:     min=63.993      mean=68.674     TP=4192029      FP=4195489      TN=4195118      FN=4194580      Total=16777216

这已经快了两倍，即使有很多不必要的工作正在做。

乘法（饱和）似乎是一个过大的杀伤力-bitwise_and也会做得很好，并且可能会节省一点时间。

许多冗余矩阵分配带来了巨大的开销。不必为truePos，falsePos，falseNeg和trueNeg中的每一个分配新的矩阵，我们可以对所有4种情况重用相同的cv::Mat。由于形状和数据类型将始终相同，这意味着将只分配1次而不是4次。

代码：

result_t conf_mat_2b(cv::Mat1b truth, cv::Mat1b detections)
{
    CV_Assert(truth.size == detections.size);

    result_t result = { 0 };

    cv::Mat temp;
    cv::bitwise_and(detections, truth, temp);
    result.TP = cv::countNonZero(temp);
    cv::bitwise_and(detections, ~truth, temp);
    result.FP = cv::countNonZero(temp);
    cv::bitwise_and(~detections, truth, temp);
    result.FN = cv::countNonZero(temp);
    cv::bitwise_and(~detections, ~truth, temp);
    result.TN = cv::countNonZero(temp);

    return result;
}

性能和结果：

#2:     min=50.995      mean=52.440     TP=4192029      FP=4195489      TN=4195118      FN=4194580      Total=16777216

与conf_mat_2a相比，所需时间减少了约20％。

接下来，请注意您正在计算~truth和~detections两次。因此，我们也可以通过重用它们来消除这些操作以及2个额外的分配。

注意：内存使用情况不会改变-我们之前需要3个临时数组，现在仍然如此。

代码：

result_t conf_mat_2c(cv::Mat1b truth, cv::Mat1b detections)
{
    CV_Assert(truth.size == detections.size);

    result_t result = { 0 };

    cv::Mat inv_truth(~truth);
    cv::Mat inv_detections(~detections);

    cv::Mat temp;
    cv::bitwise_and(detections, truth, temp);
    result.TP = cv::countNonZero(temp);
    cv::bitwise_and(detections, inv_truth, temp);
    result.FP = cv::countNonZero(temp);
    cv::bitwise_and(inv_detections, truth, temp);
    result.FN = cv::countNonZero(temp);
    cv::bitwise_and(inv_detections, inv_truth, temp);
    result.TN = cv::countNonZero(temp);

    return result;
}

性能和结果：

#3:     min=37.997      mean=38.569     TP=4192029      FP=4195489      TN=4195118      FN=4194580      Total=16777216

与conf_mat_2a相比，所需时间减少了约40％。

仍有改进的潜力。让我们观察一下。

element_count == rows * cols，其中rows和cols代表cv::Mat的高度和宽度（我们可以使用cv::Mat::total()）。
TP + FP + FN + TN == element_count，因为每个元素恰好属于4组中的1组。
positive_count是detections中非零元素的数量。
negative_count是detections中零元素的数量。
positive_count + negative_count == element_count，因为每个元素恰好属于2套中的1套
TP + FP == positive_count
TN + FN == negative_count

使用此信息，我们可以使用简单的算法来计算TN，从而消除一个bitwise_and和一个countNonZero。我们可以类似地计算FP，排除另一个bitwise_and，然后使用第二个countNonZero来计算positive_count。

由于我们取消了inv_truth的两种使用，因此我们也可以将其删除。

注意：内存使用量已减少-我们现在只有2个临时阵列。

代码：

result_t conf_mat_2d(cv::Mat1b truth, cv::Mat1b detections)
{
    CV_Assert(truth.size == detections.size);

    result_t result = { 0 };

    cv::Mat1b inv_detections(~detections);
    int positive_count(cv::countNonZero(detections));
    int negative_count(static_cast<int>(truth.total()) - positive_count);

    cv::Mat1b temp;
    cv::bitwise_and(truth, detections, temp);
    result.TP = cv::countNonZero(temp);
    result.FP = positive_count - result.TP;

    cv::bitwise_and(truth, inv_detections, temp);
    result.FN = cv::countNonZero(temp);
    result.TN = negative_count - result.FN;

    return result;
}

性能和结果：

#4:     min=22.494      mean=22.831     TP=4192029      FP=4195489      TN=4195118      FN=4194580      Total=16777216

与conf_mat_2a相比，所需时间减少了约65％。

最后，由于我们只需要inv_detections一次，因此我们可以重复使用temp来存储它，省去了另外的分配，并进一步减少了内存占用。

注意：内存使用量已减少-我们现在只有1个临时数组。

代码：

result_t conf_mat_2e(cv::Mat1b truth, cv::Mat1b detections)
{
    CV_Assert(truth.size == detections.size);

    result_t result = { 0 };

    int positive_count(cv::countNonZero(detections));
    int negative_count(static_cast<int>(truth.total()) - positive_count);

    cv::Mat1b temp;
    cv::bitwise_and(truth, detections, temp);
    result.TP = cv::countNonZero(temp);
    result.FP = positive_count - result.TP;

    cv::bitwise_not(detections, temp);
    cv::bitwise_and(truth, temp, temp);
    result.FN = cv::countNonZero(temp);
    result.TN = negative_count - result.FN;

    return result;
}

性能和结果：

#5:     min=16.999      mean=17.391     TP=4192029      FP=4195489      TN=4195118      FN=4194580      Total=16777216

与conf_mat_2a相比，所需时间减少了约72％。

变体3-“ for lambda”

这又遇到了与变体1相同的问题，即它不太可能被矢量化，因此它相对较慢。

实现的主要问题是forEach在多个输入片段上并行运行该函数，并且缺少任何同步。当前的实现返回错误的结果。

但是，并行化的思想可以在变体2的最大程度上得到应用。

Variant 4-“平行”

让我们利用cv::parallel_for_来改善conf_mat_2e。在工作线程之间分配负载的最简单方法是逐行执行。

我们可以通过共享中间cv::Mat3i来避免同步，该中间TP，FP和FN每一行都应保存（回想{{1} }可以从最后的其他3个开始计算）。由于每一行仅由单个工作线程处理，因此我们不需要同步。处理完所有行后，简单的TN将为我们提供总计cv::sum，TP和FP。然后计算FN。

NB：我们可以再次减少内存需求-每个工作人员都需要一个缓冲区，该缓冲区跨越一行。此外，我们需要TN整数来存储中间结果。

代码：

3 * rows

性能和结果：

class ParallelConfMat : public cv::ParallelLoopBody
{
public:
    enum
    {
        TP = 0
        , FP = 1
        , FN = 2
    };

    ParallelConfMat(cv::Mat1b& truth, cv::Mat1b& detections, cv::Mat3i& result)
        : truth_(truth)
        , detections_(detections)
        , result_(result)
    {
    }

    ParallelConfMat& operator=(ParallelConfMat const&)
    {
        return *this;
    };

    virtual void operator()(cv::Range const& range) const
    {
        cv::Mat1b temp;
        for (int r(range.start); r < range.end; r++) {
            cv::Mat1b detections(detections_.row(r));
            cv::Mat1b truth(truth_.row(r));
            cv::Vec3i& result(result_.at<cv::Vec3i>(r));

            int positive_count(cv::countNonZero(detections));
            int negative_count(static_cast<int>(truth.total()) - positive_count);

            cv::bitwise_and(truth, detections, temp);
            result[TP] = cv::countNonZero(temp);
            result[FP] = positive_count - result[TP];

            cv::bitwise_not(detections, temp);
            cv::bitwise_and(truth, temp, temp);
            result[FN] = cv::countNonZero(temp);
        }
    }

private:
    cv::Mat1b& truth_;
    cv::Mat1b& detections_;
    cv::Mat3i& result_; // TP, FP, FN per row
};

result_t conf_mat_4(cv::Mat1b truth, cv::Mat1b detections)
{
    CV_Assert(truth.size == detections.size);

    result_t result = { 0 };

    cv::Mat3i partial_results(truth.rows, 1);
    cv::parallel_for_(cv::Range(0, truth.rows)
        , ParallelConfMat(truth, detections, partial_results));
    cv::Scalar reduced_results = cv::sum(partial_results);

    result.TP = static_cast<int>(reduced_results[ParallelConfMat::TP]);
    result.FP = static_cast<int>(reduced_results[ParallelConfMat::FP]);
    result.FN = static_cast<int>(reduced_results[ParallelConfMat::FN]);
    result.TN = static_cast<int>(truth.total()) - result.TP - result.FP - result.FN;

    return result;
}

它在启用HT（即12个线程）的6核CPU上运行。

与#6: min=1.496 mean=1.966 TP=4192029 FP=4195489 TN=4195118 FN=4194580 Total=16777216相比，运行时间减少了约97.5％。

对于很小的输入，可能是次优的。理想的实现可能是这些方法中的一些方法的组合，并根据输入大小进行委派。

测试代码：

conf_mat_2a

MSVS2015，Win64，OpenCV 3.4.3的性能和结果：

#include <opencv2/opencv.hpp>

#include <chrono>
#include <iomanip>

using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::microseconds;

struct result_t
{
    int TP;
    int FP;
    int FN;
    int TN;
};

/******** PASTE all the conf_mat_xx functions here *********/

int main()
{
    int ROWS(4 * 1024), COLS(4 * 1024), ITERS(32);

    cv::Mat1b truth(ROWS, COLS);
    cv::randu(truth, 0, 2);
    truth *= 255;

    cv::Mat1b detections(ROWS, COLS);
    cv::randu(detections, 0, 2);
    detections *= 255;

    typedef result_t(*conf_mat_fn)(cv::Mat1b, cv::Mat1b);
    struct test_info
    {
        conf_mat_fn fn;
        std::vector<double> d;
        result_t r;
    };
    std::vector<test_info> info;
    info.push_back({ conf_mat_1a });
    info.push_back({ conf_mat_2a });
    info.push_back({ conf_mat_2b });
    info.push_back({ conf_mat_2c });
    info.push_back({ conf_mat_2d });
    info.push_back({ conf_mat_2e });
    info.push_back({ conf_mat_4 });

    // Warm-up
    for (int n(0); n < info.size(); ++n) {
        info[n].fn(truth, detections);
    }

    for (int i(0); i < ITERS; ++i) {
        for (int n(0); n < info.size(); ++n) {
            high_resolution_clock::time_point t1 = high_resolution_clock::now();
            info[n].r = info[n].fn(truth, detections);
            high_resolution_clock::time_point t2 = high_resolution_clock::now();
            info[n].d.push_back(static_cast<double>(duration_cast<microseconds>(t2 - t1).count()) / 1000.0);
        }
    }

    for (int n(0); n < info.size(); ++n) {
        std::cout << "#" << n << ":"
            << std::fixed << std::setprecision(3)
            << "\tmin=" << *std::min_element(info[n].d.begin(), info[n].d.end())
            << "\tmean=" << cv::mean(info[n].d)[0]
            << "\tTP=" << info[n].r.TP
            << "\tFP=" << info[n].r.FP
            << "\tTN=" << info[n].r.TN
            << "\tFN=" << info[n].r.FN
            << "\tTotal=" << (info[n].r.TP + info[n].r.FP + info[n].r.TN + info[n].r.FN)
            << "\n";
    }
}

OpenCV C ++。快速计算混淆矩阵

1 个答案:

变体1-“直接致电”

变体2-“ RAM重”

变体3-“ for lambda”

Variant 4-“平行”