Question

我的系统：

系统规格： Intel core2duo E4500 3700g内存二级缓存2M x64 fedora 17

我如何测量flops / mflops

好吧，我使用papi库（读取硬件性能计数器）来测量我的代码的翻转和mflops。返回实时处理时间，翻转，最后翻转/处理时间等于MFLOPS.library使用硬件计数器计算浮点数inststruction或浮点运算和Total循环来获得包含flops和MFLOPS的最终结果。

我的计算内核

我使用了三个循环矩阵矩阵乘法（方阵）和三个嵌套循环，它们在内循环中对1d数组进行了一些操作。

First Kernel MM

    float a[size][size];
    float b[size][size];
    float c[size][size];

 start_calculate_MFlops();

for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            for (int k = 0; k < size; **k+=1**) {
                *c[i][j]=c[i][j]+a[i][k] * b[k][j];*
                     }
            }
 }
  stop_calculate_MFlops();

具有1d数组的第二个内核

    float d[size];
    float e[size];
    float f[size];
    float g[size];
    float r = 3.6541;

 start_calculate_MFlops();

for (int i = 0; i < size; ++i) {
    for (int j = 0; j < size; ++j) {
        for (int k = 0; k < size; ++k) {
            d[k]=d[k]+e[k]+f[k]+g[k]+r;
        }
    }
}    

stop_calculate_MFlops();

我对翻牌的了解

矩阵矩阵乘法（MM）在其内循环（这里是浮点运算）中进行2次运算，并且因为有3个循环迭代大小X，因此理论上我们对MM的总触发器为2 * n ^ 3。 / p>

在第二个内核中我们有3个循环，在最内层循环中我们有1d数组进行一些计算。在这个循环中有4个浮点运算。理论上我们有4 * n ^ 3个触发器的总触发器< / p>

我知道我们上面计算的翻牌与真机中的翻牌不完全相同。在真机中还有其他操作，如加载和存储，这将加起来理论上的失败。

问题？：

当我使用1d数组时，因为第二个内核理论上的触发器是通过执行代码和测量得到的翻牌相同或不同实际上，当我使用1d阵列触发器时，它等于操作的＃最内层循环乘以n ^ 3，但是当我使用我的第一个内核MM时使用2d数组理论翻转是2n ^ 3，但是当我运行代码时，测量值远高于理论值，它是大约4+（在矩阵乘法的最内循环中的2次操作）* n ^ 3 + = 6n ^ 3。我用最下面的代码更改了最里面循环中的矩阵乘法线：

A[i][j]++;

3嵌套循环中此代码的理论翻转是1次操作* n ^ 3 = n ^ 3当我运行代码时结果太高于预期的2+（最内层循环的1次操作））* N ^ 3 = 3 * N ^ 3

尺寸为512X512的矩阵的样本结果：

Real_time：1.718368 Proc_time：1.227672总flpops： 807,107,072 MFLOPS：657.429016

Real_time：3.608078 Proc_time：3.042272总flpops： 807,024,448 MFLOPS：265.270355

理论翻牌：2 * 512 * 512 * 512 = 268,435,456

测量的失败= 6 * 512 ^ 3 = 807,107,072

3嵌套循环中的1d数组操作的示例结果

Real_time：1.282257 Proc_time：1.155990总flpops： 536,872,000 MFLOPS：464.426117

理论翻牌： 4n ^ 3 = 536,870,912

测量翻牌： 4n ^ 3 = 4 * 512 ^ 3 +开销（其他操作？）= 536,872,000

< / LI>

我找不到上述行为的原因？ 我的假设是真的吗？

希望使它比描述之前简单得多。

实际上我的意思是通过执行代码来测量真正的翻牌。

的代码： 的

 void countFlops() {

    int size = 512;
    int itr = 20;
    float a[size][size];
    float b[size][size];
    float c[size][size];
/*  float d[size];
    float e[size];
    float f[size];
    float g[size];*/
        float r = 3.6541;

    float real_time, proc_time, mflops;
    long long flpops;
    float ireal_time, iproc_time, imflops;
    long long iflpops;
    int retval;

    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            a[j][j] = b[j][j] = c[j][j] = 1.0125;
        }
    }

/*  for (int i = 0; i < size; ++i) {
                d[i]=e[i]=f[i]=g[i]=10.235;
        }*/

    if ((retval = PAPI_flops(&ireal_time, &iproc_time, &iflpops, &imflops))
            < PAPI_OK) {
        printf("Could not initialise PAPI_flops \n");
        printf("Your platform may not support floating point operation event.\n");
        printf("retval: %d\n", retval);
        exit(1);
    }
    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            for (int k = 0; k < size; k+=16) {
                c[i][j]=c[i][j]+a[i][k] * b[k][j];
            }
        }
    }

/*  for (int i = 0; i < size; ++i) {
    for (int j = 0; j < size; ++j) {
        for (int k = 0; k < size; ++k) {
            d[k]=d[k]+e[k]+f[k]+g[k]+r;
        }
    }
    }*/

    if ((retval = PAPI_flops(&real_time, &proc_time, &flpops, &mflops))
            < PAPI_OK) {
        printf("retval: %d\n", retval);
        exit(1);
    }
    string flpops_tmp;
    flpops_tmp = output_formatted_string(flpops);
    printf(
            "calculation: Real_time: %f Proc_time: %f Total flpops: %s MFLOPS: %f\n",
            real_time, proc_time, flpops_tmp.c_str(), mflops);

}

谢谢

Answer 1

如果需要计算操作次数 - 可以创建一个类似浮点值的简单类，并收集统计信息。它可与内置类型互换。

LIVE DEMO：

#include <boost/numeric/ublas/matrix.hpp>
#include <boost/operators.hpp>
#include <iostream>
#include <ostream>
#include <utility>
#include <cstddef>
#include <vector>

using namespace boost;
using namespace std;

class Statistic
{
    size_t ops = 0;
public:
    Statistic &increment()
    {
        ++ops;
        return *this;
    }
    size_t count() const
    {
        return ops;
    }
};

template<typename Domain>
class Profiled: field_operators<Profiled<Domain>>
{
    Domain value;
    static vector<Statistic> stat;
    void stat_increment()
    {
        stat.back().increment();
    }
public:
    struct StatisticScope
    {
        StatisticScope()
        {
            stat.emplace_back();
        }
        Statistic &current()
        {
            return stat.back();
        }
        ~StatisticScope()
        {
            stat.pop_back();
        }
    };
    template<typename ...Args>
    Profiled(Args&& ...args)
        : value{forward<Args>(args)...}
    {}
    Profiled& operator+=(const Profiled& x)
    {
        stat_increment();
        value+=x.value;
        return *this;
    }
    Profiled& operator-=(const Profiled& x)
    {
        stat_increment();
        value-=x.value;
        return *this;
    }
    Profiled& operator*=(const Profiled& x)
    {
        stat_increment();
        value*=x.value;
        return *this;
    }
    Profiled& operator/=(const Profiled& x)
    {
        stat_increment();
        value/=x.value;
        return *this;
    }
};
template<typename Domain>
vector<Statistic> Profiled<Domain>::stat{1};

int main()
{
    typedef Profiled<double> Float;
    {
        Float::StatisticScope s;
        Float x = 1.0, y = 2.0, res = 0.0;
        res = x+y*x+y;
        cout << s.current().count() << endl;
    }
    {
        using namespace numeric::ublas;
        Float::StatisticScope s;
        matrix<Float> x{10, 20},y{20,5},res{10,5};
        res = prod(x,y);
        cout << s.current().count() << endl;
    }
}

输出是：

3
2000

P.S。你的矩阵循环不是缓存友好的，结果是very inefficient。

P.P.S

int size = 512;
float a[size][size];

这不是合法的C ++代码。 C ++不支持VLA。

理论和实践矩阵乘法FLOP

1 个答案: