Question

我正在尝试创建一个行为与内置int类型完全相同的C ++类，但有一个例外：调用operator *（或operator * =）的地方，而是调用addition。

起初，我班级的表现非常差（内置int类型的1/2），但我注意到这是因为我忘了在下面包含复制构造函数：

struct AlmostInt {                                                                                                                                                                       

  AlmostInt () { }                
  AlmostInt (const AlmostInt  &a) : val(a.val) { }  // forgetting this killed
                                                    // performance

  AlmostInt operator+(const AlmostInt &a) const { AlmostInt result = *this;
                                          result.val += a.val;
                                          return result; }
  AlmostInt operator-(const AlmostInt &a) const { AlmostInt result = *this;
                                          result.val -= a.val;
                                          return result; }
  AlmostInt operator*(const AlmostInt &a) const { AlmostInt result = *this;
                                          result.val  = result.val + a.val;      
                                          return result; }
  AlmostInt &operator+=(const AlmostInt &a) { this->val += a.val;                           
                                              return *this; }
  AlmostInt &operator-=(const AlmostInt &a) { this->val -= a.val;        
                                              return *this; }
  AlmostInt &operator*=(const AlmostInt &a) { this->val = this->val + a.val);     
                                              return *this; }

private:
  int val;
};

不幸的是，我的程序仍然比它应该慢25％。检查为程序的两个不同版本生成的程序集（一个使用int，另一个使用AlmostInt），我看到有相同数量的+和 - 操作，所以事情正在“工作” “在某种程度上。

问题是使用AlmostInt类而不是本机int操作的代码中的加载和存储操作明显更多。

有没有人对这个开销可能来自哪里有任何想法？唯一的猜测我的原因可能是编译器不理解AlmostInt拥有全部相同的属性int（例如关联性，交换性），但如果这是真的一个问题，我希望代码中有不同数量的“+”或“ - ”指令，但这不会发生。

我怀疑额外的加载和存储与额外的堆栈活动有关，但是我现在可以说的是，它不仅仅是一些额外的堆栈加载和存储每个函数的顶部和底部，但额外的加载和存储发生在整个代码中。

有什么想法吗？我想知道是否有人可以指向允许的编译器一个用自定义类达到int的性能水平。

更新：

这是一个简单的功能，你可以剪切和粘贴，看看自己发生了什么。在x86-64 Linux（g ++ 4.3,4.4），AIX6 xlC和其他几个平台上，更改下面的“CHOOSE ONE ...”行应该导致生成相同的代码（或者最少相同性能的代码），但实际上代码膨胀很大。任何人都可以解释发生了什么（对于任何特定的平台/编译器），或者如何解决它？

class AlmostInt
{
    int value;

public:

    AlmostInt& operator+=(AlmostInt that)
    {
        value += that.value;
        return *this;
    }

    AlmostInt& operator-=(AlmostInt that)
    {
        value -= that.value;
        return *this;
    }

        AlmostInt& operator*=(AlmostInt that)
    {
        value *= that.value;
        return *this;
    }
};

AlmostInt operator+(AlmostInt lhs, AlmostInt rhs)
{
    lhs += rhs;
    return lhs;
}

AlmostInt operator-(AlmostInt lhs, AlmostInt rhs)
{
    lhs -= rhs;
    return lhs;
}

AlmostInt operator*(AlmostInt lhs, AlmostInt rhs)
{
    lhs *= rhs;
    return lhs;
}

// CHOOSE ONE OF THE FOLLOWING TWO LINES:
//typedef int real;
typedef AlmostInt real;

typedef struct {
  real re;
  real im;
} complex;

#define R(a0,a1,b0,b1,wre,wim) { \
  t1 = a0 - a1;  t2 = b0 - b1; \
  t5 = t1 * wim; t6 = t2 * wim; \
  t3 = a0;  t1 *= wre; \
  t3 += a1; t2 *= wre; \
  t1 -= t6; t4 = b0; \
  t2 += t5; t4 += b1; \
  a0 = t3;  b1 = t2; \
  a1 = t4;  b0 = t1; \
}

#define RZERO(a0,a1,b0,b1) { \
  t1 = a0 - a1; t2 = b0 - b1; \
  t3 = a0 + a1; t4 = b0 + b1; \
  b0 = t1; a0 = t3; \
  b1 = t2; a1 = t4; \
}

void rpass(real *a, const complex *w, unsigned int n)
{
  real t1, t2, t3, t4, t5, t6, t7, t8;
  real *b;
  unsigned int k;

  b = a + 4 * n;
  k = n - 2;

  RZERO(a[0],a[1],b[0],b[1]);
  R(a[2],a[3],b[2],b[3],w[0].re,w[0].im);
  R(a[4],a[5],b[4],b[5],w[1].re,w[1].im);
  R(a[6],a[7],b[6],b[7],w[2].re,w[2].im);

  for (;;) {
    R(a[8],a[9],b[8],b[9],w[3].re,w[3].im);
    R(a[10],a[11],b[10],b[11],w[4].re,w[4].im);
    R(a[12],a[13],b[12],b[13],w[5].re,w[5].im);
    R(a[14],a[15],b[14],b[15],w[6].re,w[6].im);
    if (!(k -= 2)) break;
    a += 8;
    b += 8;
    w += 4;
  }
}

（信用到期的信用：这个小基准来自Dan Bernstein的'djbfft'库）

Answer 1

在这类情况下性能损失的最常见原因之一从函数返回值。从理论上讲，编译器应该是能够对此进行优化，并执行与返回int时相同的操作（前提是所有相关职能都内联）;在实践中，所有的我知道的编译器会在寄存器中返回int，但对于一个类 type，将使用a的地址传递一个额外的隐藏参数临时，并在此地址返回内存中的值。原因就像复制构造函数或赋值这样的东西需要一个地址（this指针，对正在复制的内容的引用），以及编译器似乎没有认识到它一旦内联所有的内容功能，不再需要地址。（也有事实上，二进制API说这样做，但二进制API 通常只关注结构，而不是非平凡的类型构造函数，析构函数和赋值运算符。）

Answer 2

我会摆脱构造函数，用引号替换引用到const的调用（因为AlmostInt对象非常小），并将非修改运算符实现为自由函数：

class AlmostInt
{
    int value;

public:

    AlmostInt& operator+=(AlmostInt that)
    {
        value += that.value;
        return *this;
    }

    AlmostInt& operator-=(AlmostInt that)
    {
        value -= that.value;
        return *this;
    }

    AlmostInt& operator*=(AlmostInt that)
    {
        value *= that.value;
        return *this;
    }
};

AlmostInt operator+(AlmostInt lhs, AlmostInt rhs)
{
    lhs += rhs;
    return lhs;
}

AlmostInt operator-(AlmostInt lhs, AlmostInt rhs)
{
    lhs -= rhs;
    return lhs;
}

AlmostInt operator*(AlmostInt lhs, AlmostInt rhs)
{
    lhs *= rhs;
    return lhs;
}

这应该有可能摆脱一些不必要的开销。

Answer 3

也许您可以使用#define的集合来代替自己的课程，所以：

// For normal operations
#define specialplus +
#define specialmultiple *

// And a separate compilation with
#define specialplus min
#define specialmultiple +

也许更好：

// normalmath.c
#define plus(a,b) (a)+(b)
#define star(a,b) (a)*(b)
#define FUNCTYPE normal
#include "yourcode.c"

// tropicalmath.c
#define plus(a,b) min((a),(b))
#define star(a,b) (a)+(b) 
#define FUNCTYPE tropical
#include "yourcode.c"

// yourcode.c
int FUNCTYPE_opp(int x, int y)
{
    // for example
   return star(plus(x,y),52);
}

（至少如果我没有弄乱我的C预处理器代码）。

或至少是那种形式的东西？也许有一些狡猾的函数命名允许两种类型同时使用？

Answer 4

根据Fumiyo的评论：

测试AlmostInt在普通机器上的运行速度是否与普通int一样快：

这是我运行的代码：这是一个矩阵乘法（或试图，我不确定我是否正确）。但无论如何它会进行大量的乘法和加法，这是我们想要测试的。它只用-O3

编译

在main（）函数中，我运行乘法10,000次。

结果：

AlmostInt:    10093876
int           10320511

在我的书中，这与我们给出的误差范围相同。

#include <vector>
#include <iostream>
#include <time.h>

#ifdef ALMOST_INT
class AlmostInt
{
    int value;

    public:
    explicit AlmostInt(int i): value(i)  {}
             AlmostInt():      value(0)  {}

    AlmostInt& operator+=(AlmostInt const& that)
    {
        value += that.value;
        return *this;
    }

    AlmostInt& operator-=(AlmostInt const& that)
    {
        value -= that.value;
        return *this;
    }

    AlmostInt& operator*=(AlmostInt const& that)
    {
        value *= that.value;
        return *this;
    }

};

AlmostInt operator+(AlmostInt lhs, AlmostInt const& rhs) { return lhs += rhs; }
AlmostInt operator-(AlmostInt lhs, AlmostInt const& rhs) { return lhs -= rhs; }
AlmostInt operator*(AlmostInt lhs, AlmostInt const &rhs) { return lhs *= rhs; }

#else
typedef int AlmostInt;

#endif


typedef std::vector<AlmostInt>      Vector;
typedef std::vector<Vector>         Matrix;

void mult(Matrix const& a, Matrix const& b, Matrix& r)
{

    for(int x =0; x < 100; ++x)
    {
        for(int y =0; y < 100; ++y)
        {
            AlmostInt   sum(0);
            for(int s = 0;s < 100; ++s)
            {
                sum    += a[s][y] * b[x][s];
            }
            r[x][y] = sum;
        }
    }
}

int main()
{
    Matrix      a(100, Vector(100));
    Matrix      b(100, Vector(100));
    Matrix      r(100, Vector(100));

    clock_t  t   = 0;
    for(int loop=0;loop < 10000; ++loop)
    {
        if ((loop % 100) == 0) std::cout << "Time: " << t << "\n";
        clock_t  s   = clock();
        mult(a, b, r);
        clock_t  e   = clock();
        t += (e-s);
    }
}

自定义C ++类可以复制内置类型的性能吗？

4 个答案: