Question

我有同样的问题： Expression templates: improving performance in evaluating expressions?

我的目标是展开这个表达式的循环

auto && intermediate = A+D*C
for(int i= 0; i<10 ;i++)
    intermediate = intermediate + B
Vector result = intermediate * E

我想在中间的二进制表达式的整个树中，最后运算符=（Expression）的类Vector运行图的检查使用我的代码，它只能在没有循环的情况下工作（我使用表达模板的经典实现，Joel Falcou @cppcon 2015之一）

编辑：由于循环而导致的代码编译问题

如果我将循环取消注释到main我有编译错误需要运行c ++ 11

g ++ -std = c ++ 11 -O3 -fopenmp -Wall -pedantic -pthread main.cpp＆amp;＆amp; ./a.out

#include <vector>
#include <iostream>

template <typename TBase, typename Derived>
struct BaseExpression
{
   Derived const& self() const { return static_cast<const Derived&>(*this); }
   Derived & self() { return static_cast<Derived&>(*this); }
   TBase operator[](size_t szIdx) const { return self()[szIdx]; }
   size_t size() const {return self().size();}
};

template <typename TBase, typename Operator, typename OP1, typename OP2>
class Binary_Expression : public BaseExpression<TBase, Binary_Expression<TBase, Operator, OP1, OP2> >
{
public:
   Binary_Expression(OP1 const & a, OP2 const & b) : op1(a), op2(b){}
   TBase operator[] (size_t idx) const { return op(op1[idx], op2[idx]); }
   size_t size() const { return op1.size() != 0 ? op1.size() : op2.size(); }


private:
   const OP1 & op1;
   const OP2 & op2;
   Operator op;
};


template <typename TBase >
class Vector : public BaseExpression<TBase, Vector<TBase> >
{

public:
   explicit Vector(size_t szSizeN) : m_xMemory(szSizeN){}

   Vector(const Vector &orig): m_xMemory()
   { 
      this->copy(orig);
   }

   Vector & operator=(const Vector &orig)
   {
      if (&orig != this)
      {
         Vector temp(orig);
         this->swap(temp);
      }

      return *this;
   }

   Vector & operator=(TBase factor)
   {
      size_t szSizeN = size();
#pragma omp parallel for
      for (size_t idx = 0; idx < szSizeN; idx++)
      {
         m_xMemory[idx] = factor;
      }

      return *this;
   }

   template <typename Expression>
   Vector(const BaseExpression<TBase, Expression> &b) :m_xMemory(b.size())
   {
      size_t szSizeN = size();
#pragma omp parallel for
      for (size_t idx = 0; idx < szSizeN; idx++)
      {
         m_xMemory[idx] = b[idx];
      }

   }

   void swap(Vector &orig)
   {
      using std::swap;
      swap(m_xMemory, orig.m_xMemory);
   }

   TBase operator[] (size_t idx) const { return m_xMemory[idx]; }

   TBase & operator[] (size_t idx) { return m_xMemory[idx]; }

   size_t size() const { return m_xMemory.size(); }

   void print()
   {
      size_t szSizeN = size();
      for (size_t idx = 0; idx < szSizeN; idx++)
      {
         std::cout << "Index=" << idx << "\t" << "Value=" << m_xMemory[idx] << std::endl;

      }
   }

private:
   void copy(const Vector &orig) 
   {
      m_xMemory = orig.m_xMemory;
   }

   std::vector<TBase> m_xMemory;
};


template <typename TBase, typename E1, typename E2>
Binary_Expression<TBase, std::plus<TBase>, E1, E2> operator+(const BaseExpression<TBase, E1> & xE1, const BaseExpression< TBase, E2> & xE2)
{
   return Binary_Expression<TBase, std::plus<TBase>, E1, E2>(xE1.self(), xE2.self());
}


int main()
{
   Vector<double> x1(10);
   Vector<double> x2(10);
   Vector<double> x3(10);

   x1 = 7.5;
   x2 = 8.;
   x3 = 4.2;

   auto && intermediate =  x1 + x2;


//compil error   
/*
   for (int i = 0; i< 10; i++)
   {
       intermediate = intermediate + x3;   
   }
   */
   // inspection of the graph here
   Vector<double> result = intermediate + x2;


   result.print();   

}

事实上，在我的最终设计中，我想写下以下内容：

   Vector<double> x1(10);
   Vector<double> x2(10);
   Vector<double> x3(10);

   x1 = 7.5;
   x2 = 8.;
   x3 = 4.2;

   Vector<double> intermediate = x1 + x2;
   for (int i = 0; i < 5; ++i)
       intermediate = intermediate + x3;

   Vector<double> result = x1 + x3 + intermediate;
   // finally into result I have the expression tree, and evaluate method which will make the graph inspection
   result.evaluate();

提前致谢乔纳森

Answer 1

我担心它不会起作用，因为链接技术依赖于捕获整个表达式的intermediate变量的类型。所以它看起来像Sum<Mult<Vector,Vector>>（这里简化）。但是在for循环的每次迭代中类型都不能改变。

我看到了替代方案：

不要将表达式捕获为类型，而是作为运行时结构，类型为let，表示VectorExpression。这将产生性能影响，因为您必须在运行时分析表达式图并限制您可以执行的优化类型。

第二种选择是使用模板元编程（每个步骤都有一个新类型）编写自己的for循环。

折叠功能的示例（这是您想要的）。我们必须使用折叠仿函数，因为不支持函数的部分特化：

#include <utility>

template <int N, class V, class F>
struct foldf {
    auto operator()(V v, F&& f) -> decltype(auto) {
        auto next = f(v);
        return foldf<N - 1, decltype(next), F>()(next, std::move(f));
    }
};

template <class V, class F>
struct foldf<0, V, F> {
    auto operator()(V v, F&& f) -> decltype(auto) {
        return v;
    }
};

// just a helper to make usage simpler
template <int N>
class Repeat{};

template <int N, class V, class F>
auto fold(Repeat<N> tag, V v, F&& f) -> decltype(auto) {
    return foldf<N, V, F>()(v, std::move(f));
}

为了证明它符合我们的要求，请添加以下代码：

template <class T>
class Test {
};

class Other{};

template <class T>
auto wrap(T t) -> decltype(auto) {
    return Test<T>();
}

int main() {
    auto v = fold(Repeat<3>(), 0, [](auto t){ 
        return wrap(t); 
    });
    Other x = v;
}

结果应为tmp.cpp:42:11: error: no viable conversion from 'Test<Test<Test<int> > >' to 'Other'，表示保留了类型。

表达式模板：展开循环

1 个答案: