Question

代码是动态编写的，并且更改了名称约定，如果我弄得一团糟，那就很抱歉。我会在这里重写一下这个问题，使其更加清晰。

在编译时已知有一些数据，两个整数D和E数组，长度均为L。 D的每个元素都是零或一个。 E的每个元素都包含[0,L]中的值。

然后我有一个向量X，它在运行时已知，长度为L。

我想使用D，E和X构建一个计算特定值的函数，例如：

int comp_rt(int i, array<int, L> X) {
    int v = 0;
    if (D[i] == 0) // D[i] known at compile-time
        return 10;
    for (int j = 0; j < E[i]; ++j) // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    return v;
}

由于这个计算很多次执行，我想减少开销，我认为在D和E上执行检查和循环会很棒编译时间。

通常，为了使它更快，而不是使用comp_rt函数 - 这是一般情况，我会编写模板专用函数，对于每个i，只会进行数学运算。例如：

N = 5 D = [0, 1, 1, 0, 1] // Values in {0, 1} E = [1, 0, 3, 2, 4] // Values in [0, L-1] X = [1, 3, 5, 7, 9] // Any integer template <int i> int comp(array<int, L> X); template <> int comp_tpl<0>(array<int, L> X) { return 10; } // D[0] == 0 template <> int comp_tpl<1>(array<int, L> X) { return 0; } // E[1] == 0, skip loop template <> int comp_tpl<2>(array<int, L> X) { return X[0] + 2 * X[1] + 3 * X[2]; } template <> int comp_tpl<3>(array<int, L> X) { return 10; } template <> int comp_tpl<4>(array<int, L> X) { return compl_tpl<2>(X) + 4 * X[3]; }

我的问题是：是否可以使用模板和/或常量表达式在编译时使用D和E构建函数，但执行速度与comp_tpl一样快？我的意思是构建“构建要在运行时计算的表达式”的东西，并且只有涉及X的计算留给运行时。

并且，如果有可能，它是如何完成的？哪些一般原则可用于解决这类问题？

我尝试使用模板来做到这一点，但结果代码没有comp_tpl那么快......我认为有些递归调用是在运行时进行评估的。

Answer 1

修改：根据相关说明进行更新：
Edit2 ：删除了Conditional。

这很像以前计算总和;这是尾递归的^*：

template<class T, size_t Length> struct Sum {
    template<class Array>
    static T comp(const Array &x, T add = 0) {
        return Sum<T, Length - 1>::comp(x, add + Length * x[Length - 1]);
    }
};

template<class T> struct Sum<T, 0> {
    template<class Array>
    static T comp(const Array &x, T add = 0) {
        return add;
    }
};

这是将它集中在一起的部分，取决于d和e。你可能可以对它们进行参数化，但我认为它比值得更麻烦。

constexpr int d[] = { 0, 1, 1, 0, 1 };
constexpr int e[] = { 1, 0, 3, 2, 4 };

template<int N> struct Comp {
    template<class Array>
    static int comp(const Array &x) {
        return d[N] ? Sum<int, e[N]>::comp(x) : 10;
    }
};

用法：

int x[] = { 1, 3, 5, 7, 9 };
Comp<3>::comp(x);

http://ideone.com/PmFBhU

（*）不是真的，但足够接近。

Answer 2

（更新：最后讨论了使用clang ++和g ++的计时实验。另外，为简单起见，我在问题中使用comp_rt的确切正文，证明它可以完全优化而不需要我们重写函数体。）

是的，这可以做到。但是，g ++似乎在没有你意识到的情况下为你做了很多这样的事情，最后看到实验。但是对于clang ++，你真的可以看到运行时版本更慢。

在下面的程序中，除X之外的所有参数都作为模板参数传递。因此，将为您使用的每个参数组合构建不同的comp_rt模板函数。如果L很大，这可能会导致二进制文件变大。

我提交D[i]==0的方式起初可能很难理解。我把它放在enable_if里面。这里有comp_tpl的两个定义，一个在D[i]==0时使用，在D[i]==1时使用。说实话，这可能是不必要的，我怀疑即使你只是在单个comp_rt函数模板中使用了函数的原始主体，代码仍然会被最佳编译。（我删除了这个并发症）。

我在函数中包含了这样的一行：

    using confirm_Ei_is_known_at_compile_time = array<char,E[i]>;

这确认编译器在编译时已知E[i]。这相当于typedef，并且必须在编译时知道array中的元素数。例如，如果您尝试使用X[i]而不是E[i]作为array的大小，编译器将拒绝该代码。注意：这行没有任何作用，只是在编译时进行完整性检查。

最后，鉴于E[i]在编译时是已知的，编译器能够展开循环（如果，它的智慧，它会感觉会加快它）。请务必启用所有优化 - gcc有一个选项-funroll-all-loops。

通过将相关参数作为模板参数传递，编译器能够进行更多优化。但我不确定它会选择这样做！实验是必需的。

这是我用于计时实验的完整程序。

#include<array>
#include<iostream>
using namespace std;

/*
 * L is a positive integer
 * D is vector of booleans of length L
 * E is a vector of ints [0,L) of length L
 * i will be in [0,L) also, therefore it is small enough that we can
 *         treat it as if it's known at compile time also
 *
 * The only thing that is *not* known at compile time is:
 * X is a vector of ints of length L
 *
 * Therefore, our goal is something like:
 *
 *   template<int L, int i, int D[L], int E[L]>
 *   int compute(int X[L]);
 */

template<int L, int i, const bool (&D)[L], const int (&E)[L]> // arrays passed, by reference, at compile-time
typename enable_if< D[i]==0 , int> :: type
comp_tpl(int (&)[L]) {
        return 10;
}
template<int L, int i, const bool (&D)[L], const int (&E)[L]> // arrays passed, by reference, at compile-time
typename enable_if< D[i]==1 , int> :: type
comp_tpl(int (&X)[L]) {
    int v = 0;
    //using confirm_Ei_is_known_at_compile_time = array<char,E[i]>;
    for (int j = 0; j < E[i]; ++j) // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    return v;
}

template<int L, int i, const bool (&D)[L], const int (&E)[L]> // arrays passed, by reference, at compile-time
int
comp_tpl_simple(int (&X)[L]) {
    if (D[i] == 0) // D[i] known at compile-time
        return 10;
    int v = 0;
    using confirm_Ei_is_known_at_compile_time = array<char,E[i]>;
    for (int j = 0; j < E[i]; ++j) // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    return v;
}

template<int L> // arrays passed, by reference, at compile-time
int
comp_rt(int i, const bool (&D)[L], const int (&E)[L], int (&X)[L]) {
    if (D[i] == 0) // D[i] known at compile-time
        return 10;
    int v = 0;
    for (int j = 0; j < E[i]; ++j) // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    return v;
}


constexpr int L = 5;
extern constexpr bool D[L] {0, 1, 1, 0, 1};  // Values in {0, 1}
extern constexpr int  E[L] {1, 0, 3, 2, 4}; // Values in [0, L-1]

void change_X_arbitrarily(int (&X)[L]) {
    for(int j=0; j<L; ++j)
        ++X[j];
}

int main() {
    int X[L] {1, 3, 5, 7, 9}; // Any integer

#ifdef USE_RUNTIME
    #define comp(L,i,D,E,X) comp_rt<L>(i,D,E,X)
#endif
#ifdef USE_TEMPLATE
    #define comp(L,i,D,E,X) comp_tpl_simple<L,i,D,E>(X)
#endif

    int total=0;
    for(int outer_reps=0; outer_reps<10000; ++outer_reps) {
        for(int inner_reps=0; inner_reps<100000; ++inner_reps) {
            total += comp(L,0,D,E,X);
            total += comp(L,1,D,E,X);
            total += comp(L,2,D,E,X);
            total += comp(L,3,D,E,X);
            total += comp(L,4,D,E,X);
        }
        change_X_arbitrarily(X);
    }

    cout << total << endl; // should be 39798784
}

请注意我如何使用#define选择要使用的函数。我编译并运行：

$ clang++ SO.cpp -std=gnu++0x -O3 -DUSE_TEMPLATE -o SO && time -p ./SO
39798784  // the total value from all the calls, as a check
real 0.00
user 0.00
sys 0.00

计算1,000,000,000次需要零秒！但运行时版本需要2.7秒

$ clang++ SO.cpp -std=gnu++0x -O3 -DUSE_RUNTIME -o SO && time -p ./SO
39798784  // the total value from all the calls, as a check
real 2.70
user 2.68
sys 0.00

我在那里使用了clang3.3和-O3。

使用g ++ 4.8.2时，我收到有关-O3未定义行为的警告，但奇怪的是，运行时或模板版本的运行时为零秒！也许g ++正在为我们启用编译时技巧，即使在“运行时”模式下也是如此。这里的教训是，编译器真的可以比我们更了解优化！

无论如何，如果我回到g++-4.8.2 -O2那么在任何一种情况下运行时间都是6.8秒。相当奇怪！有时添加更多O可以减慢它的速度！

解释： 在这种情况下，X实际上是在编译时知道的。它是此代码中的局部变量，并且是确定性更新的，因此编译器能够完全预测它并在编译时计算答案！看起来g ++正在这样做（非常令人印象深刻！）。因此，在我最近的实验中，我将X移到main之外作为全局变量，现在优化的行为'按预期'。 comp_tpl始终比comp_rt快得多。

Answer 3

（抱歉添加其他答案）

（我会把示例代码放在最后）

我的实验，并进一步思考，使我确信原始代码可以使用，只需稍加修改即可。编译器非常擅长优化，我发现有时候很难放慢速度！只有标记X volatile，或者使用来自rand()的随机数据不断编辑它，才能真正使“运行时”版本变慢。

首先，如果您只有一个D向量且只有一个E向量，那么您只需将constexpr放在数组声明之前。

constexpr int D[] = { 0, 1, 1, 0, 1 };
constexpr int E[] = { 1, 0, 3, 2, 4 };

（如果你有多个这样的向量，并且你想为每个向量准备'pre-partial-compiled'函数，我们可以通过模板参数传递它们，正如我在其他（冗长的）答案中所讨论的那样。）

我们还需要处理原始函数中的i索引：int comp_rt(int i, array<int, L> X);。它应该是模板参数：

template<size_t i>
int comp_rt(array<int, L> X);

不需要更改功能的主体。编译器现在知道i，D[i]和E[i]是常量表达式。涉及D[i]和E[i]的表达式将被其常量值替换。在编译时，测试if(D[i]==0)会根据需要由if (true)或if (false)替换。此外，循环将展开，因为编译器确切地知道E[i]的值。循环展开，编译器可以看到v只是一个长总和。在这种情况下，它将用显式总和替换它，删除所有零项，并将所有常数项加起来，依此类推。所有这些都是在编译时完成的。我们在这里帮助编译器几乎无能为力。这相当于一些可以使用的更复杂的模板解决方案。

使用g ++和clang ++与-O2和-O3。

在我的一些实验中，gcc的程序在零秒内运行，无论我需要多少次迭代。这是因为算法是确定性的，gcc可以预先计算X发生的所有事情（即使我经常更换X！）。在这种情况下，问题的一部分是我使X成为局部变量，但教训是编译器可以看到确定性程序并提前计算所有内容，即使您不想要它！ clang似乎没有在这里积极优化。

如果你有一些比你希望的慢的代码，并且你可以整理一段完整的代码来演示慢代码，那么也许我们可以建议其他小的改变。但我相信constexpr对数据的简单使用以及i的模板参数都可以解决问题。

在这个示例代码中，我做了另一个更改。我间接使用tuple_size<array<char, D[i]> > :: value而不是简单D_i，这并没有真正改变含义，但我认为它会鼓励旧编译器在编译时进行评估。我的目标是尽可能地匹配代码的原始可读性，例如将整个函数保存在一个地方，而不是将其拆分为多个模板。

constexpr int L   = 5;
constexpr int D[] = { 0, 1, 1, 0, 1 };
constexpr int E[] = { 1, 0, 3, 2, 4 };
template<int i>
int comp_rt(array<int, L> X) {
    using D_i_type = array<char, D[i]>;
    int v = 0;
    if (tuple_size<D_i_type>::value == 0) // D[i] known at compile-time
        return 10;
    using E_i_type = array<char, E[i]>;
    for (int j = 0; j < tuple_size<E_i_type>::value; ++j) { // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    }
    return v;
}

Answer 4

使用constexpr函数，可以在编译时完成

#include <iostream>
#include <array>
#include <utility>

#include <cstddef>
#include <type_traits>

  /// A type that represents a parameter pack of zero or more integers.
  template<typename T, T... I>
    struct integer_sequence
    {
      static_assert( std::is_integral<T>::value, "Integral type" );

      using type = T;

      static constexpr T size = sizeof...(I);

      /// Generate an integer_sequence with an additional element.
      template<T N>
        using append = integer_sequence<T, I..., N>;

      using next = append<size>;
    };

  template<typename T, T... I>
    constexpr T integer_sequence<T, I...>::size;

  template<std::size_t... I>
    using index_sequence = integer_sequence<std::size_t, I...>;

  namespace detail
  {
    // Metafunction that generates an integer_sequence of T containing [0, N)
    template<typename T, T Nt, std::size_t N>
      struct iota
      {
        static_assert( Nt >= 0, "N cannot be negative" );

        using type = typename iota<T, Nt-1, N-1>::type::next;
      };

    // Terminal case of the recursive metafunction.
    template<typename T, T Nt>
      struct iota<T, Nt, 0ul>
      {
        using type = integer_sequence<T>;
      };
  }


  // make_integer_sequence<T, N> is an alias for integer_sequence<T, 0,...N-1>
  template<typename T, T N>
    using make_integer_sequence = typename detail::iota<T, N, N>::type;

  template<int N>
    using make_index_sequence = make_integer_sequence<std::size_t, N>;


  // index_sequence_for<A, B, C> is an alias for index_sequence<0, 1, 2>
  template<typename... Args>
    using index_sequence_for = make_index_sequence<sizeof...(Args)>;
//--------------------My part starts here-------------------------------------------------------
template <size_t N> constexpr int computebis(int bound,std::array<int,N> X,int j)
{
  return (j<bound) ? X[j]*(j+1) + computebis(bound,X,j+1) : 0;
}

template <size_t N> constexpr int compute2(std::array<int,N> D,
                                            std::array<int,N> E,
                                            std::array<int,N> X,int index)
{
  return (D[index]==0) ? 10 : computebis(E[index],X,0);
}


template <size_t N,std::size_t... Indices> constexpr std::array<int,N> mfill(std::array<int,N> D,
                                                                            std::array<int,N> E,
                                                                            std::array<int,N> X,
                                                                            index_sequence<Indices...>)
{
  return {{ compute2(D,E,X,Indices)... }};
}

template <size_t N> constexpr std::array<int,N> mfill(std::array<int,N> D,std::array<int,N> E,std::array<int,N> X)
{
  return mfill(D,E,X,make_index_sequence<N>{});
}


int main(int argc, char *argv[])
{

  std::array<int,5> D= {0,1,1,0,1};
  std::array<int,5> E= {1,0,3,2,4};
  std::array<int,5> X= {1,3,5,7,9};
  //to be sure that it is done at compil time
  const auto X2 =  mfill(D,E,X);

  for(auto e:X2){
    std::cout<<e<<std::endl;
  }

编辑：代码已更新受Create N-element constexpr array in C++11的启发我完成了第一部分there

使用模板和/或constexpr在编译时构建函数

4 个答案: