OP的问题

Question

我喜欢D的某些功能，但是如果他们带来的话会感兴趣运行时惩罚？

为了比较，我实现了一个简单的程序，用C ++和D计算许多短向量的标量积。结果令人惊讶：

D：18.9秒[见下文最终运行时间]
C ++：3.8 s

C ++真的几乎快了五倍，或者我在D中犯了错误程序

我在最近的Linux桌面上使用g ++ -O3（gcc-snapshot 2011-02-19）和D和dmd -O（dmd 2.052）编译了C ++。结果可在几次运行中重现，标准偏差可忽略不计。

这里是C ++程序：

#include <iostream>
#include <random>
#include <chrono>
#include <string>

#include <vector>
#include <array>

typedef std::chrono::duration<long, std::ratio<1, 1000>> millisecs;
template <typename _T>
long time_since(std::chrono::time_point<_T>& time) {
      long tm = std::chrono::duration_cast<millisecs>( std::chrono::system_clock::now() - time).count();
  time = std::chrono::system_clock::now();
  return tm;
}

const long N = 20000;
const int size = 10;

typedef int value_type;
typedef long long result_type;
typedef std::vector<value_type> vector_t;
typedef typename vector_t::size_type size_type;

inline value_type scalar_product(const vector_t& x, const vector_t& y) {
  value_type res = 0;
  size_type siz = x.size();
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {
  auto tm_before = std::chrono::system_clock::now();

  // 1. allocate and fill randomly many short vectors
  vector_t* xs = new vector_t [N];
  for (int i = 0; i < N; ++i) {
    xs[i] = vector_t(size);
      }
  std::cerr << "allocation: " << time_since(tm_before) << " ms" << std::endl;

  std::mt19937 rnd_engine;
  std::uniform_int_distribution<value_type> runif_gen(-1000, 1000);
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = runif_gen(rnd_engine);
  std::cerr << "random generation: " << time_since(tm_before) << " ms" << std::endl;

  // 2. compute all pairwise scalar products:
  time_since(tm_before);
  result_type avg = 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j) 
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  auto time = time_since(tm_before);
  std::cout << "result: " << avg << std::endl;
  std::cout << "time: " << time << " ms" << std::endl;
}

这里有D版本：

import std.stdio;
import std.datetime;
import std.random;

const long N = 20000;
const int size = 10;

alias int value_type;
alias long result_type;
alias value_type[] vector_t;
alias uint size_type;

value_type scalar_product(const ref vector_t x, const ref vector_t y) {
  value_type res = 0;
  size_type siz = x.length;
  for (size_type i = 0; i < siz; ++i)
    res += x[i] * y[i];
  return res;
}

int main() {   
  auto tm_before = Clock.currTime();

  // 1. allocate and fill randomly many short vectors
  vector_t[] xs;
  xs.length = N;
  for (int i = 0; i < N; ++i) {
    xs[i].length = size;
  }
  writefln("allocation: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  for (int i = 0; i < N; ++i)
    for (int j = 0; j < size; ++j)
      xs[i][j] = uniform(-1000, 1000);
  writefln("random: %i ", (Clock.currTime() - tm_before));
  tm_before = Clock.currTime();

  // 2. compute all pairwise scalar products:
  result_type avg = cast(result_type) 0;
  for (int i = 0; i < N; ++i)
    for (int j = 0; j < N; ++j) 
      avg += scalar_product(xs[i], xs[j]);
  avg = avg / N*N;
  writefln("result: %d", avg);
  auto time = Clock.currTime() - tm_before;
  writefln("scalar products: %i ", time);

  return 0;
}

Answer 1

要启用所有优化并禁用所有安全检查，请使用以下DMD标记编译D程序：

-O -inline -release -noboundscheck

编辑：我用g ++，dmd和gdc尝试了你的程序。 dmd确实落后了，但gdc的性能非常接近g ++。我使用的命令行是gdmd -O -release -inline（gdmd是gdc的一个包装器，接受dmd选项）。

查看汇编程序列表，它看起来既不是dmd也不是gdc内联scalar_product，但g ++ / gdc确实发出了MMX指令，因此它们可能会自动对循环进行矢量化。

Answer 2

减慢D的一个重要因素是subpar垃圾收集实现。不会严重压缩GC的基准测试将显示与使用相同编译器后端编译的C和C ++代码非常相似的性能。严重压力GC的基准测试将显示D表现糟糕。但请放心，这是一个单一的（尽管很严重的）实施质量问题，而不是慢速的保证。此外，D使您能够选择退出GC并在性能关键位中调整内存管理，同时仍然在性能较低的95％代码中使用它。

我put some effort into improving GC performance lately并且结果相当戏剧性，至少在综合基准测试方面。希望这些更改将集成到下一个版本之一，并将缓解此问题。

Answer 3

这是一个非常有益的线索，感谢OP和助手的所有工作。

一个注意事项 - 此测试不是评估抽象/特征惩罚的一般问题，甚至不评估后端质量的问题。它侧重于几乎一个优化（循环优化）。我认为gcc的后端比dmd更精致，这是公平的，但假设它们之间的差距对于所有任务来说都是一样大是错误的。

Answer 4

绝对看起来像是一个实施质量问题。

我使用OP的代码进行了一些测试并进行了一些更改。 我实际上让LD更快用于LDC / clang ++，假设数组必须动态分配（xs和关联的标量）。请参阅下面的一些数字。

OP的问题

是否有意在C ++的每次迭代中使用相同的种子，而对D来说却不是这样？

设置

我调整了原始D源（称为scalar.d），使其可以在平台之间移植。这只涉及更改用于访问和修改数组大小的数字类型。

在此之后，我做了以下更改：

使用uninitializedArray来避免xs中标量的默认输入（可能是最大的区别）。 这很重要，因为D通常是默认的 - 静默地进入所有内容，C ++没有。
考虑打印代码并将writefln替换为writeln
将导入更改为有选择性
使用pow运算符（^^）而不是手动乘法来计算平均值
删除size_type并使用新的index_type别名

...因此产生scalar2.cpp（pastebin）：

    import std.stdio : writeln;
    import std.datetime : Clock, Duration;
    import std.array : uninitializedArray;
    import std.random : uniform;

    alias result_type = long;
    alias value_type = int;
    alias vector_t = value_type[];
    alias index_type = typeof(vector_t.init.length);// Make index integrals portable - Linux is ulong, Win8.1 is uint

    immutable long N = 20000;
    immutable int size = 10;

    // Replaced for loops with appropriate foreach versions
    value_type scalar_product(in ref vector_t x, in ref vector_t y) { // "in" is the same as "const" here
      value_type res = 0;
      for(index_type i = 0; i < size; ++i)
        res += x[i] * y[i];
      return res;
    }

    int main() {
      auto tm_before = Clock.currTime;
      auto countElapsed(in string taskName) { // Factor out printing code
        writeln(taskName, ": ", Clock.currTime - tm_before);
        tm_before = Clock.currTime;
      }

      // 1. allocate and fill randomly many short vectors
      vector_t[] xs = uninitializedArray!(vector_t[])(N);// Avoid default inits of inner arrays
      for(index_type i = 0; i < N; ++i)
        xs[i] = uninitializedArray!(vector_t)(size);// Avoid more default inits of values
      countElapsed("allocation");

      for(index_type i = 0; i < N; ++i)
        for(index_type j = 0; j < size; ++j)
          xs[i][j] = uniform(-1000, 1000);
      countElapsed("random");

      // 2. compute all pairwise scalar products:
      result_type avg = 0;
      for(index_type i = 0; i < N; ++i)
        for(index_type j = 0; j < N; ++j)
          avg += scalar_product(xs[i], xs[j]);
      avg /= N ^^ 2;// Replace manual multiplication with pow operator
      writeln("result: ", avg);
      countElapsed("scalar products");

      return 0;
    }

在测试scalar2.d（优先考虑速度优化）之后，出于好奇，我将main中的循环替换为等效foreach，并将其称为scalar3.d（{{ 3}}）：

    import std.stdio : writeln;
    import std.datetime : Clock, Duration;
    import std.array : uninitializedArray;
    import std.random : uniform;

    alias result_type = long;
    alias value_type = int;
    alias vector_t = value_type[];
    alias index_type = typeof(vector_t.init.length);// Make index integrals portable - Linux is ulong, Win8.1 is uint

    immutable long N = 20000;
    immutable int size = 10;

    // Replaced for loops with appropriate foreach versions
    value_type scalar_product(in ref vector_t x, in ref vector_t y) { // "in" is the same as "const" here
      value_type res = 0;
      for(index_type i = 0; i < size; ++i)
        res += x[i] * y[i];
      return res;
    }

    int main() {
      auto tm_before = Clock.currTime;
      auto countElapsed(in string taskName) { // Factor out printing code
        writeln(taskName, ": ", Clock.currTime - tm_before);
        tm_before = Clock.currTime;
      }

      // 1. allocate and fill randomly many short vectors
      vector_t[] xs = uninitializedArray!(vector_t[])(N);// Avoid default inits of inner arrays
      foreach(ref x; xs)
        x = uninitializedArray!(vector_t)(size);// Avoid more default inits of values
      countElapsed("allocation");

      foreach(ref x; xs)
        foreach(ref val; x)
          val = uniform(-1000, 1000);
      countElapsed("random");

      // 2. compute all pairwise scalar products:
      result_type avg = 0;
      foreach(const ref x; xs)
        foreach(const ref y; xs)
          avg += scalar_product(x, y);
      avg /= N ^^ 2;// Replace manual multiplication with pow operator
      writeln("result: ", avg);
      countElapsed("scalar products");

      return 0;
    }

我使用基于LLVM的编译器编译了每个测试，因为在性能方面，LDC似乎是D编译的最佳选择。在我的x86_64 Arch Linux安装中，我使用了以下软件包：

clang 3.6.0-3
ldc 1:0.15.1-4
dtools 2.067.0-2

我使用以下命令编译每个命令：

C ++：clang++ scalar.cpp -o"scalar.cpp.exe" -std=c++11 -O3
D：rdmd --compiler=ldc2 -O3 -boundscheck=off <sourcefile>

结果

每个版本的源的结果（pastebin）如下：

scalar.cpp（原始C ++）：

allocation: 2 ms

random generation: 12 ms

result: 29248300000

time: 2582 ms

C ++将标准设置为 2582 ms 。

scalar.d（已修改的OP来源）：

allocation: 5 ms, 293 μs, and 5 hnsecs 

random: 10 ms, 866 μs, and 4 hnsecs 

result: 53237080000

scalar products: 2 secs, 956 ms, 513 μs, and 7 hnsecs

这是 ~2957 ms 。比C ++实现慢，但不是太多。

scalar2.d（索引/长度类型更改和uninitializedArray优化）：

allocation: 2 ms, 464 μs, and 2 hnsecs

random: 5 ms, 792 μs, and 6 hnsecs

result: 59

scalar products: 1 sec, 859 ms, 942 μs, and 9 hnsecs

换句话说， ~1860 ms 。到目前为止，这是领先的。

scalar3.d（foreaches）：

allocation: 2 ms, 911 μs, and 3 hnsecs

random: 7 ms, 567 μs, and 8 hnsecs

result: 189

scalar products: 2 secs, 182 ms, and 366 μs

~2182 ms 比scalar2.d慢，但比C ++版本更快。

结论

通过正确的优化，D实现比使用可用的基于LLVM的编译器的等效C ++实现更快。目前大多数应用程序的D和C ++之间的差距似乎只是基于当前实现的限制。

Answer 5

dmd是该语言的参考实现，因此大多数工作都放在前端修复错误而不是优化后端。

“in”在您的情况下更快，因为您正在使用作为引用类型的动态数组。使用ref引入另一级别的间接（通常用于更改数组本身而不仅仅是内容）。

向量通常用结构实现，其中const ref非常有意义。请参阅smallptD与smallpt的相关内容，了解包含向量运算和随机性负载的实际示例。

请注意，64位也可以有所作为。我曾经错过了x64 gcc编译64位代码，而dmd仍默认为32（当64位codegen成熟时会改变）。 “dmd -m64 ......”的速度非常快。

Answer 6

C ++或D是否更快可能高度依赖于您正在做的事情。我认为，在将编写良好的C ++与编写良好的D代码进行比较时，它们通常要么具有相似的速度，要么C ++会更快，但是特定的编译器设法优化可能会产生很大的影响，除了语言本身。

然而，有几种情况下，D很有可能在速度上击败C ++。想到的主要是字符串处理。由于D的数组切片功能，字符串（和一般的数组）可以比在C ++中更快地处理。对于D1，Tango's XML processor is extremely fast，主要得益于D的数组切片功能（并且希望在完成当前正在为Phobos工作的那个时，D2将具有类似的快速XML解析器）。因此，最终D或C ++是否会更快将取决于你正在做什么。

现在，我感到非常惊讶你在这种特殊情况下看到了速度上的这种差异，但是随着dmd的改进，我希望它会有所改善。使用gdc可能会产生更好的结果，并且可能是语言本身（而不是后端）的更接近的比较，因为它是基于gcc的。但是，如果有许多事情可以加速dmd产生的代码，那么我一点都不会感到惊讶。我不认为gcc在这一点上比dmd更成熟的问题。代码优化是代码成熟的主要成果之一。

最重要的是，重要的是dmd对你的特定应用程序的表现如何，但我确实同意，知道C ++和D在一般情况下的比较肯定会很好。从理论上讲，它们应该基本相同，但它实际上取决于实现。我认为需要一套全面的基准才能真正测试两者目前的比较情况。

Answer 7

你可以编写C代码是D，因为它更快，它将取决于很多东西：

您使用的编译器
您使用的功能
你如何积极地优化

第一个的差异是不公平的。第二个可能会给C ++一个优势，因为它有更少的重要功能。第三个是有趣的：D代码在某些方面更容易优化，因为通常它更容易理解。此外，它还具有进行大量生成编程的能力，允许使用冗长和重复但快速的代码以较短的形式编写。

Answer 8

似乎是一个实施质量问题。例如，以下是我一直在测试的内容：

import std.datetime, std.stdio, std.random;

version = ManualInline;

immutable N = 20000;
immutable Size = 10;

alias int value_type;
alias long result_type;
alias value_type[] vector_type;

result_type scalar_product(in vector_type x, in vector_type y)
in
{
    assert(x.length == y.length);
}
body
{
    result_type result = 0;

    foreach(i; 0 .. x.length)
        result += x[i] * y[i];

    return result;
}

void main()
{   
    auto startTime = Clock.currTime();

    // 1. allocate vectors
    vector_type[] vectors = new vector_type[N];
    foreach(ref vec; vectors)
        vec = new value_type[Size];

    auto time = Clock.currTime() - startTime;
    writefln("allocation: %s ", time);
    startTime = Clock.currTime();

    // 2. randomize vectors
    foreach(ref vec; vectors)
        foreach(ref e; vec)
            e = uniform(-1000, 1000);

    time = Clock.currTime() - startTime;
    writefln("random: %s ", time);
    startTime = Clock.currTime();

    // 3. compute all pairwise scalar products
    result_type avg = 0;

    foreach(vecA; vectors)
        foreach(vecB; vectors)
        {
            version(ManualInline)
            {
                result_type result = 0;

                foreach(i; 0 .. vecA.length)
                    result += vecA[i] * vecB[i];

                avg += result;
            }
            else
            {
                avg += scalar_product(vecA, vecB);
            }
        }

    avg = avg / (N * N);

    time = Clock.currTime() - startTime;
    writefln("scalar products: %s ", time);
    writefln("result: %s", avg);
}

定义ManualInline我得到28秒，但没有我得到32.所以编译器甚至没有内联这个简单的函数，我认为它应该是明确的。

（我的命令行是dmd -O -noboundscheck -inline -release ...。）

D与C ++相比有多快？

8 个答案:

OP的问题

设置

结果

结论