我正在尝试使用valarray,因为它在操作矢量和矩阵时非常类似于MATLAB。我首先进行了一些性能检查,发现valarray无法达到Stroustrup在 C++ programming language 一书中声明的性能。
测试程序实际上完成了500万次双倍乘法。我认为c = a * b至少可以与for
循环双类型元素乘法相媲美,但我完全错了。我尝试了几台计算机和Microsoft Visual C ++ 6.0和Visual Studio 2008。
顺便说一下,我使用以下代码在MATLAB上测试:
len = 5*1024*1024;
a = rand(len, 1);
b = rand(len, 1);
c = zeros(len, 1);
tic;
c = a.*b;
toc;
结果是46毫秒。这个时间精度不高;它只作为参考。
代码是:
#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"
using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;
double gettime_hp();
int main()
{
enum { N = 5*1024*1024 };
valarray<double> a(N), b(N), c(N);
QueryPerformanceFrequency(&sys_freq);
int i, j;
for (j=0 ; j<8 ; ++j)
{
for (i=0 ; i<N ; ++i)
{
a[i] = rand();
b[i] = rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c1[i] = a1[i] * b1[i];
dtime = gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
c = a*b ;
dtime = gettime_hp() - dtime;
cout << "valarray operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c[i] = a[i] * b[i];
dtime = gettime_hp() - dtime;
cout << "valarray[i] operator* " << dtime<< " ms\n";
cout << "------------------------------------------------------\n";
}
}
double gettime_hp()
{
LARGE_INTEGER tick;
extern LARGE_INTEGER sys_freq;
QueryPerformanceCounter(&tick);
return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
}
运行结果:(具有最大速度优化的释放模式)
double operator* 52.3019 ms
valarray operator* 128.338 ms
valarray[i] operator* 43.1801 ms
------------------------------------------------------
double operator* 43.4036 ms
valarray operator* 145.533 ms
valarray[i] operator* 44.9121 ms
------------------------------------------------------
double operator* 43.2619 ms
valarray operator* 158.681 ms
valarray[i] operator* 43.4871 ms
------------------------------------------------------
double operator* 42.7317 ms
valarray operator* 173.164 ms
valarray[i] operator* 80.1004 ms
------------------------------------------------------
double operator* 43.2236 ms
valarray operator* 158.004 ms
valarray[i] operator* 44.3813 ms
------------------------------------------------------
具有相同优化的调试模式:
double operator* 41.8123 ms
valarray operator* 201.484 ms
valarray[i] operator* 41.5452 ms
------------------------------------------------------
double operator* 40.2238 ms
valarray operator* 215.351 ms
valarray[i] operator* 40.2076 ms
------------------------------------------------------
double operator* 40.5859 ms
valarray operator* 232.007 ms
valarray[i] operator* 40.8803 ms
------------------------------------------------------
double operator* 40.9734 ms
valarray operator* 234.325 ms
valarray[i] operator* 40.9711 ms
------------------------------------------------------
double operator* 41.1977 ms
valarray operator* 234.409 ms
valarray[i] operator* 41.1429 ms
------------------------------------------------------
double operator* 39.7754 ms
valarray operator* 234.26 ms
valarray[i] operator* 39.6338 ms
------------------------------------------------------
答案 0 :(得分:23)
我刚刚在Linux x86-64系统(Sandy Bridge CPU)上尝试过它:
gcc 4.5.0:
double operator* 9.64185 ms
valarray operator* 9.36987 ms
valarray[i] operator* 9.35815 ms
英特尔ICC 12.0.2:
double operator* 7.76757 ms
valarray operator* 9.60208 ms
valarray[i] operator* 7.51409 ms
在这两种情况下,我只使用了-O3
而没有其他与优化相关的标志。
看起来MS C ++编译器和/或valarray实现很糟糕。
以下是针对Linux修改的OP代码:
#include <iostream>
#include <valarray>
#include <iostream>
#include <ctime>
using namespace std ;
double gettime_hp();
int main()
{
enum { N = 5*1024*1024 };
valarray<double> a(N), b(N), c(N) ;
int i,j;
for( j=0 ; j<8 ; ++j )
{
for( i=0 ; i<N ; ++i )
{
a[i]=rand();
b[i]=rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;
double dtime=gettime_hp();
for( i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;
dtime=gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n" ;
dtime=gettime_hp();
c = a*b ;
dtime=gettime_hp()-dtime;
cout << "valarray operator* " << dtime << " ms\n" ;
dtime=gettime_hp();
for( i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;
dtime=gettime_hp()-dtime;
cout << "valarray[i] operator* " << dtime<< " ms\n" ;
cout << "------------------------------------------------------\n" ;
}
}
double gettime_hp()
{
struct timespec timestamp;
clock_gettime(CLOCK_REALTIME, ×tamp);
return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
}
答案 1 :(得分:12)
我怀疑c = a*b
的原因比一次执行一个元素要慢得多
template<class T> valarray<T> operator*
(const valarray<T>&, const valarray<T>&);
运算符必须分配内存以将结果放入,然后按值返回。
即使使用“交换优化”来执行复制,该功能仍然具有
的开销valarray
valarray
(可能会将其优化掉)valarray
valarray
进行分页
valarray
答案 2 :(得分:4)
valarray的重点是在矢量机器上快速运行,而x86机器则不然。
非向量机上的一个好的实现应该能够匹配你得到的性能
for (i=0; i < N; ++i)
c1[i] = a1[i] * b1[i];
当然不好。除非硬件中有某些东西可以加速并行处理,否则它将非常接近您可以做到的最佳状态。
答案 3 :(得分:3)
我终于通过使用延迟评估得到了这一点。代码可能很难看,因为我刚开始学习这些C ++高级概念。
以下是代码:
#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"
using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;
double gettime_hp();
// To improve the c = a*b (it will generate a temporary first, assigned to 'c' and delete the temporary.
// Which causes the program really slow
// The solution is the expression template and let the compiler to decide when all the expression is known.
// Delayed evaluation
//typedef valarray<double> Vector;
class Vector;
class VecMul
{
public:
const Vector& va;
const Vector& vb;
//Vector& vc;
VecMul(const Vector& v1, const Vector& v2): va(v1), vb(v2) {}
operator Vector();
};
class Vector:public valarray<double>
{
valarray<double> *p;
public:
explicit Vector(int n)
{
p = new valarray<double>(n);
}
Vector& operator = (const VecMul &m)
{
for(int i=0; i<m.va.size(); i++)
(*p)[i] = (m.va)[i]*(m.vb)[i]; // Ambiguous
return *this;
}
double& operator[](int i) const {return (*p)[i];} //const vector_type[i]
int size()const {return (*p).size();}
};
inline VecMul operator*(const Vector& v1, const Vector& v2)
{
return VecMul(v1, v2);
}
int main()
{
enum {N = 5*1024*1024};
Vector a(N), b(N), c(N);
QueryPerformanceFrequency(&sys_freq);
int i, j;
for (j=0 ; j<8 ; ++j)
{
for (i=0 ; i<N ; ++i)
{
a[i] = rand();
b[i] = rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c1[i] = a1[i] * b1[i];
dtime = gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
c = a*b;
dtime = gettime_hp()-dtime;
cout << "valarray operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c[i] = a[i] * b[i];
dtime = gettime_hp() - dtime;
cout << "valarray[i] operator* " << dtime << " ms\n";
cout << "------------------------------------------------------\n";
}
}
double gettime_hp()
{
LARGE_INTEGER tick;
extern LARGE_INTEGER sys_freq;
QueryPerformanceCounter(&tick);
return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;
}
Visual Studio上的运行结果是:
double operator* 41.2031 ms
valarray operator* 43.8407 ms
valarray[i] operator* 42.49 ms
答案 4 :(得分:1)
我正在x64版Visual Studio 2010中编译。我稍微改变了你的代码:
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
a1[i] *= b1[i];
dtime = gettime_hp() - dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
a *= b;
dtime = gettime_hp() - dtime;
cout << "valarray operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
a[i] *= b[i];
dtime = gettime_hp() - dtime;
cout << "valarray[i] operator* " << dtime<< " ms\n";
cout << "------------------------------------------------------\n" ;
在这里你可以看到我使用了* =而不是c = a * b
。在更现代的数学库中,使用非常复杂的表达模板机制来消除这个问题。在这种情况下,我实际上从valarray获得了非常快的结果,尽管这可能只是因为内容已经在缓存中。你看到的开销只是冗余的临时数,而且对于valarray来说并不是内在的,特别是你会看到std::string
之类的相同行为。
答案 5 :(得分:-1)
嗯..我测试了Blitz++,它与valarray相同......而且,Blitz ++ []
运算符非常慢。
#include <blitz/array.h>
#include <iostream>
#ifdef WIN32
#include "windows.h"
LARGE_INTEGER sys_freq;
#endif
#ifdef LINUX
<ctime>
#endif
using namespace std;
SYSTEMTIME stime;
__forceinline double gettime_hp();
double gettime_hp()
{
#ifdef WIN32
LARGE_INTEGER tick;
extern LARGE_INTEGER sys_freq;
QueryPerformanceCounter(&tick);
return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
#endif
#ifdef LINUX
struct timespec timestamp;
clock_gettime(CLOCK_REALTIME, ×tamp);
return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
#endif
}
BZ_USING_NAMESPACE(blitz)
int main()
{
int N = 5*1024*1024;
// Create three-dimensional arrays of double
Array<double, 1> a(N), b(N), c(N);
int i, j;
#ifdef WIN32
QueryPerformanceFrequency(&sys_freq);
#endif
for (j=0 ; j<8 ; ++j)
{
for (i=0 ; i<N ; ++i)
{
a[i] = rand();
b[i] = rand();
}
double* a1 = a.data(), *b1 = b.data(), *c1 = c.data();
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c1[i] = a1[i] * b1[i];
dtime = gettime_hp() - dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
c = a*b;
dtime = gettime_hp() - dtime;
cout << "blitz operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c[i] = a[i] * b[i];
dtime = gettime_hp() - dtime;
cout << "blitz[i] operator* " << dtime<< " ms\n";
cout << "------------------------------------------------------\n";
}
}
答案 6 :(得分:-1)
我认为Michael Burr的回答是正确的。也许您可以创建一个虚拟类型作为运算符+
的返回值的类型,并为此虚拟类型重新加载另一个operator=
,如operator=(virtual type& v){&valarray=&v;v=NULL;}
(粗略地说)。
当然,很难在valarray上实现这个想法。但是当你创建一个新类时,你可以尝试这个想法。然后,operator+
的效率几乎与operator+=
相同。