我是C ++编程的新手。我试图看到将所有MatLab软件转移到C ++的好处。我正在做一些有限元素的东西,主要是非线性的,所以我需要大量执行的操作之一是两个向量的交叉积。我已经在Matlab和C ++中测试了两个实现,C ++似乎要快得多。在C ++中,两种不同的实现提供不同的时序。我正在使用英特尔MKL。
以下是代码:
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
#include <mkl.h>
void vprod( double vgr[3], double vg1[3], double vg2[3]);
int main() {
double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
int LC=1000000;
int i,j,k;
double tiempo=0.0, tinicial;
//------------------------------------------------------------------------
std::cout << "INLINE METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<LC; i++){
vr[0] = v1[1]*v2[2]-v1[2]*v2[1];
vr[1] =-(v1[0]*v2[2]-v1[2]*v2[0]);
vr[2] = v1[0]*v2[1]-v1[1]*v2[0];
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
//------------------------------------------------------------------------
std::cout << "FUNCTION METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<LC; i++){
vprod (vr,v1,v2);
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
std::cin.ignore();
return 0;
}
inline void vprod( double vgr[3], double vg1[3], double vg2[3]){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
我的问题是:为什么第一次实施比第二次快3倍?这是函数调用开销的结果吗?谢谢!!!
编辑:我修改了代码,以避免编译器使用常量向量“猜测”循环的结果。正如@phonetagger所示,结果非常不同。我没有使用vprod函数有28500微秒,使用vprod
函数有29000微秒。该数字是使用Ox优化获得的。如果内联关键字打开,更改优化不会影响比较,尽管数字会增加一些。此外,如果未使用内联关键字(并且优化已关闭),则不使用vprod函数时计时为32000,使用该函数时计时为37000。因此函数调用开销可能约为5000微秒。
新代码是:
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
#include <mkl.h>
//#include <mkl_lapack.h>
void vprod( double *vgr, int ploc, double *vg1, double *vg2);
int main() {
int nv=1000000;
int dim=3*nv;
double *v1, *v2, *vr; // Declare Pointers
int ploc, i;
double tiempo=0.0, tinicial;
v1 = new double [dim]; //Allocate block of memory
v2 = new double [dim];
vr = new double [dim];
// Fill vectors with something
for (i = 0; i < dim; i++) {
v1[i] =1.25 + (double)(i+1);
v2[i] =2.62+ 2*(double)(i+7);
}
//------------------------------------------------------------------------
std::cout << "RUTINA CON CODIGO INLINE: \n" ;
tinicial = dsecnd();
ploc = 0; // ploc points to an intermediate location.
for (i=0; i<nv; i++){
vr[ploc] = v1[ploc+1]*v2[ploc+2]-v1[ploc+2]*v2[ploc+1];
vr[ploc+1] =-(v1[ploc]*v2[ploc+2]-v1[ploc+2]*v2[ploc]);
vr[ploc+2] = v1[ploc]*v2[ploc+1]-v1[ploc+1]*v2[ploc];
ploc +=3;
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << ".\n";
std::cout << "Resultado: " << vr[0] << ".\n";
delete v1,v2,vr;
v1 = new double [dim]; //Allocate block of memory
v2 = new double [dim];
vr = new double [dim];
//------------------------------------------------------------------------
//------------------------------------------------------------------------
std::cout << "RUTINA LLAMANDO A FUNCION: \n" ;
ploc=0;
tinicial = dsecnd();
for (i=0; i<nv; i++){
vprod ( vr, ploc, v1, v2);
ploc +=3;
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << ".\n";
std::cout << "Resultado: " << vr[0] << ".\n";
//------------------------------------------------------------------------
std::cin.ignore();
return 0;
}
inline void vprod( double *vgr, int ploc, double *vg1, double *vg2) {
vgr[ploc] = vg1[ploc+1]*vg2[ploc+2]-vg1[ploc+2]*vg2[ploc+1];
vgr[ploc+1] = -(vg1[ploc]*vg2[ploc+2]-vg1[ploc+2]*vg2[ploc]);
vgr[ploc+2] = vg1[ploc]*vg2[ploc+1]-vg1[ploc+1]*vg2[ploc];
}
答案 0 :(得分:3)
我不知道你正在使用什么编译器(“MKL”是一个编译器套件?),但无论你使用什么编译器,优化级别都会对代码的性能产生巨大影响,有时候多个数量级,具体取决于您的编码风格和你是否试图“玩弄技巧”来让你的代码运行得更快。通常(尽管不总是)让编译器为你发挥技巧更好,你只需要专注于编写有效的算法而不是编写技巧。
无论如何,我以各种方式在我的系统上运行你的代码,结果显示在下面的代码注释中......
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
//#include <mkl.h>
// My standin for dsecnd() since I don't have "mkl.h"...
#include <sys/time.h>
double dsecnd()
{
struct timeval tv;
if (gettimeofday(&tv,NULL))
{
fprintf(stderr,"\ngettimeofday() error\n\n");
exit(1);
}
return tv.tv_sec*1000000 + tv.tv_usec; // ...returns MICROSECONDS
//return tv.tv_sec + ((double)tv.tv_usec)/1000000; // ...returns SECONDS
}
//---------------------------------
// Uncomment one or both of these to test variations....
//#define USE_INLINE_KEYWORD
//#define DEFINE_vprod_AT_TOP
//
// Using g++ (GCC) 4.1.2 20080704 (Red Hat 4.1.2-52) on an x86 machine...
//
// microseconds microseconds
// "hardcoded inline" "via vprod() function"
// [i]=inlined, [-]=not
// ------------------ ----------------------
// inline keyword, at top
// no optimization 9501 17797 [-]
// optimization -O1 2 (see NOTE) 1 [i]
// optimization -O2 1 1 [i]
// optimization -O3 0 0 [i]
//
// no inline keyword, at top
// no optimization 9630 18203 [-]
// optimization -O1 1257 10681 [-]
// optimization -O2 1272 10694 [-]
// optimization -O3 0 1 [i]
//
// inline keyword, at bottom
// no optimization 9763 18333 [-]
// optimization -O1 1 0 [i]
// optimization -O2 2 1 [i]
// optimization -O3 0 0 [i]
//
// no inline keyword, at bottom
// no optimization 9900 18387 [-]
// optimization -O1 1289 10714 [-]
// optimization -O2 795 6740 [-]
// optimization -O3 1 0 [i]
//
// Note that in all cases, both results were reported as -213.458.
//
// NOTE: Especially since I'm using gettimeofday() instead of something
// that returns process (CPU) time, all results may include some
// time that the CPU spent processing other stuff, but even if
// that weren't the case (i.e. even if I used a function that
// returned only CPU time spent on this particular process), there
// would still be the quantization error of +/-1 microsecond on
// each end of the interval, meaning +/-2 microseconds overall.
//
/* My cut & paste "build & test script" to run on the Linux command prompt...
echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""
rm -f a.out; g++ so.cpp
echo ""; echo "No optimization:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O1 so.cpp
echo ""; echo "Optimization -O1:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O2 so.cpp
echo ""; echo "Optimization -O2:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O3 so.cpp
echo ""; echo "Optimization -O3:---------------"; objdump -d a.out | grep call | grep vprod; a.out
...if the "objdump -d a.out | grep call | grep vprod" command returns something
like "call 8048754 <_Z5vprodPdS_S_>", then I know that the call to vprod() is
NOT inlined, whereas if it returns nothing, I know the call WAS inlined. There
is only one caller of vprod(), so the results can't be confusing.
*/
//
//---------------------------------
#ifdef DEFINE_vprod_AT_TOP
#ifdef USE_INLINE_KEYWORD
inline
#endif
void vprod( double vgr[3], double vg1[3], double vg2[3]){
//void vprod( double *vgr, double *vg1, double *vg2){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
#else
// Declare (prototype) the function only if NOT defining it at the top...
void vprod( double vgr[3], double vg1[3], double vg2[3]);
#endif
int main() {
double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
int LC=1000000L;
int i,j,k;
double tiempo=0.0, tinicial;
//------------------------------------------------------------------------
std::cout << "INLINE METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<LC; i++){
vr[0] = v1[1]*v2[2]-v1[2]*v2[1];
vr[1] =-(v1[0]*v2[2]-v1[2]*v2[0]);
vr[2] = v1[0]*v2[1]-v1[1]*v2[0];
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
//------------------------------------------------------------------------
std::cout << "FUNCTION METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<LC; i++){
vprod (vr,v1,v2);
};
tiempo = (dsecnd() - tinicial);
std::cout << "Tiempo Total: " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
// std::cin.ignore();
return 0;
}
#ifndef DEFINE_vprod_AT_TOP
#ifdef USE_INLINE_KEYWORD
inline
#endif
void vprod( double vgr[3], double vg1[3], double vg2[3]){
//void vprod( double *vgr, double *vg1, double *vg2){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
#endif
现在编译器使用的编码技巧并没有随着优化程度的提高而以线性方式出现;编译器播放的技巧在不同的优化级别打开,可能取决于您是否使用“内联”关键字。可能存在(并且我的结果表明存在)编译器可以采用的不同类型的优化,而不是内联函数。有趣的是,正如我所读到的,“内联”关键字实际上只是对编译器的一个建议,你想要一个函数内联,并且可能只是调整一些阈值来确定是否内联它可能内联的函数无论如何,如果打开优化。看起来,在关闭优化的情况下,即使使用“inline”关键字,也从未内联函数。值得注意的是,prod()是在main()之上还是在main()之上定义,似乎对函数是否内联没有区别。
答案 1 :(得分:1)
我将测试代码重新编码为三个单独的文件(一个标题和两个源文件)并打破了计算和放大计算。循环到一个单独的函数,以防止编译器过于聪明的优化。现在它无法将循环优化为编译时计算。以下是我的新结果。请注意,我在原始0到1000000循环周围添加了另一个循环(0到50),然后除以50.我这样做有两个原因:它允许我们将今天的数字与之前的数字进行比较,并且它还平均了不规则性由于在测试过程中进行交换。这可能对您没有关系,因为我认为dsecnd()仅报告其特定进程的CPU时间?
无论如何,这是我的新结果.......
(是的,“内联关键字,优化-O1”比-O2或-O3更快的奇怪结果是可重复的,因为“无内联关键字,优化-O1”的奇怪之处。我没有挖掘进入大会,看看为什么会这样。)
//========================================================================================
// File: so.h
void loop_inline( const int LC, double vgr[3], double vg1[3], double vg2[3]);
void loop_func( const int LC, double vgr[3], double vg1[3], double vg2[3]);
//---------------------------------
// Comment or uncomment to test both ways...
#define USE_INLINE_KEYWORD
//
// Using g++ (GCC) 4.1.2 20080704 (Red Hat 4.1.2-52) on an x86 machine...
//
// microseconds microseconds
// "hardcoded inline" "via vprod() function"
// [i]=inlined, [-]=not
// ------------------ ----------------------
// inline keyword
// no optimization 11734 14598 [-]
// optimization -O1 4617 4616 [i]
// optimization -O2 7754 7838 [i]
// optimization -O3 7777 7673 [i]
//
// no inline keyword
// no optimization 11807 14602 [-]
// optimization -O1 4651 7691 [-]
// optimization -O2 7755 7383 [-]
// optimization -O3 7921 7432 [-]
//
// Note that in all cases, both results were reported as -213.458.
//
/* My cut & paste "build & test script" to run on the Linux command prompt...
echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""; echo ""
rm -f a.out; g++ -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "No optimization:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O1 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O1:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O2 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O2:---------------"; objdump -d a.out | grep call | grep vprod; a.out
rm -f a.out; g++ -O3 -c so.cpp so2.cpp; g++ so.o so2.o;
echo ""; echo "Optimization -O3:---------------"; objdump -d a.out | grep call | grep vprod; a.out
...if the "objdump -d a.out | grep call | grep vprod" command returns something
like "call 8048754 <_Z5vprodPdS_S_>", then I know that the call to vprod() is
NOT inlined, whereas if it returns nothing, I know the call WAS inlined.
*/
//========================================================================================
// File: so.cpp
// Sorry so messy, I didn't bother to clean up the #includes.......
#include <stdint.h>
#include <inttypes.h>
#include <stddef.h> // for NULL
#include <stdlib.h> // for exit()
#include <stdio.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>
//#include <mkl.h>
#include "so.h"
// My standin for dsecnd() since I don't have "mkl.h"...
#include <sys/time.h>
double dsecnd()
{
struct timeval tv;
if (gettimeofday(&tv,NULL))
{
fprintf(stderr,"\ngettimeofday() error\n\n");
exit(1);
}
return tv.tv_sec*1000000 + tv.tv_usec; // ...returns MICROSECONDS
//return tv.tv_sec + ((double)tv.tv_usec)/1000000; // ...returns SECONDS
}
//---------------------------------
#ifndef USE_INLINE_KEYWORD
// We're NOT using the 'inline' keyword, so define vprod() in this
// file so it can't possibly be inlined where it's called (in the
// other source file).
void vprod( double vgr[3], double vg1[3], double vg2[3]){
//void vprod( double *vgr, double *vg1, double *vg2){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
#endif
int main() {
double v1[3]={1.22, 2.65, 3.65}, v2[3]={6.98, 98.159, 54.65}, vr[3];
int LC=1000000L;
int i, N=100;
double tiempo=0.0, tinicial;
//------------------------------------------------------------------------
std::cout << "INLINE METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<N; ++i)
loop_inline(LC,vr,v1,v2);
tiempo = (dsecnd() - tinicial)/N;
std::cout << "Tiempo Total: " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
//------------------------------------------------------------------------
std::cout << "FUNCTION METHOD: " << std::endl;
tinicial = dsecnd();
for (i=0; i<N; ++i)
loop_func(LC,vr,v1,v2);
tiempo = (dsecnd() - tinicial)/N;
std::cout << "Tiempo Total: " << tiempo << std::endl;
std::cout << "Resultado: " << vr[0] << std::endl;
//------------------------------------------------------------------------
// std::cin.ignore();
return 0;
}
//========================================================================================
// File: so2.cpp
#include "so.h"
#ifdef USE_INLINE_KEYWORD
inline void vprod( double vgr[3], double vg1[3], double vg2[3]){
//void vprod( double *vgr, double *vg1, double *vg2){
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
#else
// Not using 'inline' keyword, so just declare (prototype) the
// function here and define it in the other source file (so it
// can't possibly be inlined).
void vprod( double vgr[3], double vg1[3], double vg2[3]);
#endif
void loop_inline( const int LC, double vgr[3], double vg1[3], double vg2[3]){
for (int i=0; i<LC; i++) {
vgr[0] = vg1[1]*vg2[2]-vg1[2]*vg2[1];
vgr[1] =-(vg1[0]*vg2[2]-vg1[2]*vg2[0]);
vgr[2] = vg1[0]*vg2[1]-vg1[1]*vg2[0];
}
}
void loop_func( const int LC, double vgr[3], double vg1[3], double vg2[3]){
for (int i=0; i<LC; i++) {
vprod (vgr,vg1,vg2);
}
}