我正在尝试并行化我设计FIR Filter的代码。因为我选择了parallel_reduce。当我在Windows上执行代码时需要15秒和相同的代码,当我在linux上执行它需要大约2.5秒。 Windows我在VS 2010上执行代码,英特尔性能库TBB已启用,在linux中,我通过包含TBB库和g ++编译器来编译终端。 由于处理器是相同的,代码也会在同一处理器上执行,为什么这个操作系统有所不同?
我使用的代码是:
#include<iostream>
#include "tbb/task_scheduler_init.h"
#include "tbb/parallel_for.h"
#include "tbb/blocked_range.h"
#include "tbb/compat/thread"
#include "tbb/parallel_reduce.h"
#include <math.h>
#include <fstream>
using namespace tbb;
using namespace std;
#define pi 3.141593
#define FILTER_LEN 265
double coeffs[ FILTER_LEN ] =
{
0.0033473431384214393,0.000032074683390218124,0.0033131082058404943,0.0024777666109278788,
-0.0008968429179843104,-0.0031973449396977684,-0.003430943381749411,-0.0029796565504781646,
-0.002770673157048994,-0.0022783059845596586,-0.0008531818129514857,0.001115432556294998,
0.0026079871108133294,0.003012423848769931,0.002461420635709332,0.0014154004589753215,
0.00025190669718400967,-0.0007608257014963959,-0.0013703600874774068,-0.0014133823230551277,
-0.0009759556503342884,-0.00039687498737139273,-0.00007527524701314324,-0.00024181463305012626,
-0.0008521761947454302,-0.00162618205097997,-0.002170446498273018,-0.002129903305507943,
-0.001333859049002249,0.00010700092934983156,0.0018039564602637683,0.0032107930896349583,
0.0038325849735515363,0.003416201274366522,0.002060848732332109,0.00017954815260431595,
-0.0016358832300944531,-0.0028402136847527387,-0.0031256650498727384,-0.0025374271571154713,
-0.001438370315670195,-0.00035115295209013755,0.0002606730012030533,0.0001969569787142967,
-0.00039635535951198597,-0.0010886127490608972,-0.0013530057243606405,-0.0008123200399262436,
0.0005730271959526784,0.0024419465938120906,0.004133717273258681,0.0049402122577746265,
0.0043879285604252714,0.002449549610687005,-0.00040283102645093463,-0.003337730734820209,
-0.0054508346511294775,-0.006093057767824609,-0.005117609782189977,-0.0029293645861970417,
-0.0003251033117661085,0.0018074390555649442,0.0028351284091668164,0.002623563404428517,
0.0015692864792199496,0.0004127664681096788,-0.00009249878881824428,0.0004690173244168184,
0.001964334172374759,0.0037256715492873485,0.004809640399145206,0.004395274594482053,
0.0021650921193604,-0.0014888595443799124,-0.005534807968511709,-0.008642334104607624,
-0.009668950651149259,-0.008104732391434574,-0.004299972815463919,0.0006184612821881392,
0.005136551428636121,0.007907786753766152,0.008241212326068366,0.00634786595941524,
0.003235610213062744,0.00028882736660937287,-0.001320994685952108,-0.0011237433853145615,
0.00044213409507615003,0.0022057106517524255,0.00277593527678719,0.0011909915058737617,
-0.0025807757230413447,-0.007497632882437637,-0.011739520895818884,-0.013377018279057393,
-0.011166543231844196,-0.005133056165990026,0.0032948631959114935,0.011673660427968408,
0.017376415708412904,0.018548938130314566,0.014811760899506572,0.007450782505155853,
-0.001019540069785369,-0.007805775815783898,-0.010898333714715424,-0.00985364043415772,
-0.005988406030111452,-0.001818560524968024,0.000028552677472614846,-0.0019938756495376363,
-0.007477684025727061,-0.013989430449615033,-0.017870518868849213,-0.015639422062597726,
-0.005624959109456065,0.010993528170353541,0.03001263681283932,0.04527492462846608,
0.050581340787164114,0.041949186532860346,0.019360612460662185,-0.012644336735920483,
-0.0458782599058412,-0.07073838953156347,-0.0791205623455818,-0.06709535677423759,
-0.03644544574795176,0.005505370370858695,0.04780486657828151,0.07898800597378192,
0.0904453420042807,0.07898800597378192,0.04780486657828151,0.005505370370858695,
-0.03644544574795176,-0.06709535677423759,-0.0791205623455818,-0.07073838953156347,
-0.0458782599058412,-0.012644336735920483,0.019360612460662185,0.041949186532860346,
0.050581340787164114,0.04527492462846608,0.03001263681283932,0.010993528170353541,
-0.005624959109456065,-0.015639422062597726,-0.017870518868849213,-0.013989430449615033,
-0.007477684025727061,-0.0019938756495376363,0.000028552677472614846,-0.001818560524968024,
-0.005988406030111452,-0.00985364043415772,-0.010898333714715424,-0.007805775815783898,
-0.001019540069785369,0.007450782505155853,0.014811760899506572,0.018548938130314566,
0.017376415708412904,0.011673660427968408,0.0032948631959114935,-0.005133056165990026,
-0.011166543231844196,-0.013377018279057393,-0.011739520895818884,-0.007497632882437637,
-0.0025807757230413447,0.0011909915058737617,0.00277593527678719,0.0022057106517524255,
0.00044213409507615003,-0.0011237433853145615,-0.001320994685952108,0.00028882736660937287,
0.003235610213062744,0.00634786595941524,0.008241212326068366,0.007907786753766152,
0.005136551428636121,0.0006184612821881392,-0.004299972815463919,-0.008104732391434574,
-0.009668950651149259,-0.008642334104607624,-0.005534807968511709,-0.0014888595443799124,
0.0021650921193604,0.004395274594482053,0.004809640399145206,0.0037256715492873485,
0.001964334172374759,0.0004690173244168184,-0.00009249878881824428,0.0004127664681096788,
0.0015692864792199496,0.002623563404428517,0.0028351284091668164,0.0018074390555649442,
-0.0003251033117661085,-0.0029293645861970417,-0.005117609782189977,-0.006093057767824609,
-0.0054508346511294775,-0.003337730734820209,-0.00040283102645093463,0.002449549610687005,
0.0043879285604252714,0.0049402122577746265,0.004133717273258681,0.0024419465938120906,
0.0005730271959526784,-0.0008123200399262436,-0.0013530057243606405,-0.0010886127490608972,
-0.00039635535951198597,0.0001969569787142967,0.0002606730012030533,-0.00035115295209013755,
-0.001438370315670195,-0.0025374271571154713,-0.0031256650498727384,-0.0028402136847527387,
-0.0016358832300944531,0.00017954815260431595,0.002060848732332109,0.003416201274366522,
0.0038325849735515363,0.0032107930896349583,0.0018039564602637683,0.00010700092934983156,
-0.001333859049002249,-0.002129903305507943,-0.002170446498273018,-0.00162618205097997,
-0.0008521761947454302,-0.00024181463305012626,-0.00007527524701314324,-0.00039687498737139273,
-0.0009759556503342884,-0.0014133823230551277,-0.0013703600874774068,-0.0007608257014963959,
0.00025190669718400967,0.0014154004589753215,0.002461420635709332,0.003012423848769931,
0.0026079871108133294,0.001115432556294998,-0.0008531818129514857,-0.0022783059845596586,
-0.002770673157048994,-0.0029796565504781646,-0.003430943381749411,-0.0031973449396977684,
-0.0008968429179843104,0.0024777666109278788,0.0033131082058404943,0.000032074683390218124,
0.0033473431384214393
};
class SumFoo
{
double* my_a;
public:
double sum;
static int count;
int ip,nip;
void operator( )( const blocked_range<size_t>& r )
{
double *a = my_a;
// cout<<"id of thread is \t"<<this_thread::get_id()<<endl;
// cout<<"r.begin is "<<r.begin()<<"\t r.end is "<<r.end()<<endl;
ip=( FILTER_LEN - 1 + (SumFoo::count));
for( size_t k=r.begin(); k!=r.end( ); ++k )
{
nip=ip-k;
sum+= ((coeffs[k]) * (a[nip]));
}
}
SumFoo( SumFoo& x, split ) : my_a(x.my_a), sum(0)
{
//cout<<"split Constructor called"<<endl;
}
void join( const SumFoo& y )
{
// cout<<"Joining all the sums"<<endl;
sum+=y.sum;
}
SumFoo(double a[] ) :my_a(a), sum(0)
{
// cout<<"Constructor called"<<endl;
}
};
void ParallelSumFoo(double *a, size_t n ,ofstream &o)
{
SumFoo sf(a);
for(int j=264;j<150264;j++)
{
SumFoo::count=j-264;
parallel_reduce(blocked_range<size_t>(0,265), sf,auto_partitioner() );
o<<j<<","<<sf.sum<<endl;
}
}
int SumFoo::count=0;
int main()
{
ofstream o("400hzreduce.csv");
double *buffer=new double[150264];
fill_n(buffer,150264,0);
tick_count t0=tick_count::now();
for(int i=264;i<150264;i++)
{
buffer[i] = sin(400 * (2 * pi) * (i / 5000.0));
o<<i<<","<<buffer[i]<<endl;
}
cout<<fixed;
ParallelSumFoo(buffer,150264,o);
tick_count t1=tick_count::now();
double t9=(t1-t0).seconds();
cout<<"Time Taken for parallel execution is \t"<<t9<<"seconds"<<endl;
}
请帮助我找到我错的地方?
答案 0 :(得分:2)
你在两个操作系统上都有类似的编译器优化选项吗? -O3比没有什么与gcc可以产生那种差异。使用visual studio我不太确定选项,但我相信你可以通过GUI搜索并找到它们。
没有parallel_reduce的两个系统上的运行时间是多少?这将需要1级复杂性。
您是否尝试过分析代码?我建议valgrind --tool=callgrind
和kcachegrind在Linux中查看结果。这应该有助于缩小人们的反应范围。
答案 1 :(得分:0)
在这段代码中,数据被写入文件,这使得执行时间产生巨大差异。将数据写入文件所花费的时间在linux中与windows不同,这就是时间不同的原因,否则TBB没有任何区别。