我很好奇为什么以下代码在针对x64时运行速度明显变慢。
#include <iostream>
#include <cstdio>
#include <chrono>
#include <random>
#include <memory>
#include <climits>
#include "Timer.h"
class Color
{
public:
union
{
unsigned int c;
struct
{
unsigned char b;
unsigned char g;
unsigned char r;
unsigned char x;
};
};
public:
Color()
{}
Color( unsigned int c )
:
c( c )
{}
Color( unsigned char r,unsigned char g,unsigned char b )
:
Color( 0u,r,g,b )
{}
Color( unsigned char x,unsigned char r,unsigned char g,unsigned char b )
:
r( r ),g( g ),b( b ),x( x )
{}
Color& operator =( unsigned int c )
{
this->c = c;
return *this;
}
operator unsigned int() const
{
return c;
}
};
void RandomizeBuffer( Color* buf,unsigned int width,unsigned int height )
{
static std::random_device rd;
static std::mt19937 rng( rd() );
static std::uniform_int_distribution<unsigned int> dist( 0u,UINT_MAX );
for( unsigned int i = 0u; i < width * height; i++ )
{
buf[i] = dist( rng );
}
}
void ReduceBuffer( const Color* bufIn,Color* bufOut,unsigned int width,unsigned int height )
{
for( unsigned int y = 0u; y < height / 4u; y++ )
{
for( unsigned int x = 0u; x < width / 4u; x++ )
{
const Color p0 = bufIn[( y * 4u ) * width + x * 4u];
const Color p1 = bufIn[( y * 4u ) * width + x * 4u + 1u];
const Color p2 = bufIn[( y * 4u ) * width + x * 4u + 2u];
const Color p3 = bufIn[( y * 4u ) * width + x * 4u + 3u];
const Color p4 = bufIn[( y * 4u + 1u ) * width + x * 4u];
const Color p5 = bufIn[( y * 4u + 1u ) * width + x * 4u + 1u];
const Color p6 = bufIn[( y * 4u + 1u ) * width + x * 4u + 2u];
const Color p7 = bufIn[( y * 4u + 1u ) * width + x * 4u + 3u];
const Color p8 = bufIn[( y * 4u + 2u ) * width + x * 4u];
const Color p9 = bufIn[( y * 4u + 2u ) * width + x * 4u + 1u];
const Color p10 = bufIn[( y * 4u + 2u ) * width + x * 4u + 2u];
const Color p11 = bufIn[( y * 4u + 2u ) * width + x * 4u + 3u];
const Color p12 = bufIn[( y * 4u + 3u ) * width + x * 4u];
const Color p13 = bufIn[( y * 4u + 3u ) * width + x * 4u + 1u];
const Color p14 = bufIn[( y * 4u + 3u ) * width + x * 4u + 2u];
const Color p15 = bufIn[( y * 4u + 3u ) * width + x * 4u + 3u];
bufOut[x + y * ( width / 4u )] = Color {
static_cast<unsigned char>(
( p0.r * p0.x + p1.r * p1.x + p2.r * p2.x + p3.r * p3.x +
p4.r * p4.x + p5.r * p5.x + p6.r * p6.x + p7.r * p7.x +
p8.r * p8.x + p9.r * p9.x + p10.r * p10.x + p11.r * p11.x +
p12.r * p12.x + p13.r * p13.x + p14.r * p14.x + p15.r * p15.x )
/ ( 16u * 255u ) ),
static_cast<unsigned char>(
( p0.g * p0.x + p1.g * p1.x + p2.g * p2.x + p3.g * p3.x +
p4.g * p4.x + p5.g * p5.x + p6.g * p6.x + p7.g * p7.x +
p8.g * p8.x + p9.g * p9.x + p10.g * p10.x + p11.g * p11.x +
p12.g * p12.x + p13.g * p13.x + p14.g * p14.x + p15.g * p15.x )
/ ( 16u * 255u ) ),
static_cast<unsigned char>(
( p0.b * p0.x + p1.b * p1.x + p2.b * p2.x + p3.b * p3.x +
p4.b * p4.x + p5.b * p5.x + p6.b * p6.x + p7.b * p7.x +
p8.b * p8.x + p9.b * p9.x + p10.b * p10.x + p11.b * p11.x +
p12.b * p12.x + p13.b * p13.x + p14.b * p14.x + p15.b * p15.x )
/ ( 16u * 255u ) )
};
}
}
}
int main()
{
const unsigned int width = 800u;
const unsigned int height = 600u;
std::unique_ptr<Color[]> bufFull = std::make_unique<Color[]>( width * height );
std::unique_ptr<Color[]> bufReduce = std::make_unique<Color[]>( (width / 4u) * (height / 4u) );
Timer t;
for( int i = 0; i < 20; i++ )
{
RandomizeBuffer( bufFull.get(),width,height );
t.StartWatch();
ReduceBuffer( bufFull.get(),bufReduce.get(),width,height );
t.StopWatch();
std::cout << t.GetTimeMilli() << std::endl;
}
std::cout << "Press enter to continue..." << std::endl;
std::getchar();
}
上面的代码在编译为32位可执行文件时,在大约0.76 ms内执行对ReduceBuffer()
的单个调用,在编译为64位可执行文件时执行大约1.26 ms。
编译器是VS2013。优化设置为最大化速度并支持快速代码。在运行Win7-x64的Haswell上执行。 Timer
类是QueryPerformanceCounter
的简单包装器,因为VS2013实现了high_resolution_clock
任何关于为什么会出现这种差异或者甚至是其他编制者的结果的见解都会受到欢迎。