为什么这个普通数组实现比std :: vector实现性能慢?
由于我正在研究一些我正在研究的结果,我决定编写一个简化的测试来比较std::vector
与普通数组效率。
我有一个结构,我以两种方式实现,
typedef struct {
uint16_t index;
uint16_t nvals;
uint16_t vals[50];
double mean;
} a_segment_t;
typedef struct {
uint16_t index;
uint16_t nvals;
vector<uint16_t> vals;
uint32_t mean;
} b_segment_t;
在内存中创建此对象并不是我感兴趣的(所以我不介意push_back()
),一旦此对象在内存中,它就用于操作而且效率就是我正在分析的。 vals
填充了一些随机数据。
操作遍历存储在每个段中的val,在这种情况下是简单的平均计算。测试如下:
using namespace std;
#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>
#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000
// plain array approach
typedef struct {
uint16_t index;
uint16_t nvals;
uint16_t vals[MAX_NPXS];
double mean;
} a_segment_t;
uint16_t operation(uint16_t, a_segment_t*);
uint16_t print(uint16_t nsegments, a_segment_t* p_segments);
// stl vector approach
typedef struct {
uint16_t index;
uint16_t nvals;
vector<uint16_t> vals;
uint32_t mean;
} b_segment_t;
uint16_t operation(uint16_t, vector<b_segment_t>*);
uint16_t print(uint16_t nsegments, vector<b_segment_t>*);
void delta_time(struct timespec*, struct timespec*, struct timespec*);
uint16_t operation(uint16_t nsegments, a_segment_t* p_segments) {
// the operation (plain array approach)
uint64_t sum;
for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
sum = 0;
for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
sum = sum + p_segments[nsegment].vals[nval];
}
p_segments[nsegment].mean = sum/p_segments[nsegment].nvals;
}
return nsegments;
}
uint16_t print(uint16_t nsegments, a_segment_t* p_segments) {
// print data (plain array approach)
for( uint16_t nsegment = 0; nsegment < nsegments; ++nsegment ) {
cout << "index : " << setfill('0') << setw(3) << p_segments[nsegment].index;
cout << "\tnval : " << setfill('0') << setw(3) << p_segments[nsegment].nvals;
cout << "\tvals : [";
for(uint16_t nval = 0; nval < p_segments[nsegment].nvals; ++nval){
cout << p_segments[nsegment].vals[nval] << ",";
}
cout << "\b]" << endl;
}
return nsegments;
}
uint16_t operation(uint16_t nsegments, vector<b_segment_t>* p_segments) {
// the operation (stl vector approach)
uint32_t sum;
for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
sum = 0;
for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
sum = sum + (*p_val);
}
p_segment->mean = sum/(p_segment->nvals);
}
return nsegments;
}
uint16_t print(uint16_t nsegments, vector<b_segment_t>* p_segments) {
// print data (stl vector approach)
for (vector<b_segment_t>::iterator p_segment = p_segments->begin(); p_segment<p_segments->end(); ++p_segment) {
cout << "index : " << setfill('0') << setw(3) << p_segment->index;
cout << "\tnval : " << setfill('0') << setw(3) << p_segment->nvals;
cout << "\tvals : [";
for (vector<uint16_t>::iterator p_val = (p_segment->vals).begin(); p_val<(p_segment->vals).end(); ++p_val) {
cout << *p_val << ",";
}
cout << "\b]" << endl;
}
return nsegments;
}
void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
if ((t2->tv_nsec - t1->tv_nsec) < 0) {
dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
} else {
dt->tv_sec = t2->tv_sec - t1->tv_sec;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
}
return;
}
int main(int argc, char const *argv[]) {
uint16_t nsegments = NSEGMENTS;
uint16_t nsegment = 0;
uint16_t i = 0;
//create an populate the segments with dummy data (plain array approach)
a_segment_t* a_segments = new a_segment_t[nsegments];
for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
a_segments[nsegment].index = nsegment;
srand(nsegment);
a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
a_segments[nsegment].vals[nval] = nval;
}
}
//create an populate the segments with dummy data (stl vector approach)
nsegment = 0;
vector<b_segment_t> b_segments(nsegments);
for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
p_segment->index = nsegment;
srand(nsegment);
p_segment->nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
p_segment->vals.push_back(nval);
}
nsegment++;
}
// print(nsegments, a_segments);
// cout << "===================================" << endl;
// print(nsegments, &b_segments);
// cout << "===================================" << endl;
// ======================= plain array timing measure ========================
struct timespec a_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(nsegments, a_segments);
clock_gettime(CLOCK_REALTIME, &(a_times[i]));
}
// ===========================================================================
// ========================= vector timing measure ===========================
struct timespec b_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(nsegments, &b_segments);
clock_gettime(CLOCK_REALTIME, &(b_times[i]));
}
// ===========================================================================
// =========================== timing console log ============================
struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
cout << "\t\t plain array\t\t stl vector" << endl;
cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
for(i = 0; i < N-1; i=i+1000) {
delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
cout << i << ",\t"
<< a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
<< a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
<< b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
<< b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
}
// ===========================================================================
}
online version。注意:所有测试都是使用-O3
编译的有人可以指出为什么普通数组实现比std::vector
实现慢?
普通阵列实现不应该更快吗?
我该怎么做才能提高普通阵列实现的速度?
答案 0 :(得分:1)
如果您根据迭代器表示算法,编译器将在优化代码方面做得更好。其中一个原因是它可以对数组索引的大小和溢出特性做出假设(它转换为带有机器码中偏移量的索引寻址)。
根据迭代器(可以是指针)重构表达MyViewHolder
和operation()
:
print()
产生预期结果:
#include <stdint.h>
#include <stdlib.h> // srand, rand
#include <time.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <array>
#include <numeric>
using namespace std;
#define NSEGMENTS 100
#define MAX_NPXS 50
#define N 10000
// plain array approach
typedef struct {
uint16_t index;
uint16_t nvals;
uint16_t vals[MAX_NPXS];
double mean;
} a_segment_t;
// stl vector approach
typedef struct {
uint16_t index;
uint16_t nvals;
vector<uint16_t> vals;
uint32_t mean;
} b_segment_t;
void delta_time(struct timespec*, struct timespec*, struct timespec*);
template<class Iter>
uint16_t operation(Iter first, Iter last) {
auto result = std::uint16_t(std::distance(first, last));
// the operation (plain array approach)
for( ; first != last ; ++first ) {
auto sum = std::accumulate(std::begin(first->vals), std::begin(first->vals) + first->nvals, uint64_t(0), std::plus<>());
first->mean = sum / first->nvals;
}
return result;
}
template<class Iter>
uint16_t print(Iter first, Iter last) {
auto result = std::uint16_t(std::distance(first, last));
// print data (plain array approach)
for( ; first != last ; ++first ) {
cout << "index : " << setfill('0') << setw(3) << first->index;
cout << "\tnval : " << setfill('0') << setw(3) << first->nvals;
cout << "\tvals : [";
for_each(std::begin(first->vals), std::begin(first->vals) + first->nvals, [](const auto& val)
{
cout << val << ",";
});
cout << "\b]" << endl;
}
return result;
}
void delta_time(struct timespec* t1, struct timespec* t2, struct timespec* dt) {
if ((t2->tv_nsec - t1->tv_nsec) < 0) {
dt->tv_sec = t2->tv_sec - t1->tv_sec - 1;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec + 1000000000;
} else {
dt->tv_sec = t2->tv_sec - t1->tv_sec;
dt->tv_nsec = t2->tv_nsec - t1->tv_nsec;
}
return;
}
int main(int argc, char const *argv[]) {
uint16_t nsegments = NSEGMENTS;
uint16_t nsegment = 0;
uint16_t i = 0;
//create an populate the segments with dummy data (plain array approach)
a_segment_t* a_segments = new a_segment_t[nsegments];
for( nsegment = 0; nsegment < nsegments; ++nsegment ) {
a_segments[nsegment].index = nsegment;
srand(nsegment);
a_segments[nsegment].nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < a_segments[nsegment].nvals; ++nval){
a_segments[nsegment].vals[nval] = nval;
}
}
//create an populate the segments with dummy data (stl vector approach)
nsegment = 0;
vector<b_segment_t> b_segments(nsegments);
for (vector<b_segment_t>::iterator p_segment = b_segments.begin(); p_segment<b_segments.end(); ++p_segment) {
p_segment->index = nsegment;
srand(nsegment);
p_segment->nvals = rand() % MAX_NPXS + 1;
for(uint16_t nval = 0; nval < p_segment->nvals; ++nval){
p_segment->vals.push_back(nval);
}
nsegment++;
}
// print(a_segments, a_segments + nsegments);
// cout << "===================================" << endl;
// print(b_segments.begin(), b_segments.end());
// cout << "===================================" << endl;
// ======================= plain array timing measure ========================
struct timespec a_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(a_segments, a_segments + nsegments);
clock_gettime(CLOCK_REALTIME, &(a_times[i]));
}
// ===========================================================================
// ========================= vector timing measure ===========================
struct timespec b_times[N];
for(i = 0; i < N; i++) {
nsegments = operation(b_segments.begin(), b_segments.begin() + nsegments);
clock_gettime(CLOCK_REALTIME, &(b_times[i]));
}
// ===========================================================================
// =========================== timing console log ============================
struct timespec a_deltatime[N], a_elapsedtime[N], b_deltatime[N], b_elapsedtime[N];
cout << "\t\t plain array\t\t stl vector" << endl;
cout << "frame #\telapsedtime\tdeltatime\telapsedtime\tdeltatime" << endl;
for(i = 0; i < N-1; i=i+1000) {
delta_time(&(a_times[0]), &(a_times[i]), &(a_elapsedtime[i]));
delta_time(&(a_times[i]), &(a_times[i+1]), &(a_deltatime[i]));
delta_time(&(b_times[0]), &(b_times[i]), &(b_elapsedtime[i]));
delta_time(&(b_times[i]), &(b_times[i+1]), &(b_deltatime[i]));
cout << i << ",\t"
<< a_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << a_elapsedtime[i].tv_nsec << ",\t"
<< a_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << a_deltatime[i].tv_nsec << ",\t"
<< b_elapsedtime[i].tv_sec << "." << setfill('0') << setw(9) << b_elapsedtime[i].tv_nsec << ",\t"
<< b_deltatime[i].tv_sec << "." << setfill('0') << setw(9) << b_deltatime[i].tv_nsec << endl;
}
// ===========================================================================
}
答案 1 :(得分:1)
这两个版本实际上并不相同。
首先,您的“数组版本”m
为mean
,“STL版本”为double
为mean
。要使两个函数远程等效,uint32_t
的计算必须相同。
其次,您的“数组版本”使用数组下标,而STL版本增加和解引用迭代器。由于编译器/优化器需要在数组版本中允许更多关注点(例如指针别名),因此可能无法优化性能。
尝试将数组版本转换为类似的内容;
mean
这将(除非我在翻译成这种形式时所犯的错误 - 我没有经过测试)给出相同的结果,但至少会给编译器一个能够应用相同类型的性能优化的战斗机会关于“STL版本”的“数组版本”。
这种事情是(有几个)C ++标准算法与迭代器一起工作的原因之一,而不是像uint16_t operation(uint16_t nsegments, a_segment_t* p_segments)
{
uint64_t sum;
for(a_segment *pseg = p_segments, *eseg = p_segments + nsegments; pseg < eseg; ++pseg)
{
sum = 0;
for(uint16_t *val = pseg->vals, *eval = pseg->vals + pseg->nvals; val < eval; ++val)
{
sum = sum + (*val);
}
p_seg->mean = sum/(pseg->nvals);
}
return nsegments;
}
这样的容器上的数组索引。编译器更有可能优化性能。请注意,指针是一种迭代器。