我尝试用std :: priority_queue替换std :: multiset。但我对速度结果感到失望。算法的运行时间增加了50%......
以下是相应的命令:
top() = begin();
pop() = erase(knn.begin());
push() = insert();
我对priority_queue实现的速度感到惊讶,我预计会有不同的结果(PQ更好)......
十个结果的平均值,MSVS 2010,Win XP,32位,方法findAllKNN2()(请参阅下面的内容)
MS
N time [s]
100 000 0.5
1 000 000 8
PQ
N time [s]
100 000 0.8
1 000 000 12
可能导致这些结果的原因是什么?没有对源代码进行其他更改...感谢您的帮助...
MS实施:
template <typename Point>
struct TKDNodePriority
{
KDNode <Point> *node;
typename Point::Type priority;
TKDNodePriority() : node ( NULL ), priority ( 0 ) {}
TKDNodePriority ( KDNode <Point> *node_, typename Point::Type priority_ ) : node ( node_ ), priority ( priority_ ) {}
bool operator < ( const TKDNodePriority <Point> &n1 ) const
{
return priority > n1.priority;
}
};
template <typename Point>
struct TNNeighboursList
{
typedef std::multiset < TKDNodePriority <Point> > Type;
};
方法:
template <typename Point>
template <typename Point2>
void KDTree2D <Point>::findAllKNN2 ( const Point2 * point, typename TNNeighboursList <Point>::Type & knn, unsigned int k, KDNode <Point> *node, const unsigned int depth ) const
{
if ( node == NULL )
{
return;
}
if ( point->getCoordinate ( depth % 2 ) <= node->getData()->getCoordinate ( depth % 2 ) )
{
findAllKNN2 ( point, knn, k, node->getLeft(), depth + 1 );
}
else
{
findAllKNN2 ( point, knn, k, node->getRight(), depth + 1 );
}
typename Point::Type dist_q_node = ( node->getData()->getX() - point->getX() ) * ( node->getData()->getX() - point->getX() ) +
( node->getData()->getY() - point->getY() ) * ( node->getData()->getY() - point->getY() );
if (knn.size() == k)
{
if (dist_q_node < knn.begin()->priority )
{
knn.erase(knn.begin());
knn.insert ( TKDNodePriority <Point> ( node, dist_q_node ) );
}
}
else
{
knn.insert ( TKDNodePriority <Point> ( node, dist_q_node ) );
}
typename Point::Type dist_q_node_straight = ( point->getCoordinate ( node->getDepth() % 2 ) - node->getData()->getCoordinate ( node->getDepth() % 2 ) ) *
( point->getCoordinate ( node->getDepth() % 2 ) - node->getData()->getCoordinate ( node->getDepth() % 2 ) ) ;
typename Point::Type top_priority = knn.begin()->priority;
if ( knn.size() < k || dist_q_node_straight < top_priority )
{
if ( point->getCoordinate ( node->getDepth() % 2 ) < node->getData()->getCoordinate ( node->getDepth() % 2 ) )
{
findAllKNN2 ( point, knn, k, node->getRight(), depth + 1 );
}
else
{
findAllKNN2 ( point, knn, k, node->getLeft(), depth + 1 );
}
}
}
PQ实施(更慢,为什么?)
template <typename Point>
struct TKDNodePriority
{
KDNode <Point> *node;
typename Point::Type priority;
TKDNodePriority() : node ( NULL ), priority ( 0 ) {}
TKDNodePriority ( KDNode <Point> *node_, typename Point::Type priority_ ) : node ( node_ ), priority ( priority_ ) {}
bool operator < ( const TKDNodePriority <Point> &n1 ) const
{
return priority > n1.priority;
}
};
template <typename Point>
struct TNNeighboursList
{
typedef std::priority_queue< TKDNodePriority <Point> > Type;
};
方法:
template <typename Point>
template <typename Point2>
void KDTree2D <Point>::findAllKNN2 ( const Point2 * point, typename TNNeighboursList <Point>::Type & knn, unsigned int k, KDNode <Point> *node, const unsigned int depth ) const
{
if ( node == NULL )
{
return;
}
if ( point->getCoordinate ( depth % 2 ) <= node->getData()->getCoordinate ( depth % 2 ) )
{
findAllKNN2 ( point, knn, k, node->getLeft(), depth + 1 );
}
else
{
findAllKNN2 ( point, knn, k, node->getRight(), depth + 1 );
}
typename Point::Type dist_q_node = ( node->getData()->getX() - point->getX() ) * ( node->getData()->getX() - point->getX() ) +
( node->getData()->getY() - point->getY() ) * ( node->getData()->getY() - point->getY() );
if (knn.size() == k)
{
if (dist_q_node < knn.top().priority )
{
knn.pop();
knn.push ( TKDNodePriority <Point> ( node, dist_q_node ) );
}
}
else
{
knn.push ( TKDNodePriority <Point> ( node, dist_q_node ) );
}
typename Point::Type dist_q_node_straight = ( point->getCoordinate ( node->getDepth() % 2 ) - node->getData()->getCoordinate ( node->getDepth() % 2 ) ) *
( point->getCoordinate ( node->getDepth() % 2 ) - node->getData()->getCoordinate ( node->getDepth() % 2 ) ) ;
typename Point::Type top_priority = knn.top().priority;
if ( knn.size() < k || dist_q_node_straight < top_priority )
{
if ( point->getCoordinate ( node->getDepth() % 2 ) < node->getData()->getCoordinate ( node->getDepth() % 2 ) )
{
findAllKNN2 ( point, knn, k, node->getRight(), depth + 1 );
}
else
{
findAllKNN2 ( point, knn, k, node->getLeft(), depth + 1 );
}
}
}
答案 0 :(得分:5)
首先,作者没有提供导致性能下降的最小代码示例。 其次,这个问题是在8年前提出的,我敢肯定编译器可以极大地提高性能。
我已经做了一个基准示例,我将队列中的第一个元素放入,然后以另一个优先级推回(模拟对新元素的推入而不创建一个),通过计算数组kNodesCount
中的元素来完成kRunsCount
个迭代的循环。我正在将priority_queue
与multiset
和multimap
进行比较。我决定添加multimap
以便进行更精确的比较。这个简单的测试非常接近作者的用例,我也尝试重现他在代码示例中使用的结构。
#include <set>
#include <type_traits>
#include <vector>
#include <chrono>
#include <queue>
#include <map>
#include <iostream>
template<typename T>
struct Point {
static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value, "Incompatible type");
using Type = T;
T x;
T y;
};
template<typename T>
struct Node {
using Type = T;
Node<T> * left;
Node<T> * right;
T data;
};
template <typename T>
struct NodePriority {
using Type = T;
using DataType = typename T::Type;
Node<T> * node = nullptr;
DataType priority = static_cast<DataType>(0);
bool operator < (const NodePriority<T> & n1) const noexcept {
return priority > n1.priority;
}
bool operator > (const NodePriority<T> & n1) const noexcept {
return priority < n1.priority;
}
};
// descending order by default
template <typename T>
using PriorityQueueList = std::priority_queue<T>;
// greater used because of ascending order by default
template <typename T>
using MultisetList = std::multiset<T, std::greater<T>>;
// greater used because of ascending order by default
template <typename T>
using MultimapList = std::multimap<typename T::DataType, T, std::greater<typename T::DataType>>;
struct Inner {
template<template <typename> class C, typename T>
static void Operate(C<T> & list, std::size_t priority);
template<typename T>
static void Operate(PriorityQueueList<T> & list, std::size_t priority) {
if (list.size() % 2 == 0) {
auto el = std::move(list.top());
el.priority = priority;
list.push(std::move(el));
}
else {
list.pop();
}
}
template<typename T>
static void Operate(MultisetList<T> & list, std::size_t priority) {
if (list.size() % 2 == 0) {
auto el = std::move(*list.begin());
el.priority = priority;
list.insert(std::move(el));
}
else {
list.erase(list.begin());
}
}
template<typename T>
static void Operate(MultimapList<T> & list, std::size_t priority) {
if (list.size() % 2 == 0) {
auto el = std::move(*list.begin());
auto & elFirst = const_cast<int&>(el.first);
elFirst = priority;
el.second.priority = priority;
list.insert(std::move(el));
}
else {
list.erase(list.begin());
}
}
};
template<typename T>
void doOperationOnPriorityList(T & list) {
for (std::size_t pos = 0, len = list.size(); pos < len; ++pos) {
// move top element and update priority
auto priority = std::rand() % 10;
Inner::Operate(list, priority);
}
}
template<typename T>
void measureOperationTime(T & list, std::size_t runsCount) {
std::chrono::system_clock::time_point t1, t2;
std::uint64_t totalTime(0);
for (std::size_t i = 0; i < runsCount; ++i) {
t1 = std::chrono::system_clock::now();
doOperationOnPriorityList(list);
t2 = std::chrono::system_clock::now();
auto castedTime = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
std::cout << "Run " << i << " time: " << castedTime << "\n";
totalTime += castedTime;
}
std::cout << "Average time is: " << totalTime / runsCount << " ms" << std::endl;
}
int main() {
// consts
const int kNodesCount = 10'000'000;
const int kRunsCount = 10;
// prepare data
PriorityQueueList<NodePriority<Point<int>>> neighboursList1;
MultisetList<NodePriority<Point<int>>> neighboursList2;
MultimapList<NodePriority<Point<int>>> neighboursList3;
std::vector<Node<Point<int>>> nodes;
nodes.reserve(kNodesCount);
for (auto i = 0; i < kNodesCount; ++i) {
nodes.emplace_back(decltype(nodes)::value_type{ nullptr, nullptr, { 0,0 } });
auto priority = std::rand() % 10;
neighboursList1.emplace(decltype(neighboursList1)::value_type{ &nodes.back(), priority });
neighboursList2.emplace(decltype(neighboursList2)::value_type{ &nodes.back(), priority });
neighboursList3.emplace(decltype(neighboursList3)::value_type{ priority, { &nodes.back(), priority } });
}
// do operation on data
std::cout << "\nPriority queue\n";
measureOperationTime(neighboursList1, kRunsCount);
std::cout << "\nMultiset\n";
measureOperationTime(neighboursList2, kRunsCount);
std::cout << "\nMultimap\n";
measureOperationTime(neighboursList3, kRunsCount);
return 0;
}
我已经使用VS v15.8.9使用/ Ox进行了发布。在10次运行中查看1万件产品的结果:
Priority queue
Run 0 time: 764
Run 1 time: 933
Run 2 time: 920
Run 3 time: 813
Run 4 time: 991
Run 5 time: 862
Run 6 time: 902
Run 7 time: 1277
Run 8 time: 774
Run 9 time: 771
Average time is: 900 ms
Multiset
Run 0 time: 2235
Run 1 time: 1811
Run 2 time: 1755
Run 3 time: 1535
Run 4 time: 1475
Run 5 time: 1388
Run 6 time: 1482
Run 7 time: 1431
Run 8 time: 1347
Run 9 time: 1347
Average time is: 1580 ms
Multimap
Run 0 time: 2197
Run 1 time: 1885
Run 2 time: 1725
Run 3 time: 1671
Run 4 time: 1500
Run 5 time: 1403
Run 6 time: 1411
Run 7 time: 1420
Run 8 time: 1409
Run 9 time: 1362
Average time is: 1598 ms
嗯,您看到multiset
的性能与multimap
相同,而priority_queue
的性能最快(大约快43%)。那为什么会发生呢?
让我们从priority_queue
开始,C ++标准没有告诉我们如何实现一个或另一个容器或结构,但是在大多数情况下,它是基于binary heap的(寻找msvc和gcc实现) !在priority_queue
的情况下,除了top之外,您无权访问其他任何元素,您无法遍历它们,按索引获取甚至获取最后一个元素(这会留出一些优化空间)。二进制堆的平均插入量为O(1),只有最坏的情况是O(log n),删除是O(log n),因为我们从底部开始获取元素,然后搜索下一个高优先级。
multimap
和multiset
怎么样?它们通常都在red-black binary tree上实现(寻找msvc和gcc实现),其中平均插入为O(log n),删除为O(log n)。
从这个角度来看,priority_queue
从不比multiset
或multimap
慢。因此,回到您的问题,multiset
优先队列比priority_queue
本身快不。可能有很多原因,包括在旧编译器上执行原始priority_queue
或对该结构的错误使用(问题不包含最小的可行示例),此外作者未提及编译标志或编译器版本,有时没有进行优化进行重大更改。
更新1 ,应@noɥʇʎԀʎzɐɹƆ请求
不幸的是,我现在无法访问linux环境,但是我安装了mingw-w64,版本信息:g ++。exe(x86_64-posix-seh,由Strawberryperl.com项目构建)8.3.0。二手处理器与visual studio相同:处理器Intel(R)Core(TM)i7-8550U CPU @ 1.80GHz,2001 Mhz,4个Core,8个逻辑处理器。
所以g++ -O2
的结果是:
Priority queue
Run 0 time: 775
Run 1 time: 995
Run 2 time: 901
Run 3 time: 807
Run 4 time: 930
Run 5 time: 765
Run 6 time: 799
Run 7 time: 1151
Run 8 time: 760
Run 9 time: 780
Average time is: 866 ms
Multiset
Run 0 time: 2280
Run 1 time: 1942
Run 2 time: 1607
Run 3 time: 1344
Run 4 time: 1319
Run 5 time: 1210
Run 6 time: 1129
Run 7 time: 1156
Run 8 time: 1244
Run 9 time: 992
Average time is: 1422 ms
Multimap
Run 0 time: 2530
Run 1 time: 1958
Run 2 time: 1670
Run 3 time: 1390
Run 4 time: 1391
Run 5 time: 1235
Run 6 time: 1088
Run 7 time: 1198
Run 8 time: 1071
Run 9 time: 963
Average time is: 1449 ms
您可能会注意到它与msvc几乎相同。
更新2 ,感谢@JorgeBellon
一个quick-bench.com在线基准测试链接,请您自己检查!
想看看我的帖子有什么补充,加油!
答案 1 :(得分:2)
您的编译器的优化设置似乎可能会对性能产生很大影响。在下面的代码中,multiset在没有优化的情况下轻松地击败基于矢量和基于deque的priority_queue。但是,通过“-O3”优化,基于矢量的优先级队列胜过所有。现在,这些实验是在带有GCC的Linux上运行的,所以也许你会在Windows上得到不同的结果。我认为启用优化可能会删除STL向量中的许多错误检查行为。
没有优化:
pq-w-vector:79.2997ms
pq-w-deque:362.366ms
pq-w-multiset:34.649ms
使用-O2优化:
pq-w-vector:8.88154ms
pq-w-deque:17.5233ms
pq-w-multiset:12.5539ms
使用-O3优化:
pq-w-vector:7.92462ms
pq-w-deque:16.8028ms
pq-w-multiset:12.3208ms
测试工具(不要忘记与-lrt链接):
#include <iostream>
#include <queue>
#include <deque>
#include <set>
#include <ctime>
#include <cstdlib>
#include <unistd.h>
using namespace std;
template <typename T>
double run_test(T& pq, int size, int iterations)
{
struct timespec start, end;
for(int i = 0; i < size; ++i)
pq.push(rand());
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
for(int i = 0; i < iterations; ++i)
{
if(rand()%2)
pq.pop();
else
pq.push(rand());
}
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
end.tv_sec -= start.tv_sec;
end.tv_nsec -= start.tv_nsec;
if (end.tv_nsec < 0)
{
--end.tv_sec;
end.tv_nsec += 1000000000ULL;
}
return (end.tv_sec*1e3 + end.tv_nsec/1e6);
}
template <class T>
class multiset_pq: public multiset<T>
{
public:
multiset_pq(): multiset<T>() {};
void push(T elm) { this->insert(elm); }
void pop() { if(!this->empty()) this->erase(this->begin()); }
const T& top() { return *this->begin(); }
};
int main(void)
{
const int size = 5000;
const int iterations = 100000;
priority_queue<int, vector<int> > pqv;
priority_queue<int, deque<int> > pqd;
multiset_pq<int> pqms;
srand(time(0));
cout<<"pq-w-vector: "<<run_test(pqv, size, iterations)<<"ms"<<endl;
cout<<"pq-w-deque: "<<run_test(pqd, size, iterations)<<"ms"<<endl;
cout<<"pq-w-multiset: "<<run_test(pqms, size, iterations)<<"ms"<<endl;
return 0;
}
答案 2 :(得分:0)
此非综合基准是从priority_queue的实际用法中得出的。以标准输入Feed this file的形式运行基准测试。
// TOPOSORT 2
// This short function computes the lexicographically smallest toposort.
// priority_queue vs multiset benchmark
#include <vector>
#include <queue>
#include <set>
#include <unordered_set>
#include <ctime>
#include <chrono>
#include <iostream>
// https://stackoverflow.com/a/13772771/1459669
#ifdef _WIN32
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
using namespace std;
constexpr int MAXN = 100001;
struct Tail {
int observation, number;
};
typedef vector<vector<Tail>> AdjList;
int N, M;
void computeNumIncomingEdges(int observationID, AdjList adjacency_list,
int *numIncomingEdges) {
for (int node = 0; node <= N; ++node) {
numIncomingEdges[node] = 0;
}
for (int node = 1; node <= N; ++node) {
for (Tail tail : adjacency_list[node]) {
if (tail.observation <= observationID) {
numIncomingEdges[tail.number]++;
}
}
}
}
template<class T>
vector<int> toposort2_PQ(int observationID, AdjList adjacency_list) {
vector<int> sortedElements;
priority_queue<int, T, std::greater<int>> S;
static int numIncomingEdges[MAXN];
computeNumIncomingEdges(observationID, adjacency_list, numIncomingEdges);
for (int node = 1; node <= N; ++node) {
if (numIncomingEdges[node] == 0)
S.push(node);
}
while (!S.empty()) {
auto n = S.top();
S.pop();
sortedElements.push_back(n);
for (int _ = adjacency_list[n].size() - 1; _ >= 0; --_) {
Tail m = adjacency_list[n][_];
if (m.observation <= observationID) {
adjacency_list[n].pop_back();
numIncomingEdges[m.number]--;
if (numIncomingEdges[m.number] == 0)
S.push(m.number);
}
}
}
bool graphStillHasEdges = false;
for (int node = 1; node <= N; ++node)
if (numIncomingEdges[node] > 0) {
graphStillHasEdges = true;
break;
}
return sortedElements;
}
vector<int> toposort2_multiset(int observationID, AdjList adjacency_list) {
vector<int> sortedElements;
multiset<int, std::greater<int>> S;
static int numIncomingEdges[MAXN];
computeNumIncomingEdges(observationID, adjacency_list, numIncomingEdges);
for (int node = 1; node <= N; ++node) {
if (numIncomingEdges[node] == 0)
S.insert(node);
}
while (!S.empty()) {
int n = *S.begin();
S.erase(S.begin());
sortedElements.push_back(n);
for (int _ = adjacency_list[n].size() - 1; _ >= 0; --_) {
Tail m = adjacency_list[n][_];
if (m.observation <= observationID) {
adjacency_list[n].pop_back();
numIncomingEdges[m.number]--;
if (numIncomingEdges[m.number] == 0)
S.insert(m.number);
}
}
}
bool graphStillHasEdges = false;
for (int node = 1; node <= N; ++node)
if (numIncomingEdges[node] > 0) {
graphStillHasEdges = true;
break;
}
return sortedElements;
}
int main() {
scanf("%d %d", &N, &M);
AdjList adjacency_list(MAXN);
for (int observation = 0; observation < M; ++observation) {
int observationSize;
scanf("%d", &observationSize);
int head;
scanf("%d", &head);
for (int i = 0; i < observationSize - 1; ++i) {
int tail;
scanf("%d", &tail);
Tail to_insert;
to_insert.observation = observation;
to_insert.number = tail;
adjacency_list[head].push_back(to_insert);
head = tail;
}
}
for (int i = 0; i < 5; ++i) {
auto start_pq = std::chrono::high_resolution_clock::now();
toposort2_PQ<vector<int>>(3182, adjacency_list);
auto end_pq = std::chrono::high_resolution_clock::now();
auto start_pq_dq = std::chrono::high_resolution_clock::now();
toposort2_PQ<deque<int>>(3182, adjacency_list);
auto end_pq_dq = std::chrono::high_resolution_clock::now();
auto start_ms = std::chrono::high_resolution_clock::now();
toposort2_multiset(3182, adjacency_list);
auto end_ms = std::chrono::high_resolution_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(end_pq-start_pq).count() << ' ' << std::chrono::duration_cast<std::chrono::microseconds>(end_pq_dq-start_pq_dq).count() << ' ' << std::chrono::duration_cast<std::chrono::microseconds>(end_ms-start_ms).count() << endl;
}
}
将clang ++与-O2结合使用会得到我的帮助
31622 37891 54884
27092 33919 54878
27324 35870 51427
27961 35348 53170
26746 34753 54191
总而言之,具有矢量的priority_queue始终获胜。第二位是带有双端队列的priority_queue,最后是多集。
答案 3 :(得分:-1)
根据我的理解,priority_queue
的实施是罪魁祸首。 priority_queue
已作为专门的vector
或deque
实施(在下方)。因为priority_queue需要具有随机访问迭代器。当您将项目弹出或推送到priority_queue
时,队列中的其余项目需要复制到空白区域,插入时也会发生同样的情况。 multi_set
基于密钥。
multi_set
中的方式,它
......具有重要的属性 将新元素插入到
multi_set
不会失效 指向现有的迭代器 元素。擦除元素multi_set
也不会失效 任何迭代器,当然,除了 实际上指向的迭代器 正在删除的元素。
- 引自SGI documentation。
这意味着multi_set
的存储不是线性的,因此性能会提高。