我想使用二叉树(实际上是VP Tree)来实现我的搜索算法。我需要先使用输入点构造树,然后对每个输入点执行一次查询。
在单线程版本中,瓶颈在搜索过程中,但经过一些优化后,搜索过程可以在多核平台上获得满意的加速,因此瓶颈变为构造部分。
在构造过程中,我需要先将所有点分成两部分,然后递归地构建左子树和右子树。在进行分区时,我使用in place方法重新排列输入数组。所以我认为可能会有一些false sharing
。
我尝试使用c ++ 11 std::thread
来加速构造。也就是说,使用std::atomic<int>
作为线程计数器,当我需要构建两个子树并且总线程数小于我设置的最大线程限制时,我为左子树生成另一个线程。分支部分如下所示
但是,上述方法无法在多核平台上扩展,例如,在我的80核,160线程平台上,search
程序获得100倍的加速,而构造只能获得4倍的加速并且整个程序的速度降低到大约30倍。
所以,通常,我可以利用什么技术有效地构建二叉树?
以下是构造代码的简化版本:
#include <thread>
#include <atomic>
#include <cstdio>
#include <random>
#include <chrono>
#include <algorithm>
#include <functional>
#include <cmath>
using std::thread;
using namespace std::chrono;
std::default_random_engine g(0);
std::uniform_real_distribution<double> distr(0, 100);
class Tree
{
public:
struct Node
{
int idx; // index of point
double threshold;
Node* left;
Node* right;
};
Tree(double* points, size_t size, int maxThread)
: _nodes(new Node[size]),
_items(points), // order does not matter, and won't modify value
_size(size),
_maxThread(maxThread),
_currentThread(1)
{
buildFromPoints(0, 0, size);
_currentThread = 1;
}
private:
Node* _nodes;
double* _items;
const int _size;
const int _maxThread;
std::atomic<int> _currentThread;
void buildFromPoints(int nodeIdx, int lower, int upper)
{
// build from [lower,upper) of _items
// would rearrange elements in the above range
if (lower >= upper)
return;
auto& node = _nodes[nodeIdx];
node.idx = lower;
node.left = nullptr;
node.right = nullptr;
node.threshold = 0;
if (upper - lower > 1) // at least two elements
{
std::uniform_int_distribution<int> distr(lower, upper - 1);
int pivotIdx = distr(g);
std::swap(_items[lower], _items[pivotIdx]);
int median = (upper + lower - 1) / 2;
auto cmp = [&](const double p1, const double p2)->bool
{
return fabs(p1 - _items[lower]) < fabs(p2 - _items[lower]);
};
std::nth_element(
_items + lower + 1,
_items + median,
_items + upper,
cmp);
node.threshold = fabs(_items[lower] - _items[median]);
int nodeSize = nodeIdx + 1;
if(lower < median)
node.left = _nodes + nodeSize;
if(median + 1 < upper)
node.right = _nodes + nodeSize + median - lower;
if(node.left && node.right)
{
if (_maxThread > _currentThread)
{
++_currentThread;
std::thread t1(
&Tree::buildFromPoints,
this,
nodeSize,
lower + 1,
median + 1);
buildFromPoints(nodeSize + median - lower, median + 1, upper);
t1.join();
}
else
{
buildFromPoints(nodeSize, lower + 1, median + 1);
buildFromPoints(nodeSize + median - lower, median + 1, upper);
}
}
else if (node.left)
buildFromPoints(nodeSize, lower + 1, median + 1);
else if (node.right)
buildFromPoints(nodeSize + median - lower, median + 1, upper);
}
}
};
int main(int argNum, char** args)
{
int pointsNum = 10;
int threadNum = 1;
if(argNum > 1)
pointsNum = atoi(args[1]);
if(argNum > 2)
threadNum = atoi(args[2]);
double* arr = new double[pointsNum];
for (int i = 0; i < pointsNum; i++)
arr[i] = distr(g);
for (int i = 0; i < 10; i++)
printf("arr[%d] = %g\n", i, arr[i]);
auto t1 = std::chrono::high_resolution_clock::now();
Tree t(arr,pointsNum,threadNum);
auto t2 = std::chrono::high_resolution_clock::now();
double timeSpan = duration_cast<duration<double>>(t2 - t1).count();
printf("time used: %g s\n",timeSpan);
return 0;
}