我有一个自上而下的程序,可以从3D对象的高级描述中构建一个线性八叉树(例如,叶子排列在一个数组中并按照Morton编码排序)。问题在于,对于我的预期应用,得到的八叉树必须是2:1平衡,即不得有任何一对相邻的块,其中一个是另一块的两倍以上。
我唯一能找到的是文章" Bottom Up Construction和2:1平行线性八边形的平衡细化" (你从多个来源找到它,但版权不明确,不确定在这个网站上链接的东西的政策是什么),这解释了一个算法来做到这一点。问题是所提出的算法在并行消息传递架构中工作,并且它对我的应用程序来说太过分了。另一个问题是(自下而上)构造和平衡算法似乎捆绑在一起,我不知道如何在用我自己的方法构造树之后如何仅平衡它。
那么2:1平衡线性八叉树的(有希望简单且有效)方法是什么?并行算法也很棒,但是使用共享内存模型,而不是像链接算法那样传递消息。
答案 0 :(得分:4)
最简单的顺序算法可能是保持包含未处理的八叉树节点的队列,并通过确保其三个级别 - 1个非兄弟邻居存在来依次处理每个节点。处理期间创建的其他节点进入队列。
-----------------
| | | | | |
--------- |
| | | | | |
--------- |
| | |d| | |
--b------ |
| | |a|e| |
-----------------
| | |
| | |
| | |
| c | |
| | |
| | |
| | |
-----------------
这里是两个(四叉树)级别 - 一个非兄弟姐妹的邻居是b,而一个尚未存在的孩子是c。节点d和e是(同级)兄弟邻居。
该算法的复杂部分是确定如何找到这些邻居。这可以通过计算节点及其非兄弟邻居的坐标,Morton对它们进行编码,然后依次在节点及其每个邻居的编码的XOR中寻找最重要的设置位来实现。这个位超过三加1的索引是需要遍历的父链接数。
例如,让我们使用yxyxyx作为四叉树图的Morton交错。节点a是从(2,4)到(3,5)的正方形,并且具有莫顿坐标100100.节点b是从(0,4)到(2,6)的正方形,并且具有莫顿坐标100000.节点c是从(0,0)到(4,4)的正方形并且具有莫顿坐标000000.节点d是从(2,5)到(3,6)的正方形并且具有莫顿坐标100110.节点e是从(3)的正方形,4)到(4,5)并且莫顿坐标为100101。
为了找到邻居,我们编码(2 + 1,4)和(2-1,4)和(2,4 + 1)和(2,4-1)。我们做(2,4-1)=(2,3)。 (2,3)的莫顿编码是001110.与莫顿编码相比,
001110
100100
XOR ------
101010
\/\/\/
由于两位的第三最低有效组是非零的,我们根据Morton代码上升到叶级别减去三(即,来自a的三个父链接)然后下降两次(三次减去一次)。当遇到不存在的子节点时,与第二个下降步骤一样,我们根据需要制作它们并将它们添加到队列中。
并行版
由于我已经在顺序算法中避免了一些复杂的优化,因此只要在分割八叉树节点时没有比赛,它对处理顺序的变化甚至是并行性都是稳健的。对于共享内存并行性,给定比较和交换原语,很容易构建子锁无关(从null分配一个节点然后CAS适当的子指针;如果CAS失败,那么只需读取获胜儿童)。鉴于CAS可用,队列的有效共享集合不应太难(它不必是FIFO)。我不知道在这里会有什么样的加速并行性。
答案 1 :(得分:4)
您引用的论文是我小组以前的一篇文章。由于您询问了版权问题,因此该代码可在我们的论坛网页GPL2下找到:http://padas.ices.utexas.edu/software
新代码(pvfmm,我正在处理)也可以从上面的链接下载,可以在GPL3下找到。这个算法有一个更简单,更有效的平衡算法。 该算法在本文中描述:http://padas.ices.utexas.edu/static/papers/pvfmm.pdf
我已经调整了下面的代码来删除分布式内存并行性。
函数balanceOctree
实现了算法。
#include <vector>
#include <set>
#include <cstdlib>
#include <cmath>
#include <stdint.h>
#include <omp.h>
#include <algorithm>
#include <cstring>
#ifndef MAX_DEPTH
#define MAX_DEPTH 30
#endif
#if MAX_DEPTH < 7
#define UINT_T uint8_t
#define INT_T int8_t
#elif MAX_DEPTH < 15
#define UINT_T uint16_t
#define INT_T int16_t
#elif MAX_DEPTH < 31
#define UINT_T uint32_t
#define INT_T int32_t
#elif MAX_DEPTH < 63
#define UINT_T uint64_t
#define INT_T int64_t
#endif
namespace omp_par{
template <class T,class StrictWeakOrdering>
void merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering comp){
typedef typename std::iterator_traits<T>::difference_type _DiffType;
typedef typename std::iterator_traits<T>::value_type _ValType;
_DiffType N1=A_last-A_;
_DiffType N2=B_last-B_;
if(N1==0 && N2==0) return;
if(N1==0 || N2==0){
_ValType* A=(N1==0? &B_[0]: &A_[0]);
_DiffType N=(N1==0? N2 : N1 );
#pragma omp parallel for
for(int i=0;i<p;i++){
_DiffType indx1=( i *N)/p;
_DiffType indx2=((i+1)*N)/p;
memcpy(&C_[indx1], &A[indx1], (indx2-indx1)*sizeof(_ValType));
}
return;
}
//Split both arrays ( A and B ) into n equal parts.
//Find the position of each split in the final merged array.
int n=10;
_ValType* split=new _ValType[p*n*2];
_DiffType* split_size=new _DiffType[p*n*2];
#pragma omp parallel for
for(int i=0;i<p;i++){
for(int j=0;j<n;j++){
int indx=i*n+j;
_DiffType indx1=(indx*N1)/(p*n);
split [indx]=A_[indx1];
split_size[indx]=indx1+(std::lower_bound(B_,B_last,split[indx],comp)-B_);
indx1=(indx*N2)/(p*n);
indx+=p*n;
split [indx]=B_[indx1];
split_size[indx]=indx1+(std::lower_bound(A_,A_last,split[indx],comp)-A_);
}
}
//Find the closest split position for each thread that will
//divide the final array equally between the threads.
_DiffType* split_indx_A=new _DiffType[p+1];
_DiffType* split_indx_B=new _DiffType[p+1];
split_indx_A[0]=0;
split_indx_B[0]=0;
split_indx_A[p]=N1;
split_indx_B[p]=N2;
#pragma omp parallel for
for(int i=1;i<p;i++){
_DiffType req_size=(i*(N1+N2))/p;
int j=std::lower_bound(&split_size[0],&split_size[p*n],req_size,std::less<_DiffType>())-&split_size[0];
if(j>=p*n)
j=p*n-1;
_ValType split1 =split [j];
_DiffType split_size1=split_size[j];
j=(std::lower_bound(&split_size[p*n],&split_size[p*n*2],req_size,std::less<_DiffType>())-&split_size[p*n])+p*n;
if(j>=2*p*n)
j=2*p*n-1;
if(abs(split_size[j]-req_size)<abs(split_size1-req_size)){
split1 =split [j];
split_size1=split_size[j];
}
split_indx_A[i]=std::lower_bound(A_,A_last,split1,comp)-A_;
split_indx_B[i]=std::lower_bound(B_,B_last,split1,comp)-B_;
}
delete[] split;
delete[] split_size;
//Merge for each thread independently.
#pragma omp parallel for
for(int i=0;i<p;i++){
T C=C_+split_indx_A[i]+split_indx_B[i];
std::merge(A_+split_indx_A[i],A_+split_indx_A[i+1],B_+split_indx_B[i],B_+split_indx_B[i+1],C,comp);
}
delete[] split_indx_A;
delete[] split_indx_B;
}
template <class T,class StrictWeakOrdering>
void merge_sort(T A,T A_last,StrictWeakOrdering comp){
typedef typename std::iterator_traits<T>::difference_type _DiffType;
typedef typename std::iterator_traits<T>::value_type _ValType;
int p=omp_get_max_threads();
_DiffType N=A_last-A;
if(N<2*p){
std::sort(A,A_last,comp);
return;
}
//Split the array A into p equal parts.
_DiffType* split=new _DiffType[p+1];
split[p]=N;
#pragma omp parallel for
for(int id=0;id<p;id++){
split[id]=(id*N)/p;
}
//Sort each part independently.
#pragma omp parallel for
for(int id=0;id<p;id++){
std::sort(A+split[id],A+split[id+1],comp);
}
//Merge two parts at a time.
_ValType* B=new _ValType[N];
_ValType* A_=&A[0];
_ValType* B_=&B[0];
for(int j=1;j<p;j=j*2){
for(int i=0;i<p;i=i+2*j){
if(i+j<p){
merge(A_+split[i],A_+split[i+j],A_+split[i+j],A_+split[(i+2*j<=p?i+2*j:p)],B_+split[i],p,comp);
}else{
#pragma omp parallel for
for(int k=split[i];k<split[p];k++)
B_[k]=A_[k];
}
}
_ValType* tmp_swap=A_;
A_=B_;
B_=tmp_swap;
}
//The final result should be in A.
if(A_!=&A[0]){
#pragma omp parallel for
for(int i=0;i<N;i++)
A[i]=A_[i];
}
//Free memory.
delete[] split;
delete[] B;
}
template <class T>
void merge_sort(T A,T A_last){
typedef typename std::iterator_traits<T>::value_type _ValType;
merge_sort(A,A_last,std::less<_ValType>());
}
template <class T, class I>
T reduce(T* A, I cnt){
T sum=0;
#pragma omp parallel for reduction(+:sum)
for(I i = 0; i < cnt; i++)
sum+=A[i];
return sum;
}
template <class T, class I>
void scan(T* A, T* B,I cnt){
int p=omp_get_max_threads();
if(cnt<(I)100*p){
for(I i=1;i<cnt;i++)
B[i]=B[i-1]+A[i-1];
return;
}
I step_size=cnt/p;
#pragma omp parallel for
for(int i=0; i<p; i++){
int start=i*step_size;
int end=start+step_size;
if(i==p-1) end=cnt;
if(i!=0)B[start]=0;
for(I j=(I)start+1; j<(I)end; j++)
B[j]=B[j-1]+A[j-1];
}
T* sum=new T[p];
sum[0]=0;
for(int i=1;i<p;i++)
sum[i]=sum[i-1]+B[i*step_size-1]+A[i*step_size-1];
#pragma omp parallel for
for(int i=1; i<p; i++){
int start=i*step_size;
int end=start+step_size;
if(i==p-1) end=cnt;
T sum_=sum[i];
for(I j=(I)start; j<(I)end; j++)
B[j]+=sum_;
}
delete[] sum;
}
}
class MortonId{
public:
MortonId();
MortonId(MortonId m, unsigned char depth);
template <class T>
MortonId(T x_f,T y_f, T z_f, unsigned char depth=MAX_DEPTH);
template <class T>
MortonId(T* coord, unsigned char depth=MAX_DEPTH);
unsigned int GetDepth() const;
template <class T>
void GetCoord(T* coord);
MortonId NextId() const;
MortonId getAncestor(unsigned char ancestor_level) const;
/**
* \brief Returns the deepest first descendant.
*/
MortonId getDFD(unsigned char level=MAX_DEPTH) const;
void NbrList(std::vector<MortonId>& nbrs,unsigned char level, int periodic) const;
std::vector<MortonId> Children() const;
int operator<(const MortonId& m) const;
int operator>(const MortonId& m) const;
int operator==(const MortonId& m) const;
int operator!=(const MortonId& m) const;
int operator<=(const MortonId& m) const;
int operator>=(const MortonId& m) const;
int isAncestor(MortonId const & other) const;
private:
UINT_T x,y,z;
unsigned char depth;
};
inline MortonId::MortonId():x(0), y(0), z(0), depth(0){}
inline MortonId::MortonId(MortonId m, unsigned char depth_):x(m.x), y(m.y), z(m.z), depth(depth_){}
template <class T>
inline MortonId::MortonId(T x_f,T y_f, T z_f, unsigned char depth_): depth(depth_){
UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH);
x=(UINT_T)floor(x_f*max_int);
y=(UINT_T)floor(y_f*max_int);
z=(UINT_T)floor(z_f*max_int);
}
template <class T>
inline MortonId::MortonId(T* coord, unsigned char depth_): depth(depth_){
UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH);
x=(UINT_T)floor(coord[0]*max_int);
y=(UINT_T)floor(coord[1]*max_int);
z=(UINT_T)floor(coord[2]*max_int);
}
template <class T>
inline void MortonId::GetCoord(T* coord){
UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH);
T s=1.0/((T)max_int);
coord[0]=x*s;
coord[1]=y*s;
coord[2]=z*s;
}
inline unsigned int MortonId::GetDepth() const{
return depth;
}
inline MortonId MortonId::NextId() const{
MortonId m=*this;
UINT_T mask=((UINT_T)1)<<(MAX_DEPTH-depth);
int i;
for(i=depth;i>=0;i--){
m.x=(m.x^mask);
if((m.x & mask))
break;
m.y=(m.y^mask);
if((m.y & mask))
break;
m.z=(m.z^mask);
if((m.z & mask))
break;
mask=(mask<<1);
}
m.depth=i;
return m;
}
inline MortonId MortonId::getAncestor(unsigned char ancestor_level) const{
MortonId m=*this;
m.depth=ancestor_level;
UINT_T mask=(((UINT_T)1)<<(MAX_DEPTH))-(((UINT_T)1)<<(MAX_DEPTH-ancestor_level));
m.x=(m.x & mask);
m.y=(m.y & mask);
m.z=(m.z & mask);
return m;
}
inline MortonId MortonId::getDFD(unsigned char level) const{
MortonId m=*this;
m.depth=level;
return m;
}
inline int MortonId::operator<(const MortonId& m) const{
if(x==m.x && y==m.y && z==m.z) return depth<m.depth;
UINT_T x_=(x^m.x);
UINT_T y_=(y^m.y);
UINT_T z_=(z^m.z);
if((z_>x_ || ((z_^x_)<x_ && (z_^x_)<z_)) && (z_>y_ || ((z_^y_)<y_ && (z_^y_)<z_)))
return z<m.z;
if(y_>x_ || ((y_^x_)<x_ && (y_^x_)<y_))
return y<m.y;
return x<m.x;
}
inline int MortonId::operator>(const MortonId& m) const{
if(x==m.x && y==m.y && z==m.z) return depth>m.depth;
UINT_T x_=(x^m.x);
UINT_T y_=(y^m.y);
UINT_T z_=(z^m.z);
if((z_>x_ || ((z_^x_)<x_ && (z_^x_)<z_)) && (z_>y_ || ((z_^y_)<y_ && (z_^y_)<z_)))
return z>m.z;
if((y_>x_ || ((y_^x_)<x_ && (y_^x_)<y_)))
return y>m.y;
return x>m.x;
}
inline int MortonId::operator==(const MortonId& m) const{
return (x==m.x && y==m.y && z==m.z && depth==m.depth);
}
inline int MortonId::operator!=(const MortonId& m) const{
return !(*this==m);
}
inline int MortonId::operator<=(const MortonId& m) const{
return !(*this>m);
}
inline int MortonId::operator>=(const MortonId& m) const{
return !(*this<m);
}
inline int MortonId::isAncestor(MortonId const & other) const {
return other.depth>depth && other.getAncestor(depth)==*this;
}
void MortonId::NbrList(std::vector<MortonId>& nbrs, unsigned char level, int periodic) const{
nbrs.clear();
static int dim=3;
static int nbr_cnt=(int)(pow(3.0,dim)+0.5);
static UINT_T maxCoord=(((UINT_T)1)<<(MAX_DEPTH));
UINT_T mask=maxCoord-(((UINT_T)1)<<(MAX_DEPTH-level));
UINT_T pX=x & mask;
UINT_T pY=y & mask;
UINT_T pZ=z & mask;
MortonId mid_tmp;
mask=(((UINT_T)1)<<(MAX_DEPTH-level));
for(int i=0; i<nbr_cnt; i++){
INT_T dX = ((i/1)%3-1)*mask;
INT_T dY = ((i/3)%3-1)*mask;
INT_T dZ = ((i/9)%3-1)*mask;
INT_T newX=(INT_T)pX+dX;
INT_T newY=(INT_T)pY+dY;
INT_T newZ=(INT_T)pZ+dZ;
if(!periodic){
if(newX>=0 && newX<(INT_T)maxCoord)
if(newY>=0 && newY<(INT_T)maxCoord)
if(newZ>=0 && newZ<(INT_T)maxCoord){
mid_tmp.x=newX; mid_tmp.y=newY; mid_tmp.z=newZ;
mid_tmp.depth=level;
nbrs.push_back(mid_tmp);
}
}else{
if(newX<0) newX+=maxCoord; if(newX>=(INT_T)maxCoord) newX-=maxCoord;
if(newY<0) newY+=maxCoord; if(newY>=(INT_T)maxCoord) newY-=maxCoord;
if(newZ<0) newZ+=maxCoord; if(newZ>=(INT_T)maxCoord) newZ-=maxCoord;
mid_tmp.x=newX; mid_tmp.y=newY; mid_tmp.z=newZ;
mid_tmp.depth=level;
nbrs.push_back(mid_tmp);
}
}
}
std::vector<MortonId> MortonId::Children() const{
static int dim=3;
static int c_cnt=(1UL<<dim);
static UINT_T maxCoord=(((UINT_T)1)<<(MAX_DEPTH));
std::vector<MortonId> child(c_cnt);
UINT_T mask=maxCoord-(((UINT_T)1)<<(MAX_DEPTH-depth));
UINT_T pX=x & mask;
UINT_T pY=y & mask;
UINT_T pZ=z & mask;
mask=(((UINT_T)1)<<(MAX_DEPTH-(depth+1)));
for(int i=0; i<c_cnt; i++){
child[i].x=pX+mask*((i/1)%2);
child[i].y=pY+mask*((i/2)%2);
child[i].z=pZ+mask*((i/4)%2);
child[i].depth=depth+1;
}
return child;
}
//list must be sorted.
void lineariseList(std::vector<MortonId> & list) {
if(!list.empty()) {// Remove duplicates and ancestors.
std::vector<MortonId> tmp;
for(unsigned int i = 0; i < (list.size()-1); i++) {
if( (!(list[i].isAncestor(list[i+1]))) && (list[i] != list[i+1]) ) {
tmp.push_back(list[i]);
}
}
tmp.push_back(list[list.size()-1]);
list.swap(tmp);
}
}
void balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &out,
unsigned int maxDepth, bool periodic) {
unsigned int dim=3;
int omp_p=omp_get_max_threads();
if(in.size()==1){
out=in;
return 0;
}
//Build level-by-level set of nodes.
std::vector<std::set<MortonId> > nodes((maxDepth+1)*omp_p);
#pragma omp parallel for
for(int p=0;p<omp_p;p++){
size_t a=( p *in.size())/omp_p;
size_t b=((p+1)*in.size())/omp_p;
for(size_t i=a;i<b;){
size_t d=in[i].GetDepth();
if(d==0){i++; continue;}
MortonId pnode=in[i].getAncestor(d-1);
nodes[d-1+(maxDepth+1)*p].insert(pnode);
while(i<b && d==in[i].GetDepth() && pnode==in[i].getAncestor(d-1)) i++;
}
//Add new nodes level-by-level.
std::vector<MortonId> nbrs;
unsigned int num_chld=1UL<<dim;
for(unsigned int l=maxDepth;l>=1;l--){
//Build set of parents of balancing nodes.
std::set<MortonId> nbrs_parent;
std::set<MortonId>::iterator start=nodes[l+(maxDepth+1)*p].begin();
std::set<MortonId>::iterator end =nodes[l+(maxDepth+1)*p].end();
for(std::set<MortonId>::iterator node=start; node != end;){
node->NbrList(nbrs, l, periodic);
int nbr_cnt=nbrs.size();
for(int i=0;i<nbr_cnt;i++)
nbrs_parent.insert(nbrs[i].getAncestor(l-1));
node++;
}
//Get the balancing nodes.
std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
start=nbrs_parent.begin();
end =nbrs_parent.end();
ancestor_nodes.insert(start,end);
}
//Remove ancestors nodes. (optional)
for(unsigned int l=1;l<=maxDepth;l++){
std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
for(std::set<MortonId>::iterator node=start; node != end; node++){
MortonId parent=node->getAncestor(node->GetDepth()-1);
ancestor_nodes.erase(parent);
}
}
}
//Resize out.
std::vector<size_t> node_cnt(omp_p,0);
std::vector<size_t> node_dsp(omp_p,0);
#pragma omp parallel for
for(int i=0;i<omp_p;i++){
for(unsigned int j=0;j<=maxDepth;j++)
node_cnt[i]+=nodes[j+i*(maxDepth+1)].size();
}
omp_par::scan(&node_cnt[0],&node_dsp[0], omp_p);
out.resize(node_cnt[omp_p-1]+node_dsp[omp_p-1]);
//Copy leaf nodes to out.
#pragma omp parallel for
for(int p=0;p<omp_p;p++){
size_t node_iter=node_dsp[p];
for(unsigned int l=0;l<=maxDepth;l++){
std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
for(std::set<MortonId>::iterator node=start; node != end; node++)
out[node_iter++]=*node;
}
}
//Sort, Linearise, Redistribute.
omp_par::merge_sort(&out[0], &out[out.size()]);
lineariseList(out);
{ // Add children
MortonId nxt_mid(0,0,0,0);
std::vector<MortonId> out1;
std::vector<MortonId> children;
for(size_t i=0;i<out.size();i++){
while(nxt_mid.getDFD()<out[i]){
while(nxt_mid.isAncestor(out[i])){
nxt_mid=nxt_mid.getAncestor(nxt_mid.GetDepth()+1);
}
out1.push_back(nxt_mid);
nxt_mid=nxt_mid.NextId();
}
children=out[i].Children();
for(size_t j=0;j<8;j++){
out1.push_back(children[j]);
}
nxt_mid=out[i].NextId();
}
while(nxt_mid.GetDepth()>0){
out1.push_back(nxt_mid);
nxt_mid=nxt_mid.NextId();
}
out.swap(out1);
}
}
以上代码针对性能进行了优化,OpenMP使事情变得更加复杂,但基本思路是: