Question

我有一个2D矩阵存储在沿对角线的平缓冲区中。例如，4x4矩阵的索引分散如下：

 0   2   5   9
 1   4   8  12
 3   7  11  14
 6  10  13  15

使用此表示法，在给定原始索引和X / Y偏移量的情况下，计算相邻元素索引的最有效方法是什么？例如：

// return the index of a neighbor given an offset
int getNGonalNeighbor(const size_t index,
                      const int x_offset,
                      const int y_offset){
    //...
}

// for the array above:
getNGonalNeighbor(15,-1,-1); // should return 11
getNGonalNeighbor(15, 0,-1); // should return 14
getNGonalNeighbor(15,-1, 0); // should return 13
getNGonalNeighbor(11,-2,-1); // should return 1

我们假设溢出永远不会发生，并且没有环绕。

我有一个涉及大量triangular number和三角根计算的解决方案。它还包含很多分支，如果可能的话，我宁愿用代数替换它（这将在GPU上运行，其中分散的控制流是昂贵的）。我的解决方案工作但非常冗长。我觉得必须有一个更简单，更少计算密集的方法。

如果有人可以在这个特定的问题/表现上加上名字，也许对我有帮助。

如果有人感兴趣，我可以发布我的完整解决方案，但正如我所说，对于这样一个简单的任务而言，这是非常漫长且相对复杂的。简而言之，我的解决方案确实：

将原始索引转换为更大的三角矩阵，以避免处理2个三角形（例如13将变为17）

对于4x4矩阵，这将是：

0   2   5   9   14  20  27
1   4   8   13  19  26  
3   7   12  18  25  
6   11  17  24  
10  16  23
15  22  
21

使用偏移的曼哈顿距离和指数的三角根来计算此表示中邻居对角线的索引。
使用偏移
通过删除填充来转换回原始表示。

出于某种原因，这是我能提出的最简单的解决方案。

修改

有循环来累积偏移量：

我意识到，考虑到三角形数字的属性，将矩阵分成两个三角形更容易（让我们称之为0到9'上三角'和10到15'下三角'）并且有一个循环内部测试通过在上三角形中添加一个并在下部减去一个来累积偏移（如果这是有意义的）。但是对于我的解决方案，必须不惜一切代价避免循环，尤其是具有不平衡跳闸计数的循环（再次，非常对GPU不好）。

所以我正在寻找代数解决方案而不是算法。

构建查找表：

同样，由于GPU，最好避免构建查找表并在其中进行随机访问（非常昂贵）。代数解决方案更可取。

矩阵的属性：

矩阵的大小是已知的。
现在我只考虑方形矩阵，但对于矩形矩阵的解决方案也会很好。
正如我的例子中的函数名称所示，将解决方案扩展到N维卷（因此N-gonal flattening）也将是一个很大的优势。

Answer 1

表格查询

#include <stdio.h>

#define SIZE 16
#define SIDE  4  //sqrt(SIZE)

int table[SIZE];
int rtable[100];// {x,y| x<=99, y<=99 }

void setup(){
    int i, x, y, xy, index;//xy = x + y

    x=y=xy=0;
    for(i=0;i<SIZE;++i){
        table[i]= index= x*10 + y;
        rtable[x*10+y]=i;
        x = x + 1; y = y - 1;//right up
        if(y < 0 || x >= SIDE){
            ++xy;
            x = 0;
            y = xy;;
            while(y>=SIDE){
                ++x;
                --y;
            }
        }
    }
}
int getNGonalNeighbor(int index, int offsetX, int offsetY){
    int x,y;
    x=table[index] / 10 + offsetX;
    y=table[index] % 10 + offsetY;
    if(x < 0 || x >= SIDE || y < 0 || y >= SIDE) return -1; //ERROR
    return rtable[ x*10+y ];
}

int main() {
    int i;
    setup();
    printf("%d\n", getNGonalNeighbor(15,-1,-1));
    printf("%d\n", getNGonalNeighbor(15, 0,-1));
    printf("%d\n", getNGonalNeighbor(15,-1, 0));
    printf("%d\n", getNGonalNeighbor(11,-2,-1));
    printf("%d\n", getNGonalNeighbor(0, -1,-1));

    return 0;
}

不要使用表格版本。

#include <stdio.h>

#define SIZE 16
#define SIDE  4

void num2xy(int index, int *offsetX, int *offsetY){
    int i, x, y, xy;//xy = x + y

    x=y=xy=0;
    for(i=0;i<SIZE;++i){
        if(i == index){
            *offsetX = x;
            *offsetY = y;
            return;
        }
        x = x + 1; y = y - 1;//right up
        if(y < 0 || x >= SIDE){
            ++xy;
            x = 0;
            y = xy;;
            while(y>=SIDE){
                ++x;
                --y;
            }
        }
    }
}
int xy2num(int offsetX, int offsetY){
    int i, x, y, xy, index;//xy = x + y

    x=y=xy=0;
    for(i=0;i<SIZE;++i){
        if(offsetX == x && offsetY == y) return i;
        x = x + 1; y = y - 1;//right up
        if(y < 0 || x >= SIDE){
            ++xy;
            x = 0;
            y = xy;;
            while(y>=SIDE){
                ++x;
                --y;
            }
        }
    }
    return -1;
}
int getNGonalNeighbor(int index, int offsetX, int offsetY){
    int x,y;

    num2xy(index, &x, &y);

    return xy2num(x + offsetX, y + offsetY);
}

int main() {
    printf("%d\n", getNGonalNeighbor(15,-1,-1));
    printf("%d\n", getNGonalNeighbor(15, 0,-1));
    printf("%d\n", getNGonalNeighbor(15,-1, 0));
    printf("%d\n", getNGonalNeighbor(11,-2,-1));
    printf("%d\n", getNGonalNeighbor(0, -1,-1));

    return 0;
}

Answer 2

我实际上已经有了在我的代码中的其他地方解决它的元素。正如BLUEPIXY的解决方案暗示的那样，我正在使用分散/收集操作，我已经实现了布局转换。

此解决方案基本上重建矩阵中给定元素的原始(x,y)索引，应用索引偏移并将结果转换回转换后的布局。它将正方形分成2个三角形，并根据它所属的三角形调整计算。

这几乎完全是代数转换：它不使用循环，也不使用表查找，内存占用少，分支少。代码可能会进一步优化。

以下是代码草案：

#include <stdio.h>
#include <math.h>

// size of the matrix
#define SIZE 4

// triangle number of X
#define TRIG(X) (((X) * ((X) + 1)) >> 1)
// triangle root of X
#define TRIROOT(X) ((int)(sqrt(8*(X)+1)-1)>>1);

// return the index of a neighbor given an offset
int getNGonalNeighbor(const size_t index,
                      const int x_offset,
                      const int y_offset){
    // compute largest upper triangle index
    const size_t upper_triangle = TRIG(SIZE);

    // position of the actual element of index
    unsigned int x = 0,y = 0;

    // adjust the index depending of upper/lower triangle.
    const size_t adjusted_index = index < upper_triangle ?
                index :
                SIZE * SIZE - index - 1;

    // compute triangular root
    const size_t triroot = TRIROOT(adjusted_index);
    const size_t trig = TRIG(triroot);
    const size_t offset = adjusted_index - trig;

    // upper triangle
    if(index < upper_triangle){
        x = offset;
        y = triroot-offset;
    }
    // lower triangle
    else {
        x = SIZE - offset - 1;
        y = SIZE - (trig + triroot + 1 - adjusted_index);
    }

    // adjust the offset
    x += x_offset;
    y += y_offset;

    // manhattan distance
    const size_t man_dist = x+y;

    // calculate index using triangular number
    return TRIG(man_dist) +
            (man_dist >= SIZE ? x - (man_dist - SIZE + 1) : x) -
            (man_dist > SIZE ? 2* TRIG(man_dist - SIZE) : 0);
}

int main(){
    printf("%d\n", getNGonalNeighbor(15,-1,-1)); // should return 11
    printf("%d\n", getNGonalNeighbor(15, 0,-1)); // should return 14
    printf("%d\n", getNGonalNeighbor(15,-1, 0)); // should return 13
    printf("%d\n", getNGonalNeighbor(11,-2,-1)); // should return 1
}

输出确实是：

如果您认为此解决方案看起来过于复杂且效率低下，我提醒您这里的目标是GPU，与内存访问相比，计算成本几乎为零，并且所有索引计算都是使用大规模并行体系结构同时计算的。 / p>

对角扁平矩阵的邻域索引计算

2 个答案: