我希望以快速有效的方式为C语言编写Morton Z-Order Encoding and Decoding的两个函数。
uint64_t morton_encode(uint32_t xindex, uint32_t yindex, uint32_t zindex);
void morton_decode(uint64_t morton_number, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex);
我之前已经关注了问题
how to compute a 3d morton number interleave the bits of 3 ints
我目前基于SO和开源代码的解决方案是
uint64_t spread(uint64_t w) {
w &= 0x00000000001fffff;
w = (w | w << 32) & 0x001f00000000ffff;
w = (w | w << 16) & 0x001f0000ff0000ff;
w = (w | w << 8) & 0x010f00f00f00f00f;
w = (w | w << 4) & 0x10c30c30c30c30c3;
w = (w | w << 2) & 0x1249249249249249;
return w;
}
uint64_t morton_encode(uint32_t x, uint32_t y, uint32_t z) {
return ((spread((uint64_t)x)) | (spread((uint64_t)y) << 1) | (spread((uint64_t)z) << 2));
}
///////////////// For Decoding //////////////////////
uint32_t compact(uint64_t w) {
w &= 0x1249249249249249;
w = (w ^ (w >> 2)) & 0x30c30c30c30c30c3;
w = (w ^ (w >> 4)) & 0xf00f00f00f00f00f;
w = (w ^ (w >> 8)) & 0x00ff0000ff0000ff;
w = (w ^ (w >> 16)) & 0x00ff00000000ffff;
w = (w ^ (w >> 32)) & 0x00000000001fffff;
return (uint32_t)w;
}
void morton_decode(uint64_t morton_number, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex){
*xindex = compact(code);
*yindex = compact(code >> 1);
*zindex = compact(code >> 2);
}
我最近遇到了这个问题(尝试使用2D morton代码时):2d morton code encode decode 64bits
#include <immintrin.h>
#include <stdint.h>
// on GCC, compile with option -mbmi2, requires Haswell or better.
uint64_t xy_to_morton (uint32_t x, uint32_t y)
{
return _pdep_u32(x, 0x55555555) | _pdep_u32(y,0xaaaaaaaa);
}
uint64_t morton_to_xy (uint64_t m, uint32_t *x, uint32_t *y)
{
*x = _pext_u64(m, 0x5555555555555555);
*y = _pext_u64(m, 0xaaaaaaaaaaaaaaaa);
}
据我所知,这不是一个可移植的解决方案,但由于我(将)运行我的代码的每个系统都有Haswell CPU(甚至在HPC群集上)。我的问题:
编辑:对于Q1,我非常接近解决方案,但仍然无法弄清楚
0x55555555 -> 0000 0000 0101 0101 0101 0101 0101 0101 0101 0101
0xaaaaaaaa -> 0000 0000 1010 1010 1010 1010 1010 1010 1010 1010
很明显,掩模是x和y位的交替。所以对于3d我需要得到像这样的面具
0000 0000 01 001 001 001 001 001 001 001 001 001 001 (for x)
0000 0000 01 010 010 010 010 010 010 010 010 010 010 (for y)
0000 0000 01 100 100 100 100 100 100 100 100 100 100 (for z)
^
我对64位morton代码的^标记之前的位感到困惑,只有x,y和z的前32位才是重要的。
答案 0 :(得分:0)
所以在摆弄了一下之后,我找到了一个我认为应该在这里作为答案分享的解决方案。
// on GCC, compile with option -mbmi2, requires Haswell or better.
#include <stdio.h>
#include <limits.h>
#include <immintrin.h>
#include <inttypes.h>
#include <sys/time.h>
#define maask 0x1249249249249249
/* Morton Encoding Mehtod 1 */
uint64_t Z_encode1 (uint32_t x, uint32_t y, uint32_t z)
{
return _pdep_u32(x, maask) | \
_pdep_u32(y,(maask << 1)) | \
_pdep_u32(z,(maask << 2));
}
/* Morton Decoding Method 1 */
uint64_t Z_decode1 (uint64_t m, uint32_t *x, uint32_t *y, uint32_t *z)
{
*x = _pext_u64(m, maask);
*y = _pext_u64(m, (maask << 1));
*z = _pext_u64(m, (maask << 2));
}
// method 2 functions
uint64_t spread(uint64_t w) {
w &= 0x00000000001fffff;
w = (w | w << 32) & 0x001f00000000ffff;
w = (w | w << 16) & 0x001f0000ff0000ff;
w = (w | w << 8) & 0x010f00f00f00f00f;
w = (w | w << 4) & 0x10c30c30c30c30c3;
w = (w | w << 2) & 0x1249249249249249;
return w;
}
uint32_t compact(uint64_t w) {
w &= 0x1249249249249249;
w = (w ^ (w >> 2)) & 0x30c30c30c30c30c3;
w = (w ^ (w >> 4)) & 0xf00f00f00f00f00f;
w = (w ^ (w >> 8)) & 0x00ff0000ff0000ff;
w = (w ^ (w >> 16)) & 0x00ff00000000ffff;
w = (w ^ (w >> 32)) & 0x00000000001fffff;
return (uint32_t)w;
}
uint64_t Z_encode2(uint32_t x, uint32_t y, uint32_t z) {
return ((spread((uint64_t)x)) | (spread((uint64_t)y) << 1) | (spread((uint64_t)z) << 2));
}
void Z_decode2(uint64_t Z_code, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex){
*xindex = compact(Z_code);
*yindex = compact(Z_code >> 1);
*zindex = compact(Z_code >> 2);
}
int main()
{
const int size = 1024;
struct timeval start, stop;
double time_encode1 = 0.0, time_encode2 = 0.0;
double time_decode1 = 0.0, time_decode2 = 0.0;
uint64_t Zindex = 0;
uint32_t xindex=0,yindex=0,zindex=0;
/* method 1 ENCODING benchmark */
gettimeofday(&start, NULL);
for (uint32_t i = 0; i < size; i++){
for (uint32_t j = 0; j < size; j++) {
for (uint32_t k = 0; k < size; k++) {
Zindex = Z_encode1(i, j, k);
}
}
}
gettimeofday(&stop, NULL);
time_encode1 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);
/* method 2 ENCODING benchmark */
gettimeofday(&start, NULL);
for (uint32_t i = 0; i < size; i++){
for (uint32_t j = 0; j < size; j++) {
for (uint32_t k = 0; k < size; k++) {
Zindex = Z_encode2(i, j, k);
}
}
}
gettimeofday(&stop, NULL);
time_encode2 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);
//////////////////////// DECODING ////////////////
/* method 1 DECODING benchmark */
gettimeofday(&start, NULL);
for (uint64_t i = 0; i < size; i++)
Z_decode1(i, &xindex, &yindex, &zindex);
gettimeofday(&stop, NULL);
time_decode1 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);
/* method 1 DECODING benchmark */
gettimeofday(&start, NULL);
for (uint64_t i = 0; i < size; i++)
Z_decode2(i, &xindex, &yindex, &zindex);
gettimeofday(&stop, NULL);
time_decode2 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);
printf("Method1 -> Encoding: %f Decoding: %f\n", time_encode1, time_decode1);
printf("Method2 -> Encoding: %f Decoding: %f\n", time_encode2, time_decode2);
return 0;
}
以下是结果
size = 512 ( 512x512x512 = 134217728 numbers)
======================================================
Method 1 -> Encoding: 0.600302sec Decoding: 0.000003sec
Method 2 -> Encoding: 2.778170sec Decoding: 0.000011sec
size = 1024 ( 1024x1024x1024 = 1073741824 numbers)
======================================================
Method 1 -> Encoding: 4.623594sec Decoding: 0.000006sec
Method 2 -> Encoding: 22.339238sec Decoding: 0.000022sec
size = 2048 ( 2048*2048*2048 = 8589934592 numbers)
======================================================
Method 1 -> Encoding: 36.981743sec Decoding: 0.000011sec
Method 2 -> Encoding: 178.164773sec Decoding: 0.000045sec
结论:编码比解码成本高,使用BMI指令集来优化性能。
PS。 - 因为需要Haswell cpu或更高版本而不便携。