23#ifndef LSST_SPHGEOM_CURVE_H_ 
   24#define LSST_SPHGEOM_CURVE_H_ 
   79#if !defined(NO_SIMD) && defined(__x86_64__) 
   80    #include <x86intrin.h> 
   98inline uint8_t 
log2(uint64_t 
x) {
 
   99    alignas(64) 
static uint8_t 
const PERFECT_HASH_TABLE[64] = {
 
  100         0,  1,  2,  7,  3, 13,  8, 19,  4, 25, 14, 28,  9, 34, 20, 40,
 
  101         5, 17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57,
 
  102        63,  6, 12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56,
 
  103        62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58
 
  105    uint64_t 
const DE_BRUIJN_SEQUENCE = UINT64_C(0x0218a392cd3d5dbf);
 
  122    return PERFECT_HASH_TABLE[(DE_BRUIJN_SEQUENCE * 
x) >> 58];
 
  127    alignas(32) 
static uint8_t 
const PERFECT_HASH_TABLE[32] = {
 
  128        0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
 
  129        8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5, 4, 31
 
  131    uint32_t 
const DE_BRUIJN_SEQUENCE = UINT32_C(0x07c4acdd);
 
  137    return PERFECT_HASH_TABLE[(DE_BRUIJN_SEQUENCE * 
x) >> 27];
 
  147#if defined(NO_SIMD) || !defined(__x86_64__) 
  153        b = (
b | (
b << 16)) & UINT64_C(0x0000ffff0000ffff);
 
  154        a = (
a | (
a << 16)) & UINT64_C(0x0000ffff0000ffff);
 
  155        b = (
b | (
b << 8)) & UINT64_C(0x00ff00ff00ff00ff);
 
  156        a = (
a | (
a << 8)) & UINT64_C(0x00ff00ff00ff00ff);
 
  157        b = (
b | (
b << 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
 
  158        a = (
a | (
a << 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
 
  159        b = (
b | (
b << 2)) & UINT64_C(0x3333333333333333);
 
  160        a = (
a | (
a << 2)) & UINT64_C(0x3333333333333333);
 
  161        b = (
b | (
b << 1)) & UINT64_C(0x5555555555555555);
 
  162        a = (
a | (
a << 1)) & UINT64_C(0x5555555555555555);
 
  167        xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 16)),
 
  168                           _mm_set1_epi32(0x0000ffff));
 
  169        xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 8)),
 
  170                           _mm_set1_epi32(0x00ff00ff));
 
  171        xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 4)),
 
  172                           _mm_set1_epi32(0x0f0f0f0f));
 
  173        xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 2)),
 
  174                           _mm_set1_epi32(0x33333333));
 
  175        xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 1)),
 
  176                           _mm_set1_epi32(0x55555555));
 
  177        __m128i 
y = _mm_unpackhi_epi64(xy, _mm_setzero_si128());
 
  178        __m128i r = _mm_or_si128(xy, _mm_slli_epi64(
y, 1));
 
  179        return static_cast<uint64_t
>(_mm_cvtsi128_si64(r));
 
  183        __m128i xy = _mm_set_epi64x(
static_cast<int64_t
>(
y),
 
  184                                    static_cast<int64_t
>(
x));
 
  194#if defined(NO_SIMD) || !defined(__x86_64__) 
  196        uint64_t 
x = 
z & UINT64_C(0x5555555555555555);
 
  197        uint64_t 
y = (
z >> 1) & UINT64_C(0x5555555555555555);
 
  198        x = (
x | (
x >> 1)) & UINT64_C(0x3333333333333333);
 
  199        y = (
y | (
y >> 1)) & UINT64_C(0x3333333333333333);
 
  200        x = (
x | (
x >> 2)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
 
  201        y = (
y | (
y >> 2)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
 
  202        x = (
x | (
x >> 4)) & UINT64_C(0x00ff00ff00ff00ff);
 
  203        y = (
y | (
y >> 4)) & UINT64_C(0x00ff00ff00ff00ff);
 
  204        x = (
x | (
x >> 8)) & UINT64_C(0x0000ffff0000ffff);
 
  205        y = (
y | (
y >> 8)) & UINT64_C(0x0000ffff0000ffff);
 
  207                               static_cast<uint32_t
>(
y | (
y >> 16)));
 
  210    inline __m128i mortonIndexInverseSimd(uint64_t 
z) {
 
  211        __m128i xy = _mm_set_epi64x(
static_cast<int64_t
>(
z >> 1),
 
  212                                    static_cast<int64_t
>(
z));
 
  213        xy = _mm_and_si128(xy, _mm_set1_epi32(0x55555555));
 
  214        xy = _mm_and_si128(_mm_or_si128(xy, _mm_srli_epi64(xy, 1)),
 
  215                           _mm_set1_epi32(0x33333333));
 
  216        xy = _mm_and_si128(_mm_or_si128(xy, _mm_srli_epi64(xy, 2)),
 
  217                           _mm_set1_epi32(0x0f0f0f0f));
 
  218        xy = _mm_and_si128(_mm_or_si128(xy, _mm_srli_epi64(xy, 4)),
 
  219                           _mm_set1_epi32(0x00ff00ff));
 
  220        xy = _mm_and_si128(_mm_or_si128(xy, _mm_srli_epi64(xy, 8)),
 
  221                           _mm_set1_epi32(0x0000ffff));
 
  222        xy = _mm_or_si128(xy, _mm_srli_epi64(xy, 16));
 
  227        __m128i xy = mortonIndexInverseSimd(
z);
 
  228        uint64_t r = _mm_cvtsi128_si64(_mm_shuffle_epi32(xy, 8));
 
  230                               static_cast<uint32_t
>(r >> 32));
 
  237    alignas(64) 
static uint8_t 
const HILBERT_LUT_3[256] = {
 
  238        0x40, 0xc3, 0x01, 0x02, 0x04, 0x45, 0x87, 0x46,
 
  239        0x8e, 0x8d, 0x4f, 0xcc, 0x08, 0x49, 0x8b, 0x4a,
 
  240        0xfa, 0x3b, 0xf9, 0xb8, 0x7c, 0xff, 0x3d, 0x3e,
 
  241        0xf6, 0x37, 0xf5, 0xb4, 0xb2, 0xb1, 0x73, 0xf0,
 
  242        0x10, 0x51, 0x93, 0x52, 0xde, 0x1f, 0xdd, 0x9c,
 
  243        0x54, 0xd7, 0x15, 0x16, 0x58, 0xdb, 0x19, 0x1a,
 
  244        0x20, 0x61, 0xa3, 0x62, 0xee, 0x2f, 0xed, 0xac,
 
  245        0x64, 0xe7, 0x25, 0x26, 0x68, 0xeb, 0x29, 0x2a,
 
  246        0x00, 0x41, 0x83, 0x42, 0xce, 0x0f, 0xcd, 0x8c,
 
  247        0x44, 0xc7, 0x05, 0x06, 0x48, 0xcb, 0x09, 0x0a,
 
  248        0x50, 0xd3, 0x11, 0x12, 0x14, 0x55, 0x97, 0x56,
 
  249        0x9e, 0x9d, 0x5f, 0xdc, 0x18, 0x59, 0x9b, 0x5a,
 
  250        0xba, 0xb9, 0x7b, 0xf8, 0xb6, 0xb5, 0x77, 0xf4,
 
  251        0x3c, 0x7d, 0xbf, 0x7e, 0xf2, 0x33, 0xf1, 0xb0,
 
  252        0x60, 0xe3, 0x21, 0x22, 0x24, 0x65, 0xa7, 0x66,
 
  253        0xae, 0xad, 0x6f, 0xec, 0x28, 0x69, 0xab, 0x6a,
 
  254        0xaa, 0xa9, 0x6b, 0xe8, 0xa6, 0xa5, 0x67, 0xe4,
 
  255        0x2c, 0x6d, 0xaf, 0x6e, 0xe2, 0x23, 0xe1, 0xa0,
 
  256        0x9a, 0x99, 0x5b, 0xd8, 0x96, 0x95, 0x57, 0xd4,
 
  257        0x1c, 0x5d, 0x9f, 0x5e, 0xd2, 0x13, 0xd1, 0x90,
 
  258        0x70, 0xf3, 0x31, 0x32, 0x34, 0x75, 0xb7, 0x76,
 
  259        0xbe, 0xbd, 0x7f, 0xfc, 0x38, 0x79, 0xbb, 0x7a,
 
  260        0xca, 0x0b, 0xc9, 0x88, 0x4c, 0xcf, 0x0d, 0x0e,
 
  261        0xc6, 0x07, 0xc5, 0x84, 0x82, 0x81, 0x43, 0xc0,
 
  262        0xea, 0x2b, 0xe9, 0xa8, 0x6c, 0xef, 0x2d, 0x2e,
 
  263        0xe6, 0x27, 0xe5, 0xa4, 0xa2, 0xa1, 0x63, 0xe0,
 
  264        0x30, 0x71, 0xb3, 0x72, 0xfe, 0x3f, 0xfd, 0xbc,
 
  265        0x74, 0xf7, 0x35, 0x36, 0x78, 0xfb, 0x39, 0x3a,
 
  266        0xda, 0x1b, 0xd9, 0x98, 0x5c, 0xdf, 0x1d, 0x1e,
 
  267        0xd6, 0x17, 0xd5, 0x94, 0x92, 0x91, 0x53, 0xd0,
 
  268        0x8a, 0x89, 0x4b, 0xc8, 0x86, 0x85, 0x47, 0xc4,
 
  269        0x0c, 0x4d, 0x8f, 0x4e, 0xc2, 0x03, 0xc1, 0x80
 
  273    for (
m = 2 * 
m; 
m >= 6;) {
 
  275        uint8_t j = HILBERT_LUT_3[i | ((
z >> 
m) & 0x3f)];
 
  276        h = (h << 6) | (j & 0x3f);
 
  282        uint8_t j = HILBERT_LUT_3[i | ((
z << r) & 0x3f)];
 
  283        h = (h << 
m) | ((j & 0x3f) >> r);
 
  291    alignas(64) 
static uint8_t 
const HILBERT_INVERSE_LUT_3[256] = {
 
  292        0x40, 0x02, 0x03, 0xc1, 0x04, 0x45, 0x47, 0x86,
 
  293        0x0c, 0x4d, 0x4f, 0x8e, 0xcb, 0x89, 0x88, 0x4a,
 
  294        0x20, 0x61, 0x63, 0xa2, 0x68, 0x2a, 0x2b, 0xe9,
 
  295        0x6c, 0x2e, 0x2f, 0xed, 0xa7, 0xe6, 0xe4, 0x25,
 
  296        0x30, 0x71, 0x73, 0xb2, 0x78, 0x3a, 0x3b, 0xf9,
 
  297        0x7c, 0x3e, 0x3f, 0xfd, 0xb7, 0xf6, 0xf4, 0x35,
 
  298        0xdf, 0x9d, 0x9c, 0x5e, 0x9b, 0xda, 0xd8, 0x19,
 
  299        0x93, 0xd2, 0xd0, 0x11, 0x54, 0x16, 0x17, 0xd5,
 
  300        0x00, 0x41, 0x43, 0x82, 0x48, 0x0a, 0x0b, 0xc9,
 
  301        0x4c, 0x0e, 0x0f, 0xcd, 0x87, 0xc6, 0xc4, 0x05,
 
  302        0x50, 0x12, 0x13, 0xd1, 0x14, 0x55, 0x57, 0x96,
 
  303        0x1c, 0x5d, 0x5f, 0x9e, 0xdb, 0x99, 0x98, 0x5a,
 
  304        0x70, 0x32, 0x33, 0xf1, 0x34, 0x75, 0x77, 0xb6,
 
  305        0x3c, 0x7d, 0x7f, 0xbe, 0xfb, 0xb9, 0xb8, 0x7a,
 
  306        0xaf, 0xee, 0xec, 0x2d, 0xe7, 0xa5, 0xa4, 0x66,
 
  307        0xe3, 0xa1, 0xa0, 0x62, 0x28, 0x69, 0x6b, 0xaa,
 
  308        0xff, 0xbd, 0xbc, 0x7e, 0xbb, 0xfa, 0xf8, 0x39,
 
  309        0xb3, 0xf2, 0xf0, 0x31, 0x74, 0x36, 0x37, 0xf5,
 
  310        0x9f, 0xde, 0xdc, 0x1d, 0xd7, 0x95, 0x94, 0x56,
 
  311        0xd3, 0x91, 0x90, 0x52, 0x18, 0x59, 0x5b, 0x9a,
 
  312        0x8f, 0xce, 0xcc, 0x0d, 0xc7, 0x85, 0x84, 0x46,
 
  313        0xc3, 0x81, 0x80, 0x42, 0x08, 0x49, 0x4b, 0x8a,
 
  314        0x60, 0x22, 0x23, 0xe1, 0x24, 0x65, 0x67, 0xa6,
 
  315        0x2c, 0x6d, 0x6f, 0xae, 0xeb, 0xa9, 0xa8, 0x6a,
 
  316        0xbf, 0xfe, 0xfc, 0x3d, 0xf7, 0xb5, 0xb4, 0x76,
 
  317        0xf3, 0xb1, 0xb0, 0x72, 0x38, 0x79, 0x7b, 0xba,
 
  318        0xef, 0xad, 0xac, 0x6e, 0xab, 0xea, 0xe8, 0x29,
 
  319        0xa3, 0xe2, 0xe0, 0x21, 0x64, 0x26, 0x27, 0xe5,
 
  320        0xcf, 0x8d, 0x8c, 0x4e, 0x8b, 0xca, 0xc8, 0x09,
 
  321        0x83, 0xc2, 0xc0, 0x01, 0x44, 0x06, 0x07, 0xc5,
 
  322        0x10, 0x51, 0x53, 0x92, 0x58, 0x1a, 0x1b, 0xd9,
 
  323        0x5c, 0x1e, 0x1f, 0xdd, 0x97, 0xd6, 0xd4, 0x15
 
  327    for (
m = 2 * 
m; 
m >= 6;) {
 
  329        uint8_t j = HILBERT_INVERSE_LUT_3[i | ((h >> 
m) & 0x3f)];
 
  330        z = (
z << 6) | (j & 0x3f);
 
  336        uint8_t j = HILBERT_INVERSE_LUT_3[i | ((h << r) & 0x3f)];
 
  337        z = (
z << 
m) | ((j & 0x3f) >> r);
 
  353#if !defined(NO_SIMD) && defined(__x86_64__) 
  365#if !defined(NO_SIMD) && defined(__x86_64__) 
  366    inline __m128i hilbertIndexInverseSimd(uint64_t h, 
int m) {
 
std::tuple< uint32_t, uint32_t > mortonIndexInverse(uint64_t z)
mortonIndexInverse separates the even and odd bits of z.
std::tuple< uint32_t, uint32_t > hilbertIndexInverse(uint64_t h, int m)
hilbertIndexInverse returns the point (x, y) with Hilbert index h, where x and y are m bit integers.
uint64_t hilbertIndex(uint32_t x, uint32_t y, int m)
hilbertIndex returns the index of (x, y) in a 2-D Hilbert curve.
uint64_t mortonIndex(uint32_t x, uint32_t y)
mortonIndex interleaves the bits of x and y.
uint64_t hilbertToMorton(uint64_t h, int m)
hilbertToMorton converts the 2m-bit Hilbert index h to the corresponding Morton index.
uint64_t mortonToHilbert(uint64_t z, int m)
mortonToHilbert converts the 2m-bit Morton index z to the corresponding Hilbert index.