23 #ifndef LSST_SPHGEOM_CURVE_H_ 
   24 #define LSST_SPHGEOM_CURVE_H_ 
   79 #if !defined(NO_SIMD) && defined(__x86_64__) 
   80     #include <x86intrin.h> 
   98 inline uint8_t 
log2(uint64_t 
x) {
 
   99     alignas(64) 
static uint8_t 
const PERFECT_HASH_TABLE[64] = {
 
  100          0,  1,  2,  7,  3, 13,  8, 19,  4, 25, 14, 28,  9, 34, 20, 40,
 
  101          5, 17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57,
 
  102         63,  6, 12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56,
 
  103         62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58
 
  105     uint64_t 
const DE_BRUIJN_SEQUENCE = UINT64_C(0x0218a392cd3d5dbf);
 
  122     return PERFECT_HASH_TABLE[(DE_BRUIJN_SEQUENCE * 
x) >> 58];
 
  127     alignas(32) 
static uint8_t 
const PERFECT_HASH_TABLE[32] = {
 
  128         0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
 
  129         8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5, 4, 31
 
  131     uint32_t 
const DE_BRUIJN_SEQUENCE = UINT32_C(0x07c4acdd);
 
  137     return PERFECT_HASH_TABLE[(DE_BRUIJN_SEQUENCE * 
x) >> 27];
 
  147 #if defined(NO_SIMD) || !defined(__x86_64__) 
  153         b = (
b | (
b << 16)) & UINT64_C(0x0000ffff0000ffff);
 
  154         a = (
a | (
a << 16)) & UINT64_C(0x0000ffff0000ffff);
 
  155         b = (
b | (
b << 8)) & UINT64_C(0x00ff00ff00ff00ff);
 
  156         a = (
a | (
a << 8)) & UINT64_C(0x00ff00ff00ff00ff);
 
  157         b = (
b | (
b << 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
 
  158         a = (
a | (
a << 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
 
  159         b = (
b | (
b << 2)) & UINT64_C(0x3333333333333333);
 
  160         a = (
a | (
a << 2)) & UINT64_C(0x3333333333333333);
 
  161         b = (
b | (
b << 1)) & UINT64_C(0x5555555555555555);
 
  162         a = (
a | (
a << 1)) & UINT64_C(0x5555555555555555);
 
  167         xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 16)),
 
  168                            _mm_set1_epi32(0x0000ffff));
 
  169         xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 8)),
 
  170                            _mm_set1_epi32(0x00ff00ff));
 
  171         xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 4)),
 
  172                            _mm_set1_epi32(0x0f0f0f0f));
 
  173         xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 2)),
 
  174                            _mm_set1_epi32(0x33333333));
 
  175         xy = _mm_and_si128(_mm_or_si128(xy, _mm_slli_epi64(xy, 1)),
 
  176                            _mm_set1_epi32(0x55555555));
 
  177         __m128i 
y = _mm_unpackhi_epi64(xy, _mm_setzero_si128());
 
  178         __m128i r = _mm_or_si128(xy, _mm_slli_epi64(
y, 1));
 
  179         return static_cast<uint64_t
>(_mm_cvtsi128_si64(r));
 
  183         __m128i xy = _mm_set_epi64x(
static_cast<int64_t
>(
y),
 
  184                                     static_cast<int64_t
>(
x));
 
  194 #if defined(NO_SIMD) || !defined(__x86_64__) 
  196         uint64_t 
x = 
z & UINT64_C(0x5555555555555555);
 
  197         uint64_t 
y = (
z >> 1) & UINT64_C(0x5555555555555555);
 
  198         x = (
x | (
x >> 1)) & UINT64_C(0x3333333333333333);
 
  199         y = (
y | (
y >> 1)) & UINT64_C(0x3333333333333333);
 
  200         x = (
x | (
x >> 2)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
 
  201         y = (
y | (
y >> 2)) & UINT64_C(0x0f0f0f0f0f0f0f0f);
 
  202         x = (
x | (
x >> 4)) & UINT64_C(0x00ff00ff00ff00ff);
 
  203         y = (
y | (
y >> 4)) & UINT64_C(0x00ff00ff00ff00ff);
 
  204         x = (
x | (
x >> 8)) & UINT64_C(0x0000ffff0000ffff);
 
  205         y = (
y | (
y >> 8)) & UINT64_C(0x0000ffff0000ffff);
 
  207                                static_cast<uint32_t
>(
y | (
y >> 16)));
 
  210     inline __m128i mortonIndexInverseSimd(uint64_t 
z) {
 
  211         __m128i xy = _mm_set_epi64x(
static_cast<int64_t
>(
z >> 1),
 
  212                                     static_cast<int64_t
>(
z));
 
  213         xy = _mm_and_si128(xy, _mm_set1_epi32(0x55555555));
 
  214         xy = _mm_and_si128(_mm_or_si128(xy, _mm_srli_epi64(xy, 1)),
 
  215                            _mm_set1_epi32(0x33333333));
 
  216         xy = _mm_and_si128(_mm_or_si128(xy, _mm_srli_epi64(xy, 2)),
 
  217                            _mm_set1_epi32(0x0f0f0f0f));
 
  218         xy = _mm_and_si128(_mm_or_si128(xy, _mm_srli_epi64(xy, 4)),
 
  219                            _mm_set1_epi32(0x00ff00ff));
 
  220         xy = _mm_and_si128(_mm_or_si128(xy, _mm_srli_epi64(xy, 8)),
 
  221                            _mm_set1_epi32(0x0000ffff));
 
  222         xy = _mm_or_si128(xy, _mm_srli_epi64(xy, 16));
 
  227         __m128i xy = mortonIndexInverseSimd(
z);
 
  228         uint64_t r = _mm_cvtsi128_si64(_mm_shuffle_epi32(xy, 8));
 
  230                                static_cast<uint32_t
>(r >> 32));
 
  237     alignas(64) 
static uint8_t 
const HILBERT_LUT_3[256] = {
 
  238         0x40, 0xc3, 0x01, 0x02, 0x04, 0x45, 0x87, 0x46,
 
  239         0x8e, 0x8d, 0x4f, 0xcc, 0x08, 0x49, 0x8b, 0x4a,
 
  240         0xfa, 0x3b, 0xf9, 0xb8, 0x7c, 0xff, 0x3d, 0x3e,
 
  241         0xf6, 0x37, 0xf5, 0xb4, 0xb2, 0xb1, 0x73, 0xf0,
 
  242         0x10, 0x51, 0x93, 0x52, 0xde, 0x1f, 0xdd, 0x9c,
 
  243         0x54, 0xd7, 0x15, 0x16, 0x58, 0xdb, 0x19, 0x1a,
 
  244         0x20, 0x61, 0xa3, 0x62, 0xee, 0x2f, 0xed, 0xac,
 
  245         0x64, 0xe7, 0x25, 0x26, 0x68, 0xeb, 0x29, 0x2a,
 
  246         0x00, 0x41, 0x83, 0x42, 0xce, 0x0f, 0xcd, 0x8c,
 
  247         0x44, 0xc7, 0x05, 0x06, 0x48, 0xcb, 0x09, 0x0a,
 
  248         0x50, 0xd3, 0x11, 0x12, 0x14, 0x55, 0x97, 0x56,
 
  249         0x9e, 0x9d, 0x5f, 0xdc, 0x18, 0x59, 0x9b, 0x5a,
 
  250         0xba, 0xb9, 0x7b, 0xf8, 0xb6, 0xb5, 0x77, 0xf4,
 
  251         0x3c, 0x7d, 0xbf, 0x7e, 0xf2, 0x33, 0xf1, 0xb0,
 
  252         0x60, 0xe3, 0x21, 0x22, 0x24, 0x65, 0xa7, 0x66,
 
  253         0xae, 0xad, 0x6f, 0xec, 0x28, 0x69, 0xab, 0x6a,
 
  254         0xaa, 0xa9, 0x6b, 0xe8, 0xa6, 0xa5, 0x67, 0xe4,
 
  255         0x2c, 0x6d, 0xaf, 0x6e, 0xe2, 0x23, 0xe1, 0xa0,
 
  256         0x9a, 0x99, 0x5b, 0xd8, 0x96, 0x95, 0x57, 0xd4,
 
  257         0x1c, 0x5d, 0x9f, 0x5e, 0xd2, 0x13, 0xd1, 0x90,
 
  258         0x70, 0xf3, 0x31, 0x32, 0x34, 0x75, 0xb7, 0x76,
 
  259         0xbe, 0xbd, 0x7f, 0xfc, 0x38, 0x79, 0xbb, 0x7a,
 
  260         0xca, 0x0b, 0xc9, 0x88, 0x4c, 0xcf, 0x0d, 0x0e,
 
  261         0xc6, 0x07, 0xc5, 0x84, 0x82, 0x81, 0x43, 0xc0,
 
  262         0xea, 0x2b, 0xe9, 0xa8, 0x6c, 0xef, 0x2d, 0x2e,
 
  263         0xe6, 0x27, 0xe5, 0xa4, 0xa2, 0xa1, 0x63, 0xe0,
 
  264         0x30, 0x71, 0xb3, 0x72, 0xfe, 0x3f, 0xfd, 0xbc,
 
  265         0x74, 0xf7, 0x35, 0x36, 0x78, 0xfb, 0x39, 0x3a,
 
  266         0xda, 0x1b, 0xd9, 0x98, 0x5c, 0xdf, 0x1d, 0x1e,
 
  267         0xd6, 0x17, 0xd5, 0x94, 0x92, 0x91, 0x53, 0xd0,
 
  268         0x8a, 0x89, 0x4b, 0xc8, 0x86, 0x85, 0x47, 0xc4,
 
  269         0x0c, 0x4d, 0x8f, 0x4e, 0xc2, 0x03, 0xc1, 0x80
 
  273     for (
m = 2 * 
m; 
m >= 6;) {
 
  275         uint8_t j = HILBERT_LUT_3[i | ((
z >> 
m) & 0x3f)];
 
  276         h = (h << 6) | (j & 0x3f);
 
  282         uint8_t j = HILBERT_LUT_3[i | ((
z << r) & 0x3f)];
 
  283         h = (h << 
m) | ((j & 0x3f) >> r);
 
  291     alignas(64) 
static uint8_t 
const HILBERT_INVERSE_LUT_3[256] = {
 
  292         0x40, 0x02, 0x03, 0xc1, 0x04, 0x45, 0x47, 0x86,
 
  293         0x0c, 0x4d, 0x4f, 0x8e, 0xcb, 0x89, 0x88, 0x4a,
 
  294         0x20, 0x61, 0x63, 0xa2, 0x68, 0x2a, 0x2b, 0xe9,
 
  295         0x6c, 0x2e, 0x2f, 0xed, 0xa7, 0xe6, 0xe4, 0x25,
 
  296         0x30, 0x71, 0x73, 0xb2, 0x78, 0x3a, 0x3b, 0xf9,
 
  297         0x7c, 0x3e, 0x3f, 0xfd, 0xb7, 0xf6, 0xf4, 0x35,
 
  298         0xdf, 0x9d, 0x9c, 0x5e, 0x9b, 0xda, 0xd8, 0x19,
 
  299         0x93, 0xd2, 0xd0, 0x11, 0x54, 0x16, 0x17, 0xd5,
 
  300         0x00, 0x41, 0x43, 0x82, 0x48, 0x0a, 0x0b, 0xc9,
 
  301         0x4c, 0x0e, 0x0f, 0xcd, 0x87, 0xc6, 0xc4, 0x05,
 
  302         0x50, 0x12, 0x13, 0xd1, 0x14, 0x55, 0x57, 0x96,
 
  303         0x1c, 0x5d, 0x5f, 0x9e, 0xdb, 0x99, 0x98, 0x5a,
 
  304         0x70, 0x32, 0x33, 0xf1, 0x34, 0x75, 0x77, 0xb6,
 
  305         0x3c, 0x7d, 0x7f, 0xbe, 0xfb, 0xb9, 0xb8, 0x7a,
 
  306         0xaf, 0xee, 0xec, 0x2d, 0xe7, 0xa5, 0xa4, 0x66,
 
  307         0xe3, 0xa1, 0xa0, 0x62, 0x28, 0x69, 0x6b, 0xaa,
 
  308         0xff, 0xbd, 0xbc, 0x7e, 0xbb, 0xfa, 0xf8, 0x39,
 
  309         0xb3, 0xf2, 0xf0, 0x31, 0x74, 0x36, 0x37, 0xf5,
 
  310         0x9f, 0xde, 0xdc, 0x1d, 0xd7, 0x95, 0x94, 0x56,
 
  311         0xd3, 0x91, 0x90, 0x52, 0x18, 0x59, 0x5b, 0x9a,
 
  312         0x8f, 0xce, 0xcc, 0x0d, 0xc7, 0x85, 0x84, 0x46,
 
  313         0xc3, 0x81, 0x80, 0x42, 0x08, 0x49, 0x4b, 0x8a,
 
  314         0x60, 0x22, 0x23, 0xe1, 0x24, 0x65, 0x67, 0xa6,
 
  315         0x2c, 0x6d, 0x6f, 0xae, 0xeb, 0xa9, 0xa8, 0x6a,
 
  316         0xbf, 0xfe, 0xfc, 0x3d, 0xf7, 0xb5, 0xb4, 0x76,
 
  317         0xf3, 0xb1, 0xb0, 0x72, 0x38, 0x79, 0x7b, 0xba,
 
  318         0xef, 0xad, 0xac, 0x6e, 0xab, 0xea, 0xe8, 0x29,
 
  319         0xa3, 0xe2, 0xe0, 0x21, 0x64, 0x26, 0x27, 0xe5,
 
  320         0xcf, 0x8d, 0x8c, 0x4e, 0x8b, 0xca, 0xc8, 0x09,
 
  321         0x83, 0xc2, 0xc0, 0x01, 0x44, 0x06, 0x07, 0xc5,
 
  322         0x10, 0x51, 0x53, 0x92, 0x58, 0x1a, 0x1b, 0xd9,
 
  323         0x5c, 0x1e, 0x1f, 0xdd, 0x97, 0xd6, 0xd4, 0x15
 
  327     for (
m = 2 * 
m; 
m >= 6;) {
 
  329         uint8_t j = HILBERT_INVERSE_LUT_3[i | ((h >> 
m) & 0x3f)];
 
  330         z = (
z << 6) | (j & 0x3f);
 
  336         uint8_t j = HILBERT_INVERSE_LUT_3[i | ((h << r) & 0x3f)];
 
  337         z = (
z << 
m) | ((j & 0x3f) >> r);
 
  353 #if !defined(NO_SIMD) && defined(__x86_64__) 
  365 #if !defined(NO_SIMD) && defined(__x86_64__) 
  366     inline __m128i hilbertIndexInverseSimd(uint64_t h, 
int m) {
 
  373 #endif // LSST_SPHGEOM_CURVE_H_