28 #if !defined(NO_SIMD) && defined(__x86_64__) 
   29     #include <x86intrin.h> 
   42     static constexpr uint8_t UNUSED = 255;
 
   50     static uint8_t 
const COMPONENT[8][4] = {
 
   54         {UNUSED, UNUSED, UNUSED, UNUSED},
 
   55         {UNUSED, UNUSED, UNUSED, UNUSED},
 
   60 #if defined(NO_SIMD) || !defined(__x86_64__) 
   64     int index = ((ax > az) << 2) +
 
   67     double w = _v[COMPONENT[index][2]];
 
   74     double u = _v[COMPONENT[index][0]] / maxabs;
 
   75     double v = _v[COMPONENT[index][1]] / maxabs;
 
   77     double d = u * u + v * v;
 
   79     _v[COMPONENT[index][0]] = u / 
norm;
 
   80     _v[COMPONENT[index][1]] = v / 
norm;
 
   81     _v[COMPONENT[index][2]] = 
w / 
norm;
 
   84     static __m128d 
const m0m0 = _mm_set_pd(-0.0, -0.0);
 
   85     __m128d ayaz = _mm_andnot_pd(m0m0, _mm_loadu_pd(_v + 1));
 
   86     __m128d axax = _mm_andnot_pd(m0m0, _mm_set1_pd(_v[0]));
 
   87     __m128d az = _mm_unpackhi_pd(ayaz, _mm_setzero_pd());
 
   88     int index = (_mm_movemask_pd(_mm_cmpgt_pd(axax, ayaz)) << 1) |
 
   89                  _mm_movemask_pd(_mm_cmplt_sd(az, ayaz));
 
   94     __m128d uv = _mm_set_pd(_v[COMPONENT[index][1]],
 
   95                             _v[COMPONENT[index][0]]);
 
   98     __m128d ww = _mm_set1_pd(_v[COMPONENT[index][2]]);
 
   99     __m128d maxabs = _mm_andnot_pd(m0m0, ww);
 
  100     if (_mm_ucomieq_sd(ww, _mm_setzero_pd())) {
 
  105     uv = _mm_div_pd(uv, maxabs);
 
  106     ww = _mm_or_pd(_mm_and_pd(m0m0, ww), _mm_set1_pd(1.0));
 
  107     __m128d 
norm = _mm_mul_pd(uv, uv);
 
  112             _mm_add_sd(
norm, _mm_unpackhi_pd(
norm, _mm_setzero_pd()))
 
  116     ww = _mm_div_sd(ww, 
norm);
 
  117     uv = _mm_div_pd(uv, _mm_shuffle_pd(
norm, 
norm, 0));
 
  118     _mm_store_sd(&_v[COMPONENT[index][0]], uv);
 
  119     _mm_storeh_pd(&_v[COMPONENT[index][1]], uv);
 
  120     _mm_store_sd(&_v[COMPONENT[index][2]], ww);
 
  121     return _mm_cvtsd_f64(_mm_mul_sd(
norm, maxabs));
 
  130     return v * c + k.
cross(v) * s + k * (k.
dot(v) * (1.0 - c));
 
  136                   v.
x(), v.
y(), v.
z());