28 #if !defined(NO_SIMD) && defined(__x86_64__)
29 #include <x86intrin.h>
42 static constexpr uint8_t UNUSED = 255;
50 static uint8_t
const COMPONENT[8][4] = {
54 {UNUSED, UNUSED, UNUSED, UNUSED},
55 {UNUSED, UNUSED, UNUSED, UNUSED},
60 #if defined(NO_SIMD) || !defined(__x86_64__)
64 int index = ((ax > az) << 2) +
67 double w = _v[COMPONENT[index][2]];
74 double u = _v[COMPONENT[index][0]] / maxabs;
75 double v = _v[COMPONENT[index][1]] / maxabs;
77 double d = u * u + v * v;
79 _v[COMPONENT[index][0]] = u /
norm;
80 _v[COMPONENT[index][1]] = v /
norm;
81 _v[COMPONENT[index][2]] =
w /
norm;
84 static __m128d
const m0m0 = _mm_set_pd(-0.0, -0.0);
85 __m128d ayaz = _mm_andnot_pd(m0m0, _mm_loadu_pd(_v + 1));
86 __m128d axax = _mm_andnot_pd(m0m0, _mm_set1_pd(_v[0]));
87 __m128d az = _mm_unpackhi_pd(ayaz, _mm_setzero_pd());
88 int index = (_mm_movemask_pd(_mm_cmpgt_pd(axax, ayaz)) << 1) |
89 _mm_movemask_pd(_mm_cmplt_sd(az, ayaz));
94 __m128d uv = _mm_set_pd(_v[COMPONENT[index][1]],
95 _v[COMPONENT[index][0]]);
98 __m128d ww = _mm_set1_pd(_v[COMPONENT[index][2]]);
99 __m128d maxabs = _mm_andnot_pd(m0m0, ww);
100 if (_mm_ucomieq_sd(ww, _mm_setzero_pd())) {
105 uv = _mm_div_pd(uv, maxabs);
106 ww = _mm_or_pd(_mm_and_pd(m0m0, ww), _mm_set1_pd(1.0));
107 __m128d
norm = _mm_mul_pd(uv, uv);
112 _mm_add_sd(
norm, _mm_unpackhi_pd(
norm, _mm_setzero_pd()))
116 ww = _mm_div_sd(ww,
norm);
117 uv = _mm_div_pd(uv, _mm_shuffle_pd(
norm,
norm, 0));
118 _mm_store_sd(&_v[COMPONENT[index][0]], uv);
119 _mm_storeh_pd(&_v[COMPONENT[index][1]], uv);
120 _mm_store_sd(&_v[COMPONENT[index][2]], ww);
121 return _mm_cvtsd_f64(_mm_mul_sd(
norm, maxabs));
130 return v * c + k.
cross(v) * s + k * (k.
dot(v) * (1.0 - c));
136 v.
x(), v.
y(), v.
z());