simd: organise SIMD-functions

* optimize dot product
2026-02-17 03:39:05 +00:00 · 2019-01-24 10:17:49 +03:00
parent be6aa9a89a
commit 31bb303c55
10 changed files with 259 additions and 126 deletions
--- a/include/cglm/simd/intrin.h
+++ b/include/cglm/simd/intrin.h
@@ -27,94 +27,39 @@
 #if defined( __SSE__ ) || defined( __SSE2__ )
 #  include <xmmintrin.h>
 #  include <emmintrin.h>
-
-/* OPTIONAL: You may save some instructions but latency (not sure) */
-#ifdef CGLM_USE_INT_DOMAIN
-#  define glmm_shuff1(xmm, z, y, x, w)                                        \
-     _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm),                \
-                                        _MM_SHUFFLE(z, y, x, w)))
-#else
-#  define glmm_shuff1(xmm, z, y, x, w)                                        \
-     _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
-#endif
-
-#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
-#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1)                     \
-     glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)),           \
-                 z1, y1, x1, w1)
-
-static inline
-__m128
-glmm_dot(__m128 a, __m128 b) {
-  __m128 x0;
-  x0 = _mm_mul_ps(a, b);
-  x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
-  return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
-}
-
-static inline
-__m128
-glmm_norm(__m128 a) {
-  return _mm_sqrt_ps(glmm_dot(a, a));
-}
-
-static inline
-__m128
-glmm_load3(float v[3]) {
-  __m128i xy;
-  __m128  z;
-
-  xy = _mm_loadl_epi64((const __m128i *)v);
-  z  = _mm_load_ss(&v[2]);
-
-  return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
-}
-
-static inline
-void
-glmm_store3(__m128 vx, float v[3]) {
-  _mm_storel_pi((__m64 *)&v[0], vx);
-  _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
-}
-
-#ifdef CGLM_ALL_UNALIGNED
-#  define glmm_load(p)      _mm_loadu_ps(p)
-#  define glmm_store(p, a)  _mm_storeu_ps(p, a)
-#else
-#  define glmm_load(p)      _mm_load_ps(p)
-#  define glmm_store(p, a)  _mm_store_ps(p, a)
-#endif
-
-#endif
-
-/* x86, x64 */
-#if defined( __SSE__ ) || defined( __SSE2__ )
 #  define CGLM_SSE_FP 1
-#endif
-
-#ifdef __AVX__
-#  define CGLM_AVX_FP 1
-
-#ifdef CGLM_ALL_UNALIGNED
-#  define glmm_load256(p)      _mm256_loadu_ps(p)
-#  define glmm_store256(p, a)  _mm256_storeu_ps(p, a)
-#else
-#  define glmm_load256(p)      _mm256_load_ps(p)
-#  define glmm_store256(p, a)  _mm256_store_ps(p, a)
-#endif
-
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
 #endif

 #if defined(__SSE3__)
 #  include <x86intrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
 #endif

 #if defined(__SSE4_1__)
 #  include <smmintrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
 #endif

 #if defined(__SSE4_2__)
 #  include <nmmintrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
+#endif
+
+#ifdef __AVX__
+#  include <immintrin.h>
+#  define CGLM_AVX_FP 1
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
 #endif

 /* ARM Neon */
@@ -122,9 +67,24 @@ glmm_store3(__m128 vx, float v[3]) {
 #  include <arm_neon.h>
 #  if defined(__ARM_NEON_FP)
 #    define CGLM_NEON_FP 1
+#    ifndef CGLM_SIMD_ARM
+#      define CGLM_SIMD_ARM
+#    endif
 #  endif
-#else
-#  undef  CGLM_NEON_FP
+#endif
+
+#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP)
+#  ifndef CGLM_SIMD
+#    define CGLM_SIMD
+#  endif
+#endif
+
+#if defined(CGLM_SIMD_x86)
+#  include "x86.h"
+#endif
+
+#if defined(CGLM_SIMD_ARM)
+#  include "arm.h"
 #endif

 #endif /* cglm_intrin_h */