simd: organise SIMD-functions

* optimize dot product
This commit is contained in:
Recep Aslantas
2019-01-24 10:17:49 +03:00
parent be6aa9a89a
commit 31bb303c55
10 changed files with 259 additions and 126 deletions

View File

@@ -27,94 +27,39 @@
#if defined( __SSE__ ) || defined( __SSE2__ )
# include <xmmintrin.h>
# include <emmintrin.h>
/* OPTIONAL: You may save some instructions but latency (not sure) */
#ifdef CGLM_USE_INT_DOMAIN
# define glmm_shuff1(xmm, z, y, x, w) \
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
_MM_SHUFFLE(z, y, x, w)))
#else
# define glmm_shuff1(xmm, z, y, x, w) \
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
#endif
#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
z1, y1, x1, w1)
static inline
__m128
glmm_dot(__m128 a, __m128 b) {
__m128 x0;
x0 = _mm_mul_ps(a, b);
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
}
static inline
__m128
glmm_norm(__m128 a) {
return _mm_sqrt_ps(glmm_dot(a, a));
}
static inline
__m128
glmm_load3(float v[3]) {
__m128i xy;
__m128 z;
xy = _mm_loadl_epi64((const __m128i *)v);
z = _mm_load_ss(&v[2]);
return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
}
static inline
void
glmm_store3(__m128 vx, float v[3]) {
_mm_storel_pi((__m64 *)&v[0], vx);
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
}
#ifdef CGLM_ALL_UNALIGNED
# define glmm_load(p) _mm_loadu_ps(p)
# define glmm_store(p, a) _mm_storeu_ps(p, a)
#else
# define glmm_load(p) _mm_load_ps(p)
# define glmm_store(p, a) _mm_store_ps(p, a)
#endif
#endif
/* x86, x64 */
#if defined( __SSE__ ) || defined( __SSE2__ )
# define CGLM_SSE_FP 1
#endif
#ifdef __AVX__
# define CGLM_AVX_FP 1
#ifdef CGLM_ALL_UNALIGNED
# define glmm_load256(p) _mm256_loadu_ps(p)
# define glmm_store256(p, a) _mm256_storeu_ps(p, a)
#else
# define glmm_load256(p) _mm256_load_ps(p)
# define glmm_store256(p, a) _mm256_store_ps(p, a)
#endif
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
#if defined(__SSE3__)
# include <x86intrin.h>
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
#if defined(__SSE4_1__)
# include <smmintrin.h>
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
#if defined(__SSE4_2__)
# include <nmmintrin.h>
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
#ifdef __AVX__
# include <immintrin.h>
# define CGLM_AVX_FP 1
# ifndef CGLM_SIMD_x86
# define CGLM_SIMD_x86
# endif
#endif
/* ARM Neon */
@@ -122,9 +67,24 @@ glmm_store3(__m128 vx, float v[3]) {
# include <arm_neon.h>
# if defined(__ARM_NEON_FP)
# define CGLM_NEON_FP 1
# ifndef CGLM_SIMD_ARM
# define CGLM_SIMD_ARM
# endif
# endif
#else
# undef CGLM_NEON_FP
#endif
#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP)
# ifndef CGLM_SIMD
# define CGLM_SIMD
# endif
#endif
#if defined(CGLM_SIMD_x86)
# include "x86.h"
#endif
#if defined(CGLM_SIMD_ARM)
# include "arm.h"
#endif
#endif /* cglm_intrin_h */