From dab86796a41c25dddeab825e4aab372d703d37ce Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 9 Jan 2024 21:35:39 +0300 Subject: [PATCH 1/2] simd: min / max helpers --- include/cglm/simd/arm.h | 8 +++----- include/cglm/simd/wasm.h | 14 ++++++-------- include/cglm/simd/x86.h | 3 +++ 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index e8507d7..1578390 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -56,11 +56,9 @@ glmm_float32x4_init(float x, float y, float z, float w) { #define glmm_float32x4_SIGNMASK_NPNP glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f) #define glmm_float32x4_SIGNMASK_NPPN glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f) -static inline -float32x4_t -glmm_abs(float32x4_t v) { - return vabsq_f32(v); -} +static inline float32x4_t glmm_abs(float32x4_t v) { return vabsq_f32(v); } +static inline float32x4_t glmm_min(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); } +static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); } static inline float32x4_t diff --git a/include/cglm/simd/wasm.h b/include/cglm/simd/wasm.h index faaf0c0..d9a18cd 100644 --- a/include/cglm/simd/wasm.h +++ b/include/cglm/simd/wasm.h @@ -34,13 +34,11 @@ #define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(0, GLMM_NEGZEROf, 0, GLMM_NEGZEROf) #define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, GLMM_NEGZEROf, 0) #define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, 0, GLMM_NEGZEROf) -#define glmm_float32x4_SIGNMASK_NEG wasm_i32x4_const_splat(GLMM_NEGZEROf) +#define glmm_float32x4_SIGNMASK_NEG wasm_i32x4_const_splat(GLMM_NEGZEROf) -static inline -glmm_128 -glmm_abs(glmm_128 x) { - return wasm_f32x4_abs(x); -} +static inline glmm_128 glmm_abs(glmm_128 x) { return wasm_f32x4_abs(x); } +static inline glmm_128 glmm_min(glmm_128 a, glmm_128 b) { return wasm_f32x4_pmin(b, a); } +static inline glmm_128 glmm_max(glmm_128 a, glmm_128 b) { return wasm_f32x4_pmax(b, a); } static inline glmm_128 @@ -74,7 +72,7 @@ glmm_128 glmm_vhmin(glmm_128 v) { glmm_128 x0, x1, x2; x0 = glmm_shuff1(v, 2, 3, 2, 3); /* [2, 3, 2, 3] */ - x1 = wasm_f32x4_pmin(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ + x1 = wasm_f32x4_pmin(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */ return wasm_f32x4_pmin(x1, x2); } @@ -90,7 +88,7 @@ glmm_128 glmm_vhmax(glmm_128 v) { glmm_128 x0, x1, x2; x0 = glmm_shuff1(v, 2, 3, 2, 3); /* [2, 3, 2, 3] */ - x1 = wasm_f32x4_pmax(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ + x1 = wasm_f32x4_pmax(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */ /* _mm_max_ss */ return wasm_i32x4_shuffle(x1, wasm_f32x4_pmax(x1, x2), 4, 1, 2, 3); diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index 696410d..8fd5a72 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -74,6 +74,9 @@ glmm_abs(__m128 x) { return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x); } +static inline __m128 glmm_min(__m128 a, __m128 b) { return _mm_min_ps(a, b); } +static inline __m128 glmm_max(__m128 a, __m128 b) { return _mm_max_ps(a, b); } + static inline __m128 glmm_vhadd(__m128 v) { From 6d8dd42ac26e361ccbfaf494eb5cb17a035fb52f Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 11 Jan 2024 00:14:28 +0300 Subject: [PATCH 2/2] simd: use new glmm_min/max in vec4 where possible --- include/cglm/vec4.h | 79 +++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 50 deletions(-) diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index fef1e7d..6588c9e 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -653,17 +653,14 @@ CGLM_INLINE void glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_add( - glmm_load(dest), - wasm_f32x4_pmax(glmm_load(a), glmm_load(b)))); + glmm_store(dest, wasm_f32x4_add(glmm_load(dest), + glmm_max(glmm_load(a), glmm_load(b)))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(dest), - _mm_max_ps(glmm_load(a), - glmm_load(b)))); + glmm_max(glmm_load(a), glmm_load(b)))); #elif defined(CGLM_NEON_FP) - vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), - vmaxq_f32(vld1q_f32(a), - vld1q_f32(b)))); + glmm_store(dest, vaddq_f32(glmm_load(dest), + glmm_max(glmm_load(a), glmm_load(b)))); #else dest[0] += glm_max(a[0], b[0]); dest[1] += glm_max(a[1], b[1]); @@ -685,17 +682,14 @@ CGLM_INLINE void glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_add( - glmm_load(dest), - wasm_f32x4_pmin(glmm_load(a), glmm_load(b)))); + glmm_store(dest, wasm_f32x4_add(glmm_load(dest), + glmm_min(glmm_load(a), glmm_load(b)))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_add_ps(glmm_load(dest), - _mm_min_ps(glmm_load(a), - glmm_load(b)))); + glmm_min(glmm_load(a), glmm_load(b)))); #elif defined(CGLM_NEON_FP) - vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), - vminq_f32(vld1q_f32(a), - vld1q_f32(b)))); + glmm_store(dest, vaddq_f32(glmm_load(dest), + glmm_min(glmm_load(a), glmm_load(b)))); #else dest[0] += glm_min(a[0], b[0]); dest[1] += glm_min(a[1], b[1]); @@ -825,17 +819,14 @@ CGLM_INLINE void glm_vec4_maxsub(vec4 a, vec4 b, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_sub( - glmm_load(dest), - wasm_f32x4_pmax(glmm_load(a), glmm_load(b)))); + glmm_store(dest, wasm_f32x4_sub(glmm_load(dest), + glmm_max(glmm_load(a), glmm_load(b)))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_sub_ps(glmm_load(dest), - _mm_max_ps(glmm_load(a), - glmm_load(b)))); + glmm_max(glmm_load(a), glmm_load(b)))); #elif defined(CGLM_NEON_FP) - vst1q_f32(dest, vsubq_f32(vld1q_f32(dest), - vmaxq_f32(vld1q_f32(a), - vld1q_f32(b)))); + glmm_store(dest, vsubq_f32(glmm_load(dest), + glmm_max(glmm_load(a), glmm_load(b)))); #else dest[0] -= glm_max(a[0], b[0]); dest[1] -= glm_max(a[1], b[1]); @@ -857,17 +848,14 @@ CGLM_INLINE void glm_vec4_minsub(vec4 a, vec4 b, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_sub( - glmm_load(dest), - wasm_f32x4_pmin(glmm_load(a), glmm_load(b)))); + glmm_store(dest, wasm_f32x4_sub(glmm_load(dest), + glmm_min(glmm_load(a), glmm_load(b)))); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_sub_ps(glmm_load(dest), - _mm_min_ps(glmm_load(a), - glmm_load(b)))); + glmm_min(glmm_load(a), glmm_load(b)))); #elif defined(CGLM_NEON_FP) - vst1q_f32(dest, vsubq_f32(vld1q_f32(dest), - vminq_f32(vld1q_f32(a), - vld1q_f32(b)))); + glmm_store(dest, vsubq_f32(vld1q_f32(dest), + glmm_min(glmm_load(a), glmm_load(b)))); #else dest[0] -= glm_min(a[0], b[0]); dest[1] -= glm_min(a[1], b[1]); @@ -1031,12 +1019,8 @@ glm_vec4_distance2(vec4 a, vec4 b) { CGLM_INLINE void glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { -#if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_pmax(glmm_load(a), glmm_load(b))); -#elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_max_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) - vst1q_f32(dest, vmaxq_f32(vld1q_f32(a), vld1q_f32(b))); +#if defined(CGLM_SIMD) + glmm_store(dest, glmm_max(glmm_load(a), glmm_load(b))); #else dest[0] = glm_max(a[0], b[0]); dest[1] = glm_max(a[1], b[1]); @@ -1055,12 +1039,8 @@ glm_vec4_maxv(vec4 a, vec4 b, vec4 dest) { CGLM_INLINE void glm_vec4_minv(vec4 a, vec4 b, vec4 dest) { -#if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_pmin(glmm_load(a), glmm_load(b))); -#elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_min_ps(glmm_load(a), glmm_load(b))); -#elif defined(CGLM_NEON_FP) - vst1q_f32(dest, vminq_f32(vld1q_f32(a), vld1q_f32(b))); +#if defined(CGLM_SIMD) + glmm_store(dest, glmm_min(glmm_load(a), glmm_load(b))); #else dest[0] = glm_min(a[0], b[0]); dest[1] = glm_min(a[1], b[1]); @@ -1080,14 +1060,13 @@ CGLM_INLINE void glm_vec4_clamp(vec4 v, float minVal, float maxVal) { #if defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(v, wasm_f32x4_pmin( - wasm_f32x4_pmax(glmm_load(v), wasm_f32x4_splat(minVal)), - wasm_f32x4_splat(maxVal))); + glmm_store(v, glmm_min(glmm_max(glmm_load(v), wasm_f32x4_splat(minVal)), + wasm_f32x4_splat(maxVal))); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), - _mm_set1_ps(maxVal))); + glmm_store(v, glmm_min(glmm_max(glmm_load(v), _mm_set1_ps(minVal)), + _mm_set1_ps(maxVal))); #elif defined(CGLM_NEON_FP) - vst1q_f32(v, vminq_f32(vmaxq_f32(vld1q_f32(v), vdupq_n_f32(minVal)), + glmm_store(v, glmm_min(glmm_max(vld1q_f32(v), vdupq_n_f32(minVal)), vdupq_n_f32(maxVal))); #else v[0] = glm_clamp(v[0], minVal, maxVal);