From 252bf925fc4ecd9cd939992f3651df15fc84e0c3 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 15:25:23 +0300 Subject: [PATCH 01/11] simd, sse2: make alignment optional for load operations --- include/cglm/affine.h | 34 ++++++++--------- include/cglm/mat4.h | 8 ++-- include/cglm/quat.h | 2 +- include/cglm/simd/intrin.h | 12 +++++- include/cglm/simd/sse2/affine.h | 38 +++++++++---------- include/cglm/simd/sse2/mat4.h | 66 ++++++++++++++++----------------- include/cglm/simd/sse2/quat.h | 4 +- include/cglm/vec4-ext.h | 6 +-- include/cglm/vec4.h | 58 ++++++++++++++--------------- 9 files changed, 118 insertions(+), 110 deletions(-) diff --git a/include/cglm/affine.h b/include/cglm/affine.h index 8124930..b200f30 100644 --- a/include/cglm/affine.h +++ b/include/cglm/affine.h @@ -59,18 +59,18 @@ glm_translate_to(mat4 m, vec3 v, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(dest[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(t[0]), + _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(t[0]), _mm_set1_ps(v[0])), - _mm_mul_ps(_mm_load_ps(t[1]), + _mm_mul_ps(glmm_load(t[1]), _mm_set1_ps(v[1]))), - _mm_add_ps(_mm_mul_ps(_mm_load_ps(t[2]), + _mm_add_ps(_mm_mul_ps(glmm_load(t[2]), _mm_set1_ps(v[2])), - _mm_load_ps(t[3])))) + glmm_load(t[3])))) ; - _mm_store_ps(dest[0], _mm_load_ps(m[0])); - _mm_store_ps(dest[1], _mm_load_ps(m[1])); - _mm_store_ps(dest[2], _mm_load_ps(m[2])); + _mm_store_ps(dest[0], glmm_load(m[0])); + _mm_store_ps(dest[1], glmm_load(m[1])); + _mm_store_ps(dest[2], glmm_load(m[2])); #else vec4 v1, v2, v3; @@ -98,13 +98,13 @@ void glm_translate(mat4 m, vec3 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(m[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), + _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(m[0]), _mm_set1_ps(v[0])), - _mm_mul_ps(_mm_load_ps(m[1]), + _mm_mul_ps(glmm_load(m[1]), _mm_set1_ps(v[1]))), - _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), + _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), _mm_set1_ps(v[2])), - _mm_load_ps(m[3])))) + glmm_load(m[3])))) ; #else vec4 v1, v2, v3; @@ -130,9 +130,9 @@ void glm_translate_x(mat4 m, float x) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), + _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), _mm_set1_ps(x)), - _mm_load_ps(m[3]))) + glmm_load(m[3]))) ; #else vec4 v1; @@ -152,9 +152,9 @@ void glm_translate_y(mat4 m, float y) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[1]), + _mm_add_ps(_mm_mul_ps(glmm_load(m[1]), _mm_set1_ps(y)), - _mm_load_ps(m[3]))) + glmm_load(m[3]))) ; #else vec4 v1; @@ -174,9 +174,9 @@ void glm_translate_z(mat4 m, float z) { #if defined( __SSE__ ) || defined( __SSE2__ ) _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), + _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), _mm_set1_ps(z)), - _mm_load_ps(m[3]))) + glmm_load(m[3]))) ; #else vec4 v1; diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index d1a72cb..8cafb7c 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -113,10 +113,10 @@ glm_mat4_copy(mat4 mat, mat4 dest) { _mm256_store_ps(dest[0], _mm256_load_ps(mat[0])); _mm256_store_ps(dest[2], _mm256_load_ps(mat[2])); #elif defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest[0], _mm_load_ps(mat[0])); - _mm_store_ps(dest[1], _mm_load_ps(mat[1])); - _mm_store_ps(dest[2], _mm_load_ps(mat[2])); - _mm_store_ps(dest[3], _mm_load_ps(mat[3])); + _mm_store_ps(dest[0], glmm_load(mat[0])); + _mm_store_ps(dest[1], glmm_load(mat[1])); + _mm_store_ps(dest[2], glmm_load(mat[2])); + _mm_store_ps(dest[3], glmm_load(mat[3])); #else glm_mat4_ucopy(mat, dest); #endif diff --git a/include/cglm/quat.h b/include/cglm/quat.h index f6dc86e..6bff527 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -198,7 +198,7 @@ glm_quat_normalize_to(versor q, versor dest) { __m128 xdot, x0; float dot; - x0 = _mm_load_ps(q); + x0 = glmm_load(q); xdot = glm_simd_dot(x0, x0); dot = _mm_cvtss_f32(xdot); diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index cf6753f..bf1db60 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -35,7 +35,7 @@ _mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ z1, y1, x1, w1) -CGLM_INLINE +static inline __m128 glm_simd_dot(__m128 a, __m128 b) { __m128 x0; @@ -44,7 +44,7 @@ glm_simd_dot(__m128 a, __m128 b) { return _mm_add_ps(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1)); } -CGLM_INLINE +static inline __m128 glm_simd_norm(__m128 a) { return _mm_sqrt_ps(glm_simd_dot(a, a)); @@ -69,6 +69,14 @@ glm_simd_store_v3(__m128 vx, vec3 v) { _mm_store_ss(&v[2], _mm_shuffle1_ps(vx, 2, 2, 2, 2)); } +#ifdef CGLM_ALL_UNALIGNED +#define glmm_load(p) _mm_loadu_ps(p) +#define glmm_store(p, a) _mm_storeu_ps(p, a) +#else +#define glmm_load(p) _mm_load_ps(p) +#define glmm_store(p, a) _mm_store_ps(p, a) +#endif + #endif /* x86, x64 */ diff --git a/include/cglm/simd/sse2/affine.h b/include/cglm/simd/sse2/affine.h index b746d0f..df8c166 100644 --- a/include/cglm/simd/sse2/affine.h +++ b/include/cglm/simd/sse2/affine.h @@ -18,30 +18,30 @@ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { /* D = R * L (Column-Major) */ __m128 l0, l1, l2, l3, r; - l0 = _mm_load_ps(m1[0]); - l1 = _mm_load_ps(m1[1]); - l2 = _mm_load_ps(m1[2]); - l3 = _mm_load_ps(m1[3]); + l0 = glmm_load(m1[0]); + l1 = glmm_load(m1[1]); + l2 = glmm_load(m1[2]); + l3 = glmm_load(m1[3]); - r = _mm_load_ps(m2[0]); + r = glmm_load(m2[0]); _mm_store_ps(dest[0], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[1]); + r = glmm_load(m2[1]); _mm_store_ps(dest[1], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[2]); + r = glmm_load(m2[2]); _mm_store_ps(dest[2], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[3]); + r = glmm_load(m2[3]); _mm_store_ps(dest[3], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), @@ -55,24 +55,24 @@ glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) { /* D = R * L (Column-Major) */ __m128 l0, l1, l2, l3, r; - l0 = _mm_load_ps(m1[0]); - l1 = _mm_load_ps(m1[1]); - l2 = _mm_load_ps(m1[2]); - l3 = _mm_load_ps(m1[3]); + l0 = glmm_load(m1[0]); + l1 = glmm_load(m1[1]); + l2 = glmm_load(m1[2]); + l3 = glmm_load(m1[3]); - r = _mm_load_ps(m2[0]); + r = glmm_load(m2[0]); _mm_store_ps(dest[0], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[1]); + r = glmm_load(m2[1]); _mm_store_ps(dest[1], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - r = _mm_load_ps(m2[2]); + r = glmm_load(m2[2]); _mm_store_ps(dest[2], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), @@ -86,10 +86,10 @@ void glm_inv_tr_sse2(mat4 mat) { __m128 r0, r1, r2, r3, x0, x1; - r0 = _mm_load_ps(mat[0]); - r1 = _mm_load_ps(mat[1]); - r2 = _mm_load_ps(mat[2]); - r3 = _mm_load_ps(mat[3]); + r0 = glmm_load(mat[0]); + r1 = glmm_load(mat[1]); + r2 = glmm_load(mat[2]); + r3 = glmm_load(mat[3]); x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); _MM_TRANSPOSE4_PS(r0, r1, r2, x1); diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 77874a8..404b496 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -20,10 +20,10 @@ glm_mat4_scale_sse2(mat4 m, float s){ __m128 x0; x0 = _mm_set1_ps(s); - _mm_store_ps(m[0], _mm_mul_ps(_mm_load_ps(m[0]), x0)); - _mm_store_ps(m[1], _mm_mul_ps(_mm_load_ps(m[1]), x0)); - _mm_store_ps(m[2], _mm_mul_ps(_mm_load_ps(m[2]), x0)); - _mm_store_ps(m[3], _mm_mul_ps(_mm_load_ps(m[3]), x0)); + _mm_store_ps(m[0], _mm_mul_ps(glmm_load(m[0]), x0)); + _mm_store_ps(m[1], _mm_mul_ps(glmm_load(m[1]), x0)); + _mm_store_ps(m[2], _mm_mul_ps(glmm_load(m[2]), x0)); + _mm_store_ps(m[3], _mm_mul_ps(glmm_load(m[3]), x0)); } CGLM_INLINE @@ -31,10 +31,10 @@ void glm_mat4_transp_sse2(mat4 m, mat4 dest){ __m128 r0, r1, r2, r3; - r0 = _mm_load_ps(m[0]); - r1 = _mm_load_ps(m[1]); - r2 = _mm_load_ps(m[2]); - r3 = _mm_load_ps(m[3]); + r0 = glmm_load(m[0]); + r1 = glmm_load(m[1]); + r2 = glmm_load(m[2]); + r3 = glmm_load(m[3]); _MM_TRANSPOSE4_PS(r0, r1, r2, r3); @@ -51,31 +51,31 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { __m128 l0, l1, l2, l3, r; - l0 = _mm_load_ps(m1[0]); - l1 = _mm_load_ps(m1[1]); - l2 = _mm_load_ps(m1[2]); - l3 = _mm_load_ps(m1[3]); + l0 = glmm_load(m1[0]); + l1 = glmm_load(m1[1]); + l2 = glmm_load(m1[2]); + l3 = glmm_load(m1[3]); - r = _mm_load_ps(m2[0]); + r = glmm_load(m2[0]); _mm_store_ps(dest[0], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); - r = _mm_load_ps(m2[1]); + r = glmm_load(m2[1]); _mm_store_ps(dest[1], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); - r = _mm_load_ps(m2[2]); + r = glmm_load(m2[2]); _mm_store_ps(dest[2], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); - r = _mm_load_ps(m2[3]); + r = glmm_load(m2[3]); _mm_store_ps(dest[3], _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), @@ -88,15 +88,15 @@ void glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { __m128 x0, x1, x2; - x0 = _mm_load_ps(v); - x1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), + x0 = glmm_load(v); + x1 = _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), _mm_shuffle1_ps1(x0, 0)), - _mm_mul_ps(_mm_load_ps(m[1]), + _mm_mul_ps(glmm_load(m[1]), _mm_shuffle1_ps1(x0, 1))); - x2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), + x2 = _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), _mm_shuffle1_ps1(x0, 2)), - _mm_mul_ps(_mm_load_ps(m[3]), + _mm_mul_ps(glmm_load(m[3]), _mm_shuffle1_ps1(x0, 3))); _mm_store_ps(dest, _mm_add_ps(x1, x2)); @@ -108,10 +108,10 @@ glm_mat4_det_sse2(mat4 mat) { __m128 r0, r1, r2, r3, x0, x1, x2; /* 127 <- 0, [square] det(A) = det(At) */ - r0 = _mm_load_ps(mat[0]); /* d c b a */ - r1 = _mm_load_ps(mat[1]); /* h g f e */ - r2 = _mm_load_ps(mat[2]); /* l k j i */ - r3 = _mm_load_ps(mat[3]); /* p o n m */ + r0 = glmm_load(mat[0]); /* d c b a */ + r1 = glmm_load(mat[1]); /* h g f e */ + r2 = glmm_load(mat[2]); /* l k j i */ + r3 = glmm_load(mat[3]); /* p o n m */ /* t[1] = j * p - n * l; @@ -166,10 +166,10 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) { x0, x1, x2, x3, x4, x5, x6, x7; /* 127 <- 0 */ - r0 = _mm_load_ps(mat[0]); /* d c b a */ - r1 = _mm_load_ps(mat[1]); /* h g f e */ - r2 = _mm_load_ps(mat[2]); /* l k j i */ - r3 = _mm_load_ps(mat[3]); /* p o n m */ + r0 = glmm_load(mat[0]); /* d c b a */ + r1 = glmm_load(mat[1]); /* h g f e */ + r2 = glmm_load(mat[2]); /* l k j i */ + r3 = glmm_load(mat[3]); /* p o n m */ x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3); /* l p p p */ @@ -290,10 +290,10 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { x0, x1, x2, x3, x4, x5, x6, x7; /* 127 <- 0 */ - r0 = _mm_load_ps(mat[0]); /* d c b a */ - r1 = _mm_load_ps(mat[1]); /* h g f e */ - r2 = _mm_load_ps(mat[2]); /* l k j i */ - r3 = _mm_load_ps(mat[3]); /* p o n m */ + r0 = glmm_load(mat[0]); /* d c b a */ + r1 = glmm_load(mat[1]); /* h g f e */ + r2 = glmm_load(mat[2]); /* l k j i */ + r3 = glmm_load(mat[3]); /* p o n m */ x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3); /* l p p p */ diff --git a/include/cglm/simd/sse2/quat.h b/include/cglm/simd/sse2/quat.h index 5dbf759..4970eff 100644 --- a/include/cglm/simd/sse2/quat.h +++ b/include/cglm/simd/sse2/quat.h @@ -24,8 +24,8 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) { __m128 xp, xq, x0, r; - xp = _mm_load_ps(p); /* 3 2 1 0 */ - xq = _mm_load_ps(q); + xp = glmm_load(p); /* 3 2 1 0 */ + xq = glmm_load(q); r = _mm_mul_ps(_mm_shuffle1_ps1(xp, 3), xq); diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h index 7a6cb3d..1055ebe 100644 --- a/include/cglm/vec4-ext.h +++ b/include/cglm/vec4-ext.h @@ -42,7 +42,7 @@ CGLM_INLINE void glm_vec4_mulv(vec4 a, vec4 b, vec4 d) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(d, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(d, _mm_mul_ps(glmm_load(a), glmm_load(b))); #else d[0] = a[0] * b[0]; d[1] = a[1] * b[1]; @@ -223,7 +223,7 @@ glm_vec4_sign(vec4 v, vec4 dest) { #if defined( __SSE2__ ) || defined( __SSE2__ ) __m128 x0, x1, x2, x3, x4; - x0 = _mm_load_ps(v); + x0 = glmm_load(v); x1 = _mm_set_ps(0.0f, 0.0f, 1.0f, -1.0f); x2 = _mm_shuffle1_ps1(x1, 2); @@ -249,7 +249,7 @@ CGLM_INLINE void glm_vec4_sqrt(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sqrt_ps(_mm_load_ps(v))); + _mm_store_ps(dest, _mm_sqrt_ps(glmm_load(v))); #else dest[0] = sqrtf(v[0]); dest[1] = sqrtf(v[1]); diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 2e9ca45..b2a9b97 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -111,7 +111,7 @@ CGLM_INLINE void glm_vec4_copy(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_load_ps(v)); + _mm_store_ps(dest, glmm_load(v)); #else dest[0] = v[0]; dest[1] = v[1]; @@ -169,7 +169,7 @@ float glm_vec4_dot(vec4 a, vec4 b) { #if defined( __SSE__ ) || defined( __SSE2__ ) __m128 x0; - x0 = _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)); + x0 = _mm_mul_ps(glmm_load(a), glmm_load(b)); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1))); #else @@ -193,7 +193,7 @@ float glm_vec4_norm2(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) __m128 x0; - x0 = _mm_load_ps(v); + x0 = glmm_load(v); x0 = _mm_mul_ps(x0, x0); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1))); @@ -214,7 +214,7 @@ float glm_vec4_norm(vec4 vec) { #if defined( __SSE__ ) || defined( __SSE2__ ) __m128 x0; - x0 = _mm_load_ps(vec); + x0 = glmm_load(vec); return _mm_cvtss_f32(_mm_sqrt_ss(glm_simd_dot(x0, x0))); #else return sqrtf(glm_vec4_norm2(vec)); @@ -232,7 +232,7 @@ CGLM_INLINE void glm_vec4_add(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] + b[0]; dest[1] = a[1] + b[1]; @@ -252,7 +252,7 @@ CGLM_INLINE void glm_vec4_adds(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(v), _mm_set1_ps(s))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] + s; dest[1] = v[1] + s; @@ -272,7 +272,7 @@ CGLM_INLINE void glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sub_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] - b[0]; dest[1] = a[1] - b[1]; @@ -292,7 +292,7 @@ CGLM_INLINE void glm_vec4_subs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sub_ps(_mm_load_ps(v), _mm_set1_ps(s))); + _mm_store_ps(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] - s; dest[1] = v[1] - s; @@ -312,7 +312,7 @@ CGLM_INLINE void glm_vec4_mul(vec4 a, vec4 b, vec4 d) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(d, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(d, _mm_mul_ps(glmm_load(a), glmm_load(b))); #else d[0] = a[0] * b[0]; d[1] = a[1] * b[1]; @@ -332,7 +332,7 @@ CGLM_INLINE void glm_vec4_scale(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_mul_ps(_mm_load_ps(v), _mm_set1_ps(s))); + _mm_store_ps(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] * s; dest[1] = v[1] * s; @@ -373,7 +373,7 @@ CGLM_INLINE void glm_vec4_div(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_div_ps(_mm_load_ps(a), _mm_load_ps(b))); + _mm_store_ps(dest, _mm_div_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] / b[0]; dest[1] = a[1] / b[1]; @@ -393,7 +393,7 @@ CGLM_INLINE void glm_vec4_divs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_div_ps(_mm_load_ps(v), _mm_set1_ps(s))); + _mm_store_ps(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s))); #else glm_vec4_scale(v, 1.0f / s, dest); #endif @@ -413,9 +413,9 @@ CGLM_INLINE void glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), - _mm_add_ps(_mm_load_ps(a), - _mm_load_ps(b)))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), + _mm_add_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] + b[0]; dest[1] += a[1] + b[1]; @@ -437,9 +437,9 @@ CGLM_INLINE void glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), - _mm_sub_ps(_mm_load_ps(a), - _mm_load_ps(b)))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), + _mm_sub_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] - b[0]; dest[1] += a[1] - b[1]; @@ -461,9 +461,9 @@ CGLM_INLINE void glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), - _mm_mul_ps(_mm_load_ps(a), - _mm_load_ps(b)))); + _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), + _mm_mul_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] * b[0]; dest[1] += a[1] * b[1]; @@ -485,8 +485,8 @@ CGLM_INLINE void glm_vec4_muladds(vec4 a, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), - _mm_mul_ps(_mm_load_ps(a), + _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), + _mm_mul_ps(glmm_load(a), _mm_set1_ps(s)))); #else dest[0] += a[0] * s; @@ -505,7 +505,7 @@ CGLM_INLINE void glm_vec4_flipsign(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(v, _mm_xor_ps(_mm_load_ps(v), _mm_set1_ps(-0.0f))); + _mm_store_ps(v, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); #else v[0] = -v[0]; v[1] = -v[1]; @@ -524,7 +524,7 @@ CGLM_INLINE void glm_vec4_flipsign_to(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_xor_ps(_mm_load_ps(v), + _mm_store_ps(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); #else dest[0] = -v[0]; @@ -571,7 +571,7 @@ glm_vec4_normalize_to(vec4 vec, vec4 dest) { __m128 xdot, x0; float dot; - x0 = _mm_load_ps(vec); + x0 = glmm_load(vec); xdot = glm_simd_dot(x0, x0); dot = _mm_cvtss_f32(xdot); @@ -633,7 +633,7 @@ CGLM_INLINE void glm_vec4_maxv(vec4 v1, vec4 v2, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_max_ps(_mm_load_ps(v1), _mm_load_ps(v2))); + _mm_store_ps(dest, _mm_max_ps(glmm_load(v1), glmm_load(v2))); #else dest[0] = glm_max(v1[0], v2[0]); dest[1] = glm_max(v1[1], v2[1]); @@ -653,7 +653,7 @@ CGLM_INLINE void glm_vec4_minv(vec4 v1, vec4 v2, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_min_ps(_mm_load_ps(v1), _mm_load_ps(v2))); + _mm_store_ps(dest, _mm_min_ps(glmm_load(v1), glmm_load(v2))); #else dest[0] = glm_min(v1[0], v2[0]); dest[1] = glm_min(v1[1], v2[1]); @@ -673,7 +673,7 @@ CGLM_INLINE void glm_vec4_clamp(vec4 v, float minVal, float maxVal) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(v, _mm_min_ps(_mm_max_ps(_mm_load_ps(v), _mm_set1_ps(minVal)), + _mm_store_ps(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), _mm_set1_ps(maxVal))); #else v[0] = glm_clamp(v[0], minVal, maxVal); From 568001d26a5b047a2e1ac438497e8a778048fb8f Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 15:31:09 +0300 Subject: [PATCH 02/11] simd, sse2: make alignment optional for store operations --- include/cglm/affine.h | 62 +++++++++++++-------------- include/cglm/mat4.h | 8 ++-- include/cglm/quat.h | 2 +- include/cglm/simd/sse2/affine.h | 68 +++++++++++++++--------------- include/cglm/simd/sse2/mat4.h | 74 ++++++++++++++++----------------- include/cglm/simd/sse2/quat.h | 2 +- include/cglm/vec4-ext.h | 8 ++-- include/cglm/vec4.h | 63 ++++++++++++++-------------- 8 files changed, 143 insertions(+), 144 deletions(-) diff --git a/include/cglm/affine.h b/include/cglm/affine.h index b200f30..dd7dbd1 100644 --- a/include/cglm/affine.h +++ b/include/cglm/affine.h @@ -58,19 +58,19 @@ glm_translate_to(mat4 m, vec3 v, mat4 dest) { mat4 t = GLM_MAT4_IDENTITY_INIT; #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(t[0]), - _mm_set1_ps(v[0])), - _mm_mul_ps(glmm_load(t[1]), - _mm_set1_ps(v[1]))), - _mm_add_ps(_mm_mul_ps(glmm_load(t[2]), - _mm_set1_ps(v[2])), - glmm_load(t[3])))) + glmm_store(dest[3], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(t[0]), + _mm_set1_ps(v[0])), + _mm_mul_ps(glmm_load(t[1]), + _mm_set1_ps(v[1]))), + _mm_add_ps(_mm_mul_ps(glmm_load(t[2]), + _mm_set1_ps(v[2])), + glmm_load(t[3])))) ; - _mm_store_ps(dest[0], glmm_load(m[0])); - _mm_store_ps(dest[1], glmm_load(m[1])); - _mm_store_ps(dest[2], glmm_load(m[2])); + glmm_store(dest[0], glmm_load(m[0])); + glmm_store(dest[1], glmm_load(m[1])); + glmm_store(dest[2], glmm_load(m[2])); #else vec4 v1, v2, v3; @@ -97,14 +97,14 @@ CGLM_INLINE void glm_translate(mat4 m, vec3 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(m[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(m[0]), - _mm_set1_ps(v[0])), - _mm_mul_ps(glmm_load(m[1]), - _mm_set1_ps(v[1]))), - _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), - _mm_set1_ps(v[2])), - glmm_load(m[3])))) + glmm_store(m[3], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(m[0]), + _mm_set1_ps(v[0])), + _mm_mul_ps(glmm_load(m[1]), + _mm_set1_ps(v[1]))), + _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), + _mm_set1_ps(v[2])), + glmm_load(m[3])))) ; #else vec4 v1, v2, v3; @@ -129,10 +129,10 @@ CGLM_INLINE void glm_translate_x(mat4 m, float x) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), - _mm_set1_ps(x)), - glmm_load(m[3]))) + glmm_store(m[3], + _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), + _mm_set1_ps(x)), + glmm_load(m[3]))) ; #else vec4 v1; @@ -151,10 +151,10 @@ CGLM_INLINE void glm_translate_y(mat4 m, float y) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(glmm_load(m[1]), - _mm_set1_ps(y)), - glmm_load(m[3]))) + glmm_store(m[3], + _mm_add_ps(_mm_mul_ps(glmm_load(m[1]), + _mm_set1_ps(y)), + glmm_load(m[3]))) ; #else vec4 v1; @@ -173,10 +173,10 @@ CGLM_INLINE void glm_translate_z(mat4 m, float z) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(m[3], - _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), - _mm_set1_ps(z)), - glmm_load(m[3]))) + glmm_store(m[3], + _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), + _mm_set1_ps(z)), + glmm_load(m[3]))) ; #else vec4 v1; diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index 8cafb7c..88563cb 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -113,10 +113,10 @@ glm_mat4_copy(mat4 mat, mat4 dest) { _mm256_store_ps(dest[0], _mm256_load_ps(mat[0])); _mm256_store_ps(dest[2], _mm256_load_ps(mat[2])); #elif defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest[0], glmm_load(mat[0])); - _mm_store_ps(dest[1], glmm_load(mat[1])); - _mm_store_ps(dest[2], glmm_load(mat[2])); - _mm_store_ps(dest[3], glmm_load(mat[3])); + glmm_store(dest[0], glmm_load(mat[0])); + glmm_store(dest[1], glmm_load(mat[1])); + glmm_store(dest[2], glmm_load(mat[2])); + glmm_store(dest[3], glmm_load(mat[3])); #else glm_mat4_ucopy(mat, dest); #endif diff --git a/include/cglm/quat.h b/include/cglm/quat.h index 6bff527..0ba2cf6 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -207,7 +207,7 @@ glm_quat_normalize_to(versor q, versor dest) { return; } - _mm_store_ps(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); + glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); #else float dot; diff --git a/include/cglm/simd/sse2/affine.h b/include/cglm/simd/sse2/affine.h index df8c166..c0c9c04 100644 --- a/include/cglm/simd/sse2/affine.h +++ b/include/cglm/simd/sse2/affine.h @@ -24,29 +24,29 @@ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { l3 = glmm_load(m1[3]); r = glmm_load(m2[0]); - _mm_store_ps(dest[0], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + glmm_store(dest[0], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); r = glmm_load(m2[1]); - _mm_store_ps(dest[1], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + glmm_store(dest[1], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); r = glmm_load(m2[2]); - _mm_store_ps(dest[2], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + glmm_store(dest[2], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); r = glmm_load(m2[3]); - _mm_store_ps(dest[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), - _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); + glmm_store(dest[3], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); } CGLM_INLINE @@ -61,24 +61,24 @@ glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) { l3 = glmm_load(m1[3]); r = glmm_load(m2[0]); - _mm_store_ps(dest[0], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + glmm_store(dest[0], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); r = glmm_load(m2[1]); - _mm_store_ps(dest[1], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + glmm_store(dest[1], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); r = glmm_load(m2[2]); - _mm_store_ps(dest[2], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + glmm_store(dest[2], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); - _mm_store_ps(dest[3], l3); + glmm_store(dest[3], l3); } CGLM_INLINE @@ -101,10 +101,10 @@ glm_inv_tr_sse2(mat4 mat) { x0 = _mm_add_ps(x0, x1); - _mm_store_ps(mat[0], r0); - _mm_store_ps(mat[1], r1); - _mm_store_ps(mat[2], r2); - _mm_store_ps(mat[3], x0); + glmm_store(mat[0], r0); + glmm_store(mat[1], r1); + glmm_store(mat[2], r2); + glmm_store(mat[3], x0); } #endif diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 404b496..1f82c08 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -20,10 +20,10 @@ glm_mat4_scale_sse2(mat4 m, float s){ __m128 x0; x0 = _mm_set1_ps(s); - _mm_store_ps(m[0], _mm_mul_ps(glmm_load(m[0]), x0)); - _mm_store_ps(m[1], _mm_mul_ps(glmm_load(m[1]), x0)); - _mm_store_ps(m[2], _mm_mul_ps(glmm_load(m[2]), x0)); - _mm_store_ps(m[3], _mm_mul_ps(glmm_load(m[3]), x0)); + glmm_store(m[0], _mm_mul_ps(glmm_load(m[0]), x0)); + glmm_store(m[1], _mm_mul_ps(glmm_load(m[1]), x0)); + glmm_store(m[2], _mm_mul_ps(glmm_load(m[2]), x0)); + glmm_store(m[3], _mm_mul_ps(glmm_load(m[3]), x0)); } CGLM_INLINE @@ -38,10 +38,10 @@ glm_mat4_transp_sse2(mat4 m, mat4 dest){ _MM_TRANSPOSE4_PS(r0, r1, r2, r3); - _mm_store_ps(dest[0], r0); - _mm_store_ps(dest[1], r1); - _mm_store_ps(dest[2], r2); - _mm_store_ps(dest[3], r3); + glmm_store(dest[0], r0); + glmm_store(dest[1], r1); + glmm_store(dest[2], r2); + glmm_store(dest[3], r3); } CGLM_INLINE @@ -57,30 +57,30 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { l3 = glmm_load(m1[3]); r = glmm_load(m2[0]); - _mm_store_ps(dest[0], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), - _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); + glmm_store(dest[0], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); r = glmm_load(m2[1]); - _mm_store_ps(dest[1], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), - _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); + glmm_store(dest[1], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); r = glmm_load(m2[2]); - _mm_store_ps(dest[2], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), - _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); + glmm_store(dest[2], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); r = glmm_load(m2[3]); - _mm_store_ps(dest[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), - _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), - _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), - _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); + glmm_store(dest[3], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); } CGLM_INLINE @@ -99,7 +99,7 @@ glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { _mm_mul_ps(glmm_load(m[3]), _mm_shuffle1_ps1(x0, 3))); - _mm_store_ps(dest, _mm_add_ps(x1, x2)); + glmm_store(dest, _mm_add_ps(x1, x2)); } CGLM_INLINE @@ -275,10 +275,10 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) { x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 0, 1)); x0 = _mm_rcp_ps(x0); - _mm_store_ps(dest[0], _mm_mul_ps(v0, x0)); - _mm_store_ps(dest[1], _mm_mul_ps(v1, x0)); - _mm_store_ps(dest[2], _mm_mul_ps(v2, x0)); - _mm_store_ps(dest[3], _mm_mul_ps(v3, x0)); + glmm_store(dest[0], _mm_mul_ps(v0, x0)); + glmm_store(dest[1], _mm_mul_ps(v1, x0)); + glmm_store(dest[2], _mm_mul_ps(v2, x0)); + glmm_store(dest[3], _mm_mul_ps(v3, x0)); } CGLM_INLINE @@ -399,10 +399,10 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 0, 1)); x0 = _mm_div_ps(_mm_set1_ps(1.0f), x0); - _mm_store_ps(dest[0], _mm_mul_ps(v0, x0)); - _mm_store_ps(dest[1], _mm_mul_ps(v1, x0)); - _mm_store_ps(dest[2], _mm_mul_ps(v2, x0)); - _mm_store_ps(dest[3], _mm_mul_ps(v3, x0)); + glmm_store(dest[0], _mm_mul_ps(v0, x0)); + glmm_store(dest[1], _mm_mul_ps(v1, x0)); + glmm_store(dest[2], _mm_mul_ps(v2, x0)); + glmm_store(dest[3], _mm_mul_ps(v3, x0)); } #endif diff --git a/include/cglm/simd/sse2/quat.h b/include/cglm/simd/sse2/quat.h index 4970eff..a8b517c 100644 --- a/include/cglm/simd/sse2/quat.h +++ b/include/cglm/simd/sse2/quat.h @@ -38,7 +38,7 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) { x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f)); r = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 2, 3, 0, 1))); - _mm_store_ps(dest, r); + glmm_store(dest, r); } diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h index 1055ebe..94150da 100644 --- a/include/cglm/vec4-ext.h +++ b/include/cglm/vec4-ext.h @@ -42,7 +42,7 @@ CGLM_INLINE void glm_vec4_mulv(vec4 a, vec4 b, vec4 d) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(d, _mm_mul_ps(glmm_load(a), glmm_load(b))); + glmm_store(d, _mm_mul_ps(glmm_load(a), glmm_load(b))); #else d[0] = a[0] * b[0]; d[1] = a[1] * b[1]; @@ -61,7 +61,7 @@ CGLM_INLINE void glm_vec4_broadcast(float val, vec4 d) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(d, _mm_set1_ps(val)); + glmm_store(d, _mm_set1_ps(val)); #else d[0] = d[1] = d[2] = d[3] = val; #endif @@ -230,7 +230,7 @@ glm_vec4_sign(vec4 v, vec4 dest) { x3 = _mm_and_ps(_mm_cmpgt_ps(x0, x2), _mm_shuffle1_ps1(x1, 1)); x4 = _mm_and_ps(_mm_cmplt_ps(x0, x2), _mm_shuffle1_ps1(x1, 0)); - _mm_store_ps(dest, _mm_or_ps(x3, x4)); + glmm_store(dest, _mm_or_ps(x3, x4)); #else dest[0] = glm_signf(v[0]); dest[1] = glm_signf(v[1]); @@ -249,7 +249,7 @@ CGLM_INLINE void glm_vec4_sqrt(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sqrt_ps(glmm_load(v))); + glmm_store(dest, _mm_sqrt_ps(glmm_load(v))); #else dest[0] = sqrtf(v[0]); dest[1] = sqrtf(v[1]); diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index b2a9b97..912932f 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -111,7 +111,7 @@ CGLM_INLINE void glm_vec4_copy(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, glmm_load(v)); + glmm_store(dest, glmm_load(v)); #else dest[0] = v[0]; dest[1] = v[1]; @@ -129,7 +129,7 @@ CGLM_INLINE void glm_vec4_zero(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(v, _mm_setzero_ps()); + glmm_store(v, _mm_setzero_ps()); #else v[0] = 0.0f; v[1] = 0.0f; @@ -147,7 +147,7 @@ CGLM_INLINE void glm_vec4_one(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(v, _mm_set1_ps(1.0f)); + glmm_store(v, _mm_set1_ps(1.0f)); #else v[0] = 1.0f; v[1] = 1.0f; @@ -232,7 +232,7 @@ CGLM_INLINE void glm_vec4_add(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); + glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] + b[0]; dest[1] = a[1] + b[1]; @@ -252,7 +252,7 @@ CGLM_INLINE void glm_vec4_adds(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); + glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] + s; dest[1] = v[1] + s; @@ -272,7 +272,7 @@ CGLM_INLINE void glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); + glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] - b[0]; dest[1] = a[1] - b[1]; @@ -292,7 +292,7 @@ CGLM_INLINE void glm_vec4_subs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); + glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] - s; dest[1] = v[1] - s; @@ -312,7 +312,7 @@ CGLM_INLINE void glm_vec4_mul(vec4 a, vec4 b, vec4 d) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(d, _mm_mul_ps(glmm_load(a), glmm_load(b))); + glmm_store(d, _mm_mul_ps(glmm_load(a), glmm_load(b))); #else d[0] = a[0] * b[0]; d[1] = a[1] * b[1]; @@ -332,7 +332,7 @@ CGLM_INLINE void glm_vec4_scale(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); + glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); #else dest[0] = v[0] * s; dest[1] = v[1] * s; @@ -373,7 +373,7 @@ CGLM_INLINE void glm_vec4_div(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_div_ps(glmm_load(a), glmm_load(b))); + glmm_store(dest, _mm_div_ps(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] / b[0]; dest[1] = a[1] / b[1]; @@ -393,7 +393,7 @@ CGLM_INLINE void glm_vec4_divs(vec4 v, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s))); + glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s))); #else glm_vec4_scale(v, 1.0f / s, dest); #endif @@ -413,9 +413,9 @@ CGLM_INLINE void glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), - _mm_add_ps(glmm_load(a), - glmm_load(b)))); + glmm_store(dest, _mm_add_ps(glmm_load(dest), + _mm_add_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] + b[0]; dest[1] += a[1] + b[1]; @@ -437,9 +437,9 @@ CGLM_INLINE void glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), - _mm_sub_ps(glmm_load(a), - glmm_load(b)))); + glmm_store(dest, _mm_add_ps(glmm_load(dest), + _mm_sub_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] - b[0]; dest[1] += a[1] - b[1]; @@ -461,9 +461,9 @@ CGLM_INLINE void glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), - _mm_mul_ps(glmm_load(a), - glmm_load(b)))); + glmm_store(dest, _mm_add_ps(glmm_load(dest), + _mm_mul_ps(glmm_load(a), + glmm_load(b)))); #else dest[0] += a[0] * b[0]; dest[1] += a[1] * b[1]; @@ -485,9 +485,9 @@ CGLM_INLINE void glm_vec4_muladds(vec4 a, float s, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_add_ps(glmm_load(dest), - _mm_mul_ps(glmm_load(a), - _mm_set1_ps(s)))); + glmm_store(dest, _mm_add_ps(glmm_load(dest), + _mm_mul_ps(glmm_load(a), + _mm_set1_ps(s)))); #else dest[0] += a[0] * s; dest[1] += a[1] * s; @@ -505,7 +505,7 @@ CGLM_INLINE void glm_vec4_flipsign(vec4 v) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(v, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); + glmm_store(v, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); #else v[0] = -v[0]; v[1] = -v[1]; @@ -524,8 +524,7 @@ CGLM_INLINE void glm_vec4_flipsign_to(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_xor_ps(glmm_load(v), - _mm_set1_ps(-0.0f))); + glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); #else dest[0] = -v[0]; dest[1] = -v[1]; @@ -576,11 +575,11 @@ glm_vec4_normalize_to(vec4 vec, vec4 dest) { dot = _mm_cvtss_f32(xdot); if (dot == 0.0f) { - _mm_store_ps(dest, _mm_setzero_ps()); + glmm_store(dest, _mm_setzero_ps()); return; } - _mm_store_ps(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); + glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); #else float norm; @@ -633,7 +632,7 @@ CGLM_INLINE void glm_vec4_maxv(vec4 v1, vec4 v2, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_max_ps(glmm_load(v1), glmm_load(v2))); + glmm_store(dest, _mm_max_ps(glmm_load(v1), glmm_load(v2))); #else dest[0] = glm_max(v1[0], v2[0]); dest[1] = glm_max(v1[1], v2[1]); @@ -653,7 +652,7 @@ CGLM_INLINE void glm_vec4_minv(vec4 v1, vec4 v2, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(dest, _mm_min_ps(glmm_load(v1), glmm_load(v2))); + glmm_store(dest, _mm_min_ps(glmm_load(v1), glmm_load(v2))); #else dest[0] = glm_min(v1[0], v2[0]); dest[1] = glm_min(v1[1], v2[1]); @@ -673,8 +672,8 @@ CGLM_INLINE void glm_vec4_clamp(vec4 v, float minVal, float maxVal) { #if defined( __SSE__ ) || defined( __SSE2__ ) - _mm_store_ps(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), - _mm_set1_ps(maxVal))); + glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), + _mm_set1_ps(maxVal))); #else v[0] = glm_clamp(v[0], minVal, maxVal); v[1] = glm_clamp(v[1], minVal, maxVal); From 56f0bb0928e127ed993ec6e46aef9ad7d8cfd819 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 15:35:17 +0300 Subject: [PATCH 03/11] simd, avx: make alignment optional for load/store operations --- include/cglm/mat4.h | 4 ++-- include/cglm/simd/avx/affine.h | 26 +++++++++++++------------- include/cglm/simd/avx/mat4.h | 28 ++++++++++++++-------------- include/cglm/simd/intrin.h | 17 +++++++++++++---- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index 88563cb..f0b6736 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -110,8 +110,8 @@ CGLM_INLINE void glm_mat4_copy(mat4 mat, mat4 dest) { #ifdef __AVX__ - _mm256_store_ps(dest[0], _mm256_load_ps(mat[0])); - _mm256_store_ps(dest[2], _mm256_load_ps(mat[2])); + glmm_store256(dest[0], glmm_load256(mat[0])); + glmm_store256(dest[2], glmm_load256(mat[2])); #elif defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest[0], glmm_load(mat[0])); glmm_store(dest[1], glmm_load(mat[1])); diff --git a/include/cglm/simd/avx/affine.h b/include/cglm/simd/avx/affine.h index 1b0dcea..5c7f71c 100644 --- a/include/cglm/simd/avx/affine.h +++ b/include/cglm/simd/avx/affine.h @@ -21,11 +21,11 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) { __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; - y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */ - y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */ + y0 = glmm_load256(m2[0]); /* h g f e d c b a */ + y1 = glmm_load256(m2[2]); /* p o n m l k j i */ - y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */ - y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */ + y2 = glmm_load256(m1[0]); /* h g f e d c b a */ + y3 = glmm_load256(m1[2]); /* p o n m l k j i */ y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */ y5 = _mm256_permute2f128_ps(y3, y3, 0b00000000); /* l k j i l k j i */ @@ -37,10 +37,10 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); - _mm256_store_ps(dest[0], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y4, y8)), - _mm256_mul_ps(y5, y7))); + glmm_store256(dest[0], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y4, y8)), + _mm256_mul_ps(y5, y7))); /* n n n n i i i i */ @@ -52,11 +52,11 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); - _mm256_store_ps(dest[2], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y3, y7)), - _mm256_add_ps(_mm256_mul_ps(y4, y8), - _mm256_mul_ps(y5, y9)))); + glmm_store256(dest[2], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y3, y7)), + _mm256_add_ps(_mm256_mul_ps(y4, y8), + _mm256_mul_ps(y5, y9)))); } #endif diff --git a/include/cglm/simd/avx/mat4.h b/include/cglm/simd/avx/mat4.h index e2ef9da..b5859a7 100644 --- a/include/cglm/simd/avx/mat4.h +++ b/include/cglm/simd/avx/mat4.h @@ -21,11 +21,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; - y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */ - y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */ + y0 = glmm_load256(m2[0]); /* h g f e d c b a */ + y1 = glmm_load256(m2[2]); /* p o n m l k j i */ - y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */ - y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */ + y2 = glmm_load256(m1[0]); /* h g f e d c b a */ + y3 = glmm_load256(m1[2]); /* p o n m l k j i */ y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */ y5 = _mm256_permute2f128_ps(y3, y3, 0b00000011); /* l k j i p o n m */ @@ -39,11 +39,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); - _mm256_store_ps(dest[0], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y3, y7)), - _mm256_add_ps(_mm256_mul_ps(y4, y8), - _mm256_mul_ps(y5, y9)))); + glmm_store256(dest[0], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y3, y7)), + _mm256_add_ps(_mm256_mul_ps(y4, y8), + _mm256_mul_ps(y5, y9)))); /* n n n n i i i i */ /* p p p p k k k k */ @@ -54,11 +54,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); - _mm256_store_ps(dest[2], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y3, y7)), - _mm256_add_ps(_mm256_mul_ps(y4, y8), - _mm256_mul_ps(y5, y9)))); + glmm_store256(dest[2], + _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), + _mm256_mul_ps(y3, y7)), + _mm256_add_ps(_mm256_mul_ps(y4, y8), + _mm256_mul_ps(y5, y9)))); } #endif diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index bf1db60..8fd1526 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -70,11 +70,11 @@ glm_simd_store_v3(__m128 vx, vec3 v) { } #ifdef CGLM_ALL_UNALIGNED -#define glmm_load(p) _mm_loadu_ps(p) -#define glmm_store(p, a) _mm_storeu_ps(p, a) +# define glmm_load(p) _mm_loadu_ps(p) +# define glmm_store(p, a) _mm_storeu_ps(p, a) #else -#define glmm_load(p) _mm_load_ps(p) -#define glmm_store(p, a) _mm_store_ps(p, a) +# define glmm_load(p) _mm_load_ps(p) +# define glmm_store(p, a) _mm_store_ps(p, a) #endif #endif @@ -86,6 +86,15 @@ glm_simd_store_v3(__m128 vx, vec3 v) { #ifdef __AVX__ # define CGLM_AVX_FP 1 + +#ifdef CGLM_ALL_UNALIGNED +# define glmm_load256(p) _mm256_loadu_ps(p) +# define glmm_store256(p, a) _mm256_storeu_ps(p, a) +#else +# define glmm_load256(p) _mm256_load_ps(p) +# define glmm_store256(p, a) _mm256_store_ps(p, a) +#endif + #endif /* ARM Neon */ From 5dbbd0826d1cd18f6722d193447d2b720aea9bdd Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 15:55:36 +0300 Subject: [PATCH 04/11] simd: replace glm_simd_ with glmm_ * now glmm_ is used as global simd namescape --- include/cglm/quat.h | 2 +- include/cglm/simd/intrin.h | 10 +++++----- include/cglm/vec4.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/cglm/quat.h b/include/cglm/quat.h index 0ba2cf6..eac853b 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -199,7 +199,7 @@ glm_quat_normalize_to(versor q, versor dest) { float dot; x0 = glmm_load(q); - xdot = glm_simd_dot(x0, x0); + xdot = glmm_dot(x0, x0); dot = _mm_cvtss_f32(xdot); if (dot <= 0.0f) { diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 8fd1526..c9ab352 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -37,7 +37,7 @@ static inline __m128 -glm_simd_dot(__m128 a, __m128 b) { +glmm_dot(__m128 a, __m128 b) { __m128 x0; x0 = _mm_mul_ps(a, b); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); @@ -46,13 +46,13 @@ glm_simd_dot(__m128 a, __m128 b) { static inline __m128 -glm_simd_norm(__m128 a) { - return _mm_sqrt_ps(glm_simd_dot(a, a)); +glmm_norm(__m128 a) { + return _mm_sqrt_ps(glmm_dot(a, a)); } static inline __m128 -glm_simd_load_v3(vec3 v) { +glmm_load3(vec3 v) { __m128i xy; __m128 z; @@ -64,7 +64,7 @@ glm_simd_load_v3(vec3 v) { static inline void -glm_simd_store_v3(__m128 vx, vec3 v) { +glmm_store3(__m128 vx, vec3 v) { _mm_storel_pi((__m64 *)&v[0], vx); _mm_store_ss(&v[2], _mm_shuffle1_ps(vx, 2, 2, 2, 2)); } diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 912932f..b98190b 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -215,7 +215,7 @@ glm_vec4_norm(vec4 vec) { #if defined( __SSE__ ) || defined( __SSE2__ ) __m128 x0; x0 = glmm_load(vec); - return _mm_cvtss_f32(_mm_sqrt_ss(glm_simd_dot(x0, x0))); + return _mm_cvtss_f32(_mm_sqrt_ss(glmm_dot(x0, x0))); #else return sqrtf(glm_vec4_norm2(vec)); #endif @@ -571,7 +571,7 @@ glm_vec4_normalize_to(vec4 vec, vec4 dest) { float dot; x0 = glmm_load(vec); - xdot = glm_simd_dot(x0, x0); + xdot = glmm_dot(x0, x0); dot = _mm_cvtss_f32(xdot); if (dot == 0.0f) { From 835cec2ccb0aaf872c5b10c24feecbd5ae099023 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 16:26:33 +0300 Subject: [PATCH 05/11] drop alignment requirement if CGLM_ALL_UNALIGNED defined * bring alignment back for visual studio 2017 --- include/cglm/simd/intrin.h | 4 ++-- include/cglm/types.h | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index c9ab352..9fef5a1 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -52,7 +52,7 @@ glmm_norm(__m128 a) { static inline __m128 -glmm_load3(vec3 v) { +glmm_load3(float v[3]) { __m128i xy; __m128 z; @@ -64,7 +64,7 @@ glmm_load3(vec3 v) { static inline void -glmm_store3(__m128 vx, vec3 v) { +glmm_store3(__m128 vx, float v[3]) { _mm_storel_pi((__m64 *)&v[0], vx); _mm_store_ss(&v[2], _mm_shuffle1_ps(vx, 2, 2, 2, 2)); } diff --git a/include/cglm/types.h b/include/cglm/types.h index 5d39a55..ad4eb02 100644 --- a/include/cglm/types.h +++ b/include/cglm/types.h @@ -9,20 +9,32 @@ #define cglm_types_h #if defined(_MSC_VER) -# define CGLM_ALIGN(X) /* __declspec(align(X)) */ +#if _MSC_VER < 1914 /* Visual Studio 2017 version 15.7 */ +# define CGLM_ALL_UNALIGNED +/* do not use alignment for older visual studio versions */ +# define CGLM_ALIGN(X) /* __declspec(align(X)) */ +#else +# define CGLM_ALIGN(X) __declspec(align(X)) +#endif #else # define CGLM_ALIGN(X) __attribute((aligned(X))) #endif -typedef float vec2[2]; -typedef CGLM_ALIGN(8) float vec3[3]; -typedef int ivec3[3]; -typedef CGLM_ALIGN(16) float vec4[4]; +#ifndef CGLM_ALL_UNALIGNED +# define CGLM_ALIGN_IF(X) CGLM_ALIGN(X) +#else +# define CGLM_ALIGN_IF(X) /* no alignment */ +#endif + +typedef float vec2[2]; +typedef CGLM_ALIGN_IF(8) float vec3[3]; +typedef int ivec3[3]; +typedef CGLM_ALIGN_IF(16) float vec4[4]; typedef vec3 mat3[3]; -typedef CGLM_ALIGN(16) vec4 mat4[4]; +typedef CGLM_ALIGN_IF(16) vec4 mat4[4]; -typedef vec4 versor; +typedef vec4 versor; #define CGLM_PI (float)M_PI #define CGLM_PI_2 (float)M_PI_2 From b27735780057fabc5375d2da3b78104deed972d7 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 18:28:31 +0300 Subject: [PATCH 06/11] update gitignore --- .gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitignore b/.gitignore index 37f68c8..d500b97 100644 --- a/.gitignore +++ b/.gitignore @@ -61,3 +61,11 @@ docs/build/* win/cglm_test_* * copy.* *.o +*.obj +*codeanalysis.*.xml +*codeanalysis.xml +*.lib +*.tlog +win/x64 +win/x85 +win/Debug From 0e49e951618544ec802f27314c7652264f8c3751 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 8 May 2018 18:29:02 +0300 Subject: [PATCH 07/11] win: update visual studio version for align requirement --- include/cglm/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cglm/types.h b/include/cglm/types.h index ad4eb02..4a7d019 100644 --- a/include/cglm/types.h +++ b/include/cglm/types.h @@ -9,7 +9,7 @@ #define cglm_types_h #if defined(_MSC_VER) -#if _MSC_VER < 1914 /* Visual Studio 2017 version 15.7 */ +#if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */ # define CGLM_ALL_UNALIGNED /* do not use alignment for older visual studio versions */ # define CGLM_ALIGN(X) /* __declspec(align(X)) */ From f774925e8a3182c3699f21b70948ba5c8b6d4ce5 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Wed, 9 May 2018 15:30:54 +0300 Subject: [PATCH 08/11] win, simd: make sure that CGLM_ALL_UNALIGNED is defined for older visual studios --- include/cglm/simd/intrin.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 9fef5a1..3d5682d 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -18,6 +18,10 @@ # define __SSE__ # endif # endif +/* do not use alignment for older visual studio versions */ +# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */ +# define CGLM_ALL_UNALIGNED +# endif #endif #if defined( __SSE__ ) || defined( __SSE2__ ) From 94b286f1f92f42cfea8fea88f8ad6a8de793584b Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Wed, 9 May 2018 16:35:15 +0300 Subject: [PATCH 09/11] docs: add new alignment option to docs --- docs/source/getting_started.rst | 7 +++++++ docs/source/index.rst | 1 + docs/source/opt.rst | 36 +++++++++++++++++++++++++++++++++ docs/source/troubleshooting.rst | 3 +++ include/cglm/types.h | 6 +++--- 5 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 docs/source/opt.rst diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index a152371..05c4440 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -27,6 +27,13 @@ Alignment is Required: **vec4** and **mat4** requires 16 byte alignment because vec4 and mat4 operations are vectorized by SIMD instructions (SSE/AVX). +**UPDATE:** + By starting v0.4.5 cglm provides an option to disable alignment requirement, it is enabled as default + + | Check :doc:`opt` page for more details + + Also alignment is disabled for older msvc verisons as default. Now alignment only is required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is not defined. + Allocations: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ *cglm* doesn't alloc any memory on heap. So it doesn't provide any allocator. diff --git a/docs/source/index.rst b/docs/source/index.rst index 32e7b48..cfdf220 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -40,6 +40,7 @@ Also currently only **float** type is supported for most operations. getting_started opengl api + opt troubleshooting Indices and tables diff --git a/docs/source/opt.rst b/docs/source/opt.rst new file mode 100644 index 0000000..c614e42 --- /dev/null +++ b/docs/source/opt.rst @@ -0,0 +1,36 @@ +.. default-domain:: C + +Options +=============================================================================== + +A few options are provided via macros. + +Alignment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As default, cglm requires types to be aligned. Alignment requirements: + +vec3: 8 byte +vec4: 16 byte +mat4: 16 byte +versor: 16 byte + +By starting **v0.4.5** cglm provides an option to disable alignment requirement. +To enable this option define **CGLM_ALL_UNALIGNED** macro before all headers. +You can define it in Xcode, Visual Studio (or other IDEs) or you can also prefer +to define it in build system. If you use pre-compiled verisons then you +have to compile cglm with **CGLM_ALL_UNALIGNED** macro. + +**VERY VERY IMPORTANT:** If you use cglm in multiple projects and + those projects are depends on each other, then + + | *ALWAYS* or *NEVER USE* **CGLM_ALL_UNALIGNED** macro in linked projects + + if you do not know what you are doing. Because a cglm header included + via 'project A' may force types to be aligned and another cglm header + included via 'project B' may not require alignment. In this case + cglm functions will read from and write to **INVALID MEMORY LOCATIONs**. + + ALWAYS USE SAME CONFIGURATION / OPTION for **cglm** if you have multiple projects. + + For instance if you set CGLM_ALL_UNALIGNED in a project then set it in other projects too diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index d599c7d..7c416b0 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -43,6 +43,9 @@ you may do it yourself. **This MSVC issue is still in TODOs.** +**UPDATE:** By starting v0.4.5 cglm provides an option to disable alignment requirement. +Also alignment is disabled for older msvc verisons as default. Now alignment only is required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is defined. + Crashes, Invalid Memory Access: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/include/cglm/types.h b/include/cglm/types.h index 4a7d019..99226df 100644 --- a/include/cglm/types.h +++ b/include/cglm/types.h @@ -9,10 +9,10 @@ #define cglm_types_h #if defined(_MSC_VER) -#if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */ -# define CGLM_ALL_UNALIGNED /* do not use alignment for older visual studio versions */ -# define CGLM_ALIGN(X) /* __declspec(align(X)) */ +#if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */ +# define CGLM_ALL_UNALIGNED +# define CGLM_ALIGN(X) /* no alignment */ #else # define CGLM_ALIGN(X) __declspec(align(X)) #endif From c6d07bb6eb036742c25bdad9b8ccb6d74871cb64 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 10 May 2018 12:18:54 +0300 Subject: [PATCH 10/11] surround PI with parentheses + code style + update docs --- docs/source/getting_started.rst | 4 ++-- docs/source/troubleshooting.rst | 2 +- include/cglm/simd/sse2/affine.h | 2 +- include/cglm/types.h | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 05c4440..2f8511c 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -28,11 +28,11 @@ Alignment is Required: vectorized by SIMD instructions (SSE/AVX). **UPDATE:** - By starting v0.4.5 cglm provides an option to disable alignment requirement, it is enabled as default + By starting v0.4.5 cglm provides an option to disable alignment requirement, it is enabled as default | Check :doc:`opt` page for more details - Also alignment is disabled for older msvc verisons as default. Now alignment only is required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is not defined. + Also alignment is disabled for older msvc verisons as default. Now alignment is only required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is not defined. Allocations: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 7c416b0..c897dc2 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -44,7 +44,7 @@ you may do it yourself. **This MSVC issue is still in TODOs.** **UPDATE:** By starting v0.4.5 cglm provides an option to disable alignment requirement. -Also alignment is disabled for older msvc verisons as default. Now alignment only is required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is defined. +Also alignment is disabled for older msvc verisons as default. Now alignment is only required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is not defined. Crashes, Invalid Memory Access: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/include/cglm/simd/sse2/affine.h b/include/cglm/simd/sse2/affine.h index c0c9c04..8a644d3 100644 --- a/include/cglm/simd/sse2/affine.h +++ b/include/cglm/simd/sse2/affine.h @@ -90,7 +90,7 @@ glm_inv_tr_sse2(mat4 mat) { r1 = glmm_load(mat[1]); r2 = glmm_load(mat[2]); r3 = glmm_load(mat[3]); - x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); + x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); _MM_TRANSPOSE4_PS(r0, r1, r2, x1); diff --git a/include/cglm/types.h b/include/cglm/types.h index 99226df..d470e7b 100644 --- a/include/cglm/types.h +++ b/include/cglm/types.h @@ -27,17 +27,17 @@ #endif typedef float vec2[2]; -typedef CGLM_ALIGN_IF(8) float vec3[3]; +typedef CGLM_ALIGN_IF(8) float vec3[3]; typedef int ivec3[3]; typedef CGLM_ALIGN_IF(16) float vec4[4]; -typedef vec3 mat3[3]; +typedef vec3 mat3[3]; typedef CGLM_ALIGN_IF(16) vec4 mat4[4]; typedef vec4 versor; -#define CGLM_PI (float)M_PI -#define CGLM_PI_2 (float)M_PI_2 -#define CGLM_PI_4 (float)M_PI_4 +#define CGLM_PI ((float)M_PI) +#define CGLM_PI_2 ((float)M_PI_2) +#define CGLM_PI_4 ((float)M_PI_4) #endif /* cglm_types_h */ From 464bd917d094d79b886248596f75fdae9a545865 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 10 May 2018 12:21:33 +0300 Subject: [PATCH 11/11] update readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 15f434c..ff1b4df 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ Complete documentation: http://cglm.readthedocs.io - **[bugfix]** euler angles was implemented in reverse order (extrinsic) it was fixed, now they are intrinsic. Make sure that you have the latest version - **[major change]** by starting v0.4.0, quaternions are stored as [x, y, z, w], it was [w, x, y, z] in v0.3.5 and earlier versions +- **[api rename]** by starting v0.4.5, **glm_simd** functions are renamed to **glmm_** +- **[new option]** by starting v0.4.5, you can disable alignment requirement, check options in docs. #### Note for C++ developers: If you don't aware about original GLM library yet, you may also want to look at: