From 30b4ea80a9f3a87ccd2b8cbdbf172acd13e1ea10 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Fri, 29 Mar 2024 20:59:54 +0300 Subject: [PATCH 01/21] optimize mat3 scalar inv --- include/cglm/mat3.h | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/include/cglm/mat3.h b/include/cglm/mat3.h index 7db616d..9d5f0d4 100644 --- a/include/cglm/mat3.h +++ b/include/cglm/mat3.h @@ -334,7 +334,7 @@ glm_mat3_det(mat3 mat) { d = mat[1][0], e = mat[1][1], f = mat[1][2], g = mat[2][0], h = mat[2][1], i = mat[2][2]; - return a * (e * i - h * f) - d * (b * i - c * h) + g * (b * f - c * e); + return a * (e * i - h * f) - d * (b * i - h * c) + g * (b * f - e * c); } /*! @@ -346,24 +346,22 @@ glm_mat3_det(mat3 mat) { CGLM_INLINE void glm_mat3_inv(mat3 mat, mat3 dest) { - float det; float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[1][0], e = mat[1][1], f = mat[1][2], - g = mat[2][0], h = mat[2][1], i = mat[2][2]; + g = mat[2][0], h = mat[2][1], i = mat[2][2], - dest[0][0] = e * i - f * h; - dest[0][1] = -(b * i - h * c); - dest[0][2] = b * f - e * c; - dest[1][0] = -(d * i - g * f); - dest[1][1] = a * i - c * g; - dest[1][2] = -(a * f - d * c); - dest[2][0] = d * h - g * e; - dest[2][1] = -(a * h - g * b); - dest[2][2] = a * e - b * d; + c1 = e * i - f * h, c2 = d * i - g * f, c3 = d * h - g * e, + idt = 1.0f / (a * c1 - b * c2 + c * c3), ndt = -idt; - det = 1.0f / (a * dest[0][0] + b * dest[1][0] + c * dest[2][0]); - - glm_mat3_scale(dest, det); + dest[0][0] = idt * c1; + dest[0][1] = ndt * (b * i - h * c); + dest[0][2] = idt * (b * f - e * c); + dest[1][0] = ndt * c2; + dest[1][1] = idt * (a * i - g * c); + dest[1][2] = ndt * (a * f - d * c); + dest[2][0] = idt * c3; + dest[2][1] = ndt * (a * h - g * b); + dest[2][2] = idt * (a * e - d * b); } /*! From 8366e51b471e5b344d542aee48a3ce0b253b85bb Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Fri, 29 Mar 2024 22:13:23 +0300 Subject: [PATCH 02/21] optimize mat4 scalar inv --- include/cglm/mat4.h | 53 +++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index 742b0cb..e1a8214 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -650,46 +650,37 @@ glm_mat4_inv(mat4 mat, mat4 dest) { #elif defined(CGLM_NEON_FP) glm_mat4_inv_neon(mat, dest); #else - float t[6]; - float det; float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3], e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3], i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3], - m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3]; + m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3], - t[0] = k * p - o * l; t[1] = j * p - n * l; t[2] = j * o - n * k; - t[3] = i * p - m * l; t[4] = i * o - m * k; t[5] = i * n - m * j; + c1 = k * p - l * o, c2 = c * h - d * g, c3 = i * p - l * m, + c4 = a * h - d * e, c5 = j * p - l * n, c6 = b * h - d * f, + c7 = i * n - j * m, c8 = a * f - b * e, c9 = j * o - k * n, + c10 = b * g - c * f, c11 = i * o - k * m, c12 = a * g - c * e, - dest[0][0] = f * t[0] - g * t[1] + h * t[2]; - dest[1][0] =-(e * t[0] - g * t[3] + h * t[4]); - dest[2][0] = e * t[1] - f * t[3] + h * t[5]; - dest[3][0] =-(e * t[2] - f * t[4] + g * t[5]); + idt = 1.0f/(c8*c1+c4*c9+c10*c3+c2*c7-c12*c5-c6*c11), ndt = -idt; - dest[0][1] =-(b * t[0] - c * t[1] + d * t[2]); - dest[1][1] = a * t[0] - c * t[3] + d * t[4]; - dest[2][1] =-(a * t[1] - b * t[3] + d * t[5]); - dest[3][1] = a * t[2] - b * t[4] + c * t[5]; + dest[0][0] = (f * c1 - g * c5 + h * c9) * idt; + dest[0][1] = (b * c1 - c * c5 + d * c9) * ndt; + dest[0][2] = (n * c2 - o * c6 + p * c10) * idt; + dest[0][3] = (j * c2 - k * c6 + l * c10) * ndt; - t[0] = g * p - o * h; t[1] = f * p - n * h; t[2] = f * o - n * g; - t[3] = e * p - m * h; t[4] = e * o - m * g; t[5] = e * n - m * f; + dest[1][0] = (e * c1 - g * c3 + h * c11) * ndt; + dest[1][1] = (a * c1 - c * c3 + d * c11) * idt; + dest[1][2] = (m * c2 - o * c4 + p * c12) * ndt; + dest[1][3] = (i * c2 - k * c4 + l * c12) * idt; - dest[0][2] = b * t[0] - c * t[1] + d * t[2]; - dest[1][2] =-(a * t[0] - c * t[3] + d * t[4]); - dest[2][2] = a * t[1] - b * t[3] + d * t[5]; - dest[3][2] =-(a * t[2] - b * t[4] + c * t[5]); + dest[2][0] = (e * c5 - f * c3 + h * c7) * idt; + dest[2][1] = (a * c5 - b * c3 + d * c7) * ndt; + dest[2][2] = (m * c6 - n * c4 + p * c8) * idt; + dest[2][3] = (i * c6 - j * c4 + l * c8) * ndt; - t[0] = g * l - k * h; t[1] = f * l - j * h; t[2] = f * k - j * g; - t[3] = e * l - i * h; t[4] = e * k - i * g; t[5] = e * j - i * f; - - dest[0][3] =-(b * t[0] - c * t[1] + d * t[2]); - dest[1][3] = a * t[0] - c * t[3] + d * t[4]; - dest[2][3] =-(a * t[1] - b * t[3] + d * t[5]); - dest[3][3] = a * t[2] - b * t[4] + c * t[5]; - - det = 1.0f / (a * dest[0][0] + b * dest[1][0] - + c * dest[2][0] + d * dest[3][0]); - - glm_mat4_scale_p(dest, det); + dest[3][0] = (e * c9 - f * c11 + g * c7) * ndt; + dest[3][1] = (a * c9 - b * c11 + c * c7) * idt; + dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt; + dest[3][3] = (i * c10 - j * c12 + k * c8) * idt; #endif } From f0e09776d772668287453d8738c8727936765da9 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 2 Apr 2024 02:36:16 +0300 Subject: [PATCH 03/21] arm, neon: optimize glmm_vhadd and add glmm_vdot --- include/cglm/simd/arm.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 1578390..eb999f1 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -63,8 +63,17 @@ static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_ static inline float32x4_t glmm_vhadd(float32x4_t v) { + float32x4_t p; + p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */ + return vpaddq_f32(p, p); /* [t, t, t, t] */; + + /* TODO: measure speed of this compare to above */ + /* return vdupq_n_f32(vaddvq_f32(v)); */ + + /* return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)), vaddq_f32(glmm_splat_z(v), glmm_splat_w(v))); + */ /* this seems slower: v = vaddq_f32(v, vrev64q_f32(v)); @@ -108,6 +117,12 @@ glmm_dot(float32x4_t a, float32x4_t b) { return glmm_hadd(vmulq_f32(a, b)); } +static inline +float32x4_t +glmm_vdot(float32x4_t a, float32x4_t b) { + return glmm_vhadd(vmulq_f32(a, b)); +} + static inline float glmm_norm(float32x4_t a) { From c528ca10950a11e943b4b54d5c679bb7bd8d5046 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 2 Apr 2024 13:21:49 +0300 Subject: [PATCH 04/21] neon: mat4_inv remastered --- include/cglm/simd/neon/mat4.h | 147 ++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index d76a454..b3f07fe 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -313,5 +313,152 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { glmm_store(dest[3], glmm_div(v3, x0)); } +CGLM_INLINE +void +glm_mat4_inv_neon_2(mat4 mat, mat4 dest) { + float32x4_t r0, r1, r2, r3, r5, r6, r7, r8, + v0, v1, v2, v3, + t0, t1, t2; + float32x4x2_t a1, a2, a3, a4, a5, a6; + float32x2_t l0, l1; + float32x4_t s1 = glmm_float32x4_SIGNMASK_PNPN, s2; + + s2 = vrev64q_f32(s1); + + /* 127 <- 0 */ + r0 = glmm_load(mat[0]); /* d c b a */ + r1 = glmm_load(mat[1]); /* h g f e */ + r2 = glmm_load(mat[2]); /* l k j i */ + r3 = glmm_load(mat[3]); /* p o n m */ + + a1 = vzipq_f32(r2, r0); /* d l c k, b j a i */ + a2 = vzipq_f32(r3, r1); /* h p g o, f n e m */ + a3 = vtrnq_f32(r3, r1); /* h p f n, g o e m */ + a4 = vtrnq_f32(r2, r0); /* d l b j, c k a i */ + + a5 = vzipq_f32(a3.val[0], a4.val[0]); /* c g k o, a e i m */ + a6 = vzipq_f32(a3.val[1], a4.val[1]); /* d h l p, b f j n */ + + r5 = vextq_f32(a5.val[0], a5.val[0], 2); /* i m a e */ + r6 = vextq_f32(a5.val[1], a5.val[1], 2); /* k o c g */ + + r7 = vextq_f32(a6.val[0], a6.val[0], 2); /* j n b f */ + r8 = vextq_f32(a6.val[1], a6.val[1], 2); /* l p d h */ + + l0 = vget_high_f32(a2.val[1]); /* h p */ + l1 = vget_high_f32(a1.val[1]); /* d l */ + + v0 = vcombine_f32(l0, l0); /* h p h p */ + v2 = vcombine_f32(l1, l1); /* d l d l */ + v1 = vextq_f32(a3.val[0], a2.val[1], 2); /* g o g o */ + v3 = vextq_f32(a4.val[0], a1.val[1], 2); /* c k c k */ + + t0 = vmulq_f32(a4.val[0], v0); + t2 = vmulq_f32(a1.val[0], v1); + t1 = vmulq_f32(a1.val[0], a3.val[1]); + + /* c3 = i * p - l * m c7 = i * n - j * m c11 = i * o - k * m + c4 = a * h - d * e c8 = a * f - b * e c12 = a * g - c * e + c1 = k * p - l * o c5 = j * p - l * n c9 = j * o - k * n + c2 = c * h - d * g c6 = b * h - d * f c10 = b * g - c * f */ + t0 = glmm_fnmadd(v2, a3.val[0], t0); + t1 = glmm_fnmadd(a4.val[1], a2.val[0], t1); + t2 = glmm_fnmadd(v3, a2.val[0], t2); + + /* det */ + + /* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */ + v0 = vextq_f32(t2, t1, 2); /* c8 c7 c10 c9 */ + v0 = vrev64q_f32(v0); /* c7 c8 c9 c10 */ + v0 = glmm_vdot(t0, v0); + + /* c5 * c12 + c6 * c11 */ + l1 = vget_low_f32(t2); + l0 = vget_high_f32(t1); + l1 = vrev64_f32(l1); + + l0 = vmul_f32(l0, l1); + l0 = vpadd_f32(l0, l0); + v1 = vdupq_lane_f32(l0, 0); + +// v1 = vextq_f32(t1, t2, 2); /* c12 c11 c6 c5 */ +// v2 = vrev64q_f32(v1); /* c11 c12 c5 c6 */ +// v2 = vextq_f32(v2, v2, 2); /* c5 c6 c11 c12 */ +// v1 = vmulq_f32(v1, v2); +// v1 = vpaddq_f32(v1, v1); +// /* v2 = vrev64q_f32(v1); +// v1 = vaddq_f32(v1, v2); */ + + v0 = vsubq_f32(v0, v1); /* det */ + + /* inv div */ + v1 = vdupq_n_f32(1.0f); + v0 = glmm_div(v1, v0); /* inv div */ + + /* multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */ + t0 = vmulq_f32(t0, v0); + t1 = vmulq_f32(t1, v0); + t2 = vmulq_f32(t2, v0); + + a1 = vzipq_f32(t0, t0); /* c2 c2 c1 c1, c4 c4 c3 c3 */ + a2 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c8 c8 c7 c7 */ + a3 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c12 c12 c11 c11 */ + +// v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */ +// v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */ + + /* result */ + + /* dest[0][0] = (f * c1 - g * c5 + h * c9) * idt; + dest[0][1] = (b * c1 - c * c5 + d * c9) * ndt; + dest[0][2] = (n * c2 - o * c6 + p * c10) * idt; + dest[0][3] = (j * c2 - k * c6 + l * c10) * ndt; + + dest[1][0] = (e * c1 - g * c3 + h * c11) * ndt; + dest[1][1] = (a * c1 - c * c3 + d * c11) * idt; + dest[1][2] = (m * c2 - o * c4 + p * c12) * ndt; + dest[1][3] = (i * c2 - k * c4 + l * c12) * idt; + + dest[2][0] = (e * c5 - f * c3 + h * c7) * idt; + dest[2][1] = (a * c5 - b * c3 + d * c7) * ndt; + dest[2][2] = (m * c6 - n * c4 + p * c8) * idt; + dest[2][3] = (i * c6 - j * c4 + l * c8) * ndt; + + dest[3][0] = (e * c9 - f * c11 + g * c7) * ndt; + dest[3][1] = (a * c9 - b * c11 + c * c7) * idt; + dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt; + dest[3][3] = (i * c10 - j * c12 + k * c8) * idt; */ + + r0 = vmulq_f32(r7, a1.val[1]); + r1 = vmulq_f32(r5, a1.val[1]); + r2 = vmulq_f32(r5, a2.val[1]); + r3 = vmulq_f32(r5, a3.val[1]); + + r0 = glmm_fnmadd(r6, a2.val[1], r0); + r1 = glmm_fnmadd(r6, a1.val[0], r1); + r2 = glmm_fnmadd(r7, a1.val[0], r2); + r3 = glmm_fnmadd(r7, a3.val[0], r3); + + r0 = glmm_fmadd(r8, a3.val[1], r0); + r1 = glmm_fmadd(r8, a3.val[0], r1); + r2 = glmm_fmadd(r8, a2.val[0], r2); + r3 = glmm_fmadd(r6, a2.val[0], r3); + + r0 = glmm_xor(r0, s1); + r1 = glmm_xor(r1, s2); + r2 = glmm_xor(r2, s1); + r3 = glmm_xor(r3, s2); + +// r0 = vmulq_f32(r0, v1); +// r1 = vmulq_f32(r1, v2); +// r2 = vmulq_f32(r2, v1); +// r3 = vmulq_f32(r3, v2); + + glmm_store(dest[0], r0); + glmm_store(dest[1], r1); + glmm_store(dest[2], r2); + glmm_store(dest[3], r3); +} + #endif #endif /* cglm_mat4_neon_h */ From 5b772d0eb4134f4471d15bd6568e5a321a74a345 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Wed, 3 Apr 2024 00:03:55 +0300 Subject: [PATCH 05/21] neon: mat4_inv, reduce 1mul for two extra 2xor --- include/cglm/simd/neon/mat4.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index b3f07fe..e6b4f8f 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -370,7 +370,9 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) { /* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */ v0 = vextq_f32(t2, t1, 2); /* c8 c7 c10 c9 */ v0 = vrev64q_f32(v0); /* c7 c8 c9 c10 */ - v0 = glmm_vdot(t0, v0); + v0 = vmulq_f32(t0, v0); + v0 = vpaddq_f32(v0, v0); + v0 = vpaddq_f32(v0, v0); /* c5 * c12 + c6 * c11 */ l1 = vget_low_f32(t2); @@ -394,8 +396,11 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) { /* inv div */ v1 = vdupq_n_f32(1.0f); v0 = glmm_div(v1, v0); /* inv div */ + + // v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */ + // v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */ - /* multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */ + /* [*] multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */ t0 = vmulq_f32(t0, v0); t1 = vmulq_f32(t1, v0); t2 = vmulq_f32(t2, v0); @@ -404,9 +409,6 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) { a2 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c8 c8 c7 c7 */ a3 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c12 c12 c11 c11 */ -// v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */ -// v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */ - /* result */ /* dest[0][0] = (f * c1 - g * c5 + h * c9) * idt; @@ -444,6 +446,7 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) { r2 = glmm_fmadd(r8, a2.val[0], r2); r3 = glmm_fmadd(r6, a2.val[0], r3); + /* 4 xor may be fastart then 4mul, see aboe [**] */ r0 = glmm_xor(r0, s1); r1 = glmm_xor(r1, s2); r2 = glmm_xor(r2, s1); From 0ff0e8948fac0db69e728a3ca386a4178d3ceffc Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Wed, 3 Apr 2024 00:05:42 +0300 Subject: [PATCH 06/21] Update mat4.h --- include/cglm/simd/neon/mat4.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index e6b4f8f..9712a71 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -172,6 +172,8 @@ glm_mat4_det_neon(mat4 mat) { return glmm_hadd(vmulq_f32(x2, r0)); } +/* old one */ +#if 0 CGLM_INLINE void glm_mat4_inv_neon(mat4 mat, mat4 dest) { @@ -312,10 +314,11 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { glmm_store(dest[2], glmm_div(v2, x0)); glmm_store(dest[3], glmm_div(v3, x0)); } +#endif CGLM_INLINE void -glm_mat4_inv_neon_2(mat4 mat, mat4 dest) { +glm_mat4_inv_neon(mat4 mat, mat4 dest) { float32x4_t r0, r1, r2, r3, r5, r6, r7, r8, v0, v1, v2, v3, t0, t1, t2; From b3308af146fb5134c98e003fae7e5138bc6043fe Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 6 Apr 2024 14:09:52 +0300 Subject: [PATCH 07/21] arm: fix glmm_vhadd on ARM32 --- include/cglm/simd/arm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index eb999f1..3e3bb22 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -63,10 +63,14 @@ static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_ static inline float32x4_t glmm_vhadd(float32x4_t v) { +#if CGLM_ARM64 float32x4_t p; p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */ return vpaddq_f32(p, p); /* [t, t, t, t] */; - +#else + return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)), + vaddq_f32(glmm_splat_z(v), glmm_splat_w(v))); +#endif /* TODO: measure speed of this compare to above */ /* return vdupq_n_f32(vaddvq_f32(v)); */ From f50a7a7d005d8d0966b6c1c8d30b73af5a3e5649 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 6 Apr 2024 14:10:13 +0300 Subject: [PATCH 08/21] arm, neon: improve glm_mat4_inv_neon --- include/cglm/simd/neon/mat4.h | 146 +++++++++++++++++----------------- 1 file changed, 72 insertions(+), 74 deletions(-) diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 9712a71..a15f1b2 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -319,13 +319,16 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { CGLM_INLINE void glm_mat4_inv_neon(mat4 mat, mat4 dest) { - float32x4_t r0, r1, r2, r3, r5, r6, r7, r8, - v0, v1, v2, v3, + float32x4_t r0, r1, r2, r3, + v0, v1, v2, v3, v4, v5, t0, t1, t2; - float32x4x2_t a1, a2, a3, a4, a5, a6; - float32x2_t l0, l1; + float32x4x2_t a0, a1, a2, a3, a4; float32x4_t s1 = glmm_float32x4_SIGNMASK_PNPN, s2; +#if !CGLM_ARM64 + float32x2_t l0, l1; +#endif + s2 = vrev64q_f32(s1); /* 127 <- 0 */ @@ -334,83 +337,83 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { r2 = glmm_load(mat[2]); /* l k j i */ r3 = glmm_load(mat[3]); /* p o n m */ - a1 = vzipq_f32(r2, r0); /* d l c k, b j a i */ - a2 = vzipq_f32(r3, r1); /* h p g o, f n e m */ - a3 = vtrnq_f32(r3, r1); /* h p f n, g o e m */ - a4 = vtrnq_f32(r2, r0); /* d l b j, c k a i */ + a1 = vzipq_f32(r0, r2); /* l d k c, j b i a */ + a2 = vzipq_f32(r1, r3); /* p h o g, n f m e */ + a3 = vzipq_f32(a2.val[0], a1.val[0]); /* j n b f, i m a e */ + a4 = vzipq_f32(a2.val[1], a1.val[1]); /* l p d h, k o c g */ - a5 = vzipq_f32(a3.val[0], a4.val[0]); /* c g k o, a e i m */ - a6 = vzipq_f32(a3.val[1], a4.val[1]); /* d h l p, b f j n */ + v0 = vextq_f32(a1.val[0], a1.val[1], 2); /* k c j b */ + v1 = vextq_f32(a2.val[0], a2.val[1], 2); /* o g n f */ + v2 = vextq_f32(a1.val[1], a2.val[0], 2); /* m e l d */ + v3 = vextq_f32(a2.val[1], a1.val[0], 2); /* i a p h */ + v4 = vextq_f32(v1, v2, 2); /* l d o g */ + v5 = vextq_f32(v0, v3, 2); /* p h k c */ - r5 = vextq_f32(a5.val[0], a5.val[0], 2); /* i m a e */ - r6 = vextq_f32(a5.val[1], a5.val[1], 2); /* k o c g */ - - r7 = vextq_f32(a6.val[0], a6.val[0], 2); /* j n b f */ - r8 = vextq_f32(a6.val[1], a6.val[1], 2); /* l p d h */ - - l0 = vget_high_f32(a2.val[1]); /* h p */ - l1 = vget_high_f32(a1.val[1]); /* d l */ - - v0 = vcombine_f32(l0, l0); /* h p h p */ - v2 = vcombine_f32(l1, l1); /* d l d l */ - v1 = vextq_f32(a3.val[0], a2.val[1], 2); /* g o g o */ - v3 = vextq_f32(a4.val[0], a1.val[1], 2); /* c k c k */ - - t0 = vmulq_f32(a4.val[0], v0); + /* c2 = c * h - g * d c12 = a * g - c * e c8 = a * f - b * e + c1 = k * p - o * l c11 = i * o - k * m c7 = i * n - j * m + c4 = h * a - d * e c6 = b * h - d * f c10 = b * g - c * f + c3 = p * i - l * m c5 = j * p - l * n c9 = j * o - k * n */ + t0 = vmulq_f32(v5, v3); + t1 = vmulq_f32(a1.val[0], a2.val[1]); t2 = vmulq_f32(a1.val[0], v1); - t1 = vmulq_f32(a1.val[0], a3.val[1]); - /* c3 = i * p - l * m c7 = i * n - j * m c11 = i * o - k * m - c4 = a * h - d * e c8 = a * f - b * e c12 = a * g - c * e - c1 = k * p - l * o c5 = j * p - l * n c9 = j * o - k * n - c2 = c * h - d * g c6 = b * h - d * f c10 = b * g - c * f */ - t0 = glmm_fnmadd(v2, a3.val[0], t0); - t1 = glmm_fnmadd(a4.val[1], a2.val[0], t1); - t2 = glmm_fnmadd(v3, a2.val[0], t2); + t0 = glmm_fnmadd(v4, v2, t0); + t1 = glmm_fnmadd(a1.val[1], a2.val[0], t1); + t2 = glmm_fnmadd(v0, a2.val[0], t2); + + t0 = vrev64q_f32(t0); + t1 = vrev64q_f32(t1); + t2 = vrev64q_f32(t2); /* det */ + v0 = vrev64q_f32(t2); + v1 = vextq_f32(t1, t1, 2); + v0 = vmulq_f32(t0, v0); + v1 = vrev64q_f32(v1); + v1 = vmulq_f32(v1, t1); /* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */ - v0 = vextq_f32(t2, t1, 2); /* c8 c7 c10 c9 */ - v0 = vrev64q_f32(v0); /* c7 c8 c9 c10 */ - v0 = vmulq_f32(t0, v0); +#if CGLM_ARM64 v0 = vpaddq_f32(v0, v0); v0 = vpaddq_f32(v0, v0); +#else + l0 = vget_low_f32(v0); + l1 = vget_high_f32(v0); + + l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */ + l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */ + l0 = vadd_f32(l0, l1); /* [sum, sum] */ + + v0 = vcombine_f32(l0, l0); +#endif /* c5 * c12 + c6 * c11 */ - l1 = vget_low_f32(t2); - l0 = vget_high_f32(t1); - l1 = vrev64_f32(l1); +#if CGLM_ARM64 + v1 = vpaddq_f32(v1, v1); +#else + l0 = vget_low_f32(v1); + l1 = vget_high_f32(v1); - l0 = vmul_f32(l0, l1); - l0 = vpadd_f32(l0, l0); - v1 = vdupq_lane_f32(l0, 0); + l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */ + l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */ -// v1 = vextq_f32(t1, t2, 2); /* c12 c11 c6 c5 */ -// v2 = vrev64q_f32(v1); /* c11 c12 c5 c6 */ -// v2 = vextq_f32(v2, v2, 2); /* c5 c6 c11 c12 */ -// v1 = vmulq_f32(v1, v2); -// v1 = vpaddq_f32(v1, v1); -// /* v2 = vrev64q_f32(v1); -// v1 = vaddq_f32(v1, v2); */ + v1 = vcombine_f32(l0, l1); +#endif v0 = vsubq_f32(v0, v1); /* det */ /* inv div */ v1 = vdupq_n_f32(1.0f); v0 = glmm_div(v1, v0); /* inv div */ - - // v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */ - // v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */ - /* [*] multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */ + /* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */ t0 = vmulq_f32(t0, v0); t1 = vmulq_f32(t1, v0); t2 = vmulq_f32(t2, v0); - a1 = vzipq_f32(t0, t0); /* c2 c2 c1 c1, c4 c4 c3 c3 */ - a2 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c8 c8 c7 c7 */ - a3 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c12 c12 c11 c11 */ + a0 = vzipq_f32(t0, t0); /* c4 c4 c3 c3, c2 c2 c1 c1 */ + a1 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c12 c12 c11 c11 */ + a2 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c8 c8 c7 c7 */ /* result */ @@ -434,32 +437,27 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt; dest[3][3] = (i * c10 - j * c12 + k * c8) * idt; */ - r0 = vmulq_f32(r7, a1.val[1]); - r1 = vmulq_f32(r5, a1.val[1]); - r2 = vmulq_f32(r5, a2.val[1]); - r3 = vmulq_f32(r5, a3.val[1]); + r0 = vmulq_f32(a3.val[1], a0.val[0]); + r1 = vmulq_f32(a3.val[0], a0.val[0]); + r2 = vmulq_f32(a3.val[0], a1.val[1]); + r3 = vmulq_f32(a3.val[0], a2.val[1]); - r0 = glmm_fnmadd(r6, a2.val[1], r0); - r1 = glmm_fnmadd(r6, a1.val[0], r1); - r2 = glmm_fnmadd(r7, a1.val[0], r2); - r3 = glmm_fnmadd(r7, a3.val[0], r3); + r0 = glmm_fnmadd(a4.val[0], a1.val[1], r0); + r1 = glmm_fnmadd(a4.val[0], a0.val[1], r1); + r2 = glmm_fnmadd(a3.val[1], a0.val[1], r2); + r3 = glmm_fnmadd(a3.val[1], a1.val[0], r3); - r0 = glmm_fmadd(r8, a3.val[1], r0); - r1 = glmm_fmadd(r8, a3.val[0], r1); - r2 = glmm_fmadd(r8, a2.val[0], r2); - r3 = glmm_fmadd(r6, a2.val[0], r3); + r0 = glmm_fmadd(a4.val[1], a2.val[1], r0); + r1 = glmm_fmadd(a4.val[1], a1.val[0], r1); + r2 = glmm_fmadd(a4.val[1], a2.val[0], r2); + r3 = glmm_fmadd(a4.val[0], a2.val[0], r3); - /* 4 xor may be fastart then 4mul, see aboe [**] */ + /* 4xor may be fastart then 4mul, see above */ r0 = glmm_xor(r0, s1); r1 = glmm_xor(r1, s2); r2 = glmm_xor(r2, s1); r3 = glmm_xor(r3, s2); -// r0 = vmulq_f32(r0, v1); -// r1 = vmulq_f32(r1, v2); -// r2 = vmulq_f32(r2, v1); -// r3 = vmulq_f32(r3, v2); - glmm_store(dest[0], r0); glmm_store(dest[1], r1); glmm_store(dest[2], r2); From 87350f809bf4085bdbb9081f93db7bec9750dfb7 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 6 Apr 2024 14:11:46 +0300 Subject: [PATCH 09/21] msvc bug: dont align types due to "ARM32 = C2719: formal parameter with requested alignment of 16 won't be aligned." on ARM32/MSVC until a good solution. --- include/cglm/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cglm/types.h b/include/cglm/types.h index 65391cd..ce460b7 100644 --- a/include/cglm/types.h +++ b/include/cglm/types.h @@ -14,7 +14,7 @@ #if defined(_MSC_VER) /* do not use alignment for older visual studio versions */ -# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */ +# if _MSC_VER < 1913 || _M_ARM /* Visual Studio 2017 version 15.6 */ # define CGLM_ALL_UNALIGNED # define CGLM_ALIGN(X) /* no alignment */ # else From bd941ed7fb73c23015a00d6a51c826a2dae47157 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 6 Apr 2024 14:23:36 +0300 Subject: [PATCH 10/21] arm, neon: fix neon support on GCC ARM --- include/cglm/simd/intrin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 11c46e5..9dade91 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -100,7 +100,7 @@ #else /* non-windows */ # if defined(__ARM_NEON) || defined(__ARM_NEON__) # include -# if defined(__ARM_NEON_FP) +# if defined(__ARM_NEON_FP) || defined(__ARM_FP) # define CGLM_NEON_FP 1 # endif # ifndef CGLM_SIMD_ARM From a7845ffc44cb91c24027f99d9a7b4901f84f5d71 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sun, 7 Apr 2024 00:54:29 +0300 Subject: [PATCH 11/21] msvc, simd: fix simd headers for _M_ARM64EC --- include/cglm/simd/intrin.h | 11 +++++------ include/cglm/simd/neon/mat4.h | 2 +- include/cglm/types.h | 1 + 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 9dade91..9b3da2f 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -8,21 +8,20 @@ #ifndef cglm_intrin_h #define cglm_intrin_h -#if defined( _MSC_VER ) +#if defined(_MSC_VER) && !defined(_M_ARM64EC) # if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2 # ifndef __SSE__ # define __SSE__ # endif -# ifndef __SSE2__ -# define __SSE2__ -# endif + # elif _M_IX86_FP == 1 # ifndef __SSE__ # define __SSE__ # endif -#endif +# endif /* do not use alignment for older visual studio versions */ -# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */ +/* also ARM32 also causes similar error, disable it for now on ARM32 too */ +# if _MSC_VER < 1913 || _M_ARM /* Visual Studio 2017 version 15.6 */ # define CGLM_ALL_UNALIGNED # endif #endif diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index a15f1b2..92442e3 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -326,7 +326,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { float32x4_t s1 = glmm_float32x4_SIGNMASK_PNPN, s2; #if !CGLM_ARM64 - float32x2_t l0, l1; + float32x2_t l0, l1; #endif s2 = vrev64q_f32(s1); diff --git a/include/cglm/types.h b/include/cglm/types.h index ce460b7..26e6467 100644 --- a/include/cglm/types.h +++ b/include/cglm/types.h @@ -14,6 +14,7 @@ #if defined(_MSC_VER) /* do not use alignment for older visual studio versions */ +/* also ARM32 also causes similar error, disable it for now on ARM32 too */ # if _MSC_VER < 1913 || _M_ARM /* Visual Studio 2017 version 15.6 */ # define CGLM_ALL_UNALIGNED # define CGLM_ALIGN(X) /* no alignment */ From 4f00ce0e52bd00c10485a7813a1e4fb46748775a Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sun, 7 Apr 2024 22:33:37 +0300 Subject: [PATCH 12/21] sse: reduce some instructions in mat4 inv --- include/cglm/simd/sse2/mat4.h | 137 ++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 05e7efe..fb6d2f2 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -295,6 +295,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) { glmm_store(dest[3], _mm_mul_ps(v3, x0)); } +/* old one */ +#if 0 CGLM_INLINE void glm_mat4_inv_sse2(mat4 mat, mat4 dest) { @@ -431,6 +433,141 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { glmm_store(dest[2], _mm_mul_ps(v2, x0)); glmm_store(dest[3], _mm_mul_ps(v3, x0)); } +#endif +CGLM_INLINE +void +glm_mat4_inv_sse2(mat4 mat, mat4 dest) { + __m128 r0, r1, r2, r3, s1, s2, + v0, v1, v2, v3, v4, v5, + t0, t1, t2, + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13; + + /* s1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */ + s1 = glmm_float32x4_SIGNMASK_NPNP; + s2 = glmm_shuff1(s1, 2, 1, 2, 1); + + /* 127 <- 0 */ + r1 = glmm_load(mat[1]); /* h g f e */ + r0 = glmm_load(mat[0]); /* d c b a */ + r3 = glmm_load(mat[3]); /* p o n m */ + r2 = glmm_load(mat[2]); /* l k j i */ + + x4 = _mm_unpackhi_ps(r0, r2); /* l d k c */ + x5 = _mm_unpacklo_ps(r0, r2); /* j b i a */ + x6 = _mm_unpackhi_ps(r1, r3); /* p h o g */ + x7 = _mm_unpacklo_ps(r1, r3); /* n f m e */ + + x0 = _mm_unpackhi_ps(x7, x5); /* j n b f */ + x1 = _mm_unpacklo_ps(x7, x5); /* i m a e */ + x2 = _mm_unpackhi_ps(x6, x4); /* l p d h */ + x3 = _mm_unpacklo_ps(x6, x4); /* k o c g */ + + /* c2 = c * h - d * g c12 = a * g - c * e c8 = a * f - b * e + c1 = k * p - l * o c11 = i * o - k * m c7 = i * n - j * m + c4 = a * h - d * e c6 = b * h - d * f c10 = b * g - c * f + c3 = i * p - l * m c5 = j * p - l * n c9 = j * o - k * n */ + + x8 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(3, 1, 3, 1)); /* k c j b */ + x9 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(2, 0, 2, 0)); /* o g n f */ + + x10 = glmm_shuff1(x2, 2, 0, 2, 0); /* p h p h */ + x11 = glmm_shuff1(x2, 3, 1, 3, 1); /* l d l d */ + +#if 1 /* TODO measure both */ + x12 = _mm_shuffle_ps(x4, x5, _MM_SHUFFLE(1, 0, 1, 0)); /* i a k c */ + x13 = _mm_shuffle_ps(x6, x7, _MM_SHUFFLE(1, 0, 1, 0)); /* m e o g */ +#else + x12 = _mm_movelh_ps(x4, x5); /* i a k c */ + x13 = _mm_movelh_ps(x6, x7); /* m e o g */ +#endif + + t0 = _mm_mul_ps(x12, x10); + t1 = _mm_mul_ps(x5, x6); + t2 = _mm_mul_ps(x5, x9); + + t0 = glmm_fnmadd(x11, x13, t0); + t1 = glmm_fnmadd(x4, x7, t1); + t2 = glmm_fnmadd(x8, x7, t2); + + /* det */ + /* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */ + /* v1: c5 * c12 + c6 * c11 */ + + v5 = _mm_set1_ps(1.0f); + v0 = glmm_shuff1(t2, 2, 3, 0, 1); + v1 = glmm_shuff1(t1, 0, 1, 2, 3); + v0 = _mm_mul_ps(t0, v0); + v1 = _mm_mul_ps(t1, v1); + v2 = glmm_shuff1(v1, 1, 0, 0, 1); + v3 = glmm_shuff1(v0, 0, 1, 2, 3); + v1 = _mm_add_ps(v1, v2); + v0 = _mm_add_ps(v0, v3); + v2 = glmm_shuff1(v0, 1, 0, 0, 1); + v0 = _mm_add_ps(v0, v2); + + v0 = _mm_sub_ps(v0, v1); /* det */ + v0 = _mm_div_ps(v5, v0); /* idt */ + + /* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */ + t0 = _mm_mul_ps(t0, v0); + t1 = _mm_mul_ps(t1, v0); + t2 = _mm_mul_ps(t2, v0); + + v0 = glmm_shuff1(t0, 0, 0, 1, 1); /* c2 c2 c1 c1 */ + v1 = glmm_shuff1(t0, 2, 2, 3, 3); /* c4 c4 c3 c3 */ + v2 = glmm_shuff1(t1, 0, 0, 1, 1); /* c12 c12 c11 c11 */ + v3 = glmm_shuff1(t1, 2, 2, 3, 3); /* c6 c6 c5 c5 */ + v4 = glmm_shuff1(t2, 0, 0, 1, 1); /* c8 c8 c7 c7 */ + v5 = glmm_shuff1(t2, 2, 2, 3, 3); /* c10 c10 c9 c9 */ + + /* result */ + + /* dest[0][0] = (f * c1 - g * c5 + h * c9) * idt; + dest[0][1] = (b * c1 - c * c5 + d * c9) * ndt; + dest[0][2] = (n * c2 - o * c6 + p * c10) * idt; + dest[0][3] = (j * c2 - k * c6 + l * c10) * ndt; + + dest[1][0] = (e * c1 - g * c3 + h * c11) * ndt; + dest[1][1] = (a * c1 - c * c3 + d * c11) * idt; + dest[1][2] = (m * c2 - o * c4 + p * c12) * ndt; + dest[1][3] = (i * c2 - k * c4 + l * c12) * idt; + + dest[2][0] = (e * c5 - f * c3 + h * c7) * idt; + dest[2][1] = (a * c5 - b * c3 + d * c7) * ndt; + dest[2][2] = (m * c6 - n * c4 + p * c8) * idt; + dest[2][3] = (i * c6 - j * c4 + l * c8) * ndt; + + dest[3][0] = (e * c9 - f * c11 + g * c7) * ndt; + dest[3][1] = (a * c9 - b * c11 + c * c7) * idt; + dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt; + dest[3][3] = (i * c10 - j * c12 + k * c8) * idt; */ + + r0 = _mm_mul_ps(x0, v0); + r1 = _mm_mul_ps(x1, v0); + r2 = _mm_mul_ps(x1, v3); + r3 = _mm_mul_ps(x1, v5); + + r0 = glmm_fnmadd(x3, v3, r0); + r1 = glmm_fnmadd(x3, v1, r1); + r2 = glmm_fnmadd(x0, v1, r2); + r3 = glmm_fnmadd(x0, v2, r3); + + r0 = glmm_fmadd(x2, v5, r0); + r1 = glmm_fmadd(x2, v2, r1); + r2 = glmm_fmadd(x2, v4, r2); + r3 = glmm_fmadd(x3, v4, r3); + + /* 4xor may be fastart then 4mul, see above */ + r0 = _mm_xor_ps(r0, s1); + r1 = _mm_xor_ps(r1, s2); + r2 = _mm_xor_ps(r2, s1); + r3 = _mm_xor_ps(r3, s2); + + glmm_store(dest[0], r0); + glmm_store(dest[1], r1); + glmm_store(dest[2], r2); + glmm_store(dest[3], r3); +} #endif #endif /* cglm_mat_sse_h */ From 62c0448e25ee03dccf3dfa2c0fc61f157410678e Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sun, 7 Apr 2024 22:48:11 +0300 Subject: [PATCH 13/21] simd, msvc: ensure required definitions are exist on msvc --- include/cglm/simd/intrin.h | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index 9b3da2f..c477f34 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -13,7 +13,9 @@ # ifndef __SSE__ # define __SSE__ # endif - +# ifndef __SSE2__ +# define __SSE2__ +# endif # elif _M_IX86_FP == 1 # ifndef __SSE__ # define __SSE__ @@ -26,6 +28,29 @@ # endif #endif +#ifdef __AVX__ +# include +# define CGLM_AVX_FP 1 +# ifndef __SSE2__ +# define __SSE2__ +# endif +# ifndef __SSE3__ +# define __SSE3__ +# endif +# ifndef __SSE4__ +# define __SSE4__ +# endif +# ifndef __SSE4_1__ +# define __SSE4_1__ +# endif +# ifndef __SSE4_2__ +# define __SSE4_2__ +# endif +# ifndef CGLM_SIMD_x86 +# define CGLM_SIMD_x86 +# endif +#endif + #if defined(__SSE__) # include # define CGLM_SSE_FP 1 @@ -63,14 +88,6 @@ # endif #endif -#ifdef __AVX__ -# include -# define CGLM_AVX_FP 1 -# ifndef CGLM_SIMD_x86 -# define CGLM_SIMD_x86 -# endif -#endif - /* ARM Neon */ #if defined(_WIN32) && defined(_MSC_VER) /* TODO: non-ARM stuff already inported, will this be better option */ From 68bdec4510e0ba10960ab558c99952fab4d959b8 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Wed, 10 Apr 2024 22:52:53 +0300 Subject: [PATCH 14/21] simd: use glmm_set1() to optimize broadcasting single float --- include/cglm/mat2.h | 2 +- include/cglm/simd/avx/mat4.h | 4 ++-- include/cglm/simd/sse2/mat4.h | 6 +++--- include/cglm/simd/x86.h | 11 +++++++++-- include/cglm/vec4-ext.h | 4 ++-- include/cglm/vec4.h | 14 +++++++------- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/include/cglm/mat2.h b/include/cglm/mat2.h index 7fba348..1da0cd4 100644 --- a/include/cglm/mat2.h +++ b/include/cglm/mat2.h @@ -235,7 +235,7 @@ glm_mat2_scale(mat2 m, float s) { glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]), wasm_f32x4_splat(s))); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s))); + glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), glmm_set1(s))); #elif defined(CGLM_NEON_FP) vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s))); #else diff --git a/include/cglm/simd/avx/mat4.h b/include/cglm/simd/avx/mat4.h index e8c36c8..f5812de 100644 --- a/include/cglm/simd/avx/mat4.h +++ b/include/cglm/simd/avx/mat4.h @@ -18,8 +18,8 @@ CGLM_INLINE void glm_mat4_scale_avx(mat4 m, float s) { __m256 y0; - y0 = _mm256_set1_ps(s); - + y0 = _mm256_broadcast_ss(&s); + glmm_store256(m[0], _mm256_mul_ps(y0, glmm_load256(m[0]))); glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2]))); } diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index fb6d2f2..5df7254 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -18,7 +18,7 @@ CGLM_INLINE void glm_mat4_scale_sse2(mat4 m, float s) { __m128 x0; - x0 = _mm_set1_ps(s); + x0 = glmm_set1(s); glmm_store(m[0], _mm_mul_ps(glmm_load(m[0]), x0)); glmm_store(m[1], _mm_mul_ps(glmm_load(m[1]), x0)); @@ -426,7 +426,7 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0)); x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0)); - x0 = _mm_div_ps(_mm_set1_ps(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0))); + x0 = _mm_div_ps(glmm_set1(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0))); glmm_store(dest[0], _mm_mul_ps(v0, x0)); glmm_store(dest[1], _mm_mul_ps(v1, x0)); @@ -494,7 +494,7 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { /* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */ /* v1: c5 * c12 + c6 * c11 */ - v5 = _mm_set1_ps(1.0f); + v5 = glmm_set1(1.0f); v0 = glmm_shuff1(t2, 2, 3, 0, 1); v1 = glmm_shuff1(t1, 0, 1, 2, 3); v0 = _mm_mul_ps(t0, v0); diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index 81081dc..657d9ba 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -18,9 +18,16 @@ # define glmm_store(p, a) _mm_store_ps(p, a) #endif -#define glmm_set1(x) _mm_set1_ps(x) #define glmm_128 __m128 +#ifdef __AVX__ +# define glmm_set1(x) _mm_broadcast_ss(&x) +# define glmm_set1_ptr(x) _mm_broadcast_ss(x) +#else +# define glmm_set1(x) _mm_set1_ps(x) +# define glmm_set1_ptr(x) _mm_set1_ps(*x) +#endif + #if defined(CGLM_USE_INT_DOMAIN) && defined(__SSE2__) # define glmm_shuff1(xmm, z, y, x, w) \ _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ @@ -86,7 +93,7 @@ #if defined(__SSE2__) # define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(GLMM_NEGZEROf)) /* _mm_set1_ps(-0.0f) */ #else -# define glmm_float32x4_SIGNMASK_NEG _mm_set1_ps(GLMM_NEGZEROf) +# define glmm_float32x4_SIGNMASK_NEG glmm_set1(GLMM_NEGZEROf) #endif #define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf)) diff --git a/include/cglm/vec4-ext.h b/include/cglm/vec4-ext.h index cc09ee1..b3850a0 100644 --- a/include/cglm/vec4-ext.h +++ b/include/cglm/vec4-ext.h @@ -48,7 +48,7 @@ glm_vec4_broadcast(float val, vec4 d) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(d, wasm_f32x4_splat(val)); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(d, _mm_set1_ps(val)); + glmm_store(d, glmm_set1(val)); #else d[0] = d[1] = d[2] = d[3] = val; #endif @@ -66,7 +66,7 @@ glm_vec4_fill(vec4 v, float val) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(v, wasm_f32x4_splat(val)); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, _mm_set1_ps(val)); + glmm_store(v, glmm_set1(val)); #else v[0] = v[1] = v[2] = v[3] = val; #endif diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index e24675f..a85419b 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -215,7 +215,7 @@ glm_vec4_one(vec4 v) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(v, wasm_f32x4_const_splat(1.0f)); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, _mm_set1_ps(1.0f)); + glmm_store(v, glmm_set1(1.0f)); #elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(1.0f)); #else @@ -367,7 +367,7 @@ glm_vec4_adds(vec4 v, float s, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_add(glmm_load(v), wasm_f32x4_splat(s))); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s))); + glmm_store(dest, _mm_add_ps(glmm_load(v), glmm_set1(s))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else @@ -415,7 +415,7 @@ glm_vec4_subs(vec4 v, float s, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_sub(glmm_load(v), wasm_f32x4_splat(s))); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s))); + glmm_store(dest, _mm_sub_ps(glmm_load(v), glmm_set1(s))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else @@ -463,7 +463,7 @@ glm_vec4_scale(vec4 v, float s, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_mul(glmm_load(v), wasm_f32x4_splat(s))); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s))); + glmm_store(dest, _mm_mul_ps(glmm_load(v), glmm_set1(s))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s))); #else @@ -528,7 +528,7 @@ glm_vec4_divs(vec4 v, float s, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_div(glmm_load(v), wasm_f32x4_splat(s))); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s))); + glmm_store(dest, _mm_div_ps(glmm_load(v), glmm_set1(s))); #else glm_vec4_scale(v, 1.0f / s, dest); #endif @@ -1065,8 +1065,8 @@ glm_vec4_clamp(vec4 v, float minVal, float maxVal) { glmm_store(v, glmm_min(glmm_max(glmm_load(v), wasm_f32x4_splat(minVal)), wasm_f32x4_splat(maxVal))); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, glmm_min(glmm_max(glmm_load(v), _mm_set1_ps(minVal)), - _mm_set1_ps(maxVal))); + glmm_store(v, glmm_min(glmm_max(glmm_load(v), glmm_set1(minVal)), + glmm_set1(maxVal))); #elif defined(CGLM_NEON_FP) glmm_store(v, glmm_min(glmm_max(vld1q_f32(v), vdupq_n_f32(minVal)), vdupq_n_f32(maxVal))); From de66f0a67fff218a7d031f4861b9db3e7b4c3f75 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Wed, 10 Apr 2024 23:49:18 +0300 Subject: [PATCH 15/21] glmm, avx: optimize splat macros --- include/cglm/simd/x86.h | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index 657d9ba..b80f335 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -20,14 +20,6 @@ #define glmm_128 __m128 -#ifdef __AVX__ -# define glmm_set1(x) _mm_broadcast_ss(&x) -# define glmm_set1_ptr(x) _mm_broadcast_ss(x) -#else -# define glmm_set1(x) _mm_set1_ps(x) -# define glmm_set1_ptr(x) _mm_set1_ps(*x) -#endif - #if defined(CGLM_USE_INT_DOMAIN) && defined(__SSE2__) # define glmm_shuff1(xmm, z, y, x, w) \ _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ @@ -39,10 +31,23 @@ #define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane) -#define glmm_splat_x(x) glmm_splat(x, 0) -#define glmm_splat_y(x) glmm_splat(x, 1) -#define glmm_splat_z(x) glmm_splat(x, 2) -#define glmm_splat_w(x) glmm_splat(x, 3) +#ifdef __AVX__ +# define glmm_set1(x) _mm_broadcast_ss(&x) +# define glmm_set1_ptr(x) _mm_broadcast_ss(x) + +# define glmm_splat_x(x) _mm_broadcastss_ps(x) +# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)) +# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)) +# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)) +#else +# define glmm_set1(x) _mm_set1_ps(x) +# define glmm_set1_ptr(x) _mm_set1_ps(*x) + +# define glmm_splat_x(x) glmm_splat(x, 0) +# define glmm_splat_y(x) glmm_splat(x, 1) +# define glmm_splat_z(x) glmm_splat(x, 2) +# define glmm_splat_w(x) glmm_splat(x, 3) +#endif /* glmm_shuff1x() is DEPRECATED!, use glmm_splat() */ #define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x) From 480e1de0486b2ba921e452e12b15f5082986c542 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 11 Apr 2024 21:57:16 +0300 Subject: [PATCH 16/21] sse: make use of int domain as default behavior if possible ( compiler may ignore it ) also use AVX's `_mm_permute_ps`for shuffling single vector --- docs/source/opt.rst | 2 +- include/cglm/simd/x86.h | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/source/opt.rst b/docs/source/opt.rst index e3c4cd2..d41549e 100644 --- a/docs/source/opt.rst +++ b/docs/source/opt.rst @@ -76,7 +76,7 @@ SSE and SSE2 Shuffle Option ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **_mm_shuffle_ps** generates **shufps** instruction even if registers are same. You can force it to generate **pshufd** instruction by defining -**CGLM_USE_INT_DOMAIN** macro. As default it is not defined. +**CGLM_NO_INT_DOMAIN** macro. As default it is not defined. SSE3 and SSE4 Dot Product Options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index b80f335..fda4b1c 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -20,13 +20,18 @@ #define glmm_128 __m128 -#if defined(CGLM_USE_INT_DOMAIN) && defined(__SSE2__) +#ifdef __AVX__ # define glmm_shuff1(xmm, z, y, x, w) \ - _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ - _MM_SHUFFLE(z, y, x, w))) + _mm_permute_ps((xmm), _MM_SHUFFLE(z, y, x, w)) #else -# define glmm_shuff1(xmm, z, y, x, w) \ +# if !defined(CGLM_NO_INT_DOMAIN) && defined(__SSE2__) +# define glmm_shuff1(xmm, z, y, x, w) \ + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ + _MM_SHUFFLE(z, y, x, w))) +# else +# define glmm_shuff1(xmm, z, y, x, w) \ _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w)) +# endif #endif #define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane) From 14c567d9d9e50e4750389c4b755ffa045974c210 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 11 Apr 2024 21:57:46 +0300 Subject: [PATCH 17/21] sse: drop unused macros: glmm_shuff1x, glmm_shuff2 --- include/cglm/simd/x86.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index fda4b1c..ea8577e 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -54,13 +54,6 @@ # define glmm_splat_w(x) glmm_splat(x, 3) #endif -/* glmm_shuff1x() is DEPRECATED!, use glmm_splat() */ -#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x) - -#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \ - glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ - z1, y1, x1, w1) - #ifdef __AVX__ # ifdef CGLM_ALL_UNALIGNED # define glmm_load256(p) _mm256_loadu_ps(p) From 45c1beff516a5305e1a7d6c5138b3e3b3af22f7c Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Fri, 12 Apr 2024 21:53:20 +0300 Subject: [PATCH 18/21] simd: fix glmm_set1, glmm_splat --- include/cglm/simd/neon/mat4.h | 2 +- include/cglm/simd/sse2/mat4.h | 12 ++++++------ include/cglm/simd/x86.h | 31 ++++++++++++++++++------------- include/cglm/vec4.h | 2 +- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 92442e3..6cf9811 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -299,7 +299,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { vget_low_f32(vzipq_f32(v2, v3).val[0])); /* - x0 = glmm_div(glmm_set1(1.0f), glmm_vhadd(vmulq_f32(x0, r0))); + x0 = glmm_div(glmm_set1_rval(1.0f), glmm_vhadd(vmulq_f32(x0, r0))); glmm_store(dest[0], vmulq_f32(v0, x0)); glmm_store(dest[1], vmulq_f32(v1, x0)); diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 5df7254..2127e72 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -471,15 +471,15 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { x8 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(3, 1, 3, 1)); /* k c j b */ x9 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(2, 0, 2, 0)); /* o g n f */ - x10 = glmm_shuff1(x2, 2, 0, 2, 0); /* p h p h */ - x11 = glmm_shuff1(x2, 3, 1, 3, 1); /* l d l d */ + x10 = glmm_shuff1(x2, 2, 0, 2, 0); /* p h p h */ + x11 = glmm_shuff1(x2, 3, 1, 3, 1); /* l d l d */ -#if 1 /* TODO measure both */ +#if 0 /* TODO measure both */ x12 = _mm_shuffle_ps(x4, x5, _MM_SHUFFLE(1, 0, 1, 0)); /* i a k c */ x13 = _mm_shuffle_ps(x6, x7, _MM_SHUFFLE(1, 0, 1, 0)); /* m e o g */ #else - x12 = _mm_movelh_ps(x4, x5); /* i a k c */ - x13 = _mm_movelh_ps(x6, x7); /* m e o g */ + x12 = _mm_movelh_ps(x4, x5); /* i a k c */ + x13 = _mm_movelh_ps(x6, x7); /* m e o g */ #endif t0 = _mm_mul_ps(x12, x10); @@ -494,7 +494,7 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { /* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */ /* v1: c5 * c12 + c6 * c11 */ - v5 = glmm_set1(1.0f); + v5 = glmm_set1_rval(1.0f); v0 = glmm_shuff1(t2, 2, 3, 0, 1); v1 = glmm_shuff1(t1, 0, 1, 2, 3); v0 = _mm_mul_ps(t0, v0); diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index ea8577e..2410d0f 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -37,21 +37,26 @@ #define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane) #ifdef __AVX__ -# define glmm_set1(x) _mm_broadcast_ss(&x) -# define glmm_set1_ptr(x) _mm_broadcast_ss(x) - -# define glmm_splat_x(x) _mm_broadcastss_ps(x) -# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)) -# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)) -# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)) +# define glmm_set1(x) _mm_broadcast_ss(&x) +# define glmm_set1_ptr(x) _mm_broadcast_ss(x) +# define glmm_set1_rval(x) _mm_set1_ps(x) +# ifdef __AVX2__ +# define glmm_splat_x(x) _mm_broadcastss_ps(x) +# else +# define glmm_splat_x(x) _mm_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)) +# endif +# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)) +# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)) +# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)) #else -# define glmm_set1(x) _mm_set1_ps(x) -# define glmm_set1_ptr(x) _mm_set1_ps(*x) +# define glmm_set1(x) _mm_set1_ps(x) +# define glmm_set1_ptr(x) _mm_set1_ps(*x) +# define glmm_set1_rval(x) _mm_set1_ps(x) -# define glmm_splat_x(x) glmm_splat(x, 0) -# define glmm_splat_y(x) glmm_splat(x, 1) -# define glmm_splat_z(x) glmm_splat(x, 2) -# define glmm_splat_w(x) glmm_splat(x, 3) +# define glmm_splat_x(x) glmm_splat(x, 0) +# define glmm_splat_y(x) glmm_splat(x, 1) +# define glmm_splat_z(x) glmm_splat(x, 2) +# define glmm_splat_w(x) glmm_splat(x, 3) #endif #ifdef __AVX__ diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index a85419b..2d18d8c 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -215,7 +215,7 @@ glm_vec4_one(vec4 v) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(v, wasm_f32x4_const_splat(1.0f)); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, glmm_set1(1.0f)); + glmm_store(v, glmm_set1_rval(1.0f)); #elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(1.0f)); #else From d75467f93f09c178754a064b62bdd8f960bf2084 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 13 Apr 2024 00:12:14 +0300 Subject: [PATCH 19/21] avx: implement transpose with AVX --- include/cglm/mat4.h | 4 ++++ include/cglm/simd/avx/mat4.h | 27 +++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index e1a8214..40a2b24 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -520,6 +520,8 @@ void glm_mat4_transpose_to(mat4 m, mat4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) glm_mat4_transp_wasm(m, dest); +#elif defined(__AVX__) + glm_mat4_transp_avx(m, dest); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, dest); #elif defined(CGLM_NEON_FP) @@ -546,6 +548,8 @@ void glm_mat4_transpose(mat4 m) { #if defined(__wasm__) && defined(__wasm_simd128__) glm_mat4_transp_wasm(m, m); +#elif defined(__AVX__) + glm_mat4_transp_avx(m, m); #elif defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, m); #elif defined(CGLM_NEON_FP) diff --git a/include/cglm/simd/avx/mat4.h b/include/cglm/simd/avx/mat4.h index f5812de..a8f576a 100644 --- a/include/cglm/simd/avx/mat4.h +++ b/include/cglm/simd/avx/mat4.h @@ -12,8 +12,6 @@ #include "../../common.h" #include "../intrin.h" -#include - CGLM_INLINE void glm_mat4_scale_avx(mat4 m, float s) { @@ -24,6 +22,31 @@ glm_mat4_scale_avx(mat4 m, float s) { glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2]))); } +/* TODO: this must be tested and compared to SSE version, may be slower!!! */ +CGLM_INLINE +void +glm_mat4_transp_avx(mat4 m, mat4 dest) { + __m256 y0, y1, y2, y3; + + y0 = glmm_load256(m[0]); /* h g f e d c b a */ + y1 = glmm_load256(m[2]); /* p o n m l k j i */ + + y2 = _mm256_unpacklo_ps(y0, y1); /* n f m e j b i a */ + y3 = _mm256_unpackhi_ps(y0, y1); /* p h o g l d k c */ + + y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* l d k c j b i a */ + y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p h o g n f m e */ + + y2 = _mm256_unpacklo_ps(y0, y1); /* o k g c m i e a */ + y3 = _mm256_unpackhi_ps(y0, y1); /* p l h d n j f b */ + + y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* n j f b m i e a */ + y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p l h d o k g c */ + + glmm_store256(dest[0], y0); + glmm_store256(dest[2], y1); +} + CGLM_INLINE void glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { From 44cd0ae4fd6fa731506bf7c4af0816c2f0c3097e Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 13 Apr 2024 00:33:57 +0300 Subject: [PATCH 20/21] avx: optimize avx mat4 scale and mat4 mul --- include/cglm/simd/avx/mat4.h | 64 ++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/include/cglm/simd/avx/mat4.h b/include/cglm/simd/avx/mat4.h index a8f576a..33771c2 100644 --- a/include/cglm/simd/avx/mat4.h +++ b/include/cglm/simd/avx/mat4.h @@ -15,11 +15,18 @@ CGLM_INLINE void glm_mat4_scale_avx(mat4 m, float s) { - __m256 y0; - y0 = _mm256_broadcast_ss(&s); + __m256 y0, y1, y2, y3, y4; - glmm_store256(m[0], _mm256_mul_ps(y0, glmm_load256(m[0]))); - glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2]))); + y0 = glmm_load256(m[0]); /* h g f e d c b a */ + y1 = glmm_load256(m[2]); /* p o n m l k j i */ + + y2 = _mm256_broadcast_ss(&s); + + y3 = _mm256_mul_ps(y0, y2); + y4 = _mm256_mul_ps(y1, y2); + + glmm_store256(m[0], y3); + glmm_store256(m[2], y4); } /* TODO: this must be tested and compared to SSE version, may be slower!!! */ @@ -52,7 +59,8 @@ void glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { /* D = R * L (Column-Major) */ - __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; + __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13; + __m256i yi0, yi1, yi2, yi3; y0 = glmm_load256(m2[0]); /* h g f e d c b a */ y1 = glmm_load256(m2[2]); /* p o n m l k j i */ @@ -64,35 +72,43 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */ y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */ + yi0 = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0); + yi1 = _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2); + yi2 = _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1); + yi3 = _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3); + /* f f f f a a a a */ /* h h h h c c c c */ /* e e e e b b b b */ /* g g g g d d d d */ - y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); - y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); - - glmm_store256(dest[0], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y3, y7)), - _mm256_add_ps(_mm256_mul_ps(y4, y8), - _mm256_mul_ps(y5, y9)))); + y6 = _mm256_permutevar_ps(y0, yi0); + y7 = _mm256_permutevar_ps(y0, yi1); + y8 = _mm256_permutevar_ps(y0, yi2); + y9 = _mm256_permutevar_ps(y0, yi3); /* n n n n i i i i */ /* p p p p k k k k */ /* m m m m j j j j */ /* o o o o l l l l */ - y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); - y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); + y10 = _mm256_permutevar_ps(y1, yi0); + y11 = _mm256_permutevar_ps(y1, yi1); + y12 = _mm256_permutevar_ps(y1, yi2); + y13 = _mm256_permutevar_ps(y1, yi3); - glmm_store256(dest[2], - _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), - _mm256_mul_ps(y3, y7)), - _mm256_add_ps(_mm256_mul_ps(y4, y8), - _mm256_mul_ps(y5, y9)))); + y0 = _mm256_mul_ps(y2, y6); + y1 = _mm256_mul_ps(y2, y10); + + y0 = glmm256_fmadd(y3, y7, y0); + y1 = glmm256_fmadd(y3, y11, y1); + + y0 = glmm256_fmadd(y4, y8, y0); + y1 = glmm256_fmadd(y4, y12, y1); + + y0 = glmm256_fmadd(y5, y9, y0); + y1 = glmm256_fmadd(y5, y13, y1); + + glmm_store256(dest[0], y0); + glmm_store256(dest[2], y1); } #endif From 4d0a0a7025943916aadcbe6d2def2b9f351ce7e2 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sun, 9 Feb 2025 15:30:49 +0300 Subject: [PATCH 21/21] Update wasm.h --- include/cglm/simd/wasm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/cglm/simd/wasm.h b/include/cglm/simd/wasm.h index d9a18cd..69f8301 100644 --- a/include/cglm/simd/wasm.h +++ b/include/cglm/simd/wasm.h @@ -14,8 +14,9 @@ #define glmm_load(p) wasm_v128_load(p) #define glmm_store(p, a) wasm_v128_store(p, (a)) -#define glmm_set1(x) wasm_f32x4_splat(x) -#define glmm_128 v128_t +#define glmm_set1(x) wasm_f32x4_splat(x) +#define glmm_set1_rval(x) wasm_f32x4_splat(x) +#define glmm_128 v128_t #define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z)