opitimize affine matrix operations with fma

This commit is contained in:
Recep Aslantas
2021-04-18 13:51:22 +03:00
parent 7df5aa2e26
commit 0d0d22f96c

View File

@@ -22,31 +22,32 @@ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
l1 = glmm_load(m1[1]); l1 = glmm_load(m1[1]);
l2 = glmm_load(m1[2]); l2 = glmm_load(m1[2]);
l3 = glmm_load(m1[3]); l3 = glmm_load(m1[3]);
r = glmm_load(m2[0]); r = glmm_load(m2[0]);
glmm_store(dest[0], glmm_store(dest[0],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_shuff1x(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_shuff1x(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_shuff1x(r, 2), l2))));
r = glmm_load(m2[1]); r = glmm_load(m2[1]);
glmm_store(dest[1], glmm_store(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_shuff1x(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_shuff1x(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_shuff1x(r, 2), l2))));
r = glmm_load(m2[2]); r = glmm_load(m2[2]);
glmm_store(dest[2], glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_shuff1x(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_shuff1x(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_shuff1x(r, 2), l2))));
r = glmm_load(m2[3]); r = glmm_load(m2[3]);
glmm_store(dest[3], glmm_store(dest[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_shuff1x(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_shuff1x(r, 1), l1,
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2), glmm_fmadd(glmm_shuff1x(r, 2), l2,
_mm_mul_ps(glmm_shuff1x(r, 3), l3)))); _mm_mul_ps(glmm_shuff1x(r, 3),
l3)))));
} }
CGLM_INLINE CGLM_INLINE
@@ -62,21 +63,22 @@ glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
r = glmm_load(m2[0]); r = glmm_load(m2[0]);
glmm_store(dest[0], glmm_store(dest[0],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_shuff1x(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_shuff1x(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_shuff1x(r, 2), l2))));
r = glmm_load(m2[1]); r = glmm_load(m2[1]);
glmm_store(dest[1], glmm_store(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_shuff1x(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_shuff1x(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_shuff1x(r, 2), l2))));
r = glmm_load(m2[2]); r = glmm_load(m2[2]);
glmm_store(dest[2], glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_shuff1x(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_shuff1x(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_shuff1x(r, 2), l2))));
glmm_store(dest[3], l3); glmm_store(dest[3], l3);
} }
@@ -94,9 +96,9 @@ glm_inv_tr_sse2(mat4 mat) {
_MM_TRANSPOSE4_PS(r0, r1, r2, x1); _MM_TRANSPOSE4_PS(r0, r1, r2, x1);
x0 = _mm_add_ps(_mm_mul_ps(r0, glmm_shuff1(r3, 0, 0, 0, 0)), x0 = glmm_fmadd(r0, glmm_shuff1(r3, 0, 0, 0, 0),
_mm_mul_ps(r1, glmm_shuff1(r3, 1, 1, 1, 1))); glmm_fmadd(r1, glmm_shuff1(r3, 1, 1, 1, 1),
x0 = _mm_add_ps(x0, _mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2))); _mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2))));
x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f)); x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f));
x0 = _mm_add_ps(x0, x1); x0 = _mm_add_ps(x0, x1);