From 5b5dbcc15c6ef7541b2944fe6f9ebded28f78b62 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Mon, 10 Oct 2016 00:08:36 +0300 Subject: [PATCH] optimize affine transform matrices * reduce some multiplications/additions for transformations matrices --- include/cglm-affine-mat-sse2.h | 51 +++++++++++++++++++++++++++++ include/cglm-affine-mat.h | 59 ++++++++++++++++++++++++++++++++++ include/cglm-affine.h | 1 + 3 files changed, 111 insertions(+) create mode 100644 include/cglm-affine-mat-sse2.h create mode 100644 include/cglm-affine-mat.h diff --git a/include/cglm-affine-mat-sse2.h b/include/cglm-affine-mat-sse2.h new file mode 100644 index 0000000..e3f5667 --- /dev/null +++ b/include/cglm-affine-mat-sse2.h @@ -0,0 +1,51 @@ +/* + * Copyright (c), Recep Aslantas. + * + * MIT License (MIT), http://opensource.org/licenses/MIT + * Full license can be found in the LICENSE file + */ + +#ifndef cglm_affine_mat_sse2_h +#define cglm_affine_mat_sse2_h +#if defined( __SSE__ ) || defined( __SSE2__ ) + +#include "cglm-intrin.h" +#include "cglm.h" + +CGLM_INLINE +void +glm_affine_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { + /* D = R * L (Column-Major) */ + __m128 l0, l1, l2, l3, r; + + l0 = _mm_load_ps(m1[0]); + l1 = _mm_load_ps(m1[1]); + l2 = _mm_load_ps(m1[2]); + l3 = _mm_load_ps(m1[3]); + + r = _mm_load_ps(m2[0]); + _mm_store_ps(dest[0], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + r = _mm_load_ps(m2[1]); + _mm_store_ps(dest[1], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + r = _mm_load_ps(m2[2]); + _mm_store_ps(dest[2], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); + + r = _mm_load_ps(m2[3]); + _mm_store_ps(dest[3], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), + _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), + _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), + _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); +} + +#endif +#endif /* cglm_affine_mat_sse2_h */ diff --git a/include/cglm-affine-mat.h b/include/cglm-affine-mat.h new file mode 100644 index 0000000..15c21fc --- /dev/null +++ b/include/cglm-affine-mat.h @@ -0,0 +1,59 @@ +/* + * Copyright (c), Recep Aslantas. + * + * MIT License (MIT), http://opensource.org/licenses/MIT + * Full license can be found in the LICENSE file + */ + +#ifndef cglm_affine_mat_h +#define cglm_affine_mat_h + +#include "cglm.h" +#include "cglm-mat.h" +#include "cglm-affine-mat-sse2.h" +#include + +CGLM_INLINE +void +glm_affine_mul(mat4 m1, mat4 m2, mat4 dest) { +#if defined( __SSE__ ) || defined( __SSE2__ ) + glm_affine_mul_sse2(m1, m2, dest); +#else + float a00, a01, a02, a03, b00, b01, b02, b03, + a10, a11, a12, a13, b10, b11, b12, b13, + a20, a21, a22, a23, b20, b21, b22, b23, + a30, a31, a32, a33, b33; + + a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], + a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3], + a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2], a23 = m1[2][3], + a30 = m1[3][0], a31 = m1[3][1], a32 = m1[3][2], a33 = m1[3][3]; + + b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2], + b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2], + b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2], + b30 = m2[3][0], b31 = m2[3][1], b32 = m2[3][2], b33 = m2[3][3]; + + dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02; + dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02; + dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02; + dest[0][3] = a03 * b00 + a13 * b01 + a23 * b02; + + dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12; + dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12; + dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12; + dest[1][3] = a03 * b10 + a13 * b11 + a23 * b12; + + dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22; + dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22; + dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22; + dest[2][3] = a03 * b20 + a13 * b21 + a23 * b22; + + dest[3][0] = a00 * b30 + a10 * b31 + a20 * b32 + a30 * b33; + dest[3][1] = a01 * b30 + a11 * b31 + a21 * b32 + a31 * b33; + dest[3][2] = a02 * b30 + a12 * b31 + a22 * b32 + a32 * b33; + dest[3][3] = a03 * b30 + a13 * b31 + a23 * b32 + a33 * b33; +#endif +} + +#endif /* cglm_affine_mat_h */ diff --git a/include/cglm-affine.h b/include/cglm-affine.h index 366493f..66ffbd9 100644 --- a/include/cglm-affine.h +++ b/include/cglm-affine.h @@ -10,6 +10,7 @@ #include "cglm.h" #include "cglm-vec.h" +#include "cglm-affine-mat.h" #include CGLM_INLINE