From b2b50076d29e3cae124bf7e2aba84c0cb13c4c37 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 20 Sep 2016 23:27:00 +0300 Subject: [PATCH] re-optimize mat4 determinant * note: for square matrices det(a) = det(transpose(a)), we can go with row-major calc here --- include/cglm-mat-simd.h | 70 ++++++++++++++++++++++++++--------------- include/cglm-mat.h | 17 +++++----- 2 files changed, 53 insertions(+), 34 deletions(-) diff --git a/include/cglm-mat-simd.h b/include/cglm-mat-simd.h index 6876ee7..6d11456 100644 --- a/include/cglm-mat-simd.h +++ b/include/cglm-mat-simd.h @@ -82,38 +82,56 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { CGLM_INLINE float glm_mat4_det_sse2(mat4 mat) { - __m128 v0, dt, t0, t1, t2, t3, t4, r0, r1, r2, r3; + __m128 r0, r1, r2, r3, x0, x1, x2; + /* 127 <- 0, [square] det(A) = det(At) */ + r0 = _mm_load_ps(mat[0]); /* d c b a */ + r1 = _mm_load_ps(mat[1]); /* h g f e */ + r2 = _mm_load_ps(mat[2]); /* l k j i */ + r3 = _mm_load_ps(mat[3]); /* p o n m */ - r0 = _mm_load_ps(mat[0]); - r1 = _mm_load_ps(mat[1]); - r2 = _mm_load_ps(mat[2]); - r3 = _mm_load_ps(mat[3]); + /* + t[1] = j * p - n * l; + t[2] = j * o - n * k; + t[3] = i * p - m * l; + t[4] = i * o - m * k; + */ + x0 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r2, 0, 0, 1, 1), + _mm_shuffle1_ps(r3, 2, 3, 2, 3)), + _mm_mul_ps(_mm_shuffle1_ps(r3, 0, 0, 1, 1), + _mm_shuffle1_ps(r2, 2, 3, 2, 3))); + /* + t[0] = k * p - o * l; + t[0] = k * p - o * l; + t[5] = i * n - m * j; + t[5] = i * n - m * j; + */ + x1 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r2, 0, 0, 2, 2), + _mm_shuffle1_ps(r3, 1, 1, 3, 3)), + _mm_mul_ps(_mm_shuffle1_ps(r3, 0, 0, 2, 2), + _mm_shuffle1_ps(r2, 1, 1, 3, 3))); - t3 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r2, 2, 1, 1, 0), - _mm_shuffle1_ps(r3, 3, 3, 2, 3)), - _mm_mul_ps(_mm_shuffle1_ps(r3, 2, 1, 1, 0), - _mm_shuffle1_ps(r2, 3, 3, 2, 3))); + /* + a * (f * t[0] - g * t[1] + h * t[2]) + - b * (e * t[0] - g * t[3] + h * t[4]) + + c * (e * t[1] - f * t[3] + h * t[5]) + - d * (e * t[2] - f * t[4] + g * t[5]) + */ + x2 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r1, 0, 0, 0, 1), + _mm_shuffle_ps(x1, x0, _MM_SHUFFLE(1, 0, 0, 0))), + _mm_mul_ps(_mm_shuffle1_ps(r1, 1, 1, 2, 2), + _mm_shuffle1_ps(x0, 3, 2, 2, 0))); - t4 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle1_ps(r2, 0, 0, 0, 0), - _mm_shuffle1_ps(r3, 2, 1, 2, 1)), - _mm_mul_ps(_mm_shuffle1_ps(r3, 0, 0, 0, 0), - _mm_shuffle1_ps(r2, 2, 1, 2, 1))); + x2 = _mm_add_ps(x2, + _mm_mul_ps(_mm_shuffle1_ps(r1, 2, 3, 3, 3), + _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)))); + x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); - t0 = _mm_shuffle1_ps(t3, 3, 3, 2, 1); - t1 = _mm_shuffle2_ps(t3, t4, 1, 1, 2, 0, 1, 0, 0, 3); - t2 = _mm_shuffle2_ps(t3, t4, 0, 1, 1, 1, 0, 2, 3, 3); + x0 = _mm_mul_ps(r0, x2); + x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 0, 1, 2, 3)); + x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 3, 3, 1)); - v0 = _mm_mul_ps(_mm_shuffle1_ps(r1, 1, 0, 0, 0), t0); - v0 = _mm_sub_ps(v0, _mm_mul_ps(_mm_shuffle1_ps(r1, 2, 2, 1, 1), t1)); - v0 = _mm_add_ps(v0, _mm_mul_ps(_mm_shuffle1_ps(r1, 3, 3, 3, 2), t2)); - v0 = _mm_xor_ps(v0, _mm_set_ps(0.f, -0.f, 0.f, -0.f)); - - dt = _mm_mul_ps(_mm_shuffle1_ps(r0, 0, 1, 2, 3), v0); - dt = _mm_add_ps(dt, _mm_shuffle1_ps(dt, 0, 1, 2, 3)); - dt = _mm_add_ps(dt, _mm_shuffle1_ps(dt, 1, 3, 3, 1)); - - return _mm_cvtss_f32(dt); + return _mm_cvtss_f32(x0); } CGLM_INLINE diff --git a/include/cglm-mat.h b/include/cglm-mat.h index fcec7c0..26fb4ec 100644 --- a/include/cglm-mat.h +++ b/include/cglm-mat.h @@ -153,16 +153,17 @@ glm_mat4_det(mat4 mat) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glm_mat4_det_sse2(mat); #else + /* [square] det(A) = det(At) */ float t[6]; float a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; - a = mat[0][0], b = mat[1][0], c = mat[2][0], d = mat[3][0], - e = mat[0][1], f = mat[1][1], g = mat[2][1], h = mat[3][1], - i = mat[0][2], j = mat[1][2], k = mat[2][2], l = mat[3][2], - m = mat[0][3], n = mat[1][3], o = mat[2][3], p = mat[3][3]; + a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3], + e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3], + i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3], + m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3]; t[0] = k * p - o * l; t[1] = j * p - n * l; @@ -171,10 +172,10 @@ glm_mat4_det(mat4 mat) { t[4] = i * o - m * k; t[5] = i * n - m * j; - return a * (f * t[0] - g * t[1] + h * t[2]) - - b * (e * t[0] - g * t[3] + h * t[4]) - + c * (e * t[1] - f * t[3] + h * t[5]) - - d * (e * t[2] - f * t[4] + g * t[5]); + return a * (f * t[0] - g * t[1] + h * t[2]) + - b * (e * t[0] - g * t[3] + h * t[4]) + + c * (e * t[1] - f * t[3] + h * t[5]) + - d * (e * t[2] - f * t[4] + g * t[5]); #endif }