Compare commits

...

35 Commits

Author SHA1 Message Date
Recep Aslantas
607182982d Merge branch 'master' into simd-3 2021-05-07 14:08:56 +03:00
Recep Aslantas
ad17f38934 Merge pull request #195 from legends2k/master
Add nlerp for quaternions
2021-05-07 12:58:05 +03:00
Sundaram Ramaswamy
8302f78484 Update documentation with nlerp 2021-05-07 14:26:31 +05:30
Sundaram Ramaswamy
9665be3138 Add struct API wrapper 2021-05-07 13:49:44 +05:30
Recep Aslantas
5c22ca3abb arrm, neon: use negate instruction instead of xor in glm_inv_tr_neon() 2021-05-07 01:52:12 +03:00
Recep Aslantas
7f9585ca72 arrm, neon: impove hadd performance 2021-05-07 01:46:24 +03:00
Recep Aslantas
d0ab3aaa2e arm, neon: util macros 2021-05-07 01:46:03 +03:00
Sundaram Ramaswamy
83dbdcc4a9 Add nlerp tests 2021-05-07 01:21:28 +05:30
Sundaram Ramaswamy
8ce45b4303 Add call and struct interfaces 2021-05-07 00:35:20 +05:30
Sundaram Ramaswamy
f19ff5d064 Use scale for both cases 2021-05-06 23:47:03 +05:30
Sundaram Ramaswamy
d6b93f052e Add nlerp for quaternions
Normalized linear interpolation for quaterions are a cheaper
alternative to slerp.  This PR adds nlerp operaiton for quaternions.
2021-05-06 23:38:26 +05:30
Recep Aslantas
28705be5a3 simd, sse: reduce some computation at glm_mul_rot_sse2() 2021-05-01 23:16:03 +03:00
Recep Aslantas
e1b142bce7 add todo to quat.h 2021-05-01 23:03:41 +03:00
Recep Aslantas
0f96eaad20 sse2: optimize glm_mat3_mul_sse2() with sse2
* reduce memory access for dest[2][2]
* the speed is increased ;)
2021-05-01 22:55:19 +03:00
Recep Aslantas
d5d3178ae0 Merge pull request #192 from Winter091/fixing-typo
fix typo: vec3 -> vec4
2021-05-01 21:24:31 +03:00
winter091
13269f4af8 fix typo: vec3 -> vec4 2021-05-01 16:48:31 +03:00
Recep Aslantas
faf6186c29 sse: optimize glm_mat2_mul_sse2 with sse 2021-05-01 03:44:04 +03:00
Recep Aslantas
2be6ac949b sse: optimize glm_quat_mul with sse 2021-05-01 03:18:26 +03:00
Recep Aslantas
5b7bc522ac sse: optimize affine with sse
* re-oder instructions for ILP
2021-05-01 02:58:14 +03:00
Recep Aslantas
376cf31ee7 armi neon: optimize affine with neon 2021-05-01 02:46:14 +03:00
Recep Aslantas
d28b381dd6 armi neon: optimize mat4 mul with neon 2021-05-01 02:45:15 +03:00
Recep Aslantas
3673622cc3 simd, sse: optimize mat4 mul-v with sse
* re-oder instructions for ILP
2021-05-01 02:17:34 +03:00
Recep Aslantas
a90f706e12 simd, sse: optimize mat4 mul with sse
* re-oder instructions for ILP
2021-05-01 02:17:08 +03:00
Recep Aslantas
c065d71a2f simd, sse: optimize mat4 inv with sse
* reduce a few shufflees
* re-oder instructions for ILP
2021-04-30 21:12:17 +03:00
Recep Aslantas
1b3b91fe0b Merge pull request #191 from quadroli/master
very minor correction in readme
2021-04-30 20:35:15 +03:00
quadroli
17560a0687 very minor correction in readme 2021-04-30 18:03:50 +03:00
Recep Aslantas
ba634d6c83 simd: optimize glm_mat4_zero() with simd 2021-04-30 04:04:27 +03:00
Recep Aslantas
f35badd436 now working on v0.8.3 2021-04-30 01:00:42 +03:00
Recep Aslantas
04eaf9c535 arm, neon: neon/fma support for glm_quat_mul() 2021-04-29 01:12:00 +03:00
Recep Aslantas
bd6641bd0a build: add missing files to build files 2021-04-28 22:45:03 +03:00
Recep Aslantas
4e4bff418d arm, neon: neon/fma support for glm_mat2_mul() 2021-04-28 22:06:46 +03:00
Recep Aslantas
55ebbdbe40 arm, neon: neon/fma support for glm_inv_tr() 2021-04-28 14:46:14 +03:00
Recep Aslantas
e4c35e32fc Merge pull request #190 from ylecuyer/patch-3
Minor typo in doc
2021-04-27 23:52:40 +03:00
Yoann Lecuyer
ec467fef1f Minor typo in doc
I stumbled upon while reading the doc
2021-04-27 22:09:13 +02:00
Recep Aslantas
1e8865233b Merge pull request #189 from recp/simd-2
ARM Neon Update
2021-04-25 15:20:24 +03:00
33 changed files with 693 additions and 244 deletions

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.8.2) cmake_minimum_required(VERSION 3.8.2)
project(cglm VERSION 0.8.2 LANGUAGES C) project(cglm VERSION 0.8.3 LANGUAGES C)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED YES) set(CMAKE_C_STANDARD_REQUIRED YES)

View File

@@ -109,7 +109,10 @@ cglm_simd_avx_HEADERS = include/cglm/simd/avx/mat4.h \
include/cglm/simd/avx/affine.h include/cglm/simd/avx/affine.h
cglm_simd_neondir=$(includedir)/cglm/simd/neon cglm_simd_neondir=$(includedir)/cglm/simd/neon
cglm_simd_neon_HEADERS = include/cglm/simd/neon/mat4.h cglm_simd_neon_HEADERS = include/cglm/simd/neon/mat4.h \
include/cglm/simd/neon/mat2.h \
include/cglm/simd/neon/affine.h \
include/cglm/simd/neon/quat.h
cglm_structdir=$(includedir)/cglm/struct cglm_structdir=$(includedir)/cglm/struct
cglm_struct_HEADERS = include/cglm/struct/mat4.h \ cglm_struct_HEADERS = include/cglm/struct/mat4.h \

View File

@@ -95,9 +95,9 @@ Currently *cglm* uses default clip space configuration (-1, 1) for camera functi
<hr /> <hr />
You have two option to call a function/operation: inline or library call (link) You have two options to call a function/operation: inline or library call (link)
Almost all functions are marked inline (always_inline) so compiler will probably inline. Almost all functions are marked inline (always_inline) so compiler will probably inline.
To call pre-compiled version, just use `glmc_` (c stands for 'call') instead of `glm_`. To call pre-compiled versions, just use `glmc_` (c stands for 'call') instead of `glm_`.
```C ```C
#include <cglm/cglm.h> /* for inline */ #include <cglm/cglm.h> /* for inline */

View File

@@ -2,7 +2,7 @@ Pod::Spec.new do |s|
# Description # Description
s.name = "cglm" s.name = "cglm"
s.version = "0.8.1" s.version = "0.8.2"
s.summary = "📽 Highly Optimized Graphics Math (glm) for C" s.summary = "📽 Highly Optimized Graphics Math (glm) for C"
s.description = <<-DESC s.description = <<-DESC
cglm is math library for graphics programming for C. See the documentation or README for all features. cglm is math library for graphics programming for C. See the documentation or README for all features.

View File

@@ -7,7 +7,7 @@
#***************************************************************************** #*****************************************************************************
AC_PREREQ([2.69]) AC_PREREQ([2.69])
AC_INIT([cglm], [0.8.2], [info@recp.me]) AC_INIT([cglm], [0.8.3], [info@recp.me])
AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects serial-tests]) AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects serial-tests])
# Don't use the default cflags (-O2 -g), we set ours manually in Makefile.am. # Don't use the default cflags (-O2 -g), we set ours manually in Makefile.am.

View File

@@ -62,9 +62,9 @@ author = u'Recep Aslantas'
# built documents. # built documents.
# #
# The short X.Y version. # The short X.Y version.
version = u'0.8.2' version = u'0.8.3'
# The full version, including alpha/beta/rc tags. # The full version, including alpha/beta/rc tags.
release = u'0.8.2' release = u'0.8.3'
# The language for content autogenerated by Sphinx. Refer to documentation # The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages. # for a list of supported languages.

View File

@@ -2,7 +2,7 @@ How to send vector or matrix to OpenGL like API
================================================== ==================================================
*cglm*'s vector and matrix types are arrays. So you can send them directly to a *cglm*'s vector and matrix types are arrays. So you can send them directly to a
function which accecpts pointer. But you may got warnings for matrix because it is function which accepts pointer. But you may got warnings for matrix because it is
two dimensional array. two dimensional array.
Passing / Uniforming Matrix to OpenGL: Passing / Uniforming Matrix to OpenGL:

View File

@@ -52,6 +52,7 @@ Functions:
#. :c:func:`glm_quat_mat3` #. :c:func:`glm_quat_mat3`
#. :c:func:`glm_quat_mat3t` #. :c:func:`glm_quat_mat3t`
#. :c:func:`glm_quat_lerp` #. :c:func:`glm_quat_lerp`
#. :c:func:`glm_quat_nlerp`
#. :c:func:`glm_quat_slerp` #. :c:func:`glm_quat_slerp`
#. :c:func:`glm_quat_look` #. :c:func:`glm_quat_look`
#. :c:func:`glm_quat_for` #. :c:func:`glm_quat_for`
@@ -304,6 +305,25 @@ Functions documentation
| *[in]* **t** interpolant (amount) clamped between 0 and 1 | *[in]* **t** interpolant (amount) clamped between 0 and 1
| *[out]* **dest** result quaternion | *[out]* **dest** result quaternion
.. c:function:: void glm_quat_nlerp(versor q, versor r, float t, versor dest)
| interpolates between two quaternions
| taking the shortest rotation path using
| normalized linear interpolation (NLERP)
| This is a cheaper alternative to slerp; most games use nlerp
| for animations as it visually makes little difference.
References:
* `Understanding Slerp, Then Not Using it <http://number-none.com/product/Understanding%20Slerp,%20Then%20Not%20Using%20It>`_
* `Lerp, Slerp and Nlerp <https://keithmaggio.wordpress.com/2011/02/15/math-magician-lerp-slerp-and-nlerp/>`_
Parameters:
| *[in]* **from** from
| *[in]* **to** to
| *[in]* **t** interpolant (amount) clamped between 0 and 1
| *[out]* **dest** result quaternion
.. c:function:: void glm_quat_slerp(versor q, versor r, float t, versor dest) .. c:function:: void glm_quat_slerp(versor q, versor r, float t, versor dest)
| interpolates between two quaternions | interpolates between two quaternions

View File

@@ -158,6 +158,8 @@ void
glm_inv_tr(mat4 mat) { glm_inv_tr(mat4 mat) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
glm_inv_tr_sse2(mat); glm_inv_tr_sse2(mat);
#elif defined(CGLM_NEON_FP)
glm_inv_tr_neon(mat);
#else #else
CGLM_ALIGN_MAT mat3 r; CGLM_ALIGN_MAT mat3 r;
CGLM_ALIGN(8) vec3 t; CGLM_ALIGN(8) vec3 t;

View File

@@ -116,11 +116,15 @@ glmc_quat_mat3t(versor q, mat3 dest);
CGLM_EXPORT CGLM_EXPORT
void void
glmc_quat_lerp(versor from, versor to, float t, versor dest); glmc_quat_lerp(versor from, versor to, float t, versor dest);
CGLM_EXPORT CGLM_EXPORT
void void
glmc_quat_lerpc(versor from, versor to, float t, versor dest); glmc_quat_lerpc(versor from, versor to, float t, versor dest);
CGLM_EXPORT
void
glmc_quat_nlerp(versor q, versor r, float t, versor dest);
CGLM_EXPORT CGLM_EXPORT
void void
glmc_quat_slerp(versor q, versor r, float t, versor dest); glmc_quat_slerp(versor q, versor r, float t, versor dest);

View File

@@ -99,7 +99,7 @@ glmc_vec4_scale(vec4 v, float s, vec4 dest);
CGLM_EXPORT CGLM_EXPORT
void void
glmc_vec4_scale_as(vec3 v, float s, vec3 dest); glmc_vec4_scale_as(vec4 v, float s, vec4 dest);
CGLM_EXPORT CGLM_EXPORT
void void

View File

@@ -40,6 +40,10 @@
# include "simd/sse2/mat2.h" # include "simd/sse2/mat2.h"
#endif #endif
#ifdef CGLM_NEON_FP
# include "simd/neon/mat2.h"
#endif
#define GLM_MAT2_IDENTITY_INIT {{1.0f, 0.0f}, {0.0f, 1.0f}} #define GLM_MAT2_IDENTITY_INIT {{1.0f, 0.0f}, {0.0f, 1.0f}}
#define GLM_MAT2_ZERO_INIT {{0.0f, 0.0f}, {0.0f, 0.0f}} #define GLM_MAT2_ZERO_INIT {{0.0f, 0.0f}, {0.0f, 0.0f}}
@@ -130,6 +134,8 @@ void
glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) { glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat2_mul_sse2(m1, m2, dest); glm_mat2_mul_sse2(m1, m2, dest);
#elif defined(CGLM_NEON_FP)
glm_mat2_mul_neon(m1, m2, dest);
#else #else
float a00 = m1[0][0], a01 = m1[0][1], float a00 = m1[0][0], a01 = m1[0][1],
a10 = m1[1][0], a11 = m1[1][1], a10 = m1[1][0], a11 = m1[1][1],

View File

@@ -187,8 +187,29 @@ glm_mat4_identity_array(mat4 * __restrict mat, size_t count) {
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_zero(mat4 mat) { glm_mat4_zero(mat4 mat) {
#ifdef __AVX__
__m256 y0;
y0 = _mm256_setzero_ps();
glmm_store256(mat[0], y0);
glmm_store256(mat[2], y0);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_128 x0;
x0 = _mm_setzero_ps();
glmm_store(mat[0], x0);
glmm_store(mat[1], x0);
glmm_store(mat[2], x0);
glmm_store(mat[3], x0);
#elif defined(CGLM_NEON_FP)
glmm_128 x0;
x0 = vdupq_n_f32(0.0f);
vst1q_f32(mat[0], x0);
vst1q_f32(mat[1], x0);
vst1q_f32(mat[2], x0);
vst1q_f32(mat[3], x0);
#else
CGLM_ALIGN_MAT mat4 t = GLM_MAT4_ZERO_INIT; CGLM_ALIGN_MAT mat4 t = GLM_MAT4_ZERO_INIT;
glm_mat4_copy(t, mat); glm_mat4_copy(t, mat);
#endif
} }
/*! /*!

View File

@@ -38,6 +38,7 @@
CGLM_INLINE void glm_quat_lerp(versor from, versor to, float t, versor dest); CGLM_INLINE void glm_quat_lerp(versor from, versor to, float t, versor dest);
CGLM_INLINE void glm_quat_lerpc(versor from, versor to, float t, versor dest); CGLM_INLINE void glm_quat_lerpc(versor from, versor to, float t, versor dest);
CGLM_INLINE void glm_quat_slerp(versor q, versor r, float t, versor dest); CGLM_INLINE void glm_quat_slerp(versor q, versor r, float t, versor dest);
CGLM_INLINE void glm_quat_nlerp(versor q, versor r, float t, versor dest);
CGLM_INLINE void glm_quat_look(vec3 eye, versor ori, mat4 dest); CGLM_INLINE void glm_quat_look(vec3 eye, versor ori, mat4 dest);
CGLM_INLINE void glm_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest); CGLM_INLINE void glm_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest);
CGLM_INLINE void glm_quat_forp(vec3 from, CGLM_INLINE void glm_quat_forp(vec3 from,
@@ -63,6 +64,10 @@
# include "simd/sse2/quat.h" # include "simd/sse2/quat.h"
#endif #endif
#ifdef CGLM_NEON_FP
# include "simd/neon/quat.h"
#endif
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_mulv(mat4 m, vec4 v, vec4 dest); glm_mat4_mulv(mat4 m, vec4 v, vec4 dest);
@@ -412,6 +417,8 @@ glm_quat_mul(versor p, versor q, versor dest) {
*/ */
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
glm_quat_mul_sse2(p, q, dest); glm_quat_mul_sse2(p, q, dest);
#elif defined(CGLM_NEON_FP)
glm_quat_mul_neon(p, q, dest);
#else #else
dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1]; dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1];
dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0]; dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0];
@@ -622,6 +629,26 @@ glm_quat_lerpc(versor from, versor to, float t, versor dest) {
glm_vec4_lerpc(from, to, t, dest); glm_vec4_lerpc(from, to, t, dest);
} }
/*!
* @brief interpolates between two quaternions
* taking the shortest rotation path using
* normalized linear interpolation (NLERP)
*
* @param[in] from from
* @param[in] to to
* @param[in] t interpolant (amount)
* @param[out] dest result quaternion
*/
CGLM_INLINE
void
glm_quat_nlerp(versor from, versor to, float t, versor dest) {
float dot = glm_vec4_dot(from, to);
versor target;
glm_vec4_scale(to, (dot >= 0) ? 1 : -1, target);
glm_quat_lerp(from, target, t, dest);
glm_quat_normalize(dest);
}
/*! /*!
* @brief interpolates between two quaternions * @brief interpolates between two quaternions
* using spherical linear interpolation (SLERP) * using spherical linear interpolation (SLERP)

View File

@@ -29,6 +29,15 @@
vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), \ vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), \
vreinterpretq_s32_f32(b))) vreinterpretq_s32_f32(b)))
#define glmm_swplane(v) vextq_f32(v, v, 2)
#define glmm_low(x) vget_low_f32(x)
#define glmm_high(x) vget_high_f32(x)
#define glmm_combine_ll(x, y) vcombine_f32(vget_low_f32(x), vget_low_f32(y))
#define glmm_combine_hl(x, y) vcombine_f32(vget_high_f32(x), vget_low_f32(y))
#define glmm_combine_lh(x, y) vcombine_f32(vget_low_f32(x), vget_high_f32(y))
#define glmm_combine_hh(x, y) vcombine_f32(vget_high_f32(x), vget_high_f32(y))
static inline static inline
float32x4_t float32x4_t
glmm_abs(float32x4_t v) { glmm_abs(float32x4_t v) {
@@ -38,8 +47,13 @@ glmm_abs(float32x4_t v) {
static inline static inline
float32x4_t float32x4_t
glmm_vhadd(float32x4_t v) { glmm_vhadd(float32x4_t v) {
v = vaddq_f32(v, vrev64q_f32(v)); return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v))); vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
/*
this seems slower:
v = vaddq_f32(v, vrev64q_f32(v));
return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
*/
} }
static inline static inline

View File

@@ -17,29 +17,32 @@ void
glm_mul_neon(mat4 m1, mat4 m2, mat4 dest) { glm_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
glmm_128 l0, l1, l2, l3, r0, r1, r2, r3, v0, v1, v2, v3; glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
l0 = glmm_load(m1[0]); r0 = glmm_load(m2[0]); l = glmm_load(m1[0]);
l1 = glmm_load(m1[1]); r1 = glmm_load(m2[1]); r0 = glmm_load(m2[0]);
l2 = glmm_load(m1[2]); r2 = glmm_load(m2[2]); r1 = glmm_load(m2[1]);
l3 = glmm_load(m1[3]); r3 = glmm_load(m2[3]); r2 = glmm_load(m2[2]);
r3 = glmm_load(m2[3]);
v0 = vmulq_f32(glmm_splat_x(r0), l0); v0 = vmulq_f32(glmm_splat_x(r0), l);
v1 = vmulq_f32(glmm_splat_x(r1), l0); v1 = vmulq_f32(glmm_splat_x(r1), l);
v2 = vmulq_f32(glmm_splat_x(r2), l0); v2 = vmulq_f32(glmm_splat_x(r2), l);
v3 = vmulq_f32(glmm_splat_x(r3), l0); v3 = vmulq_f32(glmm_splat_x(r3), l);
v0 = glmm_fmadd(glmm_splat_y(r0), l1, v0); l = glmm_load(m1[1]);
v1 = glmm_fmadd(glmm_splat_y(r1), l1, v1); v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
v2 = glmm_fmadd(glmm_splat_y(r2), l1, v2); v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
v3 = glmm_fmadd(glmm_splat_y(r3), l1, v3); v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
v0 = glmm_fmadd(glmm_splat_z(r0), l2, v0); l = glmm_load(m1[2]);
v1 = glmm_fmadd(glmm_splat_z(r1), l2, v1); v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
v2 = glmm_fmadd(glmm_splat_z(r2), l2, v2); v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
v3 = glmm_fmadd(glmm_splat_z(r3), l2, v3); v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
v3 = glmm_fmadd(glmm_splat_w(r3), l3, v3); v3 = glmm_fmadd(glmm_splat_w(r3), glmm_load(m1[3]), v3);
glmm_store(dest[0], v0); glmm_store(dest[0], v0);
glmm_store(dest[1], v1); glmm_store(dest[1], v1);
@@ -52,23 +55,26 @@ void
glm_mul_rot_neon(mat4 m1, mat4 m2, mat4 dest) { glm_mul_rot_neon(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
glmm_128 l0, l1, l2, r0, r1, r2, v0, v1, v2; glmm_128 l, r0, r1, r2, v0, v1, v2;
l0 = glmm_load(m1[0]); r0 = glmm_load(m2[0]); l = glmm_load(m1[0]);
l1 = glmm_load(m1[1]); r1 = glmm_load(m2[1]); r0 = glmm_load(m2[0]);
l2 = glmm_load(m1[2]); r2 = glmm_load(m2[2]); r1 = glmm_load(m2[1]);
r2 = glmm_load(m2[2]);
v0 = vmulq_f32(glmm_splat_x(r0), l0); v0 = vmulq_f32(glmm_splat_x(r0), l);
v1 = vmulq_f32(glmm_splat_x(r1), l0); v1 = vmulq_f32(glmm_splat_x(r1), l);
v2 = vmulq_f32(glmm_splat_x(r2), l0); v2 = vmulq_f32(glmm_splat_x(r2), l);
v0 = glmm_fmadd(glmm_splat_y(r0), l1, v0); l = glmm_load(m1[1]);
v1 = glmm_fmadd(glmm_splat_y(r1), l1, v1); v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
v2 = glmm_fmadd(glmm_splat_y(r2), l1, v2); v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
v0 = glmm_fmadd(glmm_splat_z(r0), l2, v0);
v1 = glmm_fmadd(glmm_splat_z(r1), l2, v1); l = glmm_load(m1[2]);
v2 = glmm_fmadd(glmm_splat_z(r2), l2, v2); v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
glmm_store(dest[0], v0); glmm_store(dest[0], v0);
glmm_store(dest[1], v1); glmm_store(dest[1], v1);
@@ -76,5 +82,41 @@ glm_mul_rot_neon(mat4 m1, mat4 m2, mat4 dest) {
glmm_store(dest[3], glmm_load(m1[3])); glmm_store(dest[3], glmm_load(m1[3]));
} }
CGLM_INLINE
void
glm_inv_tr_neon(mat4 mat) {
float32x4x4_t vmat;
glmm_128 r0, r1, r2, r3, x0;
vmat = vld4q_f32(mat[0]);
r0 = vmat.val[0];
r1 = vmat.val[1];
r2 = vmat.val[2];
r3 = vmat.val[3];
x0 = glmm_fmadd(r0, glmm_splat_w(r0),
glmm_fmadd(r1, glmm_splat_w(r1),
vmulq_f32(r2, glmm_splat_w(r2))));
x0 = vnegq_f32(x0);
glmm_store(mat[0], r0);
glmm_store(mat[1], r1);
glmm_store(mat[2], r2);
glmm_store(mat[3], x0);
mat[0][3] = 0.0f;
mat[1][3] = 0.0f;
mat[2][3] = 0.0f;
mat[3][3] = 1.0f;
/* TODO: ?
zo = vget_high_f32(r3);
vst1_lane_f32(&mat[0][3], zo, 0);
vst1_lane_f32(&mat[1][3], zo, 0);
vst1_lane_f32(&mat[2][3], zo, 0);
vst1_lane_f32(&mat[3][3], zo, 1);
*/
}
#endif #endif
#endif /* cglm_affine_neon_h */ #endif /* cglm_affine_neon_h */

View File

@@ -0,0 +1,44 @@
/*
* Copyright (c), Recep Aslantas.
*
* MIT License (MIT), http://opensource.org/licenses/MIT
* Full license can be found in the LICENSE file
*/
#ifndef cglm_mat2_neon_h
#define cglm_mat2_neon_h
#if defined(__ARM_NEON_FP)
#include "../../common.h"
#include "../intrin.h"
CGLM_INLINE
void
glm_mat2_mul_neon(mat2 m1, mat2 m2, mat2 dest) {
float32x4x2_t a1;
glmm_128 x0, x1, x2;
float32x2_t dc, ba;
x1 = glmm_load(m1[0]); /* d c b a */
x2 = glmm_load(m2[0]); /* h g f e */
dc = vget_high_f32(x1);
ba = vget_low_f32(x1);
/* g g e e, h h f f */
a1 = vtrnq_f32(x2, x2);
/*
dest[0][0] = a * e + c * f;
dest[0][1] = b * e + d * f;
dest[1][0] = a * g + c * h;
dest[1][1] = b * g + d * h;
*/
x0 = glmm_fmadd(vcombine_f32(ba, ba), a1.val[0],
vmulq_f32(vcombine_f32(dc, dc), a1.val[1]));
glmm_store(dest[0], x0);
}
#endif
#endif /* cglm_mat2_neon_h */

View File

@@ -43,32 +43,36 @@ void
glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
glmm_128 l0, l1, l2, l3, r0, r1, r2, r3, v0, v1, v2, v3; glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
l0 = glmm_load(m1[0]); r0 = glmm_load(m2[0]); l = glmm_load(m1[0]);
l1 = glmm_load(m1[1]); r1 = glmm_load(m2[1]); r0 = glmm_load(m2[0]);
l2 = glmm_load(m1[2]); r2 = glmm_load(m2[2]); r1 = glmm_load(m2[1]);
l3 = glmm_load(m1[3]); r3 = glmm_load(m2[3]); r2 = glmm_load(m2[2]);
r3 = glmm_load(m2[3]);
v0 = vmulq_f32(glmm_splat_x(r0), l0); v0 = vmulq_f32(glmm_splat_x(r0), l);
v1 = vmulq_f32(glmm_splat_x(r1), l0); v1 = vmulq_f32(glmm_splat_x(r1), l);
v2 = vmulq_f32(glmm_splat_x(r2), l0); v2 = vmulq_f32(glmm_splat_x(r2), l);
v3 = vmulq_f32(glmm_splat_x(r3), l0); v3 = vmulq_f32(glmm_splat_x(r3), l);
v0 = glmm_fmadd(glmm_splat_y(r0), l1, v0); l = glmm_load(m1[1]);
v1 = glmm_fmadd(glmm_splat_y(r1), l1, v1); v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
v2 = glmm_fmadd(glmm_splat_y(r2), l1, v2); v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
v3 = glmm_fmadd(glmm_splat_y(r3), l1, v3); v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
v0 = glmm_fmadd(glmm_splat_z(r0), l2, v0); l = glmm_load(m1[2]);
v1 = glmm_fmadd(glmm_splat_z(r1), l2, v1); v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
v2 = glmm_fmadd(glmm_splat_z(r2), l2, v2); v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
v3 = glmm_fmadd(glmm_splat_z(r3), l2, v3); v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
v0 = glmm_fmadd(glmm_splat_w(r0), l3, v0); l = glmm_load(m1[3]);
v1 = glmm_fmadd(glmm_splat_w(r1), l3, v1); v0 = glmm_fmadd(glmm_splat_w(r0), l, v0);
v2 = glmm_fmadd(glmm_splat_w(r2), l3, v2); v1 = glmm_fmadd(glmm_splat_w(r1), l, v1);
v3 = glmm_fmadd(glmm_splat_w(r3), l3, v3); v2 = glmm_fmadd(glmm_splat_w(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_w(r3), l, v3);
glmm_store(dest[0], v0); glmm_store(dest[0], v0);
glmm_store(dest[1], v1); glmm_store(dest[1], v1);

View File

@@ -0,0 +1,56 @@
/*
* Copyright (c), Recep Aslantas.
*
* MIT License (MIT), http://opensource.org/licenses/MIT
* Full license can be found in the LICENSE file
*/
#ifndef cglm_quat_neon_h
#define cglm_quat_neon_h
#if defined(__ARM_NEON_FP)
#include "../../common.h"
#include "../intrin.h"
CGLM_INLINE
void
glm_quat_mul_neon(versor p, versor q, versor dest) {
/*
+ (a1 b2 + b1 a2 + c1 d2 d1 c2)i
+ (a1 c2 b1 d2 + c1 a2 + d1 b2)j
+ (a1 d2 + b1 c2 c1 b2 + d1 a2)k
a1 a2 b1 b2 c1 c2 d1 d2
*/
glmm_128 xp, xq, xqr, r, x, y, z, s2, s3;
glmm_128 s1 = {-0.f, 0.f, 0.f, -0.f};
float32x2_t qh, ql;
xp = glmm_load(p); /* 3 2 1 0 */
xq = glmm_load(q);
r = vmulq_f32(glmm_splat_w(xp), xq);
x = glmm_splat_x(xp);
y = glmm_splat_y(xp);
z = glmm_splat_z(xp);
ql = vget_high_f32(s1);
s3 = vcombine_f32(ql, ql);
s2 = vzipq_f32(s3, s3).val[0];
xqr = vrev64q_f32(xq);
qh = vget_high_f32(xqr);
ql = vget_low_f32(xqr);
r = glmm_fmadd(glmm_xor(x, s3), vcombine_f32(qh, ql), r);
r = glmm_fmadd(glmm_xor(y, s2), vcombine_f32(vget_high_f32(xq),
vget_low_f32(xq)), r);
r = glmm_fmadd(glmm_xor(z, s1), vcombine_f32(ql, qh), r);
glmm_store(dest, r);
}
#endif
#endif /* cglm_quat_neon_h */

View File

@@ -16,76 +16,76 @@ CGLM_INLINE
void void
glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
__m128 l0, l1, l2, l3, r; glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
l0 = glmm_load(m1[0]); l = glmm_load(m1[0]);
l1 = glmm_load(m1[1]); r0 = glmm_load(m2[0]);
l2 = glmm_load(m1[2]); r1 = glmm_load(m2[1]);
l3 = glmm_load(m1[3]); r2 = glmm_load(m2[2]);
r3 = glmm_load(m2[3]);
r = glmm_load(m2[0]);
glmm_store(dest[0],
glmm_fmadd(glmm_splat(r, 0), l0,
glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[1]);
glmm_store(dest[1],
glmm_fmadd(glmm_splat(r, 0), l0,
glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[2]); v0 = _mm_mul_ps(glmm_splat_x(r0), l);
glmm_store(dest[2], v1 = _mm_mul_ps(glmm_splat_x(r1), l);
glmm_fmadd(glmm_splat(r, 0), l0, v2 = _mm_mul_ps(glmm_splat_x(r2), l);
glmm_fmadd(glmm_splat(r, 1), l1, v3 = _mm_mul_ps(glmm_splat_x(r3), l);
_mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[3]); l = glmm_load(m1[1]);
glmm_store(dest[3], v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
glmm_fmadd(glmm_splat(r, 0), l0, v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
glmm_fmadd(glmm_splat(r, 1), l1, v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
glmm_fmadd(glmm_splat(r, 2), l2, v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
_mm_mul_ps(glmm_splat(r, 3), l3)))));
l = glmm_load(m1[2]);
v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
l = glmm_load(m1[3]);
v3 = glmm_fmadd(glmm_splat_w(r3), l, v3);
glmm_store(dest[0], v0);
glmm_store(dest[1], v1);
glmm_store(dest[2], v2);
glmm_store(dest[3], v3);
} }
CGLM_INLINE CGLM_INLINE
void void
glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) { glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
__m128 l0, l1, l2, l3, r;
l0 = glmm_load(m1[0]); glmm_128 l, r0, r1, r2, v0, v1, v2;
l1 = glmm_load(m1[1]);
l2 = glmm_load(m1[2]);
l3 = glmm_load(m1[3]);
r = glmm_load(m2[0]); l = glmm_load(m1[0]);
glmm_store(dest[0], r0 = glmm_load(m2[0]);
glmm_fmadd(glmm_splat(r, 0), l0, r1 = glmm_load(m2[1]);
glmm_fmadd(glmm_splat(r, 1), l1, r2 = glmm_load(m2[2]);
_mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[1]);
glmm_store(dest[1],
glmm_fmadd(glmm_splat(r, 0), l0,
glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[2]);
glmm_store(dest[2],
glmm_fmadd(glmm_splat(r, 0), l0,
glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_splat(r, 2), l2))));
glmm_store(dest[3], l3); v0 = _mm_mul_ps(glmm_splat_x(r0), l);
v1 = _mm_mul_ps(glmm_splat_x(r1), l);
v2 = _mm_mul_ps(glmm_splat_x(r2), l);
l = glmm_load(m1[1]);
v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
l = glmm_load(m1[2]);
v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
glmm_store(dest[0], v0);
glmm_store(dest[1], v1);
glmm_store(dest[2], v2);
glmm_store(dest[3], glmm_load(m1[3]));
} }
CGLM_INLINE CGLM_INLINE
void void
glm_inv_tr_sse2(mat4 mat) { glm_inv_tr_sse2(mat4 mat) {
__m128 r0, r1, r2, r3, x0, x1; __m128 r0, r1, r2, r3, x0, x1, x2, x3, x4, x5;
r0 = glmm_load(mat[0]); r0 = glmm_load(mat[0]);
r1 = glmm_load(mat[1]); r1 = glmm_load(mat[1]);
@@ -95,10 +95,13 @@ glm_inv_tr_sse2(mat4 mat) {
_MM_TRANSPOSE4_PS(r0, r1, r2, x1); _MM_TRANSPOSE4_PS(r0, r1, r2, x1);
x0 = glmm_fmadd(r0, glmm_shuff1(r3, 0, 0, 0, 0), x2 = glmm_shuff1(r3, 0, 0, 0, 0);
glmm_fmadd(r1, glmm_shuff1(r3, 1, 1, 1, 1), x3 = glmm_shuff1(r3, 1, 1, 1, 1);
_mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2)))); x4 = glmm_shuff1(r3, 2, 2, 2, 2);
x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f)); x5 = _mm_set1_ps(-0.f);
x0 = glmm_fmadd(r0, x2, glmm_fmadd(r1, x3, _mm_mul_ps(r2, x4)));
x0 = _mm_xor_ps(x0, x5);
x0 = _mm_add_ps(x0, x1); x0 = _mm_add_ps(x0, x1);

View File

@@ -15,20 +15,23 @@
CGLM_INLINE CGLM_INLINE
void void
glm_mat2_mul_sse2(mat2 m1, mat2 m2, mat2 dest) { glm_mat2_mul_sse2(mat2 m1, mat2 m2, mat2 dest) {
__m128 x0, x1, x2; __m128 x0, x1, x2, x3, x4;
x1 = glmm_load(m1[0]); /* d c b a */ x1 = glmm_load(m1[0]); /* d c b a */
x2 = glmm_load(m2[0]); /* h g f e */ x2 = glmm_load(m2[0]); /* h g f e */
x3 = glmm_shuff1(x2, 2, 2, 0, 0);
x4 = glmm_shuff1(x2, 3, 3, 1, 1);
x0 = _mm_movelh_ps(x1, x1);
x2 = _mm_movehl_ps(x1, x1);
/* /*
dest[0][0] = a * e + c * f; dest[0][0] = a * e + c * f;
dest[0][1] = b * e + d * f; dest[0][1] = b * e + d * f;
dest[1][0] = a * g + c * h; dest[1][0] = a * g + c * h;
dest[1][1] = b * g + d * h; dest[1][1] = b * g + d * h;
*/ */
x0 = glmm_fmadd(_mm_movelh_ps(x1, x1), glmm_shuff1(x2, 2, 2, 0, 0), x0 = glmm_fmadd(x0, x3, _mm_mul_ps(x2, x4));
_mm_mul_ps(_mm_movehl_ps(x1, x1),
glmm_shuff1(x2, 3, 3, 1, 1)));
glmm_store(dest[0], x0); glmm_store(dest[0], x0);
} }

View File

@@ -15,37 +15,61 @@
CGLM_INLINE CGLM_INLINE
void void
glm_mat3_mul_sse2(mat3 m1, mat3 m2, mat3 dest) { glm_mat3_mul_sse2(mat3 m1, mat3 m2, mat3 dest) {
__m128 l0, l1, l2; __m128 l0, l1, l2, r0, r1, r2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
__m128 r0, r1, r2;
__m128 x0, x1, x2;
l0 = _mm_loadu_ps(m1[0]); l0 = _mm_loadu_ps(m1[0]);
l1 = _mm_loadu_ps(&m1[1][1]); l1 = _mm_loadu_ps(&m1[1][1]);
l2 = _mm_set1_ps(m1[2][2]);
r0 = _mm_loadu_ps(m2[0]); r0 = _mm_loadu_ps(m2[0]);
r1 = _mm_loadu_ps(&m2[1][1]); r1 = _mm_loadu_ps(&m2[1][1]);
r2 = _mm_set1_ps(m2[2][2]);
x1 = glmm_shuff2(l0, l1, 1, 0, 3, 3, 0, 3, 2, 0); x8 = glmm_shuff1(l0, 0, 2, 1, 0); /* a00 a02 a01 a00 */
x2 = glmm_shuff2(l1, l2, 0, 0, 3, 2, 0, 2, 1, 0); x1 = glmm_shuff1(r0, 3, 0, 0, 0); /* b10 b00 b00 b00 */
x2 = _mm_shuffle_ps(l0, l1, _MM_SHUFFLE(1, 0, 3, 3)); /* a12 a11 a10 a10 */
x3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 3, 1)); /* b20 b11 b10 b01 */
x0 = _mm_mul_ps(x8, x1);
x0 = glmm_fmadd(glmm_shuff1(l0, 0, 2, 1, 0), glmm_shuff1(r0, 3, 0, 0, 0), x6 = glmm_shuff1(l0, 1, 0, 2, 1); /* a01 a00 a02 a01 */
glmm_fmadd(x1, glmm_shuff2(r0, r1, 0, 0, 1, 1, 2, 0, 0, 0), x7 = glmm_shuff1(x3, 3, 3, 1, 1); /* b20 b20 b10 b10 */
_mm_mul_ps(x2, glmm_shuff2(r0, r1, 1, 1, 2, 2, 2, 0, 0, 0)))); l2 = _mm_load_ss(&m1[2][2]);
r2 = _mm_load_ss(&m2[2][2]);
x1 = _mm_mul_ps(x6, x7);
l2 = glmm_shuff1(l2, 0, 0, 1, 0); /* a22 a22 0.f a22 */
r2 = glmm_shuff1(r2, 0, 0, 1, 0); /* b22 b22 0.f b22 */
_mm_storeu_ps(dest[0], x0); x4 = glmm_shuff1(x2, 0, 3, 2, 0); /* a10 a12 a11 a10 */
x5 = glmm_shuff1(x2, 2, 0, 3, 2); /* a11 a10 a12 a11 */
x6 = glmm_shuff1(x3, 2, 0, 0, 0); /* b11 b01 b01 b01 */
x2 = glmm_shuff1(r1, 3, 3, 0, 0); /* b21 b21 b11 b11 */
x0 = glmm_fmadd(glmm_shuff1(l0, 1, 0, 2, 1), _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 2, 3, 3)), x8 = _mm_unpackhi_ps(x8, x4); /* a10 a00 a12 a02 */
glmm_fmadd(glmm_shuff1(x1, 1, 0, 2, 1), glmm_shuff1(r1, 3, 3, 0, 0), x9 = _mm_unpackhi_ps(x7, x2); /* b21 b20 b21 b20 */
_mm_mul_ps(glmm_shuff1(x2, 1, 0, 2, 1),
_mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1))))); x0 = glmm_fmadd(x4, x6, x0);
x1 = glmm_fmadd(x5, x2, x1);
x2 = _mm_movehl_ps(l2, l1); /* a22 a22 a21 a20 */
x3 = glmm_shuff1(x2, 0, 2, 1, 0); /* a20 a22 a21 a20 */
x2 = glmm_shuff1(x2, 1, 0, 2, 1); /* a21 a20 a22 a21 */
x4 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 1, 2, 2)); /* b12 b12 b02 b02 */
_mm_storeu_ps(&dest[1][1], x0); x5 = glmm_shuff1(x4, 3, 0, 0, 0); /* b12 b02 b02 b02 */
x4 = _mm_movehl_ps(r2, x4); /* b22 b22 b12 b12 */
x0 = glmm_fmadd(x3, x5, x0);
x1 = glmm_fmadd(x2, x4, x1);
dest[2][2] = m1[0][2] * m2[2][0] /*
+ m1[1][2] * m2[2][1] Dot Product : dest[2][2] = a02 * b20 +
+ m1[2][2] * m2[2][2]; a12 * b21 +
a22 * b22 +
0 * 00 */
x2 = _mm_movelh_ps(x8, l2); /* 0.f a22 a12 a02 */
x3 = _mm_movelh_ps(x9, r2); /* 0.f b22 b21 b20 */
x2 = glmm_vdots(x2, x3);
_mm_storeu_ps(&dest[0][0], x0);
_mm_storeu_ps(&dest[1][1], x1);
_mm_store_ss (&dest[2][2], x2);
} }
#endif #endif

View File

@@ -49,42 +49,64 @@ void
glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
__m128 l0, l1, l2, l3, r; glmm_128 l, r0, r1, r2, r3, v0, v1, v2, v3;
l0 = glmm_load(m1[0]); l = glmm_load(m1[0]);
l1 = glmm_load(m1[1]); r0 = glmm_load(m2[0]);
l2 = glmm_load(m1[2]); r1 = glmm_load(m2[1]);
l3 = glmm_load(m1[3]); r2 = glmm_load(m2[2]);
r3 = glmm_load(m2[3]);
#define XX(C) \
\
r = glmm_load(m2[C]); \
glmm_store(dest[C], \
glmm_fmadd(glmm_splat(r, 0), l0, \
glmm_fmadd(glmm_splat(r, 1), l1, \
glmm_fmadd(glmm_splat(r, 2), l2, \
_mm_mul_ps(glmm_splat(r, 3), l3)))));
XX(0); v0 = _mm_mul_ps(glmm_splat_x(r0), l);
XX(1); v1 = _mm_mul_ps(glmm_splat_x(r1), l);
XX(2); v2 = _mm_mul_ps(glmm_splat_x(r2), l);
XX(3); v3 = _mm_mul_ps(glmm_splat_x(r3), l);
#undef XX l = glmm_load(m1[1]);
v0 = glmm_fmadd(glmm_splat_y(r0), l, v0);
v1 = glmm_fmadd(glmm_splat_y(r1), l, v1);
v2 = glmm_fmadd(glmm_splat_y(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_y(r3), l, v3);
l = glmm_load(m1[2]);
v0 = glmm_fmadd(glmm_splat_z(r0), l, v0);
v1 = glmm_fmadd(glmm_splat_z(r1), l, v1);
v2 = glmm_fmadd(glmm_splat_z(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_z(r3), l, v3);
l = glmm_load(m1[3]);
v0 = glmm_fmadd(glmm_splat_w(r0), l, v0);
v1 = glmm_fmadd(glmm_splat_w(r1), l, v1);
v2 = glmm_fmadd(glmm_splat_w(r2), l, v2);
v3 = glmm_fmadd(glmm_splat_w(r3), l, v3);
glmm_store(dest[0], v0);
glmm_store(dest[1], v1);
glmm_store(dest[2], v2);
glmm_store(dest[3], v3);
} }
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) {
__m128 x0, x1; __m128 x0, x1, m0, m1, m2, m3, v0, v1, v2, v3;
m0 = glmm_load(m[0]);
m1 = glmm_load(m[1]);
m2 = glmm_load(m[2]);
m3 = glmm_load(m[3]);
x0 = glmm_load(v); x0 = glmm_load(v);
x1 = glmm_fmadd(glmm_load(m[0]), glmm_splat(x0, 0), v0 = glmm_splat_x(x0);
glmm_fmadd(glmm_load(m[1]), glmm_splat(x0, 1), v1 = glmm_splat_y(x0);
glmm_fmadd(glmm_load(m[2]), glmm_splat(x0, 2), v2 = glmm_splat_z(x0);
_mm_mul_ps(glmm_load(m[3]), v3 = glmm_splat_w(x0);
glmm_splat(x0, 3)))));
x1 = _mm_mul_ps(m3, v3);
x1 = glmm_fmadd(m2, v2, x1);
x1 = glmm_fmadd(m1, v1, x1);
x1 = glmm_fmadd(m0, v0, x1);
glmm_store(dest, x1); glmm_store(dest, x1);
} }
@@ -143,97 +165,121 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
v0, v1, v2, v3, v0, v1, v2, v3,
t0, t1, t2, t3, t4, t5, t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9; x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
x9 = glmm_shuff1(x8, 2, 1, 2, 1); x9 = glmm_shuff1(x8, 2, 1, 2, 1);
/* 127 <- 0 */ /* 127 <- 0 */
r0 = glmm_load(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
r1 = glmm_load(mat[1]); /* h g f e */ r1 = glmm_load(mat[1]); /* h g f e */
r2 = glmm_load(mat[2]); /* l k j i */ r2 = glmm_load(mat[2]); /* l k j i */
r3 = glmm_load(mat[3]); /* p o n m */ r3 = glmm_load(mat[3]); /* p o n m */
x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ x0 = _mm_movehl_ps(r3, r2); /* p o l k */
x1 = glmm_shuff1(x0, 1, 3, 3, 3); /* l p p p */ x3 = _mm_movelh_ps(r2, r3); /* n m j i */
x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */
x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */ x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */
x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */ x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */
x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */ x7 = glmm_shuff1(x3, 0, 2, 2, 2); /* i m m m */
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */
x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */
x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */
t0 = _mm_mul_ps(x3, x1);
t1 = _mm_mul_ps(x5, x1);
t2 = _mm_mul_ps(x5, x2);
t3 = _mm_mul_ps(x6, x1);
t4 = _mm_mul_ps(x6, x2);
t5 = _mm_mul_ps(x6, x4);
/* t1[0] = k * p - o * l; /* t1[0] = k * p - o * l;
t1[0] = k * p - o * l; t1[0] = k * p - o * l;
t2[0] = g * p - o * h; t2[0] = g * p - o * h;
t3[0] = g * l - k * h; */ t3[0] = g * l - k * h; */
t0 = glmm_fnmadd(x2, x0, _mm_mul_ps(x3, x1)); t0 = glmm_fnmadd(x2, x0, t0);
x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */
x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */
x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */
/* t1[1] = j * p - n * l; /* t1[1] = j * p - n * l;
t1[1] = j * p - n * l; t1[1] = j * p - n * l;
t2[1] = f * p - n * h; t2[1] = f * p - n * h;
t3[1] = f * l - j * h; */ t3[1] = f * l - j * h; */
t1 = glmm_fnmadd(x4, x0, _mm_mul_ps(x5, x1)); t1 = glmm_fnmadd(x4, x0, t1);
/* t1[2] = j * o - n * k /* t1[2] = j * o - n * k
t1[2] = j * o - n * k; t1[2] = j * o - n * k;
t2[2] = f * o - n * g; t2[2] = f * o - n * g;
t3[2] = f * k - j * g; */ t3[2] = f * k - j * g; */
t2 = glmm_fnmadd(x4, x3, _mm_mul_ps(x5, x2)); t2 = glmm_fnmadd(x4, x3, t2);
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */
/* t1[3] = i * p - m * l; /* t1[3] = i * p - m * l;
t1[3] = i * p - m * l; t1[3] = i * p - m * l;
t2[3] = e * p - m * h; t2[3] = e * p - m * h;
t3[3] = e * l - i * h; */ t3[3] = e * l - i * h; */
t3 = glmm_fnmadd(x7, x0, _mm_mul_ps(x6, x1)); t3 = glmm_fnmadd(x7, x0, t3);
/* t1[4] = i * o - m * k; /* t1[4] = i * o - m * k;
t1[4] = i * o - m * k; t1[4] = i * o - m * k;
t2[4] = e * o - m * g; t2[4] = e * o - m * g;
t3[4] = e * k - i * g; */ t3[4] = e * k - i * g; */
t4 = glmm_fnmadd(x7, x3, _mm_mul_ps(x6, x2)); t4 = glmm_fnmadd(x7, x3, t4);
/* t1[5] = i * n - m * j; /* t1[5] = i * n - m * j;
t1[5] = i * n - m * j; t1[5] = i * n - m * j;
t2[5] = e * n - m * f; t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */ t3[5] = e * j - i * f; */
t5 = glmm_fnmadd(x7, x5, _mm_mul_ps(x6, x4)); t5 = glmm_fnmadd(x7, x5, t5);
x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */ x4 = _mm_movelh_ps(r0, r1); /* f e b a */
x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */ x5 = _mm_movehl_ps(r1, r0); /* h g d c */
x2 = glmm_shuff2(r1, r0, 2, 2, 2, 2, 2, 2, 2, 0); /* c c c g */
x3 = glmm_shuff2(r1, r0, 3, 3, 3, 3, 2, 2, 2, 0); /* d d d h */ x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */
x1 = glmm_shuff1(x4, 1, 1, 1, 3); /* b b b f */
x2 = glmm_shuff1(x5, 0, 0, 0, 2); /* c c c g */
x3 = glmm_shuff1(x5, 1, 1, 1, 3); /* d d d h */
v2 = _mm_mul_ps(x0, t1);
v1 = _mm_mul_ps(x0, t0);
v3 = _mm_mul_ps(x0, t2);
v0 = _mm_mul_ps(x1, t0);
v2 = glmm_fnmadd(x1, t3, v2);
v3 = glmm_fnmadd(x1, t4, v3);
v0 = glmm_fnmadd(x2, t1, v0);
v1 = glmm_fnmadd(x2, t3, v1);
v3 = glmm_fmadd(x2, t5, v3);
v0 = glmm_fmadd(x3, t2, v0);
v2 = glmm_fmadd(x3, t5, v2);
v1 = glmm_fmadd(x3, t4, v1);
/* /*
dest[0][0] = f * t1[0] - g * t1[1] + h * t1[2]; dest[0][0] = f * t1[0] - g * t1[1] + h * t1[2];
dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]); dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2]; dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2];
dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */ dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
v0 = _mm_xor_ps(glmm_fmadd(x3, t2, glmm_fnmadd(x2, t1, _mm_mul_ps(x1, t0))), x8); v0 = _mm_xor_ps(v0, x8);
/* /*
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5]; dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]); dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5]; dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/ dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_xor_ps(glmm_fmadd(x3, t5, glmm_fnmadd(x1, t3, _mm_mul_ps(x0, t1))), x8); v2 = _mm_xor_ps(v2, x8);
/* /*
dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]); dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4]; dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4];
dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]); dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */ dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */
v1 = _mm_xor_ps(glmm_fmadd(x3, t4, glmm_fnmadd(x2, t3, _mm_mul_ps(x0, t0))), x9); v1 = _mm_xor_ps(v1, x9);
/* /*
dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]); dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5]; dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5];
dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]); dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */ dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */
v3 = _mm_xor_ps(glmm_fmadd(x2, t5, glmm_fnmadd(x1, t4, _mm_mul_ps(x0, t2))), x9); v3 = _mm_xor_ps(v3, x9);
/* determinant */ /* determinant */
x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0)); x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
@@ -255,97 +301,121 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
v0, v1, v2, v3, v0, v1, v2, v3,
t0, t1, t2, t3, t4, t5, t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9; x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
x9 = glmm_shuff1(x8, 2, 1, 2, 1); x9 = glmm_shuff1(x8, 2, 1, 2, 1);
/* 127 <- 0 */ /* 127 <- 0 */
r0 = glmm_load(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
r1 = glmm_load(mat[1]); /* h g f e */ r1 = glmm_load(mat[1]); /* h g f e */
r2 = glmm_load(mat[2]); /* l k j i */ r2 = glmm_load(mat[2]); /* l k j i */
r3 = glmm_load(mat[3]); /* p o n m */ r3 = glmm_load(mat[3]); /* p o n m */
x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ x0 = _mm_movehl_ps(r3, r2); /* p o l k */
x1 = glmm_shuff1(x0, 1, 3, 3, 3); /* l p p p */ x3 = _mm_movelh_ps(r2, r3); /* n m j i */
x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */
x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */ x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */
x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */ x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */
x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */ x7 = glmm_shuff1(x3, 0, 2, 2, 2); /* i m m m */
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */
x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */
x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */
t0 = _mm_mul_ps(x3, x1);
t1 = _mm_mul_ps(x5, x1);
t2 = _mm_mul_ps(x5, x2);
t3 = _mm_mul_ps(x6, x1);
t4 = _mm_mul_ps(x6, x2);
t5 = _mm_mul_ps(x6, x4);
/* t1[0] = k * p - o * l; /* t1[0] = k * p - o * l;
t1[0] = k * p - o * l; t1[0] = k * p - o * l;
t2[0] = g * p - o * h; t2[0] = g * p - o * h;
t3[0] = g * l - k * h; */ t3[0] = g * l - k * h; */
t0 = glmm_fnmadd(x2, x0, _mm_mul_ps(x3, x1)); t0 = glmm_fnmadd(x2, x0, t0);
x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */
x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */
x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */
/* t1[1] = j * p - n * l; /* t1[1] = j * p - n * l;
t1[1] = j * p - n * l; t1[1] = j * p - n * l;
t2[1] = f * p - n * h; t2[1] = f * p - n * h;
t3[1] = f * l - j * h; */ t3[1] = f * l - j * h; */
t1 = glmm_fnmadd(x4, x0, _mm_mul_ps(x5, x1)); t1 = glmm_fnmadd(x4, x0, t1);
/* t1[2] = j * o - n * k /* t1[2] = j * o - n * k
t1[2] = j * o - n * k; t1[2] = j * o - n * k;
t2[2] = f * o - n * g; t2[2] = f * o - n * g;
t3[2] = f * k - j * g; */ t3[2] = f * k - j * g; */
t2 = glmm_fnmadd(x4, x3, _mm_mul_ps(x5, x2)); t2 = glmm_fnmadd(x4, x3, t2);
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */
/* t1[3] = i * p - m * l; /* t1[3] = i * p - m * l;
t1[3] = i * p - m * l; t1[3] = i * p - m * l;
t2[3] = e * p - m * h; t2[3] = e * p - m * h;
t3[3] = e * l - i * h; */ t3[3] = e * l - i * h; */
t3 = glmm_fnmadd(x7, x0, _mm_mul_ps(x6, x1)); t3 = glmm_fnmadd(x7, x0, t3);
/* t1[4] = i * o - m * k; /* t1[4] = i * o - m * k;
t1[4] = i * o - m * k; t1[4] = i * o - m * k;
t2[4] = e * o - m * g; t2[4] = e * o - m * g;
t3[4] = e * k - i * g; */ t3[4] = e * k - i * g; */
t4 = glmm_fnmadd(x7, x3, _mm_mul_ps(x6, x2)); t4 = glmm_fnmadd(x7, x3, t4);
/* t1[5] = i * n - m * j; /* t1[5] = i * n - m * j;
t1[5] = i * n - m * j; t1[5] = i * n - m * j;
t2[5] = e * n - m * f; t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */ t3[5] = e * j - i * f; */
t5 = glmm_fnmadd(x7, x5, _mm_mul_ps(x6, x4)); t5 = glmm_fnmadd(x7, x5, t5);
x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */ x4 = _mm_movelh_ps(r0, r1); /* f e b a */
x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */ x5 = _mm_movehl_ps(r1, r0); /* h g d c */
x2 = glmm_shuff2(r1, r0, 2, 2, 2, 2, 2, 2, 2, 0); /* c c c g */
x3 = glmm_shuff2(r1, r0, 3, 3, 3, 3, 2, 2, 2, 0); /* d d d h */ x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */
x1 = glmm_shuff1(x4, 1, 1, 1, 3); /* b b b f */
x2 = glmm_shuff1(x5, 0, 0, 0, 2); /* c c c g */
x3 = glmm_shuff1(x5, 1, 1, 1, 3); /* d d d h */
v2 = _mm_mul_ps(x0, t1);
v1 = _mm_mul_ps(x0, t0);
v3 = _mm_mul_ps(x0, t2);
v0 = _mm_mul_ps(x1, t0);
v2 = glmm_fnmadd(x1, t3, v2);
v3 = glmm_fnmadd(x1, t4, v3);
v0 = glmm_fnmadd(x2, t1, v0);
v1 = glmm_fnmadd(x2, t3, v1);
v3 = glmm_fmadd(x2, t5, v3);
v0 = glmm_fmadd(x3, t2, v0);
v2 = glmm_fmadd(x3, t5, v2);
v1 = glmm_fmadd(x3, t4, v1);
/* /*
dest[0][0] = f * t1[0] - g * t1[1] + h * t1[2]; dest[0][0] = f * t1[0] - g * t1[1] + h * t1[2];
dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]); dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2]; dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2];
dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */ dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
v0 = _mm_xor_ps(glmm_fmadd(x3, t2, glmm_fnmadd(x2, t1, _mm_mul_ps(x1, t0))), x8); v0 = _mm_xor_ps(v0, x8);
/* /*
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5]; dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]); dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5]; dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/ dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_xor_ps(glmm_fmadd(x3, t5, glmm_fnmadd(x1, t3, _mm_mul_ps(x0, t1))), x8); v2 = _mm_xor_ps(v2, x8);
/* /*
dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]); dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4]; dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4];
dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]); dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */ dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */
v1 = _mm_xor_ps(glmm_fmadd(x3, t4, glmm_fnmadd(x2, t3, _mm_mul_ps(x0, t0))), x9); v1 = _mm_xor_ps(v1, x9);
/* /*
dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]); dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5]; dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5];
dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]); dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */ dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */
v3 = _mm_xor_ps(glmm_fmadd(x2, t5, glmm_fnmadd(x1, t4, _mm_mul_ps(x0, t2))), x9); v3 = _mm_xor_ps(v3, x9);
/* determinant */ /* determinant */
x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0)); x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));

View File

@@ -22,25 +22,33 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) {
a1 a2 b1 b2 c1 c2 d1 d2 a1 a2 b1 b2 c1 c2 d1 d2
*/ */
__m128 xp, xq, x0, r; __m128 xp, xq, x1, x2, x3, r, x, y, z;
xp = glmm_load(p); /* 3 2 1 0 */ xp = glmm_load(p); /* 3 2 1 0 */
xq = glmm_load(q); xq = glmm_load(q);
x1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); /* TODO: _mm_set1_ss() + shuff ? */
r = _mm_mul_ps(glmm_splat_w(xp), xq);
x2 = _mm_unpackhi_ps(x1, x1);
x3 = glmm_shuff1(x1, 3, 2, 0, 1);
x = glmm_splat_x(xp);
y = glmm_splat_y(xp);
z = glmm_splat_z(xp);
r = _mm_mul_ps(glmm_splat(xp, 3), xq); x = _mm_xor_ps(x, x1);
y = _mm_xor_ps(y, x2);
x0 = _mm_xor_ps(glmm_splat(xp, 0), _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); z = _mm_xor_ps(z, x3);
r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 0, 1, 2, 3)));
x1 = glmm_shuff1(xq, 0, 1, 2, 3);
x0 = _mm_xor_ps(glmm_splat(xp, 1), _mm_set_ps(-0.f, -0.f, 0.f, 0.f)); x2 = glmm_shuff1(xq, 1, 0, 3, 2);
r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 1, 0, 3, 2))); x3 = glmm_shuff1(xq, 2, 3, 0, 1);
x0 = _mm_xor_ps(glmm_splat(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f)); r = glmm_fmadd(x, x1, r);
r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 2, 3, 0, 1))); r = glmm_fmadd(y, x2, r);
r = glmm_fmadd(z, x3, r);
glmm_store(dest, r); glmm_store(dest, r);
} }
#endif #endif
#endif /* cglm_quat_simd_h */ #endif /* cglm_quat_simd_h */

View File

@@ -34,6 +34,7 @@
CGLM_INLINE mat3s glms_quat_mat3t(versors q) CGLM_INLINE mat3s glms_quat_mat3t(versors q)
CGLM_INLINE versors glms_quat_lerp(versors from, versors to, float t) CGLM_INLINE versors glms_quat_lerp(versors from, versors to, float t)
CGLM_INLINE versors glms_quat_lerpc(versors from, versors to, float t) CGLM_INLINE versors glms_quat_lerpc(versors from, versors to, float t)
CGLM_INLINE versors glms_quat_nlerp(versors from, versors to, float t)
CGLM_INLINE versors glms_quat_slerp(versors from, versors to, float t) CGLM_INLINE versors glms_quat_slerp(versors from, versors to, float t)
CGLM_INLINE mat4s. glms_quat_look(vec3s eye, versors ori) CGLM_INLINE mat4s. glms_quat_look(vec3s eye, versors ori)
CGLM_INLINE versors glms_quat_for(vec3s dir, vec3s fwd, vec3s up) CGLM_INLINE versors glms_quat_for(vec3s dir, vec3s fwd, vec3s up)
@@ -401,6 +402,24 @@ glms_quat_lerpc(versors from, versors to, float t) {
return dest; return dest;
} }
/*!
* @brief interpolates between two quaternions
* taking the shortest rotation path using
* normalized linear interpolation (NLERP)
*
* @param[in] from from
* @param[in] to to
* @param[in] t interpolant (amount)
* @param[out] dest result quaternion
*/
CGLM_INLINE
versors
glms_quat_nlerp(versors from, versors to, float t) {
versors dest;
glm_quat_nlerp(from.raw, to.raw, t, dest.raw);
return dest;
}
/*! /*!
* @brief interpolates between two quaternions * @brief interpolates between two quaternions
* using spherical linear interpolation (SLERP) * using spherical linear interpolation (SLERP)

View File

@@ -10,6 +10,6 @@
#define CGLM_VERSION_MAJOR 0 #define CGLM_VERSION_MAJOR 0
#define CGLM_VERSION_MINOR 8 #define CGLM_VERSION_MINOR 8
#define CGLM_VERSION_PATCH 2 #define CGLM_VERSION_PATCH 3
#endif /* cglm_version_h */ #endif /* cglm_version_h */

View File

@@ -1,5 +1,5 @@
project('cglm', 'c', project('cglm', 'c',
version : '0.8.2', version : '0.8.3',
license : 'mit', license : 'mit',
default_options : [ default_options : [
'c_std=c11', 'c_std=c11',

View File

@@ -170,6 +170,12 @@ glmc_quat_lerpc(versor from, versor to, float t, versor dest) {
glm_quat_lerpc(from, to, t, dest); glm_quat_lerpc(from, to, t, dest);
} }
CGLM_EXPORT
void
glmc_quat_nlerp(versor from, versor to, float t, versor dest) {
glm_quat_nlerp(from, to, t, dest);
}
CGLM_EXPORT CGLM_EXPORT
void void
glmc_quat_slerp(versor from, versor to, float t, versor dest) { glmc_quat_slerp(versor from, versor to, float t, versor dest) {

View File

@@ -7,6 +7,25 @@
#include "test_common.h" #include "test_common.h"
#ifndef glm_affine_mat_test_guard
#define glm_affine_mat_test_guard
CGLM_INLINE
void
glm_inv_tr_raw(mat4 mat) {
CGLM_ALIGN_MAT mat3 r;
CGLM_ALIGN(8) vec3 t;
/* rotate */
glm_mat4_pick3t(mat, r);
glm_mat4_ins3(r, mat);
/* translate */
glm_mat3_mulv(r, mat[3], t);
glm_vec3_negate(t);
glm_vec3_copy(t, mat[3]);
}
#endif
TEST_IMPL(GLM_PREFIX, mul) { TEST_IMPL(GLM_PREFIX, mul) {
mat4 m1 = GLM_MAT4_IDENTITY_INIT; mat4 m1 = GLM_MAT4_IDENTITY_INIT;
mat4 m2 = GLM_MAT4_IDENTITY_INIT; mat4 m2 = GLM_MAT4_IDENTITY_INIT;
@@ -81,6 +100,12 @@ TEST_IMPL(GLM_PREFIX, inv_tr) {
GLM(mat4_inv)(m1, m2); GLM(mat4_inv)(m1, m2);
GLM(inv_tr)(m2); GLM(inv_tr)(m2);
ASSERTIFY(test_assert_mat4_eq(m1, m2)) ASSERTIFY(test_assert_mat4_eq(m1, m2))
/* test with raw */
glm_mat4_copy(m1, m2);
glm_inv_tr_raw(m2);
GLM(inv_tr)(m1);
ASSERTIFY(test_assert_mat4_eq(m1, m2))
} }
TEST_SUCCESS TEST_SUCCESS

View File

@@ -708,6 +708,38 @@ TEST_IMPL(GLM_PREFIX, quat_lerpc) {
TEST_SUCCESS TEST_SUCCESS
} }
TEST_IMPL(GLM_PREFIX, quat_nlerp) {
versor q1, q2, q3, q4;
vec3 v1 = {10.0f, 0.0f, 0.0f}, v2;
glm_quatv(q1, glm_rad(30.0f), v1);
glm_quatv(q2, glm_rad(90.0f), v1);
GLM(quat_nlerp)(q1, q2, 1.0f, q3);
glm_quat_normalize(q2);
ASSERTIFY(test_assert_quat_eq(q2, q3));
glm_quatv(q1, glm_rad(30.001f), v1);
glm_quatv(q2, glm_rad(30.002f), v1);
GLM(quat_nlerp)(q1, q2, 0.7f, q3);
glm_quat_lerp(q1, q2, 0.7f, q4);
ASSERTIFY(test_assert_quat_eq(q3, q4));
glm_quatv(q1, glm_rad(30.0f), v1);
glm_quatv(q2, glm_rad(90.0f), v1);
GLM(quat_nlerp)(q1, q2, 0.5f, q3);
glm_quat_axis(q3, v2);
glm_vec3_normalize(v1);
glm_vec3_normalize(v2);
ASSERT(glm_quat_angle(q3) > glm_rad(30.0f));
ASSERT(glm_quat_angle(q3) < glm_rad(90.0f));
ASSERTIFY(test_assert_vec3_eq(v1, v2))
TEST_SUCCESS
}
TEST_IMPL(GLM_PREFIX, quat_slerp) { TEST_IMPL(GLM_PREFIX, quat_slerp) {
versor q1, q2, q3, q4; versor q1, q2, q3, q4;
vec3 v1 = {10.0f, 0.0f, 0.0f}, v2; vec3 v1 = {10.0f, 0.0f, 0.0f}, v2;

View File

@@ -284,6 +284,7 @@ TEST_DECLARE(glm_quat_mat3)
TEST_DECLARE(glm_quat_mat3t) TEST_DECLARE(glm_quat_mat3t)
TEST_DECLARE(glm_quat_lerp) TEST_DECLARE(glm_quat_lerp)
TEST_DECLARE(glm_quat_lerpc) TEST_DECLARE(glm_quat_lerpc)
TEST_DECLARE(glm_quat_nlerp)
TEST_DECLARE(glm_quat_slerp) TEST_DECLARE(glm_quat_slerp)
TEST_DECLARE(glm_quat_look) TEST_DECLARE(glm_quat_look)
TEST_DECLARE(glm_quat_for) TEST_DECLARE(glm_quat_for)
@@ -320,6 +321,7 @@ TEST_DECLARE(glmc_quat_mat3)
TEST_DECLARE(glmc_quat_mat3t) TEST_DECLARE(glmc_quat_mat3t)
TEST_DECLARE(glmc_quat_lerp) TEST_DECLARE(glmc_quat_lerp)
TEST_DECLARE(glmc_quat_lerpc) TEST_DECLARE(glmc_quat_lerpc)
TEST_DECLARE(glmc_quat_nlerp)
TEST_DECLARE(glmc_quat_slerp) TEST_DECLARE(glmc_quat_slerp)
TEST_DECLARE(glmc_quat_look) TEST_DECLARE(glmc_quat_look)
TEST_DECLARE(glmc_quat_for) TEST_DECLARE(glmc_quat_for)
@@ -1006,6 +1008,7 @@ TEST_LIST {
TEST_ENTRY(glm_quat_mat3t) TEST_ENTRY(glm_quat_mat3t)
TEST_ENTRY(glm_quat_lerp) TEST_ENTRY(glm_quat_lerp)
TEST_ENTRY(glm_quat_lerpc) TEST_ENTRY(glm_quat_lerpc)
TEST_ENTRY(glm_quat_nlerp)
TEST_ENTRY(glm_quat_slerp) TEST_ENTRY(glm_quat_slerp)
TEST_ENTRY(glm_quat_look) TEST_ENTRY(glm_quat_look)
TEST_ENTRY(glm_quat_for) TEST_ENTRY(glm_quat_for)
@@ -1042,6 +1045,7 @@ TEST_LIST {
TEST_ENTRY(glmc_quat_mat3t) TEST_ENTRY(glmc_quat_mat3t)
TEST_ENTRY(glmc_quat_lerp) TEST_ENTRY(glmc_quat_lerp)
TEST_ENTRY(glmc_quat_lerpc) TEST_ENTRY(glmc_quat_lerpc)
TEST_ENTRY(glmc_quat_nlerp)
TEST_ENTRY(glmc_quat_slerp) TEST_ENTRY(glmc_quat_slerp)
TEST_ENTRY(glmc_quat_look) TEST_ENTRY(glmc_quat_look)
TEST_ENTRY(glmc_quat_for) TEST_ENTRY(glmc_quat_for)

View File

@@ -90,7 +90,10 @@
<ClInclude Include="..\include\cglm\simd\avx\affine.h" /> <ClInclude Include="..\include\cglm\simd\avx\affine.h" />
<ClInclude Include="..\include\cglm\simd\avx\mat4.h" /> <ClInclude Include="..\include\cglm\simd\avx\mat4.h" />
<ClInclude Include="..\include\cglm\simd\intrin.h" /> <ClInclude Include="..\include\cglm\simd\intrin.h" />
<ClInclude Include="..\include\cglm\simd\neon\affine.h" />
<ClInclude Include="..\include\cglm\simd\neon\mat2.h" />
<ClInclude Include="..\include\cglm\simd\neon\mat4.h" /> <ClInclude Include="..\include\cglm\simd\neon\mat4.h" />
<ClInclude Include="..\include\cglm\simd\neon\quat.h" />
<ClInclude Include="..\include\cglm\simd\sse2\affine.h" /> <ClInclude Include="..\include\cglm\simd\sse2\affine.h" />
<ClInclude Include="..\include\cglm\simd\sse2\mat2.h" /> <ClInclude Include="..\include\cglm\simd\sse2\mat2.h" />
<ClInclude Include="..\include\cglm\simd\sse2\mat3.h" /> <ClInclude Include="..\include\cglm\simd\sse2\mat3.h" />

View File

@@ -370,5 +370,14 @@
<ClInclude Include="..\include\cglm\struct\affine2d.h"> <ClInclude Include="..\include\cglm\struct\affine2d.h">
<Filter>include\cglm\struct</Filter> <Filter>include\cglm\struct</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\include\cglm\simd\neon\affine.h">
<Filter>include\cglm\simd\neon</Filter>
</ClInclude>
<ClInclude Include="..\include\cglm\simd\neon\mat2.h">
<Filter>include\cglm\simd\neon</Filter>
</ClInclude>
<ClInclude Include="..\include\cglm\simd\neon\quat.h">
<Filter>include\cglm\simd\neon</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
</Project> </Project>