arm, neon: neon/fma support for glm_quat_mul()

build: add missing files to build files
arm, neon: neon/fma support for glm_mat2_mul()
2026-02-17 03:39:05 +00:00 · 2021-04-29 01:12:00 +03:00 · 2021-04-28 22:45:03 +03:00 · 2021-04-28 22:06:46 +03:00 · 2021-04-28 14:46:14 +03:00 · 2021-04-27 23:52:40 +03:00
12 changed files with 192 additions and 3 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -109,7 +109,10 @@ cglm_simd_avx_HEADERS = include/cglm/simd/avx/mat4.h \
                        include/cglm/simd/avx/affine.h

 cglm_simd_neondir=$(includedir)/cglm/simd/neon
-cglm_simd_neon_HEADERS = include/cglm/simd/neon/mat4.h
+cglm_simd_neon_HEADERS = include/cglm/simd/neon/mat4.h \
+                         include/cglm/simd/neon/mat2.h \
+                         include/cglm/simd/neon/affine.h \
+                         include/cglm/simd/neon/quat.h

 cglm_structdir=$(includedir)/cglm/struct
 cglm_struct_HEADERS = include/cglm/struct/mat4.h \
--- a/docs/source/opengl.rst
+++ b/docs/source/opengl.rst
@@ -2,7 +2,7 @@ How to send vector or matrix to OpenGL like API
 ==================================================

 *cglm*'s vector and matrix types are arrays. So you can send them directly to a
-function which accecpts pointer. But you may got warnings for matrix because it is
+function which accepts pointer. But you may got warnings for matrix because it is
 two dimensional array.

 Passing / Uniforming Matrix to OpenGL:
--- a/include/cglm/affine-mat.h
+++ b/include/cglm/affine-mat.h
@@ -158,6 +158,8 @@ void
 glm_inv_tr(mat4 mat) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
  glm_inv_tr_sse2(mat);
+#elif defined(CGLM_NEON_FP)
+  glm_inv_tr_neon(mat);
 #else
  CGLM_ALIGN_MAT mat3 r;
  CGLM_ALIGN(8)  vec3 t;
--- a/include/cglm/mat2.h
+++ b/include/cglm/mat2.h
@@ -40,6 +40,10 @@
 #  include "simd/sse2/mat2.h"
 #endif

+#ifdef CGLM_NEON_FP
+#  include "simd/neon/mat2.h"
+#endif
+
 #define GLM_MAT2_IDENTITY_INIT  {{1.0f, 0.0f}, {0.0f, 1.0f}}
 #define GLM_MAT2_ZERO_INIT      {{0.0f, 0.0f}, {0.0f, 0.0f}}

@@ -130,6 +134,8 @@ void
 glm_mat2_mul(mat2 m1, mat2 m2, mat2 dest) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
  glm_mat2_mul_sse2(m1, m2, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mat2_mul_neon(m1, m2, dest);
 #else
  float a00 = m1[0][0], a01 = m1[0][1],
        a10 = m1[1][0], a11 = m1[1][1],
--- a/include/cglm/quat.h
+++ b/include/cglm/quat.h
@@ -63,6 +63,10 @@
 #  include "simd/sse2/quat.h"
 #endif

+#ifdef CGLM_NEON_FP
+#  include "simd/neon/quat.h"
+#endif
+
 CGLM_INLINE
 void
 glm_mat4_mulv(mat4 m, vec4 v, vec4 dest);
@@ -412,6 +416,8 @@ glm_quat_mul(versor p, versor q, versor dest) {
   */
 #if defined( __SSE__ ) || defined( __SSE2__ )
  glm_quat_mul_sse2(p, q, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_quat_mul_neon(p, q, dest);
 #else
  dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1];
  dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0];
--- a/include/cglm/simd/neon/affine.h
+++ b/include/cglm/simd/neon/affine.h
@@ -76,5 +76,41 @@ glm_mul_rot_neon(mat4 m1, mat4 m2, mat4 dest) {
  glmm_store(dest[3], glmm_load(m1[3]));
 }

+CGLM_INLINE
+void
+glm_inv_tr_neon(mat4 mat) {
+  float32x4x4_t vmat;
+  glmm_128      r0, r1, r2, r3, x0;
+
+  vmat = vld4q_f32(mat[0]);
+  r0   = vmat.val[0];
+  r1   = vmat.val[1];
+  r2   = vmat.val[2];
+  r3   = vmat.val[3];
+
+  x0 = glmm_fmadd(r0, glmm_splat_w(r0),
+                  glmm_fmadd(r1, glmm_splat_w(r1),
+                             vmulq_f32(r2, glmm_splat_w(r2))));
+  x0 = glmm_xor(x0, glmm_set1(-0.f));
+
+  glmm_store(mat[0], r0);
+  glmm_store(mat[1], r1);
+  glmm_store(mat[2], r2);
+  glmm_store(mat[3], x0);
+  
+  mat[0][3] = 0.0f;
+  mat[1][3] = 0.0f;
+  mat[2][3] = 0.0f;
+  mat[3][3] = 1.0f;
+
+  /* TODO: ?
+  zo   = vget_high_f32(r3);
+  vst1_lane_f32(&mat[0][3], zo, 0);
+  vst1_lane_f32(&mat[1][3], zo, 0);
+  vst1_lane_f32(&mat[2][3], zo, 0);
+  vst1_lane_f32(&mat[3][3], zo, 1);
+  */
+}
+
 #endif
 #endif /* cglm_affine_neon_h */
--- a/include/cglm/simd/neon/mat2.h
+++ b/include/cglm/simd/neon/mat2.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_mat2_neon_h
+#define cglm_mat2_neon_h
+#if defined(__ARM_NEON_FP)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_mat2_mul_neon(mat2 m1, mat2 m2, mat2 dest) {
+  float32x4x2_t a1;
+  glmm_128 x0,  x1, x2;
+  float32x2_t   dc, ba;
+
+  x1 = glmm_load(m1[0]); /* d c b a */
+  x2 = glmm_load(m2[0]); /* h g f e */
+  
+  dc  = vget_high_f32(x1);
+  ba  = vget_low_f32(x1);
+
+  /* g g e e, h h f f */
+  a1 = vtrnq_f32(x2, x2);
+  
+  /*
+   dest[0][0] = a * e + c * f;
+   dest[0][1] = b * e + d * f;
+   dest[1][0] = a * g + c * h;
+   dest[1][1] = b * g + d * h;
+   */
+  x0 = glmm_fmadd(vcombine_f32(ba, ba), a1.val[0],
+                  vmulq_f32(vcombine_f32(dc, dc), a1.val[1]));
+
+  glmm_store(dest[0], x0);
+}
+
+#endif
+#endif /* cglm_mat2_neon_h */
--- a/include/cglm/simd/neon/quat.h
+++ b/include/cglm/simd/neon/quat.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_quat_neon_h
+#define cglm_quat_neon_h
+#if defined(__ARM_NEON_FP)
+
+#include "../../common.h"
+#include "../intrin.h"
+
+CGLM_INLINE
+void
+glm_quat_mul_neon(versor p, versor q, versor dest) {
+  /*
+   + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
+   + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
+   + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
+     a1 a2 − b1 b2 − c1 c2 − d1 d2
+   */
+
+  glmm_128 xp, xq, xqr, r, x, y, z, s2, s3;
+  glmm_128 s1 = {-0.f, 0.f, 0.f, -0.f};
+  float32x2_t   qh, ql;
+  
+  xp  = glmm_load(p); /* 3 2 1 0 */
+  xq  = glmm_load(q);
+
+  r   = vmulq_f32(glmm_splat_w(xp), xq);
+  x   = glmm_splat_x(xp);
+  y   = glmm_splat_y(xp);
+  z   = glmm_splat_z(xp);
+
+  ql  = vget_high_f32(s1);
+  s3  = vcombine_f32(ql, ql);
+  s2  = vzipq_f32(s3, s3).val[0];
+
+  xqr = vrev64q_f32(xq);
+  qh  = vget_high_f32(xqr);
+  ql  = vget_low_f32(xqr);
+
+  r = glmm_fmadd(glmm_xor(x, s3), vcombine_f32(qh, ql), r);
+  
+  r = glmm_fmadd(glmm_xor(y, s2), vcombine_f32(vget_high_f32(xq),
+                                               vget_low_f32(xq)), r);
+  
+  r = glmm_fmadd(glmm_xor(z, s1), vcombine_f32(ql, qh), r);
+
+  glmm_store(dest, r);
+}
+
+#endif
+#endif /* cglm_quat_neon_h */
--- a/include/cglm/simd/sse2/quat.h
+++ b/include/cglm/simd/sse2/quat.h
@@ -41,6 +41,5 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) {
  glmm_store(dest, r);
 }

-
 #endif
 #endif /* cglm_quat_simd_h */
--- a/test/src/test_affine_mat.h
+++ b/test/src/test_affine_mat.h
@@ -7,6 +7,25 @@

 #include "test_common.h"

+#ifndef glm_affine_mat_test_guard
+#define glm_affine_mat_test_guard
+CGLM_INLINE
+void
+glm_inv_tr_raw(mat4 mat) {
+  CGLM_ALIGN_MAT mat3 r;
+  CGLM_ALIGN(8)  vec3 t;
+
+  /* rotate */
+  glm_mat4_pick3t(mat, r);
+  glm_mat4_ins3(r, mat);
+
+  /* translate */
+  glm_mat3_mulv(r, mat[3], t);
+  glm_vec3_negate(t);
+  glm_vec3_copy(t, mat[3]);
+}
+#endif
+
 TEST_IMPL(GLM_PREFIX, mul) {
  mat4 m1 = GLM_MAT4_IDENTITY_INIT;
  mat4 m2 = GLM_MAT4_IDENTITY_INIT;
@@ -81,6 +100,12 @@ TEST_IMPL(GLM_PREFIX, inv_tr) {
    GLM(mat4_inv)(m1, m2);
    GLM(inv_tr)(m2);
    ASSERTIFY(test_assert_mat4_eq(m1, m2))
+    
+    /* test with raw */
+    glm_mat4_copy(m1, m2);
+    glm_inv_tr_raw(m2);
+    GLM(inv_tr)(m1);
+    ASSERTIFY(test_assert_mat4_eq(m1, m2))
  }

  TEST_SUCCESS
--- a/win/cglm.vcxproj
+++ b/win/cglm.vcxproj
@@ -90,7 +90,10 @@
    <ClInclude Include="..\include\cglm\simd\avx\affine.h" />
    <ClInclude Include="..\include\cglm\simd\avx\mat4.h" />
    <ClInclude Include="..\include\cglm\simd\intrin.h" />
+    <ClInclude Include="..\include\cglm\simd\neon\affine.h" />
+    <ClInclude Include="..\include\cglm\simd\neon\mat2.h" />
    <ClInclude Include="..\include\cglm\simd\neon\mat4.h" />
+    <ClInclude Include="..\include\cglm\simd\neon\quat.h" />
    <ClInclude Include="..\include\cglm\simd\sse2\affine.h" />
    <ClInclude Include="..\include\cglm\simd\sse2\mat2.h" />
    <ClInclude Include="..\include\cglm\simd\sse2\mat3.h" />
--- a/win/cglm.vcxproj.filters
+++ b/win/cglm.vcxproj.filters
@@ -370,5 +370,14 @@
    <ClInclude Include="..\include\cglm\struct\affine2d.h">
      <Filter>include\cglm\struct</Filter>
    </ClInclude>
+    <ClInclude Include="..\include\cglm\simd\neon\affine.h">
+      <Filter>include\cglm\simd\neon</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\cglm\simd\neon\mat2.h">
+      <Filter>include\cglm\simd\neon</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\cglm\simd\neon\quat.h">
+      <Filter>include\cglm\simd\neon</Filter>
+    </ClInclude>
  </ItemGroup>
 </Project>
Author	SHA1	Message	Date
Recep Aslantas	04eaf9c535	arm, neon: neon/fma support for glm_quat_mul()	2021-04-29 01:12:00 +03:00
Recep Aslantas	bd6641bd0a	build: add missing files to build files	2021-04-28 22:45:03 +03:00
Recep Aslantas	4e4bff418d	arm, neon: neon/fma support for glm_mat2_mul()	2021-04-28 22:06:46 +03:00
Recep Aslantas	55ebbdbe40	arm, neon: neon/fma support for glm_inv_tr()	2021-04-28 14:46:14 +03:00
Recep Aslantas	e4c35e32fc	Merge pull request #190 from ylecuyer/patch-3 Minor typo in doc	2021-04-27 23:52:40 +03:00
Yoann Lecuyer	ec467fef1f	Minor typo in doc I stumbled upon while reading the doc	2021-04-27 22:09:13 +02:00
Recep Aslantas	1e8865233b	Merge pull request #189 from recp/simd-2 ARM Neon Update	2021-04-25 15:20:24 +03:00