Compare commits

..

4 Commits

Author SHA1 Message Date
Recep Aslantas
6626d2b74f Update affine.h 2020-11-22 01:25:34 +03:00
Recep Aslantas
5bda762df6 use epsilon to compare results in glm_uniscaled() 2020-11-22 01:14:18 +03:00
Recep Aslantas
f9824a8dc6 test: use custom epsilon to compare 2020-11-22 00:59:48 +03:00
Recep Aslantas
ccd3058adc remove CGLM_USE_DEFAULT_EPSILON
* to override float epsilon we just need to define GLM_FLT_EPSILON
* CGLM_USE_DEFAULT_EPSILON was redundant, also it forces to override system default epsilon which may not be good idea, because not all systems may support smaller epsilon values
2020-11-22 00:38:57 +03:00
24 changed files with 237 additions and 363 deletions

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.8.2) cmake_minimum_required(VERSION 3.8.2)
project(cglm VERSION 0.8.1 LANGUAGES C) project(cglm VERSION 0.8.0 LANGUAGES C)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED YES) set(CMAKE_C_STANDARD_REQUIRED YES)
@@ -18,7 +18,7 @@ else(CGLM_STATIC)
endif() endif()
if(CGLM_USE_C99) if(CGLM_USE_C99)
set(CMAKE_C_STANDARD 99) set(C_STANDARD 99)
endif() endif()
if(MSVC) if(MSVC)
@@ -93,11 +93,6 @@ target_include_directories(${PROJECT_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/src
) )
# Target for header-only usage
add_library(${PROJECT_NAME}_headers INTERFACE)
target_include_directories(${PROJECT_NAME}_headers INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}/include)
# Test Configuration # Test Configuration
if(CGLM_USE_TEST) if(CGLM_USE_TEST)
include(CTest) include(CTest)
@@ -122,7 +117,6 @@ export(TARGETS ${PROJECT_NAME}
) )
install(EXPORT ${PROJECT_NAME} install(EXPORT ${PROJECT_NAME}
FILE "${PROJECT_NAME}Config.cmake"
NAMESPACE ${PROJECT_NAME}:: NAMESPACE ${PROJECT_NAME}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}) DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}/cmake)

View File

@@ -168,24 +168,6 @@ option(CGLM_USE_C99 "" OFF) # C11
option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test
``` ```
#### Use as header-only library with your CMake project
This requires no building or installation of cglm.
* Example:
``` cmake
cmake_minimum_required(VERSION 3.8.2)
project(<Your Project Name>)
add_executable(${PROJECT_NAME} src/main.c)
target_link_libraries(${LIBRARY_NAME} PRIVATE
cglm_headers)
add_subdirectory(external/cglm/ EXCLUDE_FROM_ALL)
```
#### Use with your CMake project #### Use with your CMake project
* Example: * Example:
```cmake ```cmake

View File

@@ -2,7 +2,7 @@ Pod::Spec.new do |s|
# Description # Description
s.name = "cglm" s.name = "cglm"
s.version = "0.8.0" s.version = "0.7.9"
s.summary = "📽 Highly Optimized Graphics Math (glm) for C" s.summary = "📽 Highly Optimized Graphics Math (glm) for C"
s.description = <<-DESC s.description = <<-DESC
cglm is math library for graphics programming for C. See the documentation or README for all features. cglm is math library for graphics programming for C. See the documentation or README for all features.

View File

@@ -7,7 +7,7 @@
#***************************************************************************** #*****************************************************************************
AC_PREREQ([2.69]) AC_PREREQ([2.69])
AC_INIT([cglm], [0.8.1], [info@recp.me]) AC_INIT([cglm], [0.8.0], [info@recp.me])
AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects serial-tests]) AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects serial-tests])
# Don't use the default cflags (-O2 -g), we set ours manually in Makefile.am. # Don't use the default cflags (-O2 -g), we set ours manually in Makefile.am.

View File

@@ -32,22 +32,6 @@ If you don't want to install **cglm** to your system's folder you can get static
option(CGLM_USE_C99 "" OFF) # C11 option(CGLM_USE_C99 "" OFF) # C11
option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test
**Use as header-only library with your CMake project example**
This requires no building or installation of cglm.
.. code-block:: CMake
:linenos:
cmake_minimum_required(VERSION 3.8.2)
project(<Your Project Name>)
add_executable(${PROJECT_NAME} src/main.c)
target_link_libraries(${LIBRARY_NAME} PRIVATE
cglm_headers)
add_subdirectory(external/cglm/ EXCLUDE_FROM_ALL)
**Use with your CMake project example** **Use with your CMake project example**
.. code-block:: CMake .. code-block:: CMake

View File

@@ -62,9 +62,9 @@ author = u'Recep Aslantas'
# built documents. # built documents.
# #
# The short X.Y version. # The short X.Y version.
version = u'0.8.1' version = u'0.8.0'
# The full version, including alpha/beta/rc tags. # The full version, including alpha/beta/rc tags.
release = u'0.8.1' release = u'0.8.0'
# The language for content autogenerated by Sphinx. Refer to documentation # The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages. # for a list of supported languages.

View File

@@ -425,7 +425,7 @@ bool
glm_uniscaled(mat4 m) { glm_uniscaled(mat4 m) {
CGLM_ALIGN(8) vec3 s; CGLM_ALIGN(8) vec3 s;
glm_decompose_scalev(m, s); glm_decompose_scalev(m, s);
return glm_vec3_eq_all(s); return glm_vec3_eq_eps(s, s[0]);
} }
/*! /*!
@@ -455,7 +455,7 @@ glm_decompose_rs(mat4 m, mat4 r, vec3 s) {
glm_vec4_scale(r[1], 1.0f/s[1], r[1]); glm_vec4_scale(r[1], 1.0f/s[1], r[1]);
glm_vec4_scale(r[2], 1.0f/s[2], r[2]); glm_vec4_scale(r[2], 1.0f/s[2], r[2]);
/* Note from Apple Open Source (assume that the matrix is orthonormal): /* Note from Apple Open Source (asume that the matrix is orthonormal):
check for a coordinate system flip. If the determinant check for a coordinate system flip. If the determinant
is -1, then negate the matrix and the scaling factors. */ is -1, then negate the matrix and the scaling factors. */
glm_vec3_cross(m[0], m[1], v); glm_vec3_cross(m[0], m[1], v);

View File

@@ -228,8 +228,6 @@ glm_aabb_aabb(vec3 box[2], vec3 other[2]) {
* https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c * https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c
* Solid Box - Solid Sphere test. * Solid Box - Solid Sphere test.
* *
* Sphere Representation in cglm: [center.x, center.y, center.z, radii]
*
* @param[in] box solid bounding box * @param[in] box solid bounding box
* @param[in] s solid sphere * @param[in] s solid sphere
*/ */
@@ -239,13 +237,13 @@ glm_aabb_sphere(vec3 box[2], vec4 s) {
float dmin; float dmin;
int a, b, c; int a, b, c;
a = (s[0] < box[0][0]) + (s[0] > box[1][0]); a = s[0] >= box[0][0];
b = (s[1] < box[0][1]) + (s[1] > box[1][1]); b = s[1] >= box[0][1];
c = (s[2] < box[0][2]) + (s[2] > box[1][2]); c = s[2] >= box[0][2];
dmin = glm_pow2((s[0] - box[!(a - 1)][0]) * (a != 0)) dmin = glm_pow2(s[0] - box[a][0])
+ glm_pow2((s[1] - box[!(b - 1)][1]) * (b != 0)) + glm_pow2(s[1] - box[b][1])
+ glm_pow2((s[2] - box[!(c - 1)][2]) * (c != 0)); + glm_pow2(s[2] - box[c][2]);
return dmin <= glm_pow2(s[3]); return dmin <= glm_pow2(s[3]);
} }

View File

@@ -42,12 +42,18 @@
#include "types.h" #include "types.h"
#include "simd/intrin.h" #include "simd/intrin.h"
#ifndef CGLM_USE_DEFAULT_EPSILON /** CGLM_USE_DEFAULT_EPSILON is removed, to override float epsilon,
# ifndef GLM_FLT_EPSILON * just define GLM_FLT_EPSILON with epsilon value like below
# define GLM_FLT_EPSILON 1e-6 *
* #define GLM_FLT_EPSILON 1e-6f
*/
#ifndef GLM_FLT_EPSILON
# ifndef FLT_EPSILON
# define GLM_FLT_EPSILON 1e-6f
# else
# define GLM_FLT_EPSILON FLT_EPSILON
# endif # endif
#else
# define GLM_FLT_EPSILON FLT_EPSILON
#endif #endif
#endif /* cglm_common_h */ #endif /* cglm_common_h */

View File

@@ -539,9 +539,7 @@ glm_mat4_scale_p(mat4 m, float s) {
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_scale(mat4 m, float s) { glm_mat4_scale(mat4 m, float s) {
#ifdef __AVX__ #if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_scale_avx(m, s);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_scale_sse2(m, s); glm_mat4_scale_sse2(m, s);
#elif defined(CGLM_NEON_FP) #elif defined(CGLM_NEON_FP)
glm_mat4_scale_neon(m, s); glm_mat4_scale_neon(m, s);

View File

@@ -79,41 +79,5 @@ glmm_norm_inf(float32x4_t a) {
return glmm_hmax(glmm_abs(a)); return glmm_hmax(glmm_abs(a));
} }
static inline
float32x4_t
glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
#if defined(__aarch64__)
return vfmaq_f32(c, a, b);
#else
return vmlaq_f32(c, a, b);
#endif
}
static inline
float32x4_t
glmm_fnmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
#if defined(__aarch64__)
return vfmsq_f32(c, a, b);
#else
return vmlsq_f32(c, a, b);
#endif
}
static inline
float32x4_t
glmm_fmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
#if defined(__aarch64__)
return vfmsq_f32(c, a, b);
#else
return vmlsq_f32(c, a, b);
#endif
}
static inline
float32x4_t
glmm_fnmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
return vsubq_f32(vdupq_n_f32(0.0f), glmm_fmadd(a, b, c));
}
#endif #endif
#endif /* cglm_simd_arm_h */ #endif /* cglm_simd_arm_h */

View File

@@ -14,16 +14,6 @@
#include <immintrin.h> #include <immintrin.h>
CGLM_INLINE
void
glm_mat4_scale_avx(mat4 m, float s) {
__m256 y0;
y0 = _mm256_set1_ps(s);
glmm_store256(m[0], _mm256_mul_ps(y0, glmm_load256(m[0])));
glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2])));
}
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {

View File

@@ -25,29 +25,28 @@ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
r = glmm_load(m2[0]); r = glmm_load(m2[0]);
glmm_store(dest[0], glmm_store(dest[0],
glmm_fmadd(glmm_shuff1x(r, 0), l0, _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
glmm_fmadd(glmm_shuff1x(r, 1), l1, _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_mul_ps(glmm_shuff1x(r, 2), l2)))); _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
r = glmm_load(m2[1]); r = glmm_load(m2[1]);
glmm_store(dest[1], glmm_store(dest[1],
glmm_fmadd(glmm_shuff1x(r, 0), l0, _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
glmm_fmadd(glmm_shuff1x(r, 1), l1, _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_mul_ps(glmm_shuff1x(r, 2), l2)))); _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
r = glmm_load(m2[2]); r = glmm_load(m2[2]);
glmm_store(dest[2], glmm_store(dest[2],
glmm_fmadd(glmm_shuff1x(r, 0), l0, _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
glmm_fmadd(glmm_shuff1x(r, 1), l1, _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_mul_ps(glmm_shuff1x(r, 2), l2)))); _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
r = glmm_load(m2[3]); r = glmm_load(m2[3]);
glmm_store(dest[3], glmm_store(dest[3],
glmm_fmadd(glmm_shuff1x(r, 0), l0, _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
glmm_fmadd(glmm_shuff1x(r, 1), l1, _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
glmm_fmadd(glmm_shuff1x(r, 2), l2, _mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
_mm_mul_ps(glmm_shuff1x(r, 3), _mm_mul_ps(glmm_shuff1x(r, 3), l3))));
l3)))));
} }
CGLM_INLINE CGLM_INLINE
@@ -63,22 +62,21 @@ glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
r = glmm_load(m2[0]); r = glmm_load(m2[0]);
glmm_store(dest[0], glmm_store(dest[0],
glmm_fmadd(glmm_shuff1x(r, 0), l0, _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
glmm_fmadd(glmm_shuff1x(r, 1), l1, _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_mul_ps(glmm_shuff1x(r, 2), l2)))); _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
r = glmm_load(m2[1]); r = glmm_load(m2[1]);
glmm_store(dest[1], glmm_store(dest[1],
glmm_fmadd(glmm_shuff1x(r, 0), l0, _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
glmm_fmadd(glmm_shuff1x(r, 1), l1, _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_mul_ps(glmm_shuff1x(r, 2), l2)))); _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
r = glmm_load(m2[2]); r = glmm_load(m2[2]);
glmm_store(dest[2], glmm_store(dest[2],
glmm_fmadd(glmm_shuff1x(r, 0), l0, _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
glmm_fmadd(glmm_shuff1x(r, 1), l1, _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_mul_ps(glmm_shuff1x(r, 2), l2)))); _mm_mul_ps(glmm_shuff1x(r, 2), l2)));
glmm_store(dest[3], l3); glmm_store(dest[3], l3);
} }
@@ -96,9 +94,9 @@ glm_inv_tr_sse2(mat4 mat) {
_MM_TRANSPOSE4_PS(r0, r1, r2, x1); _MM_TRANSPOSE4_PS(r0, r1, r2, x1);
x0 = glmm_fmadd(r0, glmm_shuff1(r3, 0, 0, 0, 0), x0 = _mm_add_ps(_mm_mul_ps(r0, glmm_shuff1(r3, 0, 0, 0, 0)),
glmm_fmadd(r1, glmm_shuff1(r3, 1, 1, 1, 1), _mm_mul_ps(r1, glmm_shuff1(r3, 1, 1, 1, 1)));
_mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2)))); x0 = _mm_add_ps(x0, _mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2)));
x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f)); x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f));
x0 = _mm_add_ps(x0, x1); x0 = _mm_add_ps(x0, x1);

View File

@@ -26,11 +26,11 @@ glm_mat2_mul_sse2(mat2 m1, mat2 m2, mat2 dest) {
dest[1][0] = a * g + c * h; dest[1][0] = a * g + c * h;
dest[1][1] = b * g + d * h; dest[1][1] = b * g + d * h;
*/ */
x0 = glmm_fmadd(_mm_movelh_ps(x1, x1), glmm_shuff1(x2, 2, 2, 0, 0), x0 = _mm_mul_ps(_mm_movelh_ps(x1, x1), glmm_shuff1(x2, 2, 2, 0, 0));
_mm_mul_ps(_mm_movehl_ps(x1, x1), x1 = _mm_mul_ps(_mm_movehl_ps(x1, x1), glmm_shuff1(x2, 3, 3, 1, 1));
glmm_shuff1(x2, 3, 3, 1, 1))); x1 = _mm_add_ps(x0, x1);
glmm_store(dest[0], x0); glmm_store(dest[0], x1);
} }
CGLM_INLINE CGLM_INLINE

View File

@@ -30,16 +30,23 @@ glm_mat3_mul_sse2(mat3 m1, mat3 m2, mat3 dest) {
x1 = glmm_shuff2(l0, l1, 1, 0, 3, 3, 0, 3, 2, 0); x1 = glmm_shuff2(l0, l1, 1, 0, 3, 3, 0, 3, 2, 0);
x2 = glmm_shuff2(l1, l2, 0, 0, 3, 2, 0, 2, 1, 0); x2 = glmm_shuff2(l1, l2, 0, 0, 3, 2, 0, 2, 1, 0);
x0 = glmm_fmadd(glmm_shuff1(l0, 0, 2, 1, 0), glmm_shuff1(r0, 3, 0, 0, 0), x0 = _mm_add_ps(_mm_mul_ps(glmm_shuff1(l0, 0, 2, 1, 0),
glmm_fmadd(x1, glmm_shuff2(r0, r1, 0, 0, 1, 1, 2, 0, 0, 0), glmm_shuff1(r0, 3, 0, 0, 0)),
_mm_mul_ps(x2, glmm_shuff2(r0, r1, 1, 1, 2, 2, 2, 0, 0, 0)))); _mm_mul_ps(x1, glmm_shuff2(r0, r1, 0, 0, 1, 1, 2, 0, 0, 0)));
x0 = _mm_add_ps(x0,
_mm_mul_ps(x2, glmm_shuff2(r0, r1, 1, 1, 2, 2, 2, 0, 0, 0)));
_mm_storeu_ps(dest[0], x0); _mm_storeu_ps(dest[0], x0);
x0 = glmm_fmadd(glmm_shuff1(l0, 1, 0, 2, 1), _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 2, 3, 3)), x0 = _mm_add_ps(_mm_mul_ps(glmm_shuff1(l0, 1, 0, 2, 1),
glmm_fmadd(glmm_shuff1(x1, 1, 0, 2, 1), glmm_shuff1(r1, 3, 3, 0, 0), _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 2, 3, 3))),
_mm_mul_ps(glmm_shuff1(x2, 1, 0, 2, 1), _mm_mul_ps(glmm_shuff1(x1, 1, 0, 2, 1),
_mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1))))); glmm_shuff1(r1, 3, 3, 0, 0)));
x0 = _mm_add_ps(x0,
_mm_mul_ps(glmm_shuff1(x2, 1, 0, 2, 1),
_mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1))));
_mm_storeu_ps(&dest[1][1], x0); _mm_storeu_ps(&dest[1][1], x0);

View File

@@ -56,37 +56,46 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
l2 = glmm_load(m1[2]); l2 = glmm_load(m1[2]);
l3 = glmm_load(m1[3]); l3 = glmm_load(m1[3]);
#define XX(C) \ r = glmm_load(m2[0]);
\ glmm_store(dest[0],
r = glmm_load(m2[C]); \ _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
glmm_store(dest[C], \ _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
glmm_fmadd(glmm_shuff1x(r, 0), l0, \ _mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
glmm_fmadd(glmm_shuff1x(r, 1), l1, \ _mm_mul_ps(glmm_shuff1x(r, 3), l3))));
glmm_fmadd(glmm_shuff1x(r, 2), l2, \ r = glmm_load(m2[1]);
_mm_mul_ps(glmm_shuff1x(r, 3), \ glmm_store(dest[1],
l3))))); _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
_mm_mul_ps(glmm_shuff1x(r, 3), l3))));
r = glmm_load(m2[2]);
glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
_mm_mul_ps(glmm_shuff1x(r, 3), l3))));
XX(0); r = glmm_load(m2[3]);
XX(1); glmm_store(dest[3],
XX(2); _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
XX(3); _mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
#undef XX _mm_mul_ps(glmm_shuff1x(r, 3), l3))));
} }
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) {
__m128 x0, x1; __m128 x0, x1, x2;
x0 = glmm_load(v); x0 = glmm_load(v);
x1 = glmm_fmadd(glmm_load(m[0]), glmm_shuff1x(x0, 0), x1 = _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), glmm_shuff1x(x0, 0)),
glmm_fmadd(glmm_load(m[1]), glmm_shuff1x(x0, 1), _mm_mul_ps(glmm_load(m[1]), glmm_shuff1x(x0, 1)));
glmm_fmadd(glmm_load(m[2]), glmm_shuff1x(x0, 2),
_mm_mul_ps(glmm_load(m[3]),
glmm_shuff1x(x0, 3)))));
glmm_store(dest, x1); x2 = _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), glmm_shuff1x(x0, 2)),
_mm_mul_ps(glmm_load(m[3]), glmm_shuff1x(x0, 3)));
glmm_store(dest, _mm_add_ps(x1, x2));
} }
CGLM_INLINE CGLM_INLINE
@@ -106,18 +115,20 @@ glm_mat4_det_sse2(mat4 mat) {
t[3] = i * p - m * l; t[3] = i * p - m * l;
t[4] = i * o - m * k; t[4] = i * o - m * k;
*/ */
x0 = glmm_fnmadd(glmm_shuff1(r3, 0, 0, 1, 1), glmm_shuff1(r2, 2, 3, 2, 3), x0 = _mm_sub_ps(_mm_mul_ps(glmm_shuff1(r2, 0, 0, 1, 1),
_mm_mul_ps(glmm_shuff1(r2, 0, 0, 1, 1), glmm_shuff1(r3, 2, 3, 2, 3)),
glmm_shuff1(r3, 2, 3, 2, 3))); _mm_mul_ps(glmm_shuff1(r3, 0, 0, 1, 1),
glmm_shuff1(r2, 2, 3, 2, 3)));
/* /*
t[0] = k * p - o * l; t[0] = k * p - o * l;
t[0] = k * p - o * l; t[0] = k * p - o * l;
t[5] = i * n - m * j; t[5] = i * n - m * j;
t[5] = i * n - m * j; t[5] = i * n - m * j;
*/ */
x1 = glmm_fnmadd(glmm_shuff1(r3, 0, 0, 2, 2), glmm_shuff1(r2, 1, 1, 3, 3), x1 = _mm_sub_ps(_mm_mul_ps(glmm_shuff1(r2, 0, 0, 2, 2),
_mm_mul_ps(glmm_shuff1(r2, 0, 0, 2, 2), glmm_shuff1(r3, 1, 1, 3, 3)),
glmm_shuff1(r3, 1, 1, 3, 3))); _mm_mul_ps(glmm_shuff1(r3, 0, 0, 2, 2),
glmm_shuff1(r2, 1, 1, 3, 3)));
/* /*
a * (f * t[0] - g * t[1] + h * t[2]) a * (f * t[0] - g * t[1] + h * t[2])
@@ -125,16 +136,21 @@ glm_mat4_det_sse2(mat4 mat) {
+ c * (e * t[1] - f * t[3] + h * t[5]) + c * (e * t[1] - f * t[3] + h * t[5])
- d * (e * t[2] - f * t[4] + g * t[5]) - d * (e * t[2] - f * t[4] + g * t[5])
*/ */
x2 = glmm_fnmadd(glmm_shuff1(r1, 1, 1, 2, 2), glmm_shuff1(x0, 3, 2, 2, 0), x2 = _mm_sub_ps(_mm_mul_ps(glmm_shuff1(r1, 0, 0, 0, 1),
_mm_mul_ps(glmm_shuff1(r1, 0, 0, 0, 1), _mm_shuffle_ps(x1, x0, _MM_SHUFFLE(1, 0, 0, 0))),
_mm_shuffle_ps(x1, x0, _MM_SHUFFLE(1, 0, 0, 0)))); _mm_mul_ps(glmm_shuff1(r1, 1, 1, 2, 2),
x2 = glmm_fmadd(glmm_shuff1(r1, 2, 3, 3, 3), glmm_shuff1(x0, 3, 2, 2, 0)));
_mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)),
x2);
x2 = _mm_add_ps(x2,
_mm_mul_ps(glmm_shuff1(r1, 2, 3, 3, 3),
_mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1))));
x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
return glmm_hadd(_mm_mul_ps(x2, r0)); x0 = _mm_mul_ps(r0, x2);
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 2, 3));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 3, 3, 1));
return _mm_cvtss_f32(x0);
} }
CGLM_INLINE CGLM_INLINE
@@ -143,10 +159,7 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
__m128 r0, r1, r2, r3, __m128 r0, r1, r2, r3,
v0, v1, v2, v3, v0, v1, v2, v3,
t0, t1, t2, t3, t4, t5, t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9; x0, x1, x2, x3, x4, x5, x6, x7;
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
x9 = glmm_shuff1(x8, 2, 1, 2, 1);
/* 127 <- 0 */ /* 127 <- 0 */
r0 = glmm_load(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
@@ -164,7 +177,7 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
t1[0] = k * p - o * l; t1[0] = k * p - o * l;
t2[0] = g * p - o * h; t2[0] = g * p - o * h;
t3[0] = g * l - k * h; */ t3[0] = g * l - k * h; */
t0 = glmm_fnmadd(x2, x0, _mm_mul_ps(x3, x1)); t0 = _mm_sub_ps(_mm_mul_ps(x3, x1), _mm_mul_ps(x2, x0));
x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */ x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */
x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */ x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */
@@ -174,13 +187,13 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
t1[1] = j * p - n * l; t1[1] = j * p - n * l;
t2[1] = f * p - n * h; t2[1] = f * p - n * h;
t3[1] = f * l - j * h; */ t3[1] = f * l - j * h; */
t1 = glmm_fnmadd(x4, x0, _mm_mul_ps(x5, x1)); t1 = _mm_sub_ps(_mm_mul_ps(x5, x1), _mm_mul_ps(x4, x0));
/* t1[2] = j * o - n * k /* t1[2] = j * o - n * k
t1[2] = j * o - n * k; t1[2] = j * o - n * k;
t2[2] = f * o - n * g; t2[2] = f * o - n * g;
t3[2] = f * k - j * g; */ t3[2] = f * k - j * g; */
t2 = glmm_fnmadd(x4, x3, _mm_mul_ps(x5, x2)); t2 = _mm_sub_ps(_mm_mul_ps(x5, x2), _mm_mul_ps(x4, x3));
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */ x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */ x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */
@@ -189,19 +202,19 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
t1[3] = i * p - m * l; t1[3] = i * p - m * l;
t2[3] = e * p - m * h; t2[3] = e * p - m * h;
t3[3] = e * l - i * h; */ t3[3] = e * l - i * h; */
t3 = glmm_fnmadd(x7, x0, _mm_mul_ps(x6, x1)); t3 = _mm_sub_ps(_mm_mul_ps(x6, x1), _mm_mul_ps(x7, x0));
/* t1[4] = i * o - m * k; /* t1[4] = i * o - m * k;
t1[4] = i * o - m * k; t1[4] = i * o - m * k;
t2[4] = e * o - m * g; t2[4] = e * o - m * g;
t3[4] = e * k - i * g; */ t3[4] = e * k - i * g; */
t4 = glmm_fnmadd(x7, x3, _mm_mul_ps(x6, x2)); t4 = _mm_sub_ps(_mm_mul_ps(x6, x2), _mm_mul_ps(x7, x3));
/* t1[5] = i * n - m * j; /* t1[5] = i * n - m * j;
t1[5] = i * n - m * j; t1[5] = i * n - m * j;
t2[5] = e * n - m * f; t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */ t3[5] = e * j - i * f; */
t5 = glmm_fnmadd(x7, x5, _mm_mul_ps(x6, x4)); t5 = _mm_sub_ps(_mm_mul_ps(x6, x4), _mm_mul_ps(x7, x5));
x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */ x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */
x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */ x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */
@@ -213,35 +226,50 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]); dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2]; dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2];
dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */ dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
v0 = _mm_xor_ps(glmm_fmadd(x3, t2, glmm_fnmadd(x2, t1, _mm_mul_ps(x1, t0))), x8); v0 = _mm_add_ps(_mm_mul_ps(x3, t2),
_mm_sub_ps(_mm_mul_ps(x1, t0),
/* _mm_mul_ps(x2, t1)));
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5]; v0 = _mm_xor_ps(v0, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_xor_ps(glmm_fmadd(x3, t5, glmm_fnmadd(x1, t3, _mm_mul_ps(x0, t1))), x8);
/* /*
dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]); dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4]; dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4];
dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]); dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */ dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */
v1 = _mm_xor_ps(glmm_fmadd(x3, t4, glmm_fnmadd(x2, t3, _mm_mul_ps(x0, t0))), x9); v1 = _mm_add_ps(_mm_mul_ps(x3, t4),
_mm_sub_ps(_mm_mul_ps(x0, t0),
_mm_mul_ps(x2, t3)));
v1 = _mm_xor_ps(v1, _mm_set_ps(0.f, -0.f, 0.f, -0.f));
/*
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_add_ps(_mm_mul_ps(x3, t5),
_mm_sub_ps(_mm_mul_ps(x0, t1),
_mm_mul_ps(x1, t3)));
v2 = _mm_xor_ps(v2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
/* /*
dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]); dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5]; dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5];
dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]); dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */ dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */
v3 = _mm_xor_ps(glmm_fmadd(x2, t5, glmm_fnmadd(x1, t4, _mm_mul_ps(x0, t2))), x9); v3 = _mm_add_ps(_mm_mul_ps(x2, t5),
_mm_sub_ps(_mm_mul_ps(x0, t2),
_mm_mul_ps(x1, t4)));
v3 = _mm_xor_ps(v3, _mm_set_ps(0.f, -0.f, 0.f, -0.f));
/* determinant */ /* determinant */
x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0)); x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0)); x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0)); x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
x0 = _mm_rcp_ps(glmm_vhadd(_mm_mul_ps(x0, r0))); x0 = _mm_mul_ps(x0, r0);
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 2, 3));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1));
x0 = _mm_rcp_ps(x0);
glmm_store(dest[0], _mm_mul_ps(v0, x0)); glmm_store(dest[0], _mm_mul_ps(v0, x0));
glmm_store(dest[1], _mm_mul_ps(v1, x0)); glmm_store(dest[1], _mm_mul_ps(v1, x0));
@@ -255,10 +283,7 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
__m128 r0, r1, r2, r3, __m128 r0, r1, r2, r3,
v0, v1, v2, v3, v0, v1, v2, v3,
t0, t1, t2, t3, t4, t5, t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9; x0, x1, x2, x3, x4, x5, x6, x7;
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
x9 = glmm_shuff1(x8, 2, 1, 2, 1);
/* 127 <- 0 */ /* 127 <- 0 */
r0 = glmm_load(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
@@ -276,7 +301,7 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
t1[0] = k * p - o * l; t1[0] = k * p - o * l;
t2[0] = g * p - o * h; t2[0] = g * p - o * h;
t3[0] = g * l - k * h; */ t3[0] = g * l - k * h; */
t0 = glmm_fnmadd(x2, x0, _mm_mul_ps(x3, x1)); t0 = _mm_sub_ps(_mm_mul_ps(x3, x1), _mm_mul_ps(x2, x0));
x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */ x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */
x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */ x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */
@@ -286,13 +311,13 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
t1[1] = j * p - n * l; t1[1] = j * p - n * l;
t2[1] = f * p - n * h; t2[1] = f * p - n * h;
t3[1] = f * l - j * h; */ t3[1] = f * l - j * h; */
t1 = glmm_fnmadd(x4, x0, _mm_mul_ps(x5, x1)); t1 = _mm_sub_ps(_mm_mul_ps(x5, x1), _mm_mul_ps(x4, x0));
/* t1[2] = j * o - n * k /* t1[2] = j * o - n * k
t1[2] = j * o - n * k; t1[2] = j * o - n * k;
t2[2] = f * o - n * g; t2[2] = f * o - n * g;
t3[2] = f * k - j * g; */ t3[2] = f * k - j * g; */
t2 = glmm_fnmadd(x4, x3, _mm_mul_ps(x5, x2)); t2 = _mm_sub_ps(_mm_mul_ps(x5, x2), _mm_mul_ps(x4, x3));
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */ x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */ x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */
@@ -301,19 +326,19 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
t1[3] = i * p - m * l; t1[3] = i * p - m * l;
t2[3] = e * p - m * h; t2[3] = e * p - m * h;
t3[3] = e * l - i * h; */ t3[3] = e * l - i * h; */
t3 = glmm_fnmadd(x7, x0, _mm_mul_ps(x6, x1)); t3 = _mm_sub_ps(_mm_mul_ps(x6, x1), _mm_mul_ps(x7, x0));
/* t1[4] = i * o - m * k; /* t1[4] = i * o - m * k;
t1[4] = i * o - m * k; t1[4] = i * o - m * k;
t2[4] = e * o - m * g; t2[4] = e * o - m * g;
t3[4] = e * k - i * g; */ t3[4] = e * k - i * g; */
t4 = glmm_fnmadd(x7, x3, _mm_mul_ps(x6, x2)); t4 = _mm_sub_ps(_mm_mul_ps(x6, x2), _mm_mul_ps(x7, x3));
/* t1[5] = i * n - m * j; /* t1[5] = i * n - m * j;
t1[5] = i * n - m * j; t1[5] = i * n - m * j;
t2[5] = e * n - m * f; t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */ t3[5] = e * j - i * f; */
t5 = glmm_fnmadd(x7, x5, _mm_mul_ps(x6, x4)); t5 = _mm_sub_ps(_mm_mul_ps(x6, x4), _mm_mul_ps(x7, x5));
x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */ x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */
x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */ x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */
@@ -325,35 +350,50 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]); dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2]; dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2];
dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */ dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
v0 = _mm_xor_ps(glmm_fmadd(x3, t2, glmm_fnmadd(x2, t1, _mm_mul_ps(x1, t0))), x8); v0 = _mm_add_ps(_mm_mul_ps(x3, t2),
_mm_sub_ps(_mm_mul_ps(x1, t0),
/* _mm_mul_ps(x2, t1)));
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5]; v0 = _mm_xor_ps(v0, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_xor_ps(glmm_fmadd(x3, t5, glmm_fnmadd(x1, t3, _mm_mul_ps(x0, t1))), x8);
/* /*
dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]); dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4]; dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4];
dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]); dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */ dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */
v1 = _mm_xor_ps(glmm_fmadd(x3, t4, glmm_fnmadd(x2, t3, _mm_mul_ps(x0, t0))), x9); v1 = _mm_add_ps(_mm_mul_ps(x3, t4),
_mm_sub_ps(_mm_mul_ps(x0, t0),
_mm_mul_ps(x2, t3)));
v1 = _mm_xor_ps(v1, _mm_set_ps(0.f, -0.f, 0.f, -0.f));
/*
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_add_ps(_mm_mul_ps(x3, t5),
_mm_sub_ps(_mm_mul_ps(x0, t1),
_mm_mul_ps(x1, t3)));
v2 = _mm_xor_ps(v2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
/* /*
dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]); dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5]; dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5];
dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]); dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */ dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */
v3 = _mm_xor_ps(glmm_fmadd(x2, t5, glmm_fnmadd(x1, t4, _mm_mul_ps(x0, t2))), x9); v3 = _mm_add_ps(_mm_mul_ps(x2, t5),
_mm_sub_ps(_mm_mul_ps(x0, t2),
_mm_mul_ps(x1, t4)));
v3 = _mm_xor_ps(v3, _mm_set_ps(0.f, -0.f, 0.f, -0.f));
/* determinant */ /* determinant */
x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0)); x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0)); x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0)); x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
x0 = _mm_div_ps(_mm_set1_ps(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0))); x0 = _mm_mul_ps(x0, r0);
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 2, 3));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1));
x0 = _mm_div_ps(_mm_set1_ps(1.0f), x0);
glmm_store(dest[0], _mm_mul_ps(v0, x0)); glmm_store(dest[0], _mm_mul_ps(v0, x0));
glmm_store(dest[1], _mm_mul_ps(v1, x0)); glmm_store(dest[1], _mm_mul_ps(v1, x0));

View File

@@ -48,15 +48,6 @@ glmm_abs(__m128 x) {
return _mm_andnot_ps(_mm_set1_ps(-0.0f), x); return _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
} }
static inline
__m128
glmm_vhadd(__m128 v) {
__m128 x0;
x0 = _mm_add_ps(v, glmm_shuff1(v, 0, 1, 2, 3));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1));
return x0;
}
static inline static inline
__m128 __m128
glmm_vhadds(__m128 v) { glmm_vhadds(__m128 v) {
@@ -197,93 +188,5 @@ glmm_store3(float v[3], __m128 vx) {
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2)); _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
} }
/* enable FMA macro for MSVC? */
#if !defined(__FMA__) && defined(__AVX2__)
# define __FMA__ 1
#endif
static inline
__m128
glmm_fmadd(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fmadd_ps(a, b, c);
#else
return _mm_add_ps(c, _mm_mul_ps(a, b));
#endif
}
static inline
__m128
glmm_fnmadd(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fnmadd_ps(a, b, c);
#else
return _mm_sub_ps(c, _mm_mul_ps(a, b));
#endif
}
static inline
__m128
glmm_fmsub(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fmsub_ps(a, b, c);
#else
return _mm_sub_ps(_mm_mul_ps(a, b), c);
#endif
}
static inline
__m128
glmm_fnmsub(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fnmsub_ps(a, b, c);
#else
return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), _mm_set1_ps(-0.0f));
#endif
}
#if defined(__AVX__)
static inline
__m256
glmm256_fmadd(__m256 a, __m256 b, __m256 c) {
#ifdef __FMA__
return _mm256_fmadd_ps(a, b, c);
#else
return _mm256_add_ps(c, _mm256_mul_ps(a, b));
#endif
}
static inline
__m256
glmm256_fnmadd(__m256 a, __m256 b, __m256 c) {
#ifdef __FMA__
return _mm256_fnmadd_ps(a, b, c);
#else
return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
#endif
}
static inline
__m256
glmm256_fmsub(__m256 a, __m256 b, __m256 c) {
#ifdef __FMA__
return _mm256_fmsub_ps(a, b, c);
#else
return _mm256_sub_ps(_mm256_mul_ps(a, b), c);
#endif
}
static inline
__m256
glmm256_fnmsub(__m256 a, __m256 b, __m256 c) {
#ifdef __FMA__
return _mm256_fmsub_ps(a, b, c);
#else
return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c),
_mm256_set1_ps(-0.0f));
#endif
}
#endif
#endif #endif
#endif /* cglm_simd_x86_h */ #endif /* cglm_simd_x86_h */

View File

@@ -237,9 +237,9 @@ glm_vec3_abs(vec3 v, vec3 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec3_fract(vec3 v, vec3 dest) { glm_vec3_fract(vec3 v, vec3 dest) {
dest[0] = fminf(v[0] - floorf(v[0]), 0.999999940395355224609375f); dest[0] = fminf(v[0] - floorf(v[0]), 0x1.fffffep-1f);
dest[1] = fminf(v[1] - floorf(v[1]), 0.999999940395355224609375f); dest[1] = fminf(v[1] - floorf(v[1]), 0x1.fffffep-1f);
dest[2] = fminf(v[2] - floorf(v[2]), 0.999999940395355224609375f); dest[2] = fminf(v[2] - floorf(v[2]), 0x1.fffffep-1f);
} }
/*! /*!

View File

@@ -268,10 +268,10 @@ glm_vec4_abs(vec4 v, vec4 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec4_fract(vec4 v, vec4 dest) { glm_vec4_fract(vec4 v, vec4 dest) {
dest[0] = fminf(v[0] - floorf(v[0]), 0.999999940395355224609375f); dest[0] = fminf(v[0] - floorf(v[0]), 0x1.fffffep-1f);
dest[1] = fminf(v[1] - floorf(v[1]), 0.999999940395355224609375f); dest[1] = fminf(v[1] - floorf(v[1]), 0x1.fffffep-1f);
dest[2] = fminf(v[2] - floorf(v[2]), 0.999999940395355224609375f); dest[2] = fminf(v[2] - floorf(v[2]), 0x1.fffffep-1f);
dest[3] = fminf(v[3] - floorf(v[3]), 0.999999940395355224609375f); dest[3] = fminf(v[3] - floorf(v[3]), 0x1.fffffep-1f);
} }
/*! /*!

View File

@@ -568,8 +568,14 @@ glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) { glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) {
#if defined(CGLM_SIMD) #if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, glmm_fmadd(glmm_load(a), glmm_load(b), glmm_load(dest))); glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_mul_ps(glmm_load(a),
glmm_load(b))));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
vmulq_f32(vld1q_f32(a),
vld1q_f32(b))));
#else #else
dest[0] += a[0] * b[0]; dest[0] += a[0] * b[0];
dest[1] += a[1] * b[1]; dest[1] += a[1] * b[1];
@@ -591,9 +597,13 @@ CGLM_INLINE
void void
glm_vec4_muladds(vec4 a, float s, vec4 dest) { glm_vec4_muladds(vec4 a, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, glmm_fmadd(glmm_load(a), _mm_set1_ps(s), glmm_load(dest))); glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_mul_ps(glmm_load(a),
_mm_set1_ps(s))));
#elif defined(CGLM_NEON_FP) #elif defined(CGLM_NEON_FP)
glmm_store(dest, glmm_fmadd(glmm_load(a), vdupq_n_f32(s), glmm_load(dest))); vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
vmulq_f32(vld1q_f32(a),
vdupq_n_f32(s))));
#else #else
dest[0] += a[0] * s; dest[0] += a[0] * s;
dest[1] += a[1] * s; dest[1] += a[1] * s;

View File

@@ -10,6 +10,6 @@
#define CGLM_VERSION_MAJOR 0 #define CGLM_VERSION_MAJOR 0
#define CGLM_VERSION_MINOR 8 #define CGLM_VERSION_MINOR 8
#define CGLM_VERSION_PATCH 1 #define CGLM_VERSION_PATCH 0
#endif /* cglm_version_h */ #endif /* cglm_version_h */

View File

@@ -1,5 +1,5 @@
project('cglm', 'c', project('cglm', 'c',
version : '0.8.1', version : '0.8.0',
license : 'mit', license : 'mit',
default_options : [ default_options : [
'c_std=c11', 'c_std=c11',

View File

@@ -35,9 +35,9 @@ TEST_IMPL(camera_decomp) {
farVal = 100.0f; farVal = 100.0f;
glm_perspective(fovy, aspect, nearVal, farVal, proj); glm_perspective(fovy, aspect, nearVal, farVal, proj);
ASSERT(fabsf(aspect - glm_persp_aspect(proj)) < GLM_FLT_EPSILON) ASSERT(fabsf(aspect - glm_persp_aspect(proj)) < 1e-5f)
ASSERT(fabsf(fovy - glm_persp_fovy(proj)) < GLM_FLT_EPSILON) ASSERT(fabsf(fovy - glm_persp_fovy(proj)) < 1e-5f)
ASSERT(fabsf(49.984f - glm_deg(glm_persp_fovy(proj))) < GLM_FLT_EPSILON) ASSERT(fabsf(49.984f - glm_deg(glm_persp_fovy(proj))) < 1e-5f)
glm_persp_sizes(proj, fovy, sizes); glm_persp_sizes(proj, fovy, sizes);

View File

@@ -106,7 +106,7 @@ test_rand_quat(versor q);
CGLM_INLINE CGLM_INLINE
bool bool
test_eq(float a, float b) { test_eq(float a, float b) {
return fabsf(a - b) <= GLM_FLT_EPSILON * 10; return fabsf(a - b) <= 1e-5f;
} }
CGLM_INLINE CGLM_INLINE