Compare commits

...

49 Commits

Author SHA1 Message Date
Recep Aslantas
155eb109a8 arm, neon: neon/fma support for glm_mul_rot() 2021-04-25 03:49:35 +03:00
Recep Aslantas
2903813765 arm, neon: neon/fma support for glm_mul() 2021-04-25 03:45:00 +03:00
Recep Aslantas
0ab50f7208 arm, neon: update mat4_mul to use FMA 2021-04-25 03:41:39 +03:00
Recep Aslantas
701e015bfd avoid loading vec3 by glmm_load()
* use glmm_set1() for each for now
2021-04-25 02:36:06 +03:00
Recep Aslantas
1fb941a41b drop swizzling helpers fro now for simplicity 2021-04-25 02:35:55 +03:00
Recep Aslantas
92151c6328 arm, neon: use div instead of mul by 1 / det for mat4_inv 2021-04-24 18:02:47 +03:00
Recep Aslantas
afac887850 arm, neon: implement mat4 inv with neon 2021-04-24 17:54:01 +03:00
Recep Aslantas
a111693b6b arm, neon: implement mat4 determinant with neon 2021-04-24 15:45:36 +03:00
Recep Aslantas
ce9e5f5575 arm: update glmm swizzling func names 2021-04-24 01:38:04 +03:00
Recep Aslantas
d13842e7de arm: optimize vec4 div with NEON 2021-04-24 00:51:09 +03:00
Recep Aslantas
059e5010e6 arm: define CGLM_ARM64 for identify arm64 2021-04-24 00:44:07 +03:00
Recep Aslantas
65292a94a6 swizzling functions for NEON 2021-04-24 00:00:00 +03:00
Recep Aslantas
f303984aad use unified glmm api for vec4 2021-04-23 23:34:36 +03:00
Recep Aslantas
7d5c4da7cf optimize translate functions 2021-04-23 23:32:48 +03:00
Recep Aslantas
63988a515c glmm: new defines for splat 2021-04-23 23:32:21 +03:00
Recep Aslantas
9725b60d46 rename glmm_shuff1x() to glmm_splat()
* mark glmm_shuff1x() as DEPRECATED
2021-04-23 22:12:57 +03:00
Recep Aslantas
50c93f3d30 Merge pull request #188 from wdouglass/cmake-pkgconfig
configure and install cglm.pc with cmake
2021-04-23 17:12:04 +03:00
Recep Aslantas
f14ca0c3f2 now working on v0.8.2 2021-04-23 17:03:39 +03:00
Woodrow Douglass
0c165cba76 configure and install cglm.pc with cmake 2021-04-23 09:28:55 -04:00
Recep Aslantas
2de1133012 fix enabling FMA macro for MSVC 2021-04-19 06:04:22 +03:00
Recep Aslantas
12dc054e49 Merge pull request #186 from recp/fma
Optimizations with FMA
2021-04-19 04:32:01 +03:00
Recep Aslantas
ebba4eea8e win, msvc: enable FMA macro for MSVC 2021-04-19 04:14:14 +03:00
Recep Aslantas
aa2fa89e6c arm: fma msub and nmsub 2021-04-19 00:35:19 +03:00
Recep Aslantas
7b0eee497e arm: fix fmadd parameter order 2021-04-19 00:28:07 +03:00
Recep Aslantas
04008d9c3f arm: fix fma for glm_vec4_muladds 2021-04-19 00:21:04 +03:00
Recep Aslantas
11b1588105 glmm: missing FMA funcs for SSE and AVX 2021-04-19 00:20:47 +03:00
Recep Aslantas
7c81482248 avx: implement scale matrix using AVX 2021-04-19 00:11:43 +03:00
Recep Aslantas
f3f29bd383 vec4: optimize muladd and muladds with fma 2021-04-18 16:24:29 +03:00
Recep Aslantas
0d0d22f96c opitimize affine matrix operations with fma 2021-04-18 13:51:22 +03:00
Recep Aslantas
7df5aa2e26 opitimize mat2 operations with fma 2021-04-18 13:51:09 +03:00
Recep Aslantas
7cc4c37afb opitimize mat3 operations with fma 2021-04-18 13:51:03 +03:00
Recep Aslantas
abe29a788a opitimize mat4 operations with fma 2021-04-18 13:50:51 +03:00
Recep Aslantas
c5655bbd2e glmm: define fma functions 2021-04-18 13:49:50 +03:00
Recep Aslantas
47e0045015 glmm, x86: define hadd function 2021-04-18 13:49:36 +03:00
Recep Aslantas
8f09cc8583 Merge pull request #183 from legends2k/master
Add CMake interface library target
2021-03-26 20:36:56 +03:00
Sundaram Ramaswamy
d6a0ac320b Update docs on CMake header-only usage 2021-03-26 12:32:36 +05:30
Sundaram Ramaswamy
616d38c13a Remove redundant header listing 2021-03-26 12:23:56 +05:30
Sundaram Ramaswamy
9e12908556 Add CMake interface library target
Projects using cglm as a header-only library needn’t build files under
src/. Provide a target which allows them to skip compiling them by

add_subdirectory(external/cglm EXCLUDE_FROM_ALL)
target_link_libraries(MyExe PRIVATE cglm_headers)
2021-03-25 17:47:43 +05:30
Recep Aslantas
405cda6ee9 now working on v0.8.1 2021-03-02 23:36:33 +03:00
Recep Aslantas
1b0322e51c Merge pull request #180 from recp/aabb-sphere-fix
fix aabb-sphere test (#179)
2021-03-01 10:48:04 +03:00
Recep Aslantas
4a308dcd9e fix aabb-sphere test (#179) 2021-02-28 23:12:06 +03:00
Recep Aslantas
fec396950b Merge pull request #178 from gaurapanasenko/master
Fixed cmake config install path
2021-02-22 10:31:06 +03:00
Yehor Panasenko
981af0565e Fixed cmake config install path
Now you can use library with
```cmake
find_package(cglm REQUIRED)
target_link_libraries(${PROJECT_NAME} PRIVATE ${CGLM_LIBRARY})
```
2021-02-22 04:46:22 +02:00
Recep Aslantas
1d215ef5f3 Merge pull request #177 from Winter091/master
remove wrong c standard bug
2021-01-25 20:05:58 +03:00
winter091
2b7cfde64f remove wrong c standard bug 2021-01-25 15:50:18 +03:00
Recep Aslantas
c783c42101 Merge pull request #174 from timgates42/bugfix_typo_assume
docs: fix simple typo, asume -> assume
2020-12-12 09:18:09 +03:00
Tim Gates
bddcfedead docs: fix simple typo, asume -> assume
There is a small typo in include/cglm/affine.h.

Should read `assume` rather than `asume`.
2020-12-12 12:13:48 +11:00
Recep Aslantas
34e7438271 Merge pull request #172 from SanderMertens/master
Replace hex floating point literals
2020-12-04 13:00:07 +03:00
Sander Mertens
485ff6bc46 Replace hex floating point literals 2020-12-01 21:49:35 -08:00
26 changed files with 783 additions and 300 deletions

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.8.2) cmake_minimum_required(VERSION 3.8.2)
project(cglm VERSION 0.8.0 LANGUAGES C) project(cglm VERSION 0.8.2 LANGUAGES C)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED YES) set(CMAKE_C_STANDARD_REQUIRED YES)
@@ -18,7 +18,7 @@ else(CGLM_STATIC)
endif() endif()
if(CGLM_USE_C99) if(CGLM_USE_C99)
set(C_STANDARD 99) set(CMAKE_C_STANDARD 99)
endif() endif()
if(MSVC) if(MSVC)
@@ -93,6 +93,11 @@ target_include_directories(${PROJECT_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/src
) )
# Target for header-only usage
add_library(${PROJECT_NAME}_headers INTERFACE)
target_include_directories(${PROJECT_NAME}_headers INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}/include)
# Test Configuration # Test Configuration
if(CGLM_USE_TEST) if(CGLM_USE_TEST)
include(CTest) include(CTest)
@@ -117,6 +122,16 @@ export(TARGETS ${PROJECT_NAME}
) )
install(EXPORT ${PROJECT_NAME} install(EXPORT ${PROJECT_NAME}
FILE "${PROJECT_NAME}Config.cmake"
NAMESPACE ${PROJECT_NAME}:: NAMESPACE ${PROJECT_NAME}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}/cmake) DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME})
set(PACKAGE_NAME ${PROJECT_NAME})
set(prefix ${CMAKE_INSTALL_PREFIX})
set(exec_prefix ${CMAKE_INSTALL_PREFIX})
set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
configure_file(${CMAKE_CURRENT_LIST_DIR}/cglm.pc.in ${CMAKE_BINARY_DIR}/cglm.pc @ONLY)
install(FILES ${CMAKE_BINARY_DIR}/cglm.pc
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)

View File

@@ -74,3 +74,7 @@ Link to paper: http://webserver2.tecgraf.puc-rio.br/~mgattass/cg/trbRR/Fast%20Mi
14. ARM NEON: Matrix Vector Multiplication 14. ARM NEON: Matrix Vector Multiplication
https://stackoverflow.com/a/57793352/2676533 https://stackoverflow.com/a/57793352/2676533
16. ARM NEON Div
http://github.com/microsoft/DirectXMath

View File

@@ -168,6 +168,24 @@ option(CGLM_USE_C99 "" OFF) # C11
option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test
``` ```
#### Use as header-only library with your CMake project
This requires no building or installation of cglm.
* Example:
``` cmake
cmake_minimum_required(VERSION 3.8.2)
project(<Your Project Name>)
add_executable(${PROJECT_NAME} src/main.c)
target_link_libraries(${LIBRARY_NAME} PRIVATE
cglm_headers)
add_subdirectory(external/cglm/ EXCLUDE_FROM_ALL)
```
#### Use with your CMake project #### Use with your CMake project
* Example: * Example:
```cmake ```cmake

View File

@@ -2,7 +2,7 @@ Pod::Spec.new do |s|
# Description # Description
s.name = "cglm" s.name = "cglm"
s.version = "0.7.9" s.version = "0.8.1"
s.summary = "📽 Highly Optimized Graphics Math (glm) for C" s.summary = "📽 Highly Optimized Graphics Math (glm) for C"
s.description = <<-DESC s.description = <<-DESC
cglm is math library for graphics programming for C. See the documentation or README for all features. cglm is math library for graphics programming for C. See the documentation or README for all features.

View File

@@ -7,7 +7,7 @@
#***************************************************************************** #*****************************************************************************
AC_PREREQ([2.69]) AC_PREREQ([2.69])
AC_INIT([cglm], [0.8.0], [info@recp.me]) AC_INIT([cglm], [0.8.2], [info@recp.me])
AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects serial-tests]) AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects serial-tests])
# Don't use the default cflags (-O2 -g), we set ours manually in Makefile.am. # Don't use the default cflags (-O2 -g), we set ours manually in Makefile.am.

View File

@@ -32,6 +32,22 @@ If you don't want to install **cglm** to your system's folder you can get static
option(CGLM_USE_C99 "" OFF) # C11 option(CGLM_USE_C99 "" OFF) # C11
option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test
**Use as header-only library with your CMake project example**
This requires no building or installation of cglm.
.. code-block:: CMake
:linenos:
cmake_minimum_required(VERSION 3.8.2)
project(<Your Project Name>)
add_executable(${PROJECT_NAME} src/main.c)
target_link_libraries(${LIBRARY_NAME} PRIVATE
cglm_headers)
add_subdirectory(external/cglm/ EXCLUDE_FROM_ALL)
**Use with your CMake project example** **Use with your CMake project example**
.. code-block:: CMake .. code-block:: CMake

View File

@@ -62,9 +62,9 @@ author = u'Recep Aslantas'
# built documents. # built documents.
# #
# The short X.Y version. # The short X.Y version.
version = u'0.8.0' version = u'0.8.2'
# The full version, including alpha/beta/rc tags. # The full version, including alpha/beta/rc tags.
release = u'0.8.0' release = u'0.8.2'
# The language for content autogenerated by Sphinx. Refer to documentation # The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages. # for a list of supported languages.

View File

@@ -26,6 +26,10 @@
# include "simd/avx/affine.h" # include "simd/avx/affine.h"
#endif #endif
#ifdef CGLM_NEON_FP
# include "simd/neon/affine.h"
#endif
/*! /*!
* @brief this is similar to glm_mat4_mul but specialized to affine transform * @brief this is similar to glm_mat4_mul but specialized to affine transform
* *
@@ -49,6 +53,8 @@ glm_mul(mat4 m1, mat4 m2, mat4 dest) {
glm_mul_avx(m1, m2, dest); glm_mul_avx(m1, m2, dest);
#elif defined( __SSE__ ) || defined( __SSE2__ ) #elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mul_sse2(m1, m2, dest); glm_mul_sse2(m1, m2, dest);
#elif defined(CGLM_NEON_FP)
glm_mul_neon(m1, m2, dest);
#else #else
float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3], a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],
@@ -103,6 +109,8 @@ void
glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) { glm_mul_rot(mat4 m1, mat4 m2, mat4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
glm_mul_rot_sse2(m1, m2, dest); glm_mul_rot_sse2(m1, m2, dest);
#elif defined(CGLM_NEON_FP)
glm_mul_rot_neon(m1, m2, dest);
#else #else
float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3], float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3], a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],

View File

@@ -50,26 +50,22 @@
CGLM_INLINE CGLM_INLINE
void void
glm_translate(mat4 m, vec3 v) { glm_translate(mat4 m, vec3 v) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined(CGLM_SIMD)
glmm_128 m0, m1, m2, m3;
m0 = glmm_load(m[0]);
m1 = glmm_load(m[1]);
m2 = glmm_load(m[2]);
m3 = glmm_load(m[3]);
glmm_store(m[3], glmm_store(m[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(m[0]), glmm_fmadd(m0, glmm_set1(v[0]),
_mm_set1_ps(v[0])), glmm_fmadd(m1, glmm_set1(v[1]),
_mm_mul_ps(glmm_load(m[1]), glmm_fmadd(m2, glmm_set1(v[2]), m3))));
_mm_set1_ps(v[1]))),
_mm_add_ps(_mm_mul_ps(glmm_load(m[2]),
_mm_set1_ps(v[2])),
glmm_load(m[3]))))
;
#else #else
vec4 v1, v2, v3; glm_vec4_muladds(m[0], v[0], m[3]);
glm_vec4_muladds(m[1], v[1], m[3]);
glm_vec4_scale(m[0], v[0], v1); glm_vec4_muladds(m[2], v[2], m[3]);
glm_vec4_scale(m[1], v[1], v2);
glm_vec4_scale(m[2], v[2], v3);
glm_vec4_add(v1, m[3], m[3]);
glm_vec4_add(v2, m[3], m[3]);
glm_vec4_add(v3, m[3], m[3]);
#endif #endif
} }
@@ -99,12 +95,8 @@ glm_translate_to(mat4 m, vec3 v, mat4 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_translate_x(mat4 m, float x) { glm_translate_x(mat4 m, float x) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined(CGLM_SIMD)
glmm_store(m[3], glmm_store(m[3], glmm_fmadd(glmm_load(m[0]), glmm_set1(x), glmm_load(m[3])));
_mm_add_ps(_mm_mul_ps(glmm_load(m[0]),
_mm_set1_ps(x)),
glmm_load(m[3])))
;
#else #else
vec4 v1; vec4 v1;
glm_vec4_scale(m[0], x, v1); glm_vec4_scale(m[0], x, v1);
@@ -121,12 +113,8 @@ glm_translate_x(mat4 m, float x) {
CGLM_INLINE CGLM_INLINE
void void
glm_translate_y(mat4 m, float y) { glm_translate_y(mat4 m, float y) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined(CGLM_SIMD)
glmm_store(m[3], glmm_store(m[3], glmm_fmadd(glmm_load(m[1]), glmm_set1(y), glmm_load(m[3])));
_mm_add_ps(_mm_mul_ps(glmm_load(m[1]),
_mm_set1_ps(y)),
glmm_load(m[3])))
;
#else #else
vec4 v1; vec4 v1;
glm_vec4_scale(m[1], y, v1); glm_vec4_scale(m[1], y, v1);
@@ -143,12 +131,8 @@ glm_translate_y(mat4 m, float y) {
CGLM_INLINE CGLM_INLINE
void void
glm_translate_z(mat4 m, float z) { glm_translate_z(mat4 m, float z) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined(CGLM_SIMD)
glmm_store(m[3], glmm_store(m[3], glmm_fmadd(glmm_load(m[2]), glmm_set1(z), glmm_load(m[3])));
_mm_add_ps(_mm_mul_ps(glmm_load(m[2]),
_mm_set1_ps(z)),
glmm_load(m[3])))
;
#else #else
vec4 v1; vec4 v1;
glm_vec4_scale(m[2], z, v1); glm_vec4_scale(m[2], z, v1);
@@ -455,7 +439,7 @@ glm_decompose_rs(mat4 m, mat4 r, vec3 s) {
glm_vec4_scale(r[1], 1.0f/s[1], r[1]); glm_vec4_scale(r[1], 1.0f/s[1], r[1]);
glm_vec4_scale(r[2], 1.0f/s[2], r[2]); glm_vec4_scale(r[2], 1.0f/s[2], r[2]);
/* Note from Apple Open Source (asume that the matrix is orthonormal): /* Note from Apple Open Source (assume that the matrix is orthonormal):
check for a coordinate system flip. If the determinant check for a coordinate system flip. If the determinant
is -1, then negate the matrix and the scaling factors. */ is -1, then negate the matrix and the scaling factors. */
glm_vec3_cross(m[0], m[1], v); glm_vec3_cross(m[0], m[1], v);

View File

@@ -228,6 +228,8 @@ glm_aabb_aabb(vec3 box[2], vec3 other[2]) {
* https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c * https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c
* Solid Box - Solid Sphere test. * Solid Box - Solid Sphere test.
* *
* Sphere Representation in cglm: [center.x, center.y, center.z, radii]
*
* @param[in] box solid bounding box * @param[in] box solid bounding box
* @param[in] s solid sphere * @param[in] s solid sphere
*/ */
@@ -237,13 +239,13 @@ glm_aabb_sphere(vec3 box[2], vec4 s) {
float dmin; float dmin;
int a, b, c; int a, b, c;
a = s[0] >= box[0][0]; a = (s[0] < box[0][0]) + (s[0] > box[1][0]);
b = s[1] >= box[0][1]; b = (s[1] < box[0][1]) + (s[1] > box[1][1]);
c = s[2] >= box[0][2]; c = (s[2] < box[0][2]) + (s[2] > box[1][2]);
dmin = glm_pow2(s[0] - box[a][0]) dmin = glm_pow2((s[0] - box[!(a - 1)][0]) * (a != 0))
+ glm_pow2(s[1] - box[b][1]) + glm_pow2((s[1] - box[!(b - 1)][1]) * (b != 0))
+ glm_pow2(s[2] - box[c][2]); + glm_pow2((s[2] - box[!(c - 1)][2]) * (c != 0));
return dmin <= glm_pow2(s[3]); return dmin <= glm_pow2(s[3]);
} }

View File

@@ -539,7 +539,9 @@ glm_mat4_scale_p(mat4 m, float s) {
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_scale(mat4 m, float s) { glm_mat4_scale(mat4 m, float s) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #ifdef __AVX__
glm_mat4_scale_avx(m, s);
#elif defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_scale_sse2(m, s); glm_mat4_scale_sse2(m, s);
#elif defined(CGLM_NEON_FP) #elif defined(CGLM_NEON_FP)
glm_mat4_scale_neon(m, s); glm_mat4_scale_neon(m, s);
@@ -560,6 +562,8 @@ float
glm_mat4_det(mat4 mat) { glm_mat4_det(mat4 mat) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
return glm_mat4_det_sse2(mat); return glm_mat4_det_sse2(mat);
#elif defined(CGLM_NEON_FP)
return glm_mat4_det_neon(mat);
#else #else
/* [square] det(A) = det(At) */ /* [square] det(A) = det(At) */
float t[6]; float t[6];
@@ -593,6 +597,8 @@ void
glm_mat4_inv(mat4 mat, mat4 dest) { glm_mat4_inv(mat4 mat, mat4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
glm_mat4_inv_sse2(mat, dest); glm_mat4_inv_sse2(mat, dest);
#elif defined(CGLM_NEON_FP)
glm_mat4_inv_neon(mat, dest);
#else #else
float t[6]; float t[6];
float det; float det;

View File

@@ -10,19 +10,42 @@
#include "intrin.h" #include "intrin.h"
#ifdef CGLM_SIMD_ARM #ifdef CGLM_SIMD_ARM
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
# define CGLM_ARM64 1
#endif
#define glmm_load(p) vld1q_f32(p) #define glmm_load(p) vld1q_f32(p)
#define glmm_store(p, a) vst1q_f32(p, a) #define glmm_store(p, a) vst1q_f32(p, a)
#define glmm_set1(x) vdupq_n_f32(x)
#define glmm_128 float32x4_t
#define glmm_splat_x(x) vdupq_lane_f32(vget_low_f32(x), 0)
#define glmm_splat_y(x) vdupq_lane_f32(vget_low_f32(x), 1)
#define glmm_splat_z(x) vdupq_lane_f32(vget_high_f32(x), 0)
#define glmm_splat_w(x) vdupq_lane_f32(vget_high_f32(x), 1)
#define glmm_xor(a, b) \
vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), \
vreinterpretq_s32_f32(b)))
static inline static inline
float32x4_t float32x4_t
glmm_abs(float32x4_t v) { glmm_abs(float32x4_t v) {
return vabsq_f32(v); return vabsq_f32(v);
} }
static inline
float32x4_t
glmm_vhadd(float32x4_t v) {
v = vaddq_f32(v, vrev64q_f32(v));
return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
}
static inline static inline
float float
glmm_hadd(float32x4_t v) { glmm_hadd(float32x4_t v) {
#if defined(__aarch64__) #if CGLM_ARM64
return vaddvq_f32(v); return vaddvq_f32(v);
#else #else
v = vaddq_f32(v, vrev64q_f32(v)); v = vaddq_f32(v, vrev64q_f32(v));
@@ -79,5 +102,58 @@ glmm_norm_inf(float32x4_t a) {
return glmm_hmax(glmm_abs(a)); return glmm_hmax(glmm_abs(a));
} }
static inline
float32x4_t
glmm_div(float32x4_t a, float32x4_t b) {
#if CGLM_ARM64
return vdivq_f32(a, b);
#else
/* 2 iterations of Newton-Raphson refinement of reciprocal */
float32x4_t r0, r1;
r0 = vrecpeq_f32(b);
r1 = vrecpsq_f32(r0, b);
r0 = vmulq_f32(r1, r0);
r1 = vrecpsq_f32(r0, b);
r0 = vmulq_f32(r1, r0);
return vmulq_f32(a, r0);
#endif
}
static inline
float32x4_t
glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
#if CGLM_ARM64
return vfmaq_f32(c, a, b); /* why vfmaq_f32 is slower than vmlaq_f32 ??? */
#else
return vmlaq_f32(c, a, b);
#endif
}
static inline
float32x4_t
glmm_fnmadd(float32x4_t a, float32x4_t b, float32x4_t c) {
#if CGLM_ARM64
return vfmsq_f32(c, a, b);
#else
return vmlsq_f32(c, a, b);
#endif
}
static inline
float32x4_t
glmm_fmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
#if CGLM_ARM64
return vfmsq_f32(c, a, b);
#else
return vmlsq_f32(c, a, b);
#endif
}
static inline
float32x4_t
glmm_fnmsub(float32x4_t a, float32x4_t b, float32x4_t c) {
return vsubq_f32(vdupq_n_f32(0.0f), glmm_fmadd(a, b, c));
}
#endif #endif
#endif /* cglm_simd_arm_h */ #endif /* cglm_simd_arm_h */

View File

@@ -14,6 +14,16 @@
#include <immintrin.h> #include <immintrin.h>
CGLM_INLINE
void
glm_mat4_scale_avx(mat4 m, float s) {
__m256 y0;
y0 = _mm256_set1_ps(s);
glmm_store256(m[0], _mm256_mul_ps(y0, glmm_load256(m[0])));
glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2])));
}
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {

View File

@@ -0,0 +1,80 @@
/*
* Copyright (c), Recep Aslantas.
*
* MIT License (MIT), http://opensource.org/licenses/MIT
* Full license can be found in the LICENSE file
*/
#ifndef cglm_affine_neon_h
#define cglm_affine_neon_h
#if defined(__ARM_NEON_FP)
#include "../../common.h"
#include "../intrin.h"
CGLM_INLINE
void
glm_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */
glmm_128 l0, l1, l2, l3, r0, r1, r2, r3, v0, v1, v2, v3;
l0 = glmm_load(m1[0]); r0 = glmm_load(m2[0]);
l1 = glmm_load(m1[1]); r1 = glmm_load(m2[1]);
l2 = glmm_load(m1[2]); r2 = glmm_load(m2[2]);
l3 = glmm_load(m1[3]); r3 = glmm_load(m2[3]);
v0 = vmulq_f32(glmm_splat_x(r0), l0);
v1 = vmulq_f32(glmm_splat_x(r1), l0);
v2 = vmulq_f32(glmm_splat_x(r2), l0);
v3 = vmulq_f32(glmm_splat_x(r3), l0);
v0 = glmm_fmadd(glmm_splat_y(r0), l1, v0);
v1 = glmm_fmadd(glmm_splat_y(r1), l1, v1);
v2 = glmm_fmadd(glmm_splat_y(r2), l1, v2);
v3 = glmm_fmadd(glmm_splat_y(r3), l1, v3);
v0 = glmm_fmadd(glmm_splat_z(r0), l2, v0);
v1 = glmm_fmadd(glmm_splat_z(r1), l2, v1);
v2 = glmm_fmadd(glmm_splat_z(r2), l2, v2);
v3 = glmm_fmadd(glmm_splat_z(r3), l2, v3);
v3 = glmm_fmadd(glmm_splat_w(r3), l3, v3);
glmm_store(dest[0], v0);
glmm_store(dest[1], v1);
glmm_store(dest[2], v2);
glmm_store(dest[3], v3);
}
CGLM_INLINE
void
glm_mul_rot_neon(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */
glmm_128 l0, l1, l2, r0, r1, r2, v0, v1, v2;
l0 = glmm_load(m1[0]); r0 = glmm_load(m2[0]);
l1 = glmm_load(m1[1]); r1 = glmm_load(m2[1]);
l2 = glmm_load(m1[2]); r2 = glmm_load(m2[2]);
v0 = vmulq_f32(glmm_splat_x(r0), l0);
v1 = vmulq_f32(glmm_splat_x(r1), l0);
v2 = vmulq_f32(glmm_splat_x(r2), l0);
v0 = glmm_fmadd(glmm_splat_y(r0), l1, v0);
v1 = glmm_fmadd(glmm_splat_y(r1), l1, v1);
v2 = glmm_fmadd(glmm_splat_y(r2), l1, v2);
v0 = glmm_fmadd(glmm_splat_z(r0), l2, v0);
v1 = glmm_fmadd(glmm_splat_z(r1), l2, v1);
v2 = glmm_fmadd(glmm_splat_z(r2), l2, v2);
glmm_store(dest[0], v0);
glmm_store(dest[1], v1);
glmm_store(dest[2], v2);
glmm_store(dest[3], glmm_load(m1[3]));
}
#endif
#endif /* cglm_affine_neon_h */

View File

@@ -42,41 +42,38 @@ CGLM_INLINE
void void
glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) { glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
float32x4_t l0, l1, l2, l3, r, d0, d1, d2, d3;
l0 = vld1q_f32(m2[0]); glmm_128 l0, l1, l2, l3, r0, r1, r2, r3, v0, v1, v2, v3;
l1 = vld1q_f32(m2[1]);
l2 = vld1q_f32(m2[2]);
l3 = vld1q_f32(m2[3]);
r = vld1q_f32(m1[0]); l0 = glmm_load(m1[0]); r0 = glmm_load(m2[0]);
d0 = vmulq_lane_f32(r, vget_low_f32(l0), 0); l1 = glmm_load(m1[1]); r1 = glmm_load(m2[1]);
d1 = vmulq_lane_f32(r, vget_low_f32(l1), 0); l2 = glmm_load(m1[2]); r2 = glmm_load(m2[2]);
d2 = vmulq_lane_f32(r, vget_low_f32(l2), 0); l3 = glmm_load(m1[3]); r3 = glmm_load(m2[3]);
d3 = vmulq_lane_f32(r, vget_low_f32(l3), 0);
r = vld1q_f32(m1[1]); v0 = vmulq_f32(glmm_splat_x(r0), l0);
d0 = vmlaq_lane_f32(d0, r, vget_low_f32(l0), 1); v1 = vmulq_f32(glmm_splat_x(r1), l0);
d1 = vmlaq_lane_f32(d1, r, vget_low_f32(l1), 1); v2 = vmulq_f32(glmm_splat_x(r2), l0);
d2 = vmlaq_lane_f32(d2, r, vget_low_f32(l2), 1); v3 = vmulq_f32(glmm_splat_x(r3), l0);
d3 = vmlaq_lane_f32(d3, r, vget_low_f32(l3), 1);
r = vld1q_f32(m1[2]); v0 = glmm_fmadd(glmm_splat_y(r0), l1, v0);
d0 = vmlaq_lane_f32(d0, r, vget_high_f32(l0), 0); v1 = glmm_fmadd(glmm_splat_y(r1), l1, v1);
d1 = vmlaq_lane_f32(d1, r, vget_high_f32(l1), 0); v2 = glmm_fmadd(glmm_splat_y(r2), l1, v2);
d2 = vmlaq_lane_f32(d2, r, vget_high_f32(l2), 0); v3 = glmm_fmadd(glmm_splat_y(r3), l1, v3);
d3 = vmlaq_lane_f32(d3, r, vget_high_f32(l3), 0);
r = vld1q_f32(m1[3]); v0 = glmm_fmadd(glmm_splat_z(r0), l2, v0);
d0 = vmlaq_lane_f32(d0, r, vget_high_f32(l0), 1); v1 = glmm_fmadd(glmm_splat_z(r1), l2, v1);
d1 = vmlaq_lane_f32(d1, r, vget_high_f32(l1), 1); v2 = glmm_fmadd(glmm_splat_z(r2), l2, v2);
d2 = vmlaq_lane_f32(d2, r, vget_high_f32(l2), 1); v3 = glmm_fmadd(glmm_splat_z(r3), l2, v3);
d3 = vmlaq_lane_f32(d3, r, vget_high_f32(l3), 1);
vst1q_f32(dest[0], d0); v0 = glmm_fmadd(glmm_splat_w(r0), l3, v0);
vst1q_f32(dest[1], d1); v1 = glmm_fmadd(glmm_splat_w(r1), l3, v1);
vst1q_f32(dest[2], d2); v2 = glmm_fmadd(glmm_splat_w(r2), l3, v2);
vst1q_f32(dest[3], d3); v3 = glmm_fmadd(glmm_splat_w(r3), l3, v3);
glmm_store(dest[0], v0);
glmm_store(dest[1], v1);
glmm_store(dest[2], v2);
glmm_store(dest[3], v3);
} }
CGLM_INLINE CGLM_INLINE
@@ -101,5 +98,216 @@ glm_mat4_mulv_neon(mat4 m, vec4 v, vec4 dest) {
vst1q_f32(dest, l0); vst1q_f32(dest, l0);
} }
CGLM_INLINE
float
glm_mat4_det_neon(mat4 mat) {
float32x4_t r0, r1, r2, r3, x0, x1, x2;
float32x2_t ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34;
float32x4x2_t a1;
float32x4_t x3 = { 0.f, -0.f, 0.f, -0.f };
/* 127 <- 0, [square] det(A) = det(At) */
r0 = glmm_load(mat[0]); /* d c b a */
r1 = vrev64q_f32(glmm_load(mat[1])); /* g h e f */
r2 = vrev64q_f32(glmm_load(mat[2])); /* l k i j */
r3 = vrev64q_f32(glmm_load(mat[3])); /* o p m n */
gh = vget_high_f32(r1);
ef = vget_low_f32(r1);
kl = vget_high_f32(r2);
ij = vget_low_f32(r2);
op = vget_high_f32(r3);
mn = vget_low_f32(r3);
mm = vdup_lane_f32(mn, 1);
nn = vdup_lane_f32(mn, 0);
ii = vdup_lane_f32(ij, 1);
jj = vdup_lane_f32(ij, 0);
/*
t[1] = j * p - n * l;
t[2] = j * o - n * k;
t[3] = i * p - m * l;
t[4] = i * o - m * k;
*/
x0 = glmm_fnmadd(vcombine_f32(kl, kl), vcombine_f32(nn, mm),
vmulq_f32(vcombine_f32(op, op), vcombine_f32(jj, ii)));
t12 = vget_low_f32(x0);
t34 = vget_high_f32(x0);
/* 1 3 1 3 2 4 2 4 */
a1 = vuzpq_f32(x0, x0);
/*
t[0] = k * p - o * l;
t[0] = k * p - o * l;
t[5] = i * n - m * j;
t[5] = i * n - m * j;
*/
x1 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(kl, 0), jj),
vcombine_f32(vdup_lane_f32(op, 1), mm),
vmulq_f32(vcombine_f32(vdup_lane_f32(op, 0), nn),
vcombine_f32(vdup_lane_f32(kl, 1), ii)));
/*
a * (f * t[0] - g * t[1] + h * t[2])
- b * (e * t[0] - g * t[3] + h * t[4])
+ c * (e * t[1] - f * t[3] + h * t[5])
- d * (e * t[2] - f * t[4] + g * t[5])
*/
x2 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(gh, 1), vdup_lane_f32(ef, 0)),
vcombine_f32(vget_low_f32(a1.val[0]), t34),
vmulq_f32(vcombine_f32(ef, vdup_lane_f32(ef, 1)),
vcombine_f32(vget_low_f32(x1), t12)));
x2 = glmm_fmadd(vcombine_f32(vdup_lane_f32(gh, 0), gh),
vcombine_f32(vget_low_f32(a1.val[1]), vget_high_f32(x1)), x2);
x2 = glmm_xor(x2, x3);
return glmm_hadd(vmulq_f32(x2, r0));
}
CGLM_INLINE
void
glm_mat4_inv_neon(mat4 mat, mat4 dest) {
float32x4_t r0, r1, r2, r3,
v0, v1, v2, v3,
t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7, x8;
float32x4x2_t a1;
float32x2_t lp, ko, hg, jn, im, fe, ae, bf, cg, dh;
float32x4_t x9 = { -0.f, 0.f, -0.f, 0.f };
x8 = vrev64q_f32(x9);
/* 127 <- 0 */
r0 = glmm_load(mat[0]); /* d c b a */
r1 = glmm_load(mat[1]); /* h g f e */
r2 = glmm_load(mat[2]); /* l k j i */
r3 = glmm_load(mat[3]); /* p o n m */
/* l p k o, j n i m */
a1 = vzipq_f32(r3, r2);
jn = vget_high_f32(a1.val[0]);
im = vget_low_f32(a1.val[0]);
lp = vget_high_f32(a1.val[1]);
ko = vget_low_f32(a1.val[1]);
hg = vget_high_f32(r1);
x1 = vcombine_f32(vdup_lane_f32(lp, 0), lp); /* l p p p */
x2 = vcombine_f32(vdup_lane_f32(ko, 0), ko); /* k o o o */
x0 = vcombine_f32(vdup_lane_f32(lp, 1), vdup_lane_f32(hg, 1)); /* h h l l */
x3 = vcombine_f32(vdup_lane_f32(ko, 1), vdup_lane_f32(hg, 0)); /* g g k k */
/* t1[0] = k * p - o * l;
t1[0] = k * p - o * l;
t2[0] = g * p - o * h;
t3[0] = g * l - k * h; */
t0 = glmm_fnmadd(x2, x0, vmulq_f32(x3, x1));
fe = vget_low_f32(r1);
x4 = vcombine_f32(vdup_lane_f32(jn, 0), jn); /* j n n n */
x5 = vcombine_f32(vdup_lane_f32(jn, 1), vdup_lane_f32(fe, 1)); /* f f j j */
/* t1[1] = j * p - n * l;
t1[1] = j * p - n * l;
t2[1] = f * p - n * h;
t3[1] = f * l - j * h; */
t1 = glmm_fnmadd(x4, x0, vmulq_f32(x5, x1));
/* t1[2] = j * o - n * k
t1[2] = j * o - n * k;
t2[2] = f * o - n * g;
t3[2] = f * k - j * g; */
t2 = glmm_fnmadd(x4, x3, vmulq_f32(x5, x2));
x6 = vcombine_f32(vdup_lane_f32(im, 1), vdup_lane_f32(fe, 0)); /* e e i i */
x7 = vcombine_f32(vdup_lane_f32(im, 0), im); /* i m m m */
/* t1[3] = i * p - m * l;
t1[3] = i * p - m * l;
t2[3] = e * p - m * h;
t3[3] = e * l - i * h; */
t3 = glmm_fnmadd(x7, x0, vmulq_f32(x6, x1));
/* t1[4] = i * o - m * k;
t1[4] = i * o - m * k;
t2[4] = e * o - m * g;
t3[4] = e * k - i * g; */
t4 = glmm_fnmadd(x7, x3, vmulq_f32(x6, x2));
/* t1[5] = i * n - m * j;
t1[5] = i * n - m * j;
t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */
t5 = glmm_fnmadd(x7, x5, vmulq_f32(x6, x4));
/* h d f b, g c e a */
a1 = vtrnq_f32(r0, r1);
x4 = vrev64q_f32(a1.val[0]); /* c g a e */
x5 = vrev64q_f32(a1.val[1]); /* d h b f */
ae = vget_low_f32(x4);
cg = vget_high_f32(x4);
bf = vget_low_f32(x5);
dh = vget_high_f32(x5);
x0 = vcombine_f32(ae, vdup_lane_f32(ae, 1)); /* a a a e */
x1 = vcombine_f32(bf, vdup_lane_f32(bf, 1)); /* b b b f */
x2 = vcombine_f32(cg, vdup_lane_f32(cg, 1)); /* c c c g */
x3 = vcombine_f32(dh, vdup_lane_f32(dh, 1)); /* d d d h */
/*
dest[0][0] = f * t1[0] - g * t1[1] + h * t1[2];
dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2];
dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
v0 = glmm_xor(glmm_fmadd(x3, t2, glmm_fnmadd(x2, t1, vmulq_f32(x1, t0))), x8);
/*
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = glmm_xor(glmm_fmadd(x3, t5, glmm_fnmadd(x1, t3, vmulq_f32(x0, t1))), x8);
/*
dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4];
dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */
v1 = glmm_xor(glmm_fmadd(x3, t4, glmm_fnmadd(x2, t3, vmulq_f32(x0, t0))), x9);
/*
dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5];
dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */
v3 = glmm_xor(glmm_fmadd(x2, t5, glmm_fnmadd(x1, t4, vmulq_f32(x0, t2))), x9);
/* determinant */
x0 = vcombine_f32(vget_low_f32(vzipq_f32(v0, v1).val[0]),
vget_low_f32(vzipq_f32(v2, v3).val[0]));
/*
x0 = glmm_div(glmm_set1(1.0f), glmm_vhadd(vmulq_f32(x0, r0)));
glmm_store(dest[0], vmulq_f32(v0, x0));
glmm_store(dest[1], vmulq_f32(v1, x0));
glmm_store(dest[2], vmulq_f32(v2, x0));
glmm_store(dest[3], vmulq_f32(v3, x0));
*/
x0 = glmm_vhadd(vmulq_f32(x0, r0));
glmm_store(dest[0], glmm_div(v0, x0));
glmm_store(dest[1], glmm_div(v1, x0));
glmm_store(dest[2], glmm_div(v2, x0));
glmm_store(dest[3], glmm_div(v3, x0));
}
#endif #endif
#endif /* cglm_mat4_neon_h */ #endif /* cglm_mat4_neon_h */

View File

@@ -22,31 +22,31 @@ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
l1 = glmm_load(m1[1]); l1 = glmm_load(m1[1]);
l2 = glmm_load(m1[2]); l2 = glmm_load(m1[2]);
l3 = glmm_load(m1[3]); l3 = glmm_load(m1[3]);
r = glmm_load(m2[0]); r = glmm_load(m2[0]);
glmm_store(dest[0], glmm_store(dest[0],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_splat(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[1]); r = glmm_load(m2[1]);
glmm_store(dest[1], glmm_store(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_splat(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[2]); r = glmm_load(m2[2]);
glmm_store(dest[2], glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_splat(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[3]); r = glmm_load(m2[3]);
glmm_store(dest[3], glmm_store(dest[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_splat(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_splat(r, 1), l1,
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2), glmm_fmadd(glmm_splat(r, 2), l2,
_mm_mul_ps(glmm_shuff1x(r, 3), l3)))); _mm_mul_ps(glmm_splat(r, 3), l3)))));
} }
CGLM_INLINE CGLM_INLINE
@@ -62,21 +62,22 @@ glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
r = glmm_load(m2[0]); r = glmm_load(m2[0]);
glmm_store(dest[0], glmm_store(dest[0],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_splat(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[1]); r = glmm_load(m2[1]);
glmm_store(dest[1], glmm_store(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_splat(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_splat(r, 2), l2))));
r = glmm_load(m2[2]); r = glmm_load(m2[2]);
glmm_store(dest[2], glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), glmm_fmadd(glmm_splat(r, 0), l0,
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), glmm_fmadd(glmm_splat(r, 1), l1,
_mm_mul_ps(glmm_shuff1x(r, 2), l2))); _mm_mul_ps(glmm_splat(r, 2), l2))));
glmm_store(dest[3], l3); glmm_store(dest[3], l3);
} }
@@ -94,9 +95,9 @@ glm_inv_tr_sse2(mat4 mat) {
_MM_TRANSPOSE4_PS(r0, r1, r2, x1); _MM_TRANSPOSE4_PS(r0, r1, r2, x1);
x0 = _mm_add_ps(_mm_mul_ps(r0, glmm_shuff1(r3, 0, 0, 0, 0)), x0 = glmm_fmadd(r0, glmm_shuff1(r3, 0, 0, 0, 0),
_mm_mul_ps(r1, glmm_shuff1(r3, 1, 1, 1, 1))); glmm_fmadd(r1, glmm_shuff1(r3, 1, 1, 1, 1),
x0 = _mm_add_ps(x0, _mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2))); _mm_mul_ps(r2, glmm_shuff1(r3, 2, 2, 2, 2))));
x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f)); x0 = _mm_xor_ps(x0, _mm_set1_ps(-0.f));
x0 = _mm_add_ps(x0, x1); x0 = _mm_add_ps(x0, x1);

View File

@@ -26,11 +26,11 @@ glm_mat2_mul_sse2(mat2 m1, mat2 m2, mat2 dest) {
dest[1][0] = a * g + c * h; dest[1][0] = a * g + c * h;
dest[1][1] = b * g + d * h; dest[1][1] = b * g + d * h;
*/ */
x0 = _mm_mul_ps(_mm_movelh_ps(x1, x1), glmm_shuff1(x2, 2, 2, 0, 0)); x0 = glmm_fmadd(_mm_movelh_ps(x1, x1), glmm_shuff1(x2, 2, 2, 0, 0),
x1 = _mm_mul_ps(_mm_movehl_ps(x1, x1), glmm_shuff1(x2, 3, 3, 1, 1)); _mm_mul_ps(_mm_movehl_ps(x1, x1),
x1 = _mm_add_ps(x0, x1); glmm_shuff1(x2, 3, 3, 1, 1)));
glmm_store(dest[0], x1); glmm_store(dest[0], x0);
} }
CGLM_INLINE CGLM_INLINE

View File

@@ -30,24 +30,17 @@ glm_mat3_mul_sse2(mat3 m1, mat3 m2, mat3 dest) {
x1 = glmm_shuff2(l0, l1, 1, 0, 3, 3, 0, 3, 2, 0); x1 = glmm_shuff2(l0, l1, 1, 0, 3, 3, 0, 3, 2, 0);
x2 = glmm_shuff2(l1, l2, 0, 0, 3, 2, 0, 2, 1, 0); x2 = glmm_shuff2(l1, l2, 0, 0, 3, 2, 0, 2, 1, 0);
x0 = _mm_add_ps(_mm_mul_ps(glmm_shuff1(l0, 0, 2, 1, 0), x0 = glmm_fmadd(glmm_shuff1(l0, 0, 2, 1, 0), glmm_shuff1(r0, 3, 0, 0, 0),
glmm_shuff1(r0, 3, 0, 0, 0)), glmm_fmadd(x1, glmm_shuff2(r0, r1, 0, 0, 1, 1, 2, 0, 0, 0),
_mm_mul_ps(x1, glmm_shuff2(r0, r1, 0, 0, 1, 1, 2, 0, 0, 0))); _mm_mul_ps(x2, glmm_shuff2(r0, r1, 1, 1, 2, 2, 2, 0, 0, 0))));
x0 = _mm_add_ps(x0,
_mm_mul_ps(x2, glmm_shuff2(r0, r1, 1, 1, 2, 2, 2, 0, 0, 0)));
_mm_storeu_ps(dest[0], x0); _mm_storeu_ps(dest[0], x0);
x0 = _mm_add_ps(_mm_mul_ps(glmm_shuff1(l0, 1, 0, 2, 1), x0 = glmm_fmadd(glmm_shuff1(l0, 1, 0, 2, 1), _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 2, 3, 3)),
_mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 2, 3, 3))), glmm_fmadd(glmm_shuff1(x1, 1, 0, 2, 1), glmm_shuff1(r1, 3, 3, 0, 0),
_mm_mul_ps(glmm_shuff1(x1, 1, 0, 2, 1), _mm_mul_ps(glmm_shuff1(x2, 1, 0, 2, 1),
glmm_shuff1(r1, 3, 3, 0, 0))); _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1)))));
x0 = _mm_add_ps(x0,
_mm_mul_ps(glmm_shuff1(x2, 1, 0, 2, 1),
_mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1))));
_mm_storeu_ps(&dest[1][1], x0); _mm_storeu_ps(&dest[1][1], x0);
dest[2][2] = m1[0][2] * m2[2][0] dest[2][2] = m1[0][2] * m2[2][0]

View File

@@ -55,47 +55,37 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
l1 = glmm_load(m1[1]); l1 = glmm_load(m1[1]);
l2 = glmm_load(m1[2]); l2 = glmm_load(m1[2]);
l3 = glmm_load(m1[3]); l3 = glmm_load(m1[3]);
#define XX(C) \
\
r = glmm_load(m2[C]); \
glmm_store(dest[C], \
glmm_fmadd(glmm_splat(r, 0), l0, \
glmm_fmadd(glmm_splat(r, 1), l1, \
glmm_fmadd(glmm_splat(r, 2), l2, \
_mm_mul_ps(glmm_splat(r, 3), l3)))));
r = glmm_load(m2[0]); XX(0);
glmm_store(dest[0], XX(1);
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0), XX(2);
_mm_mul_ps(glmm_shuff1x(r, 1), l1)), XX(3);
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
_mm_mul_ps(glmm_shuff1x(r, 3), l3))));
r = glmm_load(m2[1]);
glmm_store(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
_mm_mul_ps(glmm_shuff1x(r, 3), l3))));
r = glmm_load(m2[2]);
glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
_mm_mul_ps(glmm_shuff1x(r, 3), l3))));
r = glmm_load(m2[3]); #undef XX
glmm_store(dest[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 0), l0),
_mm_mul_ps(glmm_shuff1x(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(glmm_shuff1x(r, 2), l2),
_mm_mul_ps(glmm_shuff1x(r, 3), l3))));
} }
CGLM_INLINE CGLM_INLINE
void void
glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) {
__m128 x0, x1, x2; __m128 x0, x1;
x0 = glmm_load(v); x0 = glmm_load(v);
x1 = _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), glmm_shuff1x(x0, 0)), x1 = glmm_fmadd(glmm_load(m[0]), glmm_splat(x0, 0),
_mm_mul_ps(glmm_load(m[1]), glmm_shuff1x(x0, 1))); glmm_fmadd(glmm_load(m[1]), glmm_splat(x0, 1),
glmm_fmadd(glmm_load(m[2]), glmm_splat(x0, 2),
x2 = _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), glmm_shuff1x(x0, 2)), _mm_mul_ps(glmm_load(m[3]),
_mm_mul_ps(glmm_load(m[3]), glmm_shuff1x(x0, 3))); glmm_splat(x0, 3)))));
glmm_store(dest, _mm_add_ps(x1, x2)); glmm_store(dest, x1);
} }
CGLM_INLINE CGLM_INLINE
@@ -115,20 +105,18 @@ glm_mat4_det_sse2(mat4 mat) {
t[3] = i * p - m * l; t[3] = i * p - m * l;
t[4] = i * o - m * k; t[4] = i * o - m * k;
*/ */
x0 = _mm_sub_ps(_mm_mul_ps(glmm_shuff1(r2, 0, 0, 1, 1), x0 = glmm_fnmadd(glmm_shuff1(r3, 0, 0, 1, 1), glmm_shuff1(r2, 2, 3, 2, 3),
glmm_shuff1(r3, 2, 3, 2, 3)), _mm_mul_ps(glmm_shuff1(r2, 0, 0, 1, 1),
_mm_mul_ps(glmm_shuff1(r3, 0, 0, 1, 1), glmm_shuff1(r3, 2, 3, 2, 3)));
glmm_shuff1(r2, 2, 3, 2, 3)));
/* /*
t[0] = k * p - o * l; t[0] = k * p - o * l;
t[0] = k * p - o * l; t[0] = k * p - o * l;
t[5] = i * n - m * j; t[5] = i * n - m * j;
t[5] = i * n - m * j; t[5] = i * n - m * j;
*/ */
x1 = _mm_sub_ps(_mm_mul_ps(glmm_shuff1(r2, 0, 0, 2, 2), x1 = glmm_fnmadd(glmm_shuff1(r3, 0, 0, 2, 2), glmm_shuff1(r2, 1, 1, 3, 3),
glmm_shuff1(r3, 1, 1, 3, 3)), _mm_mul_ps(glmm_shuff1(r2, 0, 0, 2, 2),
_mm_mul_ps(glmm_shuff1(r3, 0, 0, 2, 2), glmm_shuff1(r3, 1, 1, 3, 3)));
glmm_shuff1(r2, 1, 1, 3, 3)));
/* /*
a * (f * t[0] - g * t[1] + h * t[2]) a * (f * t[0] - g * t[1] + h * t[2])
@@ -136,21 +124,16 @@ glm_mat4_det_sse2(mat4 mat) {
+ c * (e * t[1] - f * t[3] + h * t[5]) + c * (e * t[1] - f * t[3] + h * t[5])
- d * (e * t[2] - f * t[4] + g * t[5]) - d * (e * t[2] - f * t[4] + g * t[5])
*/ */
x2 = _mm_sub_ps(_mm_mul_ps(glmm_shuff1(r1, 0, 0, 0, 1), x2 = glmm_fnmadd(glmm_shuff1(r1, 1, 1, 2, 2), glmm_shuff1(x0, 3, 2, 2, 0),
_mm_shuffle_ps(x1, x0, _MM_SHUFFLE(1, 0, 0, 0))), _mm_mul_ps(glmm_shuff1(r1, 0, 0, 0, 1),
_mm_mul_ps(glmm_shuff1(r1, 1, 1, 2, 2), _mm_shuffle_ps(x1, x0, _MM_SHUFFLE(1, 0, 0, 0))));
glmm_shuff1(x0, 3, 2, 2, 0))); x2 = glmm_fmadd(glmm_shuff1(r1, 2, 3, 3, 3),
_mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)),
x2 = _mm_add_ps(x2, x2);
_mm_mul_ps(glmm_shuff1(r1, 2, 3, 3, 3),
_mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1))));
x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
x0 = _mm_mul_ps(r0, x2); return glmm_hadd(_mm_mul_ps(x2, r0));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 2, 3));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 3, 3, 1));
return _mm_cvtss_f32(x0);
} }
CGLM_INLINE CGLM_INLINE
@@ -159,8 +142,11 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
__m128 r0, r1, r2, r3, __m128 r0, r1, r2, r3,
v0, v1, v2, v3, v0, v1, v2, v3,
t0, t1, t2, t3, t4, t5, t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7; x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
x9 = glmm_shuff1(x8, 2, 1, 2, 1);
/* 127 <- 0 */ /* 127 <- 0 */
r0 = glmm_load(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
r1 = glmm_load(mat[1]); /* h g f e */ r1 = glmm_load(mat[1]); /* h g f e */
@@ -177,8 +163,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
t1[0] = k * p - o * l; t1[0] = k * p - o * l;
t2[0] = g * p - o * h; t2[0] = g * p - o * h;
t3[0] = g * l - k * h; */ t3[0] = g * l - k * h; */
t0 = _mm_sub_ps(_mm_mul_ps(x3, x1), _mm_mul_ps(x2, x0)); t0 = glmm_fnmadd(x2, x0, _mm_mul_ps(x3, x1));
x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */ x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */
x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */ x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */
x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */ x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */
@@ -187,14 +173,14 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
t1[1] = j * p - n * l; t1[1] = j * p - n * l;
t2[1] = f * p - n * h; t2[1] = f * p - n * h;
t3[1] = f * l - j * h; */ t3[1] = f * l - j * h; */
t1 = _mm_sub_ps(_mm_mul_ps(x5, x1), _mm_mul_ps(x4, x0)); t1 = glmm_fnmadd(x4, x0, _mm_mul_ps(x5, x1));
/* t1[2] = j * o - n * k /* t1[2] = j * o - n * k
t1[2] = j * o - n * k; t1[2] = j * o - n * k;
t2[2] = f * o - n * g; t2[2] = f * o - n * g;
t3[2] = f * k - j * g; */ t3[2] = f * k - j * g; */
t2 = _mm_sub_ps(_mm_mul_ps(x5, x2), _mm_mul_ps(x4, x3)); t2 = glmm_fnmadd(x4, x3, _mm_mul_ps(x5, x2));
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */ x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */ x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */
@@ -202,20 +188,20 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
t1[3] = i * p - m * l; t1[3] = i * p - m * l;
t2[3] = e * p - m * h; t2[3] = e * p - m * h;
t3[3] = e * l - i * h; */ t3[3] = e * l - i * h; */
t3 = _mm_sub_ps(_mm_mul_ps(x6, x1), _mm_mul_ps(x7, x0)); t3 = glmm_fnmadd(x7, x0, _mm_mul_ps(x6, x1));
/* t1[4] = i * o - m * k; /* t1[4] = i * o - m * k;
t1[4] = i * o - m * k; t1[4] = i * o - m * k;
t2[4] = e * o - m * g; t2[4] = e * o - m * g;
t3[4] = e * k - i * g; */ t3[4] = e * k - i * g; */
t4 = _mm_sub_ps(_mm_mul_ps(x6, x2), _mm_mul_ps(x7, x3)); t4 = glmm_fnmadd(x7, x3, _mm_mul_ps(x6, x2));
/* t1[5] = i * n - m * j; /* t1[5] = i * n - m * j;
t1[5] = i * n - m * j; t1[5] = i * n - m * j;
t2[5] = e * n - m * f; t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */ t3[5] = e * j - i * f; */
t5 = _mm_sub_ps(_mm_mul_ps(x6, x4), _mm_mul_ps(x7, x5)); t5 = glmm_fnmadd(x7, x5, _mm_mul_ps(x6, x4));
x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */ x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */
x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */ x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */
x2 = glmm_shuff2(r1, r0, 2, 2, 2, 2, 2, 2, 2, 0); /* c c c g */ x2 = glmm_shuff2(r1, r0, 2, 2, 2, 2, 2, 2, 2, 0); /* c c c g */
@@ -226,50 +212,35 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]); dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2]; dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2];
dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */ dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
v0 = _mm_add_ps(_mm_mul_ps(x3, t2), v0 = _mm_xor_ps(glmm_fmadd(x3, t2, glmm_fnmadd(x2, t1, _mm_mul_ps(x1, t0))), x8);
_mm_sub_ps(_mm_mul_ps(x1, t0),
_mm_mul_ps(x2, t1))); /*
v0 = _mm_xor_ps(v0, _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_xor_ps(glmm_fmadd(x3, t5, glmm_fnmadd(x1, t3, _mm_mul_ps(x0, t1))), x8);
/* /*
dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]); dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4]; dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4];
dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]); dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */ dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */
v1 = _mm_add_ps(_mm_mul_ps(x3, t4), v1 = _mm_xor_ps(glmm_fmadd(x3, t4, glmm_fnmadd(x2, t3, _mm_mul_ps(x0, t0))), x9);
_mm_sub_ps(_mm_mul_ps(x0, t0),
_mm_mul_ps(x2, t3)));
v1 = _mm_xor_ps(v1, _mm_set_ps(0.f, -0.f, 0.f, -0.f));
/*
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_add_ps(_mm_mul_ps(x3, t5),
_mm_sub_ps(_mm_mul_ps(x0, t1),
_mm_mul_ps(x1, t3)));
v2 = _mm_xor_ps(v2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
/* /*
dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]); dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5]; dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5];
dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]); dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */ dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */
v3 = _mm_add_ps(_mm_mul_ps(x2, t5), v3 = _mm_xor_ps(glmm_fmadd(x2, t5, glmm_fnmadd(x1, t4, _mm_mul_ps(x0, t2))), x9);
_mm_sub_ps(_mm_mul_ps(x0, t2),
_mm_mul_ps(x1, t4)));
v3 = _mm_xor_ps(v3, _mm_set_ps(0.f, -0.f, 0.f, -0.f));
/* determinant */ /* determinant */
x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0)); x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0)); x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0)); x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
x0 = _mm_mul_ps(x0, r0); x0 = _mm_rcp_ps(glmm_vhadd(_mm_mul_ps(x0, r0)));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 2, 3));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1));
x0 = _mm_rcp_ps(x0);
glmm_store(dest[0], _mm_mul_ps(v0, x0)); glmm_store(dest[0], _mm_mul_ps(v0, x0));
glmm_store(dest[1], _mm_mul_ps(v1, x0)); glmm_store(dest[1], _mm_mul_ps(v1, x0));
@@ -283,8 +254,11 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
__m128 r0, r1, r2, r3, __m128 r0, r1, r2, r3,
v0, v1, v2, v3, v0, v1, v2, v3,
t0, t1, t2, t3, t4, t5, t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7; x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
x9 = glmm_shuff1(x8, 2, 1, 2, 1);
/* 127 <- 0 */ /* 127 <- 0 */
r0 = glmm_load(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
r1 = glmm_load(mat[1]); /* h g f e */ r1 = glmm_load(mat[1]); /* h g f e */
@@ -301,8 +275,8 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
t1[0] = k * p - o * l; t1[0] = k * p - o * l;
t2[0] = g * p - o * h; t2[0] = g * p - o * h;
t3[0] = g * l - k * h; */ t3[0] = g * l - k * h; */
t0 = _mm_sub_ps(_mm_mul_ps(x3, x1), _mm_mul_ps(x2, x0)); t0 = glmm_fnmadd(x2, x0, _mm_mul_ps(x3, x1));
x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */ x4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(2, 1, 2, 1)); /* o n k j */
x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */ x4 = glmm_shuff1(x4, 0, 2, 2, 2); /* j n n n */
x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */ x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */
@@ -311,14 +285,14 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
t1[1] = j * p - n * l; t1[1] = j * p - n * l;
t2[1] = f * p - n * h; t2[1] = f * p - n * h;
t3[1] = f * l - j * h; */ t3[1] = f * l - j * h; */
t1 = _mm_sub_ps(_mm_mul_ps(x5, x1), _mm_mul_ps(x4, x0)); t1 = glmm_fnmadd(x4, x0, _mm_mul_ps(x5, x1));
/* t1[2] = j * o - n * k /* t1[2] = j * o - n * k
t1[2] = j * o - n * k; t1[2] = j * o - n * k;
t2[2] = f * o - n * g; t2[2] = f * o - n * g;
t3[2] = f * k - j * g; */ t3[2] = f * k - j * g; */
t2 = _mm_sub_ps(_mm_mul_ps(x5, x2), _mm_mul_ps(x4, x3)); t2 = glmm_fnmadd(x4, x3, _mm_mul_ps(x5, x2));
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */ x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */ x7 = glmm_shuff2(r3, r2, 0, 0, 0, 0, 2, 0, 0, 0); /* i m m m */
@@ -326,20 +300,20 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
t1[3] = i * p - m * l; t1[3] = i * p - m * l;
t2[3] = e * p - m * h; t2[3] = e * p - m * h;
t3[3] = e * l - i * h; */ t3[3] = e * l - i * h; */
t3 = _mm_sub_ps(_mm_mul_ps(x6, x1), _mm_mul_ps(x7, x0)); t3 = glmm_fnmadd(x7, x0, _mm_mul_ps(x6, x1));
/* t1[4] = i * o - m * k; /* t1[4] = i * o - m * k;
t1[4] = i * o - m * k; t1[4] = i * o - m * k;
t2[4] = e * o - m * g; t2[4] = e * o - m * g;
t3[4] = e * k - i * g; */ t3[4] = e * k - i * g; */
t4 = _mm_sub_ps(_mm_mul_ps(x6, x2), _mm_mul_ps(x7, x3)); t4 = glmm_fnmadd(x7, x3, _mm_mul_ps(x6, x2));
/* t1[5] = i * n - m * j; /* t1[5] = i * n - m * j;
t1[5] = i * n - m * j; t1[5] = i * n - m * j;
t2[5] = e * n - m * f; t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */ t3[5] = e * j - i * f; */
t5 = _mm_sub_ps(_mm_mul_ps(x6, x4), _mm_mul_ps(x7, x5)); t5 = glmm_fnmadd(x7, x5, _mm_mul_ps(x6, x4));
x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */ x0 = glmm_shuff2(r1, r0, 0, 0, 0, 0, 2, 2, 2, 0); /* a a a e */
x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */ x1 = glmm_shuff2(r1, r0, 1, 1, 1, 1, 2, 2, 2, 0); /* b b b f */
x2 = glmm_shuff2(r1, r0, 2, 2, 2, 2, 2, 2, 2, 0); /* c c c g */ x2 = glmm_shuff2(r1, r0, 2, 2, 2, 2, 2, 2, 2, 0); /* c c c g */
@@ -350,50 +324,35 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]); dest[0][1] =-(b * t1[0] - c * t1[1] + d * t1[2]);
dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2]; dest[0][2] = b * t2[0] - c * t2[1] + d * t2[2];
dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */ dest[0][3] =-(b * t3[0] - c * t3[1] + d * t3[2]); */
v0 = _mm_add_ps(_mm_mul_ps(x3, t2), v0 = _mm_xor_ps(glmm_fmadd(x3, t2, glmm_fnmadd(x2, t1, _mm_mul_ps(x1, t0))), x8);
_mm_sub_ps(_mm_mul_ps(x1, t0),
_mm_mul_ps(x2, t1))); /*
v0 = _mm_xor_ps(v0, _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_xor_ps(glmm_fmadd(x3, t5, glmm_fnmadd(x1, t3, _mm_mul_ps(x0, t1))), x8);
/* /*
dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]); dest[1][0] =-(e * t1[0] - g * t1[3] + h * t1[4]);
dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4]; dest[1][1] = a * t1[0] - c * t1[3] + d * t1[4];
dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]); dest[1][2] =-(a * t2[0] - c * t2[3] + d * t2[4]);
dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */ dest[1][3] = a * t3[0] - c * t3[3] + d * t3[4]; */
v1 = _mm_add_ps(_mm_mul_ps(x3, t4), v1 = _mm_xor_ps(glmm_fmadd(x3, t4, glmm_fnmadd(x2, t3, _mm_mul_ps(x0, t0))), x9);
_mm_sub_ps(_mm_mul_ps(x0, t0),
_mm_mul_ps(x2, t3)));
v1 = _mm_xor_ps(v1, _mm_set_ps(0.f, -0.f, 0.f, -0.f));
/*
dest[2][0] = e * t1[1] - f * t1[3] + h * t1[5];
dest[2][1] =-(a * t1[1] - b * t1[3] + d * t1[5]);
dest[2][2] = a * t2[1] - b * t2[3] + d * t2[5];
dest[2][3] =-(a * t3[1] - b * t3[3] + d * t3[5]);*/
v2 = _mm_add_ps(_mm_mul_ps(x3, t5),
_mm_sub_ps(_mm_mul_ps(x0, t1),
_mm_mul_ps(x1, t3)));
v2 = _mm_xor_ps(v2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
/* /*
dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]); dest[3][0] =-(e * t1[2] - f * t1[4] + g * t1[5]);
dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5]; dest[3][1] = a * t1[2] - b * t1[4] + c * t1[5];
dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]); dest[3][2] =-(a * t2[2] - b * t2[4] + c * t2[5]);
dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */ dest[3][3] = a * t3[2] - b * t3[4] + c * t3[5]; */
v3 = _mm_add_ps(_mm_mul_ps(x2, t5), v3 = _mm_xor_ps(glmm_fmadd(x2, t5, glmm_fnmadd(x1, t4, _mm_mul_ps(x0, t2))), x9);
_mm_sub_ps(_mm_mul_ps(x0, t2),
_mm_mul_ps(x1, t4)));
v3 = _mm_xor_ps(v3, _mm_set_ps(0.f, -0.f, 0.f, -0.f));
/* determinant */ /* determinant */
x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0)); x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0)); x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0)); x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
x0 = _mm_mul_ps(x0, r0); x0 = _mm_div_ps(_mm_set1_ps(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0)));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 2, 3));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1));
x0 = _mm_div_ps(_mm_set1_ps(1.0f), x0);
glmm_store(dest[0], _mm_mul_ps(v0, x0)); glmm_store(dest[0], _mm_mul_ps(v0, x0));
glmm_store(dest[1], _mm_mul_ps(v1, x0)); glmm_store(dest[1], _mm_mul_ps(v1, x0));

View File

@@ -27,15 +27,15 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) {
xp = glmm_load(p); /* 3 2 1 0 */ xp = glmm_load(p); /* 3 2 1 0 */
xq = glmm_load(q); xq = glmm_load(q);
r = _mm_mul_ps(glmm_shuff1x(xp, 3), xq); r = _mm_mul_ps(glmm_splat(xp, 3), xq);
x0 = _mm_xor_ps(glmm_shuff1x(xp, 0), _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); x0 = _mm_xor_ps(glmm_splat(xp, 0), _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 0, 1, 2, 3))); r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 0, 1, 2, 3)));
x0 = _mm_xor_ps(glmm_shuff1x(xp, 1), _mm_set_ps(-0.f, -0.f, 0.f, 0.f)); x0 = _mm_xor_ps(glmm_splat(xp, 1), _mm_set_ps(-0.f, -0.f, 0.f, 0.f));
r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 1, 0, 3, 2))); r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 1, 0, 3, 2)));
x0 = _mm_xor_ps(glmm_shuff1x(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f)); x0 = _mm_xor_ps(glmm_splat(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f));
r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 2, 3, 0, 1))); r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 2, 3, 0, 1)));
glmm_store(dest, r); glmm_store(dest, r);

View File

@@ -18,6 +18,9 @@
# define glmm_store(p, a) _mm_store_ps(p, a) # define glmm_store(p, a) _mm_store_ps(p, a)
#endif #endif
#define glmm_set1(x) _mm_set1_ps(x)
#define glmm_128 __m128
#ifdef CGLM_USE_INT_DOMAIN #ifdef CGLM_USE_INT_DOMAIN
# define glmm_shuff1(xmm, z, y, x, w) \ # define glmm_shuff1(xmm, z, y, x, w) \
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
@@ -27,7 +30,16 @@
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w)) _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
#endif #endif
#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
#define glmm_splat_x(x) glmm_splat(x, 0)
#define glmm_splat_y(x) glmm_splat(x, 1)
#define glmm_splat_z(x) glmm_splat(x, 2)
#define glmm_splat_w(x) glmm_splat(x, 3)
/* glmm_shuff1x() is DEPRECATED!, use glmm_splat() */
#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x) #define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \ #define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
z1, y1, x1, w1) z1, y1, x1, w1)
@@ -48,6 +60,15 @@ glmm_abs(__m128 x) {
return _mm_andnot_ps(_mm_set1_ps(-0.0f), x); return _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
} }
static inline
__m128
glmm_vhadd(__m128 v) {
__m128 x0;
x0 = _mm_add_ps(v, glmm_shuff1(v, 0, 1, 2, 3));
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1));
return x0;
}
static inline static inline
__m128 __m128
glmm_vhadds(__m128 v) { glmm_vhadds(__m128 v) {
@@ -80,7 +101,7 @@ glmm_vhmin(__m128 v) {
__m128 x0, x1, x2; __m128 x0, x1, x2;
x0 = _mm_movehl_ps(v, v); /* [2, 3, 2, 3] */ x0 = _mm_movehl_ps(v, v); /* [2, 3, 2, 3] */
x1 = _mm_min_ps(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ x1 = _mm_min_ps(x0, v); /* [0|2, 1|3, 2|2, 3|3] */
x2 = glmm_shuff1x(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */ x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */
return _mm_min_ss(x1, x2); return _mm_min_ss(x1, x2);
} }
@@ -96,7 +117,7 @@ glmm_vhmax(__m128 v) {
__m128 x0, x1, x2; __m128 x0, x1, x2;
x0 = _mm_movehl_ps(v, v); /* [2, 3, 2, 3] */ x0 = _mm_movehl_ps(v, v); /* [2, 3, 2, 3] */
x1 = _mm_max_ps(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ x1 = _mm_max_ps(x0, v); /* [0|2, 1|3, 2|2, 3|3] */
x2 = glmm_shuff1x(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */ x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */
return _mm_max_ss(x1, x2); return _mm_max_ss(x1, x2);
} }
@@ -188,5 +209,99 @@ glmm_store3(float v[3], __m128 vx) {
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2)); _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
} }
static inline
__m128
glmm_div(__m128 a, __m128 b) {
return _mm_div_ps(a, b);
}
/* enable FMA macro for MSVC? */
#if defined(_MSC_VER) && !defined(__FMA__) && defined(__AVX2__)
# define __FMA__ 1
#endif
static inline
__m128
glmm_fmadd(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fmadd_ps(a, b, c);
#else
return _mm_add_ps(c, _mm_mul_ps(a, b));
#endif
}
static inline
__m128
glmm_fnmadd(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fnmadd_ps(a, b, c);
#else
return _mm_sub_ps(c, _mm_mul_ps(a, b));
#endif
}
static inline
__m128
glmm_fmsub(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fmsub_ps(a, b, c);
#else
return _mm_sub_ps(_mm_mul_ps(a, b), c);
#endif
}
static inline
__m128
glmm_fnmsub(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fnmsub_ps(a, b, c);
#else
return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), _mm_set1_ps(-0.0f));
#endif
}
#if defined(__AVX__)
static inline
__m256
glmm256_fmadd(__m256 a, __m256 b, __m256 c) {
#ifdef __FMA__
return _mm256_fmadd_ps(a, b, c);
#else
return _mm256_add_ps(c, _mm256_mul_ps(a, b));
#endif
}
static inline
__m256
glmm256_fnmadd(__m256 a, __m256 b, __m256 c) {
#ifdef __FMA__
return _mm256_fnmadd_ps(a, b, c);
#else
return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
#endif
}
static inline
__m256
glmm256_fmsub(__m256 a, __m256 b, __m256 c) {
#ifdef __FMA__
return _mm256_fmsub_ps(a, b, c);
#else
return _mm256_sub_ps(_mm256_mul_ps(a, b), c);
#endif
}
static inline
__m256
glmm256_fnmsub(__m256 a, __m256 b, __m256 c) {
#ifdef __FMA__
return _mm256_fmsub_ps(a, b, c);
#else
return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c),
_mm256_set1_ps(-0.0f));
#endif
}
#endif
#endif #endif
#endif /* cglm_simd_x86_h */ #endif /* cglm_simd_x86_h */

View File

@@ -237,9 +237,9 @@ glm_vec3_abs(vec3 v, vec3 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec3_fract(vec3 v, vec3 dest) { glm_vec3_fract(vec3 v, vec3 dest) {
dest[0] = fminf(v[0] - floorf(v[0]), 0x1.fffffep-1f); dest[0] = fminf(v[0] - floorf(v[0]), 0.999999940395355224609375f);
dest[1] = fminf(v[1] - floorf(v[1]), 0x1.fffffep-1f); dest[1] = fminf(v[1] - floorf(v[1]), 0.999999940395355224609375f);
dest[2] = fminf(v[2] - floorf(v[2]), 0x1.fffffep-1f); dest[2] = fminf(v[2] - floorf(v[2]), 0.999999940395355224609375f);
} }
/*! /*!

View File

@@ -224,10 +224,10 @@ glm_vec4_sign(vec4 v, vec4 dest) {
x0 = glmm_load(v); x0 = glmm_load(v);
x1 = _mm_set_ps(0.0f, 0.0f, 1.0f, -1.0f); x1 = _mm_set_ps(0.0f, 0.0f, 1.0f, -1.0f);
x2 = glmm_shuff1x(x1, 2); x2 = glmm_splat(x1, 2);
x3 = _mm_and_ps(_mm_cmpgt_ps(x0, x2), glmm_shuff1x(x1, 1)); x3 = _mm_and_ps(_mm_cmpgt_ps(x0, x2), glmm_splat(x1, 1));
x4 = _mm_and_ps(_mm_cmplt_ps(x0, x2), glmm_shuff1x(x1, 0)); x4 = _mm_and_ps(_mm_cmplt_ps(x0, x2), glmm_splat(x1, 0));
glmm_store(dest, _mm_or_ps(x3, x4)); glmm_store(dest, _mm_or_ps(x3, x4));
#else #else
@@ -268,10 +268,10 @@ glm_vec4_abs(vec4 v, vec4 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec4_fract(vec4 v, vec4 dest) { glm_vec4_fract(vec4 v, vec4 dest) {
dest[0] = fminf(v[0] - floorf(v[0]), 0x1.fffffep-1f); dest[0] = fminf(v[0] - floorf(v[0]), 0.999999940395355224609375f);
dest[1] = fminf(v[1] - floorf(v[1]), 0x1.fffffep-1f); dest[1] = fminf(v[1] - floorf(v[1]), 0.999999940395355224609375f);
dest[2] = fminf(v[2] - floorf(v[2]), 0x1.fffffep-1f); dest[2] = fminf(v[2] - floorf(v[2]), 0.999999940395355224609375f);
dest[3] = fminf(v[3] - floorf(v[3]), 0x1.fffffep-1f); dest[3] = fminf(v[3] - floorf(v[3]), 0.999999940395355224609375f);
} }
/*! /*!

View File

@@ -473,8 +473,8 @@ glm_vec4_scale_as(vec4 v, float s, vec4 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec4_div(vec4 a, vec4 b, vec4 dest) { glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined(CGLM_SIMD)
glmm_store(dest, _mm_div_ps(glmm_load(a), glmm_load(b))); glmm_store(dest, glmm_div(glmm_load(a), glmm_load(b)));
#else #else
dest[0] = a[0] / b[0]; dest[0] = a[0] / b[0];
dest[1] = a[1] / b[1]; dest[1] = a[1] / b[1];
@@ -568,14 +568,8 @@ glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) { glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined(CGLM_SIMD)
glmm_store(dest, _mm_add_ps(glmm_load(dest), glmm_store(dest, glmm_fmadd(glmm_load(a), glmm_load(b), glmm_load(dest)));
_mm_mul_ps(glmm_load(a),
glmm_load(b))));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
vmulq_f32(vld1q_f32(a),
vld1q_f32(b))));
#else #else
dest[0] += a[0] * b[0]; dest[0] += a[0] * b[0];
dest[1] += a[1] * b[1]; dest[1] += a[1] * b[1];
@@ -596,14 +590,8 @@ glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) {
CGLM_INLINE CGLM_INLINE
void void
glm_vec4_muladds(vec4 a, float s, vec4 dest) { glm_vec4_muladds(vec4 a, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined(CGLM_SIMD)
glmm_store(dest, _mm_add_ps(glmm_load(dest), glmm_store(dest, glmm_fmadd(glmm_load(a), glmm_set1(s), glmm_load(dest)));
_mm_mul_ps(glmm_load(a),
_mm_set1_ps(s))));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vaddq_f32(vld1q_f32(dest),
vmulq_f32(vld1q_f32(a),
vdupq_n_f32(s))));
#else #else
dest[0] += a[0] * s; dest[0] += a[0] * s;
dest[1] += a[1] * s; dest[1] += a[1] * s;

View File

@@ -10,6 +10,6 @@
#define CGLM_VERSION_MAJOR 0 #define CGLM_VERSION_MAJOR 0
#define CGLM_VERSION_MINOR 8 #define CGLM_VERSION_MINOR 8
#define CGLM_VERSION_PATCH 0 #define CGLM_VERSION_PATCH 2
#endif /* cglm_version_h */ #endif /* cglm_version_h */

View File

@@ -1,5 +1,5 @@
project('cglm', 'c', project('cglm', 'c',
version : '0.8.0', version : '0.8.2',
license : 'mit', license : 'mit',
default_options : [ default_options : [
'c_std=c11', 'c_std=c11',