Merge pull request #53 from recp/simd

simd: Make alignment OPTIONAL
This commit is contained in:
Recep Aslantas
2018-05-10 13:57:31 +03:00
committed by GitHub
18 changed files with 340 additions and 251 deletions

8
.gitignore vendored
View File

@@ -61,3 +61,11 @@ docs/build/*
win/cglm_test_* win/cglm_test_*
* copy.* * copy.*
*.o *.o
*.obj
*codeanalysis.*.xml
*codeanalysis.xml
*.lib
*.tlog
win/x64
win/x85
win/Debug

View File

@@ -22,6 +22,8 @@ Complete documentation: http://cglm.readthedocs.io
- **[bugfix]** euler angles was implemented in reverse order (extrinsic) it was fixed, now they are intrinsic. Make sure that - **[bugfix]** euler angles was implemented in reverse order (extrinsic) it was fixed, now they are intrinsic. Make sure that
you have the latest version you have the latest version
- **[major change]** by starting v0.4.0, quaternions are stored as [x, y, z, w], it was [w, x, y, z] in v0.3.5 and earlier versions - **[major change]** by starting v0.4.0, quaternions are stored as [x, y, z, w], it was [w, x, y, z] in v0.3.5 and earlier versions
- **[api rename]** by starting v0.4.5, **glm_simd** functions are renamed to **glmm_**
- **[new option]** by starting v0.4.5, you can disable alignment requirement, check options in docs.
#### Note for C++ developers: #### Note for C++ developers:
If you don't aware about original GLM library yet, you may also want to look at: If you don't aware about original GLM library yet, you may also want to look at:

View File

@@ -27,6 +27,13 @@ Alignment is Required:
**vec4** and **mat4** requires 16 byte alignment because vec4 and mat4 operations are **vec4** and **mat4** requires 16 byte alignment because vec4 and mat4 operations are
vectorized by SIMD instructions (SSE/AVX). vectorized by SIMD instructions (SSE/AVX).
**UPDATE:**
By starting v0.4.5 cglm provides an option to disable alignment requirement, it is enabled as default
| Check :doc:`opt` page for more details
Also alignment is disabled for older msvc verisons as default. Now alignment is only required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is not defined.
Allocations: Allocations:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*cglm* doesn't alloc any memory on heap. So it doesn't provide any allocator. *cglm* doesn't alloc any memory on heap. So it doesn't provide any allocator.

View File

@@ -40,6 +40,7 @@ Also currently only **float** type is supported for most operations.
getting_started getting_started
opengl opengl
api api
opt
troubleshooting troubleshooting
Indices and tables Indices and tables

36
docs/source/opt.rst Normal file
View File

@@ -0,0 +1,36 @@
.. default-domain:: C
Options
===============================================================================
A few options are provided via macros.
Alignment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
As default, cglm requires types to be aligned. Alignment requirements:
vec3: 8 byte
vec4: 16 byte
mat4: 16 byte
versor: 16 byte
By starting **v0.4.5** cglm provides an option to disable alignment requirement.
To enable this option define **CGLM_ALL_UNALIGNED** macro before all headers.
You can define it in Xcode, Visual Studio (or other IDEs) or you can also prefer
to define it in build system. If you use pre-compiled verisons then you
have to compile cglm with **CGLM_ALL_UNALIGNED** macro.
**VERY VERY IMPORTANT:** If you use cglm in multiple projects and
those projects are depends on each other, then
| *ALWAYS* or *NEVER USE* **CGLM_ALL_UNALIGNED** macro in linked projects
if you do not know what you are doing. Because a cglm header included
via 'project A' may force types to be aligned and another cglm header
included via 'project B' may not require alignment. In this case
cglm functions will read from and write to **INVALID MEMORY LOCATIONs**.
ALWAYS USE SAME CONFIGURATION / OPTION for **cglm** if you have multiple projects.
For instance if you set CGLM_ALL_UNALIGNED in a project then set it in other projects too

View File

@@ -43,6 +43,9 @@ you may do it yourself.
**This MSVC issue is still in TODOs.** **This MSVC issue is still in TODOs.**
**UPDATE:** By starting v0.4.5 cglm provides an option to disable alignment requirement.
Also alignment is disabled for older msvc verisons as default. Now alignment is only required in Visual Studio 2017 version 15.6+ if CGLM_ALL_UNALIGNED macro is not defined.
Crashes, Invalid Memory Access: Crashes, Invalid Memory Access:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -58,19 +58,19 @@ glm_translate_to(mat4 m, vec3 v, mat4 dest) {
mat4 t = GLM_MAT4_IDENTITY_INIT; mat4 t = GLM_MAT4_IDENTITY_INIT;
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest[3], glmm_store(dest[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(t[0]), _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(t[0]),
_mm_set1_ps(v[0])), _mm_set1_ps(v[0])),
_mm_mul_ps(_mm_load_ps(t[1]), _mm_mul_ps(glmm_load(t[1]),
_mm_set1_ps(v[1]))), _mm_set1_ps(v[1]))),
_mm_add_ps(_mm_mul_ps(_mm_load_ps(t[2]), _mm_add_ps(_mm_mul_ps(glmm_load(t[2]),
_mm_set1_ps(v[2])), _mm_set1_ps(v[2])),
_mm_load_ps(t[3])))) glmm_load(t[3]))))
; ;
_mm_store_ps(dest[0], _mm_load_ps(m[0])); glmm_store(dest[0], glmm_load(m[0]));
_mm_store_ps(dest[1], _mm_load_ps(m[1])); glmm_store(dest[1], glmm_load(m[1]));
_mm_store_ps(dest[2], _mm_load_ps(m[2])); glmm_store(dest[2], glmm_load(m[2]));
#else #else
vec4 v1, v2, v3; vec4 v1, v2, v3;
@@ -97,14 +97,14 @@ CGLM_INLINE
void void
glm_translate(mat4 m, vec3 v) { glm_translate(mat4 m, vec3 v) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(m[3], glmm_store(m[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(m[0]),
_mm_set1_ps(v[0])), _mm_set1_ps(v[0])),
_mm_mul_ps(_mm_load_ps(m[1]), _mm_mul_ps(glmm_load(m[1]),
_mm_set1_ps(v[1]))), _mm_set1_ps(v[1]))),
_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), _mm_add_ps(_mm_mul_ps(glmm_load(m[2]),
_mm_set1_ps(v[2])), _mm_set1_ps(v[2])),
_mm_load_ps(m[3])))) glmm_load(m[3]))))
; ;
#else #else
vec4 v1, v2, v3; vec4 v1, v2, v3;
@@ -129,10 +129,10 @@ CGLM_INLINE
void void
glm_translate_x(mat4 m, float x) { glm_translate_x(mat4 m, float x) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(m[3], glmm_store(m[3],
_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), _mm_add_ps(_mm_mul_ps(glmm_load(m[0]),
_mm_set1_ps(x)), _mm_set1_ps(x)),
_mm_load_ps(m[3]))) glmm_load(m[3])))
; ;
#else #else
vec4 v1; vec4 v1;
@@ -151,10 +151,10 @@ CGLM_INLINE
void void
glm_translate_y(mat4 m, float y) { glm_translate_y(mat4 m, float y) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(m[3], glmm_store(m[3],
_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[1]), _mm_add_ps(_mm_mul_ps(glmm_load(m[1]),
_mm_set1_ps(y)), _mm_set1_ps(y)),
_mm_load_ps(m[3]))) glmm_load(m[3])))
; ;
#else #else
vec4 v1; vec4 v1;
@@ -173,10 +173,10 @@ CGLM_INLINE
void void
glm_translate_z(mat4 m, float z) { glm_translate_z(mat4 m, float z) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(m[3], glmm_store(m[3],
_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), _mm_add_ps(_mm_mul_ps(glmm_load(m[2]),
_mm_set1_ps(z)), _mm_set1_ps(z)),
_mm_load_ps(m[3]))) glmm_load(m[3])))
; ;
#else #else
vec4 v1; vec4 v1;

View File

@@ -110,13 +110,13 @@ CGLM_INLINE
void void
glm_mat4_copy(mat4 mat, mat4 dest) { glm_mat4_copy(mat4 mat, mat4 dest) {
#ifdef __AVX__ #ifdef __AVX__
_mm256_store_ps(dest[0], _mm256_load_ps(mat[0])); glmm_store256(dest[0], glmm_load256(mat[0]));
_mm256_store_ps(dest[2], _mm256_load_ps(mat[2])); glmm_store256(dest[2], glmm_load256(mat[2]));
#elif defined( __SSE__ ) || defined( __SSE2__ ) #elif defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest[0], _mm_load_ps(mat[0])); glmm_store(dest[0], glmm_load(mat[0]));
_mm_store_ps(dest[1], _mm_load_ps(mat[1])); glmm_store(dest[1], glmm_load(mat[1]));
_mm_store_ps(dest[2], _mm_load_ps(mat[2])); glmm_store(dest[2], glmm_load(mat[2]));
_mm_store_ps(dest[3], _mm_load_ps(mat[3])); glmm_store(dest[3], glmm_load(mat[3]));
#else #else
glm_mat4_ucopy(mat, dest); glm_mat4_ucopy(mat, dest);
#endif #endif

View File

@@ -198,8 +198,8 @@ glm_quat_normalize_to(versor q, versor dest) {
__m128 xdot, x0; __m128 xdot, x0;
float dot; float dot;
x0 = _mm_load_ps(q); x0 = glmm_load(q);
xdot = glm_simd_dot(x0, x0); xdot = glmm_dot(x0, x0);
dot = _mm_cvtss_f32(xdot); dot = _mm_cvtss_f32(xdot);
if (dot <= 0.0f) { if (dot <= 0.0f) {
@@ -207,7 +207,7 @@ glm_quat_normalize_to(versor q, versor dest) {
return; return;
} }
_mm_store_ps(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
#else #else
float dot; float dot;

View File

@@ -21,11 +21,11 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */ y0 = glmm_load256(m2[0]); /* h g f e d c b a */
y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */ y1 = glmm_load256(m2[2]); /* p o n m l k j i */
y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */ y2 = glmm_load256(m1[0]); /* h g f e d c b a */
y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */ y3 = glmm_load256(m1[2]); /* p o n m l k j i */
y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */ y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */
y5 = _mm256_permute2f128_ps(y3, y3, 0b00000000); /* l k j i l k j i */ y5 = _mm256_permute2f128_ps(y3, y3, 0b00000000); /* l k j i l k j i */
@@ -37,10 +37,10 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
_mm256_store_ps(dest[0], glmm_store256(dest[0],
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
_mm256_mul_ps(y4, y8)), _mm256_mul_ps(y4, y8)),
_mm256_mul_ps(y5, y7))); _mm256_mul_ps(y5, y7)));
/* n n n n i i i i */ /* n n n n i i i i */
@@ -52,11 +52,11 @@ glm_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
_mm256_store_ps(dest[2], glmm_store256(dest[2],
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
_mm256_mul_ps(y3, y7)), _mm256_mul_ps(y3, y7)),
_mm256_add_ps(_mm256_mul_ps(y4, y8), _mm256_add_ps(_mm256_mul_ps(y4, y8),
_mm256_mul_ps(y5, y9)))); _mm256_mul_ps(y5, y9))));
} }
#endif #endif

View File

@@ -21,11 +21,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
y0 = _mm256_load_ps(m2[0]); /* h g f e d c b a */ y0 = glmm_load256(m2[0]); /* h g f e d c b a */
y1 = _mm256_load_ps(m2[2]); /* p o n m l k j i */ y1 = glmm_load256(m2[2]); /* p o n m l k j i */
y2 = _mm256_load_ps(m1[0]); /* h g f e d c b a */ y2 = glmm_load256(m1[0]); /* h g f e d c b a */
y3 = _mm256_load_ps(m1[2]); /* p o n m l k j i */ y3 = glmm_load256(m1[2]); /* p o n m l k j i */
y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */ y4 = _mm256_permute2f128_ps(y2, y2, 0b00000011); /* d c b a h g f e */
y5 = _mm256_permute2f128_ps(y3, y3, 0b00000011); /* l k j i p o n m */ y5 = _mm256_permute2f128_ps(y3, y3, 0b00000011); /* l k j i p o n m */
@@ -39,11 +39,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
_mm256_store_ps(dest[0], glmm_store256(dest[0],
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
_mm256_mul_ps(y3, y7)), _mm256_mul_ps(y3, y7)),
_mm256_add_ps(_mm256_mul_ps(y4, y8), _mm256_add_ps(_mm256_mul_ps(y4, y8),
_mm256_mul_ps(y5, y9)))); _mm256_mul_ps(y5, y9))));
/* n n n n i i i i */ /* n n n n i i i i */
/* p p p p k k k k */ /* p p p p k k k k */
@@ -54,11 +54,11 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1)); y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3)); y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
_mm256_store_ps(dest[2], glmm_store256(dest[2],
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6), _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
_mm256_mul_ps(y3, y7)), _mm256_mul_ps(y3, y7)),
_mm256_add_ps(_mm256_mul_ps(y4, y8), _mm256_add_ps(_mm256_mul_ps(y4, y8),
_mm256_mul_ps(y5, y9)))); _mm256_mul_ps(y5, y9))));
} }
#endif #endif

View File

@@ -18,6 +18,10 @@
# define __SSE__ # define __SSE__
# endif # endif
# endif # endif
/* do not use alignment for older visual studio versions */
# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */
# define CGLM_ALL_UNALIGNED
# endif
#endif #endif
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
@@ -35,24 +39,24 @@
_mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ _mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
z1, y1, x1, w1) z1, y1, x1, w1)
CGLM_INLINE static inline
__m128 __m128
glm_simd_dot(__m128 a, __m128 b) { glmm_dot(__m128 a, __m128 b) {
__m128 x0; __m128 x0;
x0 = _mm_mul_ps(a, b); x0 = _mm_mul_ps(a, b);
x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2));
return _mm_add_ps(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1)); return _mm_add_ps(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1));
} }
CGLM_INLINE static inline
__m128 __m128
glm_simd_norm(__m128 a) { glmm_norm(__m128 a) {
return _mm_sqrt_ps(glm_simd_dot(a, a)); return _mm_sqrt_ps(glmm_dot(a, a));
} }
static inline static inline
__m128 __m128
glm_simd_load_v3(vec3 v) { glmm_load3(float v[3]) {
__m128i xy; __m128i xy;
__m128 z; __m128 z;
@@ -64,11 +68,19 @@ glm_simd_load_v3(vec3 v) {
static inline static inline
void void
glm_simd_store_v3(__m128 vx, vec3 v) { glmm_store3(__m128 vx, float v[3]) {
_mm_storel_pi((__m64 *)&v[0], vx); _mm_storel_pi((__m64 *)&v[0], vx);
_mm_store_ss(&v[2], _mm_shuffle1_ps(vx, 2, 2, 2, 2)); _mm_store_ss(&v[2], _mm_shuffle1_ps(vx, 2, 2, 2, 2));
} }
#ifdef CGLM_ALL_UNALIGNED
# define glmm_load(p) _mm_loadu_ps(p)
# define glmm_store(p, a) _mm_storeu_ps(p, a)
#else
# define glmm_load(p) _mm_load_ps(p)
# define glmm_store(p, a) _mm_store_ps(p, a)
#endif
#endif #endif
/* x86, x64 */ /* x86, x64 */
@@ -78,6 +90,15 @@ glm_simd_store_v3(__m128 vx, vec3 v) {
#ifdef __AVX__ #ifdef __AVX__
# define CGLM_AVX_FP 1 # define CGLM_AVX_FP 1
#ifdef CGLM_ALL_UNALIGNED
# define glmm_load256(p) _mm256_loadu_ps(p)
# define glmm_store256(p, a) _mm256_storeu_ps(p, a)
#else
# define glmm_load256(p) _mm256_load_ps(p)
# define glmm_store256(p, a) _mm256_store_ps(p, a)
#endif
#endif #endif
/* ARM Neon */ /* ARM Neon */

View File

@@ -18,35 +18,35 @@ glm_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
__m128 l0, l1, l2, l3, r; __m128 l0, l1, l2, l3, r;
l0 = _mm_load_ps(m1[0]); l0 = glmm_load(m1[0]);
l1 = _mm_load_ps(m1[1]); l1 = glmm_load(m1[1]);
l2 = _mm_load_ps(m1[2]); l2 = glmm_load(m1[2]);
l3 = _mm_load_ps(m1[3]); l3 = glmm_load(m1[3]);
r = _mm_load_ps(m2[0]); r = glmm_load(m2[0]);
_mm_store_ps(dest[0], glmm_store(dest[0],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
r = _mm_load_ps(m2[1]); r = glmm_load(m2[1]);
_mm_store_ps(dest[1], glmm_store(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
r = _mm_load_ps(m2[2]); r = glmm_load(m2[2]);
_mm_store_ps(dest[2], glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
r = _mm_load_ps(m2[3]); r = glmm_load(m2[3]);
_mm_store_ps(dest[3], glmm_store(dest[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
} }
CGLM_INLINE CGLM_INLINE
@@ -55,30 +55,30 @@ glm_mul_rot_sse2(mat4 m1, mat4 m2, mat4 dest) {
/* D = R * L (Column-Major) */ /* D = R * L (Column-Major) */
__m128 l0, l1, l2, l3, r; __m128 l0, l1, l2, l3, r;
l0 = _mm_load_ps(m1[0]); l0 = glmm_load(m1[0]);
l1 = _mm_load_ps(m1[1]); l1 = glmm_load(m1[1]);
l2 = _mm_load_ps(m1[2]); l2 = glmm_load(m1[2]);
l3 = _mm_load_ps(m1[3]); l3 = glmm_load(m1[3]);
r = _mm_load_ps(m2[0]); r = glmm_load(m2[0]);
_mm_store_ps(dest[0], glmm_store(dest[0],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
r = _mm_load_ps(m2[1]); r = glmm_load(m2[1]);
_mm_store_ps(dest[1], glmm_store(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
r = _mm_load_ps(m2[2]); r = glmm_load(m2[2]);
_mm_store_ps(dest[2], glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2))); _mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2)));
_mm_store_ps(dest[3], l3); glmm_store(dest[3], l3);
} }
CGLM_INLINE CGLM_INLINE
@@ -86,11 +86,11 @@ void
glm_inv_tr_sse2(mat4 mat) { glm_inv_tr_sse2(mat4 mat) {
__m128 r0, r1, r2, r3, x0, x1; __m128 r0, r1, r2, r3, x0, x1;
r0 = _mm_load_ps(mat[0]); r0 = glmm_load(mat[0]);
r1 = _mm_load_ps(mat[1]); r1 = glmm_load(mat[1]);
r2 = _mm_load_ps(mat[2]); r2 = glmm_load(mat[2]);
r3 = _mm_load_ps(mat[3]); r3 = glmm_load(mat[3]);
x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); x1 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
_MM_TRANSPOSE4_PS(r0, r1, r2, x1); _MM_TRANSPOSE4_PS(r0, r1, r2, x1);
@@ -101,10 +101,10 @@ glm_inv_tr_sse2(mat4 mat) {
x0 = _mm_add_ps(x0, x1); x0 = _mm_add_ps(x0, x1);
_mm_store_ps(mat[0], r0); glmm_store(mat[0], r0);
_mm_store_ps(mat[1], r1); glmm_store(mat[1], r1);
_mm_store_ps(mat[2], r2); glmm_store(mat[2], r2);
_mm_store_ps(mat[3], x0); glmm_store(mat[3], x0);
} }
#endif #endif

View File

@@ -20,10 +20,10 @@ glm_mat4_scale_sse2(mat4 m, float s){
__m128 x0; __m128 x0;
x0 = _mm_set1_ps(s); x0 = _mm_set1_ps(s);
_mm_store_ps(m[0], _mm_mul_ps(_mm_load_ps(m[0]), x0)); glmm_store(m[0], _mm_mul_ps(glmm_load(m[0]), x0));
_mm_store_ps(m[1], _mm_mul_ps(_mm_load_ps(m[1]), x0)); glmm_store(m[1], _mm_mul_ps(glmm_load(m[1]), x0));
_mm_store_ps(m[2], _mm_mul_ps(_mm_load_ps(m[2]), x0)); glmm_store(m[2], _mm_mul_ps(glmm_load(m[2]), x0));
_mm_store_ps(m[3], _mm_mul_ps(_mm_load_ps(m[3]), x0)); glmm_store(m[3], _mm_mul_ps(glmm_load(m[3]), x0));
} }
CGLM_INLINE CGLM_INLINE
@@ -31,17 +31,17 @@ void
glm_mat4_transp_sse2(mat4 m, mat4 dest){ glm_mat4_transp_sse2(mat4 m, mat4 dest){
__m128 r0, r1, r2, r3; __m128 r0, r1, r2, r3;
r0 = _mm_load_ps(m[0]); r0 = glmm_load(m[0]);
r1 = _mm_load_ps(m[1]); r1 = glmm_load(m[1]);
r2 = _mm_load_ps(m[2]); r2 = glmm_load(m[2]);
r3 = _mm_load_ps(m[3]); r3 = glmm_load(m[3]);
_MM_TRANSPOSE4_PS(r0, r1, r2, r3); _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
_mm_store_ps(dest[0], r0); glmm_store(dest[0], r0);
_mm_store_ps(dest[1], r1); glmm_store(dest[1], r1);
_mm_store_ps(dest[2], r2); glmm_store(dest[2], r2);
_mm_store_ps(dest[3], r3); glmm_store(dest[3], r3);
} }
CGLM_INLINE CGLM_INLINE
@@ -51,36 +51,36 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) {
__m128 l0, l1, l2, l3, r; __m128 l0, l1, l2, l3, r;
l0 = _mm_load_ps(m1[0]); l0 = glmm_load(m1[0]);
l1 = _mm_load_ps(m1[1]); l1 = glmm_load(m1[1]);
l2 = _mm_load_ps(m1[2]); l2 = glmm_load(m1[2]);
l3 = _mm_load_ps(m1[3]); l3 = glmm_load(m1[3]);
r = _mm_load_ps(m2[0]); r = glmm_load(m2[0]);
_mm_store_ps(dest[0], glmm_store(dest[0],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
r = _mm_load_ps(m2[1]); r = glmm_load(m2[1]);
_mm_store_ps(dest[1], glmm_store(dest[1],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
r = _mm_load_ps(m2[2]); r = glmm_load(m2[2]);
_mm_store_ps(dest[2], glmm_store(dest[2],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
r = _mm_load_ps(m2[3]); r = glmm_load(m2[3]);
_mm_store_ps(dest[3], glmm_store(dest[3],
_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0), _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 0), l0),
_mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)), _mm_mul_ps(_mm_shuffle1_ps1(r, 1), l1)),
_mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2), _mm_add_ps(_mm_mul_ps(_mm_shuffle1_ps1(r, 2), l2),
_mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3))));
} }
CGLM_INLINE CGLM_INLINE
@@ -88,18 +88,18 @@ void
glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) {
__m128 x0, x1, x2; __m128 x0, x1, x2;
x0 = _mm_load_ps(v); x0 = glmm_load(v);
x1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), x1 = _mm_add_ps(_mm_mul_ps(glmm_load(m[0]),
_mm_shuffle1_ps1(x0, 0)), _mm_shuffle1_ps1(x0, 0)),
_mm_mul_ps(_mm_load_ps(m[1]), _mm_mul_ps(glmm_load(m[1]),
_mm_shuffle1_ps1(x0, 1))); _mm_shuffle1_ps1(x0, 1)));
x2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), x2 = _mm_add_ps(_mm_mul_ps(glmm_load(m[2]),
_mm_shuffle1_ps1(x0, 2)), _mm_shuffle1_ps1(x0, 2)),
_mm_mul_ps(_mm_load_ps(m[3]), _mm_mul_ps(glmm_load(m[3]),
_mm_shuffle1_ps1(x0, 3))); _mm_shuffle1_ps1(x0, 3)));
_mm_store_ps(dest, _mm_add_ps(x1, x2)); glmm_store(dest, _mm_add_ps(x1, x2));
} }
CGLM_INLINE CGLM_INLINE
@@ -108,10 +108,10 @@ glm_mat4_det_sse2(mat4 mat) {
__m128 r0, r1, r2, r3, x0, x1, x2; __m128 r0, r1, r2, r3, x0, x1, x2;
/* 127 <- 0, [square] det(A) = det(At) */ /* 127 <- 0, [square] det(A) = det(At) */
r0 = _mm_load_ps(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
r1 = _mm_load_ps(mat[1]); /* h g f e */ r1 = glmm_load(mat[1]); /* h g f e */
r2 = _mm_load_ps(mat[2]); /* l k j i */ r2 = glmm_load(mat[2]); /* l k j i */
r3 = _mm_load_ps(mat[3]); /* p o n m */ r3 = glmm_load(mat[3]); /* p o n m */
/* /*
t[1] = j * p - n * l; t[1] = j * p - n * l;
@@ -166,10 +166,10 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
x0, x1, x2, x3, x4, x5, x6, x7; x0, x1, x2, x3, x4, x5, x6, x7;
/* 127 <- 0 */ /* 127 <- 0 */
r0 = _mm_load_ps(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
r1 = _mm_load_ps(mat[1]); /* h g f e */ r1 = glmm_load(mat[1]); /* h g f e */
r2 = _mm_load_ps(mat[2]); /* l k j i */ r2 = glmm_load(mat[2]); /* l k j i */
r3 = _mm_load_ps(mat[3]); /* p o n m */ r3 = glmm_load(mat[3]); /* p o n m */
x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */
x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3); /* l p p p */ x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3); /* l p p p */
@@ -275,10 +275,10 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 0, 1)); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 0, 1));
x0 = _mm_rcp_ps(x0); x0 = _mm_rcp_ps(x0);
_mm_store_ps(dest[0], _mm_mul_ps(v0, x0)); glmm_store(dest[0], _mm_mul_ps(v0, x0));
_mm_store_ps(dest[1], _mm_mul_ps(v1, x0)); glmm_store(dest[1], _mm_mul_ps(v1, x0));
_mm_store_ps(dest[2], _mm_mul_ps(v2, x0)); glmm_store(dest[2], _mm_mul_ps(v2, x0));
_mm_store_ps(dest[3], _mm_mul_ps(v3, x0)); glmm_store(dest[3], _mm_mul_ps(v3, x0));
} }
CGLM_INLINE CGLM_INLINE
@@ -290,10 +290,10 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
x0, x1, x2, x3, x4, x5, x6, x7; x0, x1, x2, x3, x4, x5, x6, x7;
/* 127 <- 0 */ /* 127 <- 0 */
r0 = _mm_load_ps(mat[0]); /* d c b a */ r0 = glmm_load(mat[0]); /* d c b a */
r1 = _mm_load_ps(mat[1]); /* h g f e */ r1 = glmm_load(mat[1]); /* h g f e */
r2 = _mm_load_ps(mat[2]); /* l k j i */ r2 = glmm_load(mat[2]); /* l k j i */
r3 = _mm_load_ps(mat[3]); /* p o n m */ r3 = glmm_load(mat[3]); /* p o n m */
x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */ x0 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2)); /* p o l k */
x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3); /* l p p p */ x1 = _mm_shuffle1_ps(x0, 1, 3, 3, 3); /* l p p p */
@@ -399,10 +399,10 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 0, 1)); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 0, 1));
x0 = _mm_div_ps(_mm_set1_ps(1.0f), x0); x0 = _mm_div_ps(_mm_set1_ps(1.0f), x0);
_mm_store_ps(dest[0], _mm_mul_ps(v0, x0)); glmm_store(dest[0], _mm_mul_ps(v0, x0));
_mm_store_ps(dest[1], _mm_mul_ps(v1, x0)); glmm_store(dest[1], _mm_mul_ps(v1, x0));
_mm_store_ps(dest[2], _mm_mul_ps(v2, x0)); glmm_store(dest[2], _mm_mul_ps(v2, x0));
_mm_store_ps(dest[3], _mm_mul_ps(v3, x0)); glmm_store(dest[3], _mm_mul_ps(v3, x0));
} }
#endif #endif

View File

@@ -24,8 +24,8 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) {
__m128 xp, xq, x0, r; __m128 xp, xq, x0, r;
xp = _mm_load_ps(p); /* 3 2 1 0 */ xp = glmm_load(p); /* 3 2 1 0 */
xq = _mm_load_ps(q); xq = glmm_load(q);
r = _mm_mul_ps(_mm_shuffle1_ps1(xp, 3), xq); r = _mm_mul_ps(_mm_shuffle1_ps1(xp, 3), xq);
@@ -38,7 +38,7 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) {
x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f)); x0 = _mm_xor_ps(_mm_shuffle1_ps1(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f));
r = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 2, 3, 0, 1))); r = _mm_add_ps(r, _mm_mul_ps(x0, _mm_shuffle1_ps(xq, 2, 3, 0, 1)));
_mm_store_ps(dest, r); glmm_store(dest, r);
} }

View File

@@ -9,23 +9,35 @@
#define cglm_types_h #define cglm_types_h
#if defined(_MSC_VER) #if defined(_MSC_VER)
# define CGLM_ALIGN(X) /* __declspec(align(X)) */ /* do not use alignment for older visual studio versions */
#if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */
# define CGLM_ALL_UNALIGNED
# define CGLM_ALIGN(X) /* no alignment */
#else
# define CGLM_ALIGN(X) __declspec(align(X))
#endif
#else #else
# define CGLM_ALIGN(X) __attribute((aligned(X))) # define CGLM_ALIGN(X) __attribute((aligned(X)))
#endif #endif
typedef float vec2[2]; #ifndef CGLM_ALL_UNALIGNED
typedef CGLM_ALIGN(8) float vec3[3]; # define CGLM_ALIGN_IF(X) CGLM_ALIGN(X)
typedef int ivec3[3]; #else
typedef CGLM_ALIGN(16) float vec4[4]; # define CGLM_ALIGN_IF(X) /* no alignment */
#endif
typedef vec3 mat3[3]; typedef float vec2[2];
typedef CGLM_ALIGN(16) vec4 mat4[4]; typedef CGLM_ALIGN_IF(8) float vec3[3];
typedef int ivec3[3];
typedef CGLM_ALIGN_IF(16) float vec4[4];
typedef vec4 versor; typedef vec3 mat3[3];
typedef CGLM_ALIGN_IF(16) vec4 mat4[4];
#define CGLM_PI (float)M_PI typedef vec4 versor;
#define CGLM_PI_2 (float)M_PI_2
#define CGLM_PI_4 (float)M_PI_4 #define CGLM_PI ((float)M_PI)
#define CGLM_PI_2 ((float)M_PI_2)
#define CGLM_PI_4 ((float)M_PI_4)
#endif /* cglm_types_h */ #endif /* cglm_types_h */

View File

@@ -42,7 +42,7 @@ CGLM_INLINE
void void
glm_vec4_mulv(vec4 a, vec4 b, vec4 d) { glm_vec4_mulv(vec4 a, vec4 b, vec4 d) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(d, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b))); glmm_store(d, _mm_mul_ps(glmm_load(a), glmm_load(b)));
#else #else
d[0] = a[0] * b[0]; d[0] = a[0] * b[0];
d[1] = a[1] * b[1]; d[1] = a[1] * b[1];
@@ -61,7 +61,7 @@ CGLM_INLINE
void void
glm_vec4_broadcast(float val, vec4 d) { glm_vec4_broadcast(float val, vec4 d) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(d, _mm_set1_ps(val)); glmm_store(d, _mm_set1_ps(val));
#else #else
d[0] = d[1] = d[2] = d[3] = val; d[0] = d[1] = d[2] = d[3] = val;
#endif #endif
@@ -223,14 +223,14 @@ glm_vec4_sign(vec4 v, vec4 dest) {
#if defined( __SSE2__ ) || defined( __SSE2__ ) #if defined( __SSE2__ ) || defined( __SSE2__ )
__m128 x0, x1, x2, x3, x4; __m128 x0, x1, x2, x3, x4;
x0 = _mm_load_ps(v); x0 = glmm_load(v);
x1 = _mm_set_ps(0.0f, 0.0f, 1.0f, -1.0f); x1 = _mm_set_ps(0.0f, 0.0f, 1.0f, -1.0f);
x2 = _mm_shuffle1_ps1(x1, 2); x2 = _mm_shuffle1_ps1(x1, 2);
x3 = _mm_and_ps(_mm_cmpgt_ps(x0, x2), _mm_shuffle1_ps1(x1, 1)); x3 = _mm_and_ps(_mm_cmpgt_ps(x0, x2), _mm_shuffle1_ps1(x1, 1));
x4 = _mm_and_ps(_mm_cmplt_ps(x0, x2), _mm_shuffle1_ps1(x1, 0)); x4 = _mm_and_ps(_mm_cmplt_ps(x0, x2), _mm_shuffle1_ps1(x1, 0));
_mm_store_ps(dest, _mm_or_ps(x3, x4)); glmm_store(dest, _mm_or_ps(x3, x4));
#else #else
dest[0] = glm_signf(v[0]); dest[0] = glm_signf(v[0]);
dest[1] = glm_signf(v[1]); dest[1] = glm_signf(v[1]);
@@ -249,7 +249,7 @@ CGLM_INLINE
void void
glm_vec4_sqrt(vec4 v, vec4 dest) { glm_vec4_sqrt(vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_sqrt_ps(_mm_load_ps(v))); glmm_store(dest, _mm_sqrt_ps(glmm_load(v)));
#else #else
dest[0] = sqrtf(v[0]); dest[0] = sqrtf(v[0]);
dest[1] = sqrtf(v[1]); dest[1] = sqrtf(v[1]);

View File

@@ -111,7 +111,7 @@ CGLM_INLINE
void void
glm_vec4_copy(vec4 v, vec4 dest) { glm_vec4_copy(vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_load_ps(v)); glmm_store(dest, glmm_load(v));
#else #else
dest[0] = v[0]; dest[0] = v[0];
dest[1] = v[1]; dest[1] = v[1];
@@ -129,7 +129,7 @@ CGLM_INLINE
void void
glm_vec4_zero(vec4 v) { glm_vec4_zero(vec4 v) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(v, _mm_setzero_ps()); glmm_store(v, _mm_setzero_ps());
#else #else
v[0] = 0.0f; v[0] = 0.0f;
v[1] = 0.0f; v[1] = 0.0f;
@@ -147,7 +147,7 @@ CGLM_INLINE
void void
glm_vec4_one(vec4 v) { glm_vec4_one(vec4 v) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(v, _mm_set1_ps(1.0f)); glmm_store(v, _mm_set1_ps(1.0f));
#else #else
v[0] = 1.0f; v[0] = 1.0f;
v[1] = 1.0f; v[1] = 1.0f;
@@ -169,7 +169,7 @@ float
glm_vec4_dot(vec4 a, vec4 b) { glm_vec4_dot(vec4 a, vec4 b) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
__m128 x0; __m128 x0;
x0 = _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)); x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2));
return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1))); return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1)));
#else #else
@@ -193,7 +193,7 @@ float
glm_vec4_norm2(vec4 v) { glm_vec4_norm2(vec4 v) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
__m128 x0; __m128 x0;
x0 = _mm_load_ps(v); x0 = glmm_load(v);
x0 = _mm_mul_ps(x0, x0); x0 = _mm_mul_ps(x0, x0);
x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2));
return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1))); return _mm_cvtss_f32(_mm_add_ss(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1)));
@@ -214,8 +214,8 @@ float
glm_vec4_norm(vec4 vec) { glm_vec4_norm(vec4 vec) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
__m128 x0; __m128 x0;
x0 = _mm_load_ps(vec); x0 = glmm_load(vec);
return _mm_cvtss_f32(_mm_sqrt_ss(glm_simd_dot(x0, x0))); return _mm_cvtss_f32(_mm_sqrt_ss(glmm_dot(x0, x0)));
#else #else
return sqrtf(glm_vec4_norm2(vec)); return sqrtf(glm_vec4_norm2(vec));
#endif #endif
@@ -232,7 +232,7 @@ CGLM_INLINE
void void
glm_vec4_add(vec4 a, vec4 b, vec4 dest) { glm_vec4_add(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_add_ps(_mm_load_ps(a), _mm_load_ps(b))); glmm_store(dest, _mm_add_ps(glmm_load(a), glmm_load(b)));
#else #else
dest[0] = a[0] + b[0]; dest[0] = a[0] + b[0];
dest[1] = a[1] + b[1]; dest[1] = a[1] + b[1];
@@ -252,7 +252,7 @@ CGLM_INLINE
void void
glm_vec4_adds(vec4 v, float s, vec4 dest) { glm_vec4_adds(vec4 v, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_add_ps(_mm_load_ps(v), _mm_set1_ps(s))); glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s)));
#else #else
dest[0] = v[0] + s; dest[0] = v[0] + s;
dest[1] = v[1] + s; dest[1] = v[1] + s;
@@ -272,7 +272,7 @@ CGLM_INLINE
void void
glm_vec4_sub(vec4 a, vec4 b, vec4 dest) { glm_vec4_sub(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_sub_ps(_mm_load_ps(a), _mm_load_ps(b))); glmm_store(dest, _mm_sub_ps(glmm_load(a), glmm_load(b)));
#else #else
dest[0] = a[0] - b[0]; dest[0] = a[0] - b[0];
dest[1] = a[1] - b[1]; dest[1] = a[1] - b[1];
@@ -292,7 +292,7 @@ CGLM_INLINE
void void
glm_vec4_subs(vec4 v, float s, vec4 dest) { glm_vec4_subs(vec4 v, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_sub_ps(_mm_load_ps(v), _mm_set1_ps(s))); glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s)));
#else #else
dest[0] = v[0] - s; dest[0] = v[0] - s;
dest[1] = v[1] - s; dest[1] = v[1] - s;
@@ -312,7 +312,7 @@ CGLM_INLINE
void void
glm_vec4_mul(vec4 a, vec4 b, vec4 d) { glm_vec4_mul(vec4 a, vec4 b, vec4 d) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(d, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b))); glmm_store(d, _mm_mul_ps(glmm_load(a), glmm_load(b)));
#else #else
d[0] = a[0] * b[0]; d[0] = a[0] * b[0];
d[1] = a[1] * b[1]; d[1] = a[1] * b[1];
@@ -332,7 +332,7 @@ CGLM_INLINE
void void
glm_vec4_scale(vec4 v, float s, vec4 dest) { glm_vec4_scale(vec4 v, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_mul_ps(_mm_load_ps(v), _mm_set1_ps(s))); glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s)));
#else #else
dest[0] = v[0] * s; dest[0] = v[0] * s;
dest[1] = v[1] * s; dest[1] = v[1] * s;
@@ -373,7 +373,7 @@ CGLM_INLINE
void void
glm_vec4_div(vec4 a, vec4 b, vec4 dest) { glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_div_ps(_mm_load_ps(a), _mm_load_ps(b))); glmm_store(dest, _mm_div_ps(glmm_load(a), glmm_load(b)));
#else #else
dest[0] = a[0] / b[0]; dest[0] = a[0] / b[0];
dest[1] = a[1] / b[1]; dest[1] = a[1] / b[1];
@@ -393,7 +393,7 @@ CGLM_INLINE
void void
glm_vec4_divs(vec4 v, float s, vec4 dest) { glm_vec4_divs(vec4 v, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_div_ps(_mm_load_ps(v), _mm_set1_ps(s))); glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s)));
#else #else
glm_vec4_scale(v, 1.0f / s, dest); glm_vec4_scale(v, 1.0f / s, dest);
#endif #endif
@@ -413,9 +413,9 @@ CGLM_INLINE
void void
glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_add_ps(_mm_load_ps(a), _mm_add_ps(glmm_load(a),
_mm_load_ps(b)))); glmm_load(b))));
#else #else
dest[0] += a[0] + b[0]; dest[0] += a[0] + b[0];
dest[1] += a[1] + b[1]; dest[1] += a[1] + b[1];
@@ -437,9 +437,9 @@ CGLM_INLINE
void void
glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_sub_ps(_mm_load_ps(a), _mm_sub_ps(glmm_load(a),
_mm_load_ps(b)))); glmm_load(b))));
#else #else
dest[0] += a[0] - b[0]; dest[0] += a[0] - b[0];
dest[1] += a[1] - b[1]; dest[1] += a[1] - b[1];
@@ -461,9 +461,9 @@ CGLM_INLINE
void void
glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) { glm_vec4_muladd(vec4 a, vec4 b, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_mul_ps(_mm_load_ps(a), _mm_mul_ps(glmm_load(a),
_mm_load_ps(b)))); glmm_load(b))));
#else #else
dest[0] += a[0] * b[0]; dest[0] += a[0] * b[0];
dest[1] += a[1] * b[1]; dest[1] += a[1] * b[1];
@@ -485,9 +485,9 @@ CGLM_INLINE
void void
glm_vec4_muladds(vec4 a, float s, vec4 dest) { glm_vec4_muladds(vec4 a, float s, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_add_ps(_mm_load_ps(dest), glmm_store(dest, _mm_add_ps(glmm_load(dest),
_mm_mul_ps(_mm_load_ps(a), _mm_mul_ps(glmm_load(a),
_mm_set1_ps(s)))); _mm_set1_ps(s))));
#else #else
dest[0] += a[0] * s; dest[0] += a[0] * s;
dest[1] += a[1] * s; dest[1] += a[1] * s;
@@ -505,7 +505,7 @@ CGLM_INLINE
void void
glm_vec4_flipsign(vec4 v) { glm_vec4_flipsign(vec4 v) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(v, _mm_xor_ps(_mm_load_ps(v), _mm_set1_ps(-0.0f))); glmm_store(v, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
#else #else
v[0] = -v[0]; v[0] = -v[0];
v[1] = -v[1]; v[1] = -v[1];
@@ -524,8 +524,7 @@ CGLM_INLINE
void void
glm_vec4_flipsign_to(vec4 v, vec4 dest) { glm_vec4_flipsign_to(vec4 v, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_xor_ps(_mm_load_ps(v), glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
_mm_set1_ps(-0.0f)));
#else #else
dest[0] = -v[0]; dest[0] = -v[0];
dest[1] = -v[1]; dest[1] = -v[1];
@@ -571,16 +570,16 @@ glm_vec4_normalize_to(vec4 vec, vec4 dest) {
__m128 xdot, x0; __m128 xdot, x0;
float dot; float dot;
x0 = _mm_load_ps(vec); x0 = glmm_load(vec);
xdot = glm_simd_dot(x0, x0); xdot = glmm_dot(x0, x0);
dot = _mm_cvtss_f32(xdot); dot = _mm_cvtss_f32(xdot);
if (dot == 0.0f) { if (dot == 0.0f) {
_mm_store_ps(dest, _mm_setzero_ps()); glmm_store(dest, _mm_setzero_ps());
return; return;
} }
_mm_store_ps(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
#else #else
float norm; float norm;
@@ -633,7 +632,7 @@ CGLM_INLINE
void void
glm_vec4_maxv(vec4 v1, vec4 v2, vec4 dest) { glm_vec4_maxv(vec4 v1, vec4 v2, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_max_ps(_mm_load_ps(v1), _mm_load_ps(v2))); glmm_store(dest, _mm_max_ps(glmm_load(v1), glmm_load(v2)));
#else #else
dest[0] = glm_max(v1[0], v2[0]); dest[0] = glm_max(v1[0], v2[0]);
dest[1] = glm_max(v1[1], v2[1]); dest[1] = glm_max(v1[1], v2[1]);
@@ -653,7 +652,7 @@ CGLM_INLINE
void void
glm_vec4_minv(vec4 v1, vec4 v2, vec4 dest) { glm_vec4_minv(vec4 v1, vec4 v2, vec4 dest) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(dest, _mm_min_ps(_mm_load_ps(v1), _mm_load_ps(v2))); glmm_store(dest, _mm_min_ps(glmm_load(v1), glmm_load(v2)));
#else #else
dest[0] = glm_min(v1[0], v2[0]); dest[0] = glm_min(v1[0], v2[0]);
dest[1] = glm_min(v1[1], v2[1]); dest[1] = glm_min(v1[1], v2[1]);
@@ -673,8 +672,8 @@ CGLM_INLINE
void void
glm_vec4_clamp(vec4 v, float minVal, float maxVal) { glm_vec4_clamp(vec4 v, float minVal, float maxVal) {
#if defined( __SSE__ ) || defined( __SSE2__ ) #if defined( __SSE__ ) || defined( __SSE2__ )
_mm_store_ps(v, _mm_min_ps(_mm_max_ps(_mm_load_ps(v), _mm_set1_ps(minVal)), glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)),
_mm_set1_ps(maxVal))); _mm_set1_ps(maxVal)));
#else #else
v[0] = glm_clamp(v[0], minVal, maxVal); v[0] = glm_clamp(v[0], minVal, maxVal);
v[1] = glm_clamp(v[1], minVal, maxVal); v[1] = glm_clamp(v[1], minVal, maxVal);