diff --git a/include/cglm/simd/wasm.h b/include/cglm/simd/wasm.h index ba621dd..25bf1b9 100644 --- a/include/cglm/simd/wasm.h +++ b/include/cglm/simd/wasm.h @@ -22,12 +22,6 @@ #define _mm_cvtss_f32(v) wasm_f32x4_extract_lane(v, 0) -static inline glmm_128 __attribute__((__always_inline__, __nodebug__)) -_mm_movehl_ps(glmm_128 __a, glmm_128 __b) -{ - return wasm_i32x4_shuffle(__a, __b, 6, 7, 2, 3); -} - static inline glmm_128 glmm_abs(glmm_128 x) { @@ -49,7 +43,8 @@ glmm_vhadds(glmm_128 v) { glmm_128 shuf, sums; shuf = glmm_shuff1(v, 2, 3, 0, 1); sums = wasm_f32x4_add(v, shuf); - shuf = _mm_movehl_ps(shuf, sums); + // shuf = _mm_movehl_ps(shuf, sums); + shuf = wasm_i32x4_shuffle(shuf, sums, 6, 7, 2, 3); sums = wasm_i32x4_shuffle(sums, wasm_f32x4_add(sums, shuf), 4, 1, 2, 3); return sums; } diff --git a/include/cglm/simd/wasm/affine.h b/include/cglm/simd/wasm/affine.h index 9471592..518cf64 100644 --- a/include/cglm/simd/wasm/affine.h +++ b/include/cglm/simd/wasm/affine.h @@ -100,17 +100,20 @@ glm_inv_tr_wasm(mat4 mat) { x5 = wasm_i32x4_shuffle(r2, x1, 2, 6, 3, 7); // r0 = _mm_movelh_ps(x2, x4); r0 = wasm_i32x4_shuffle(x2, x4, 0, 1, 4, 5); - r1 = _mm_movehl_ps(x4, x2); + // r1 = _mm_movehl_ps(x4, x2); + r1 = wasm_i32x4_shuffle(x4, x2, 6, 7, 2, 3); // r2 = _mm_movelh_ps(x3, x5); r2 = wasm_i32x4_shuffle(x3, x5, 0, 1, 4, 5); - x1 = _mm_movehl_ps(x5, x3); + // x1 = _mm_movehl_ps(x5, x3); + x1 = wasm_i32x4_shuffle(x5, x3, 6, 7, 2, 3); x2 = glmm_shuff1(r3, 0, 0, 0, 0); x3 = glmm_shuff1(r3, 1, 1, 1, 1); x4 = glmm_shuff1(r3, 2, 2, 2, 2); x5 = wasm_f32x4_const_splat(-0.f); - x0 = glmm_fmadd(r0, x2, glmm_fmadd(r1, x3, wasm_f32x4_mul(r2, x4))); + x0 = glmm_fmadd(r0, x2, + glmm_fmadd(r1, x3, wasm_f32x4_mul(r2, x4))); x0 = wasm_v128_xor(x0, x5); x0 = wasm_f32x4_add(x0, x1); diff --git a/include/cglm/simd/wasm/mat2.h b/include/cglm/simd/wasm/mat2.h index 6c3f5fb..9caefd1 100644 --- a/include/cglm/simd/wasm/mat2.h +++ b/include/cglm/simd/wasm/mat2.h @@ -24,7 +24,8 @@ glm_mat2_mul_wasm(mat2 m1, mat2 m2, mat2 dest) { x4 = glmm_shuff1(x2, 3, 3, 1, 1); // x0 = _mm_movelh_ps(x1, x1); x0 = wasm_i32x4_shuffle(x1, x1, 0, 1, 4, 5); - x2 = _mm_movehl_ps(x1, x1); + // x2 = _mm_movehl_ps(x1, x1); + x2 = wasm_i32x4_shuffle(x1, x1, 6, 7, 2, 3); /* dest[0][0] = a * e + c * f; diff --git a/include/cglm/simd/wasm/mat3.h b/include/cglm/simd/wasm/mat3.h index 62d179c..835f5a3 100644 --- a/include/cglm/simd/wasm/mat3.h +++ b/include/cglm/simd/wasm/mat3.h @@ -50,13 +50,15 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) { x0 = glmm_fmadd(x4, x6, x0); x1 = glmm_fmadd(x5, x2, x1); - x2 = _mm_movehl_ps(l2, l1); /* a22 a22 a21 a20 */ + // x2 = _mm_movehl_ps(l2, l1); + x2 = wasm_i32x4_shuffle(l2, l1, 6, 7, 2, 3); /* a22 a22 a21 a20 */ x3 = glmm_shuff1(x2, 0, 2, 1, 0); /* a20 a22 a21 a20 */ x2 = glmm_shuff1(x2, 1, 0, 2, 1); /* a21 a20 a22 a21 */ - x4 = wasm_i32x4_shuffle(r0, r1, 2, 2, 5, 5); /* b12 b12 b02 b02 */ + x4 = wasm_i32x4_shuffle(r0, r1, 2, 2, 5, 5); /* b12 b12 b02 b02 */ x5 = glmm_shuff1(x4, 3, 0, 0, 0); /* b12 b02 b02 b02 */ - x4 = _mm_movehl_ps(r2, x4); /* b22 b22 b12 b12 */ + // x4 = _mm_movehl_ps(r2, x4); + x4 = wasm_i32x4_shuffle(r2, x4, 6, 7, 2, 3); /* b22 b22 b12 b12 */ x0 = glmm_fmadd(x3, x5, x0); x1 = glmm_fmadd(x2, x4, x1); diff --git a/include/cglm/simd/wasm/mat4.h b/include/cglm/simd/wasm/mat4.h index 4b3cde5..f86a4ff 100644 --- a/include/cglm/simd/wasm/mat4.h +++ b/include/cglm/simd/wasm/mat4.h @@ -43,10 +43,12 @@ glm_mat4_transp_wasm(mat4 m, mat4 dest) { tmp3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r0 = _mm_movelh_ps(tmp0, tmp2); r0 = wasm_i32x4_shuffle(tmp0, tmp2, 0, 1, 4, 5); - r1 = _mm_movehl_ps(tmp2, tmp0); + // r1 = _mm_movehl_ps(tmp2, tmp0); + r1 = wasm_i32x4_shuffle(tmp2, tmp0, 6, 7, 2, 3); // r2 = _mm_movelh_ps(tmp1, tmp3); r2 = wasm_i32x4_shuffle(tmp1, tmp3, 0, 1, 4, 5); - r3 = _mm_movehl_ps(tmp3, tmp1); + // r3 = _mm_movehl_ps(tmp3, tmp1); + r3 = wasm_i32x4_shuffle(tmp3, tmp1, 6, 7, 2, 3); glmm_store(dest[0], r0); glmm_store(dest[1], r1); @@ -184,8 +186,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) { r1 = glmm_load(mat[1]); /* h g f e */ r2 = glmm_load(mat[2]); /* l k j i */ r3 = glmm_load(mat[3]); /* p o n m */ - - x0 = _mm_movehl_ps(r3, r2); /* p o l k */ + // x0 = _mm_movehl_ps(r3, r2); + x0 = wasm_i32x4_shuffle(r3, r2, 6, 7, 2, 3); /* p o l k */ // x3 = _mm_movelh_ps(r2, r3); x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5); /* n m j i */ x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */ @@ -193,10 +195,10 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) { x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */ x7 = glmm_shuff1(x3, 0, 2, 2, 2); /* i m m m */ - x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4); /* e e i i */ - x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5); /* f f j j */ - x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6); /* g g k k */ - x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7); /* h h l l */ + x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4); /* e e i i */ + x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5); /* f f j j */ + x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6); /* g g k k */ + x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7); /* h h l l */ t0 = wasm_f32x4_mul(x3, x1); t1 = wasm_f32x4_mul(x5, x1); @@ -241,13 +243,14 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) { t3[5] = e * j - i * f; */ t5 = glmm_fnmadd(x7, x5, t5); // x4 = _mm_movelh_ps(r0, r1); - x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */ - x5 = _mm_movehl_ps(r1, r0); /* h g d c */ + x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */ + // x5 = _mm_movehl_ps(r1, r0); + x5 = wasm_i32x4_shuffle(r1, r0, 6, 7, 2, 3); /* h g d c */ - x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */ - x1 = glmm_shuff1(x4, 1, 1, 1, 3); /* b b b f */ - x2 = glmm_shuff1(x5, 0, 0, 0, 2); /* c c c g */ - x3 = glmm_shuff1(x5, 1, 1, 1, 3); /* d d d h */ + x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */ + x1 = glmm_shuff1(x4, 1, 1, 1, 3); /* b b b f */ + x2 = glmm_shuff1(x5, 0, 0, 0, 2); /* c c c g */ + x3 = glmm_shuff1(x5, 1, 1, 1, 3); /* d d d h */ v2 = wasm_f32x4_mul(x0, t1); v1 = wasm_f32x4_mul(x0, t0); @@ -323,8 +326,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) { r1 = glmm_load(mat[1]); /* h g f e */ r2 = glmm_load(mat[2]); /* l k j i */ r3 = glmm_load(mat[3]); /* p o n m */ - - x0 = _mm_movehl_ps(r3, r2); /* p o l k */ + // x0 = _mm_movehl_ps(r3, r2); + x0 = wasm_i32x4_shuffle(r3, r2, 6, 7, 2, 3); /* p o l k */ // x3 = _mm_movelh_ps(r2, r3); x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5); /* n m j i */ x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */ @@ -332,10 +335,10 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) { x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */ x7 = glmm_shuff1(x3, 0, 2, 2, 2); /* i m m m */ - x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4); /* e e i i */ - x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5); /* f f j j */ - x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6); /* g g k k */ - x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7); /* h h l l */ + x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4); /* e e i i */ + x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5); /* f f j j */ + x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6); /* g g k k */ + x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7); /* h h l l */ t0 = wasm_f32x4_mul(x3, x1); t1 = wasm_f32x4_mul(x5, x1); @@ -380,13 +383,14 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) { t3[5] = e * j - i * f; */ t5 = glmm_fnmadd(x7, x5, t5); // x4 = _mm_movelh_ps(r0, r1); - x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */ - x5 = _mm_movehl_ps(r1, r0); /* h g d c */ + x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */ + // x5 = _mm_movehl_ps(r1, r0); + x5 = wasm_i32x4_shuffle(r1, r0, 6, 7, 2, 3); /* h g d c */ - x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */ - x1 = glmm_shuff1(x4, 1, 1, 1, 3); /* b b b f */ - x2 = glmm_shuff1(x5, 0, 0, 0, 2); /* c c c g */ - x3 = glmm_shuff1(x5, 1, 1, 1, 3); /* d d d h */ + x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */ + x1 = glmm_shuff1(x4, 1, 1, 1, 3); /* b b b f */ + x2 = glmm_shuff1(x5, 0, 0, 0, 2); /* c c c g */ + x3 = glmm_shuff1(x5, 1, 1, 1, 3); /* d d d h */ v2 = wasm_f32x4_mul(x0, t1); v1 = wasm_f32x4_mul(x0, t0);