diff --git a/include/cglm/quat.h b/include/cglm/quat.h index cec3a90..eeee5a9 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -262,7 +262,8 @@ glm_quat_normalize_to(versor q, versor dest) { x0 = glmm_load(q); xdot = glmm_vdot(x0, x0); - dot = _mm_cvtss_f32(xdot); + // dot = _mm_cvtss_f32(xdot); + dot = wasm_f32x4_extract_lane(xdot, 0); if (dot <= 0.0f) { glm_quat_identity(dest); diff --git a/include/cglm/simd/wasm.h b/include/cglm/simd/wasm.h index 25bf1b9..78f4605 100644 --- a/include/cglm/simd/wasm.h +++ b/include/cglm/simd/wasm.h @@ -20,8 +20,6 @@ #define glmm_splat_z(x) glmm_splat(x, 2) #define glmm_splat_w(x) glmm_splat(x, 3) -#define _mm_cvtss_f32(v) wasm_f32x4_extract_lane(v, 0) - static inline glmm_128 glmm_abs(glmm_128 x) { @@ -52,7 +50,7 @@ glmm_vhadds(glmm_128 v) { static inline float glmm_hadd(glmm_128 v) { - return _mm_cvtss_f32(glmm_vhadds(v)); + return wasm_f32x4_extract_lane(glmm_vhadds(v), 0); } static inline @@ -68,7 +66,7 @@ glmm_vhmin(glmm_128 v) { static inline float glmm_hmin(glmm_128 v) { - return _mm_cvtss_f32(glmm_vhmin(v)); + return wasm_f32x4_extract_lane(glmm_vhmin(v), 0); } static inline @@ -106,7 +104,7 @@ glmm_vdot(glmm_128 a, glmm_128 b) { static inline float glmm_dot(glmm_128 a, glmm_128 b) { - return _mm_cvtss_f32(glmm_vdots(a, b)); + return wasm_f32x4_extract_lane(glmm_vdots(a, b), 0); } static inline @@ -114,25 +112,26 @@ float glmm_norm(glmm_128 a) { glmm_128 x0; x0 = glmm_vhadds(wasm_f32x4_mul(a, a)); - return _mm_cvtss_f32(wasm_i32x4_shuffle(x0, wasm_f32x4_sqrt(x0),4, 1, 2, 3)); + return wasm_f32x4_extract_lane( + wasm_i32x4_shuffle(x0, wasm_f32x4_sqrt(x0),4, 1, 2, 3), 0); } static inline float glmm_norm2(glmm_128 a) { - return _mm_cvtss_f32(glmm_vhadds(wasm_f32x4_mul(a, a))); + return wasm_f32x4_extract_lane(glmm_vhadds(wasm_f32x4_mul(a, a)), 0); } static inline float glmm_norm_one(glmm_128 a) { - return _mm_cvtss_f32(glmm_vhadds(glmm_abs(a))); + return wasm_f32x4_extract_lane(glmm_vhadds(glmm_abs(a)), 0); } static inline float glmm_norm_inf(glmm_128 a) { - return _mm_cvtss_f32(glmm_vhmax(glmm_abs(a))); + return wasm_f32x4_extract_lane(glmm_vhmax(glmm_abs(a)), 0); } static inline diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 9e552ba..73a5662 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -542,9 +542,9 @@ glm_vec4_addadd(vec4 a, vec4 b, vec4 dest) { _mm_add_ps(glmm_load(a), glmm_load(b)))); #elif defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_add(glmm_load(dest), - wasm_f32x4_add(glmm_load(a), - glmm_load(b)))); + glmm_store(dest, wasm_f32x4_add( + glmm_load(dest), + wasm_f32x4_add(glmm_load(a), glmm_load(b)))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vaddq_f32(vld1q_f32(a), @@ -574,9 +574,9 @@ glm_vec4_subadd(vec4 a, vec4 b, vec4 dest) { _mm_sub_ps(glmm_load(a), glmm_load(b)))); #elif defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_add(glmm_load(dest), - wasm_f32x4_sub(glmm_load(a), - glmm_load(b)))); + glmm_store(dest, wasm_f32x4_add( + glmm_load(dest), + wasm_f32x4_sub(glmm_load(a), glmm_load(b)))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vsubq_f32(vld1q_f32(a), @@ -650,9 +650,9 @@ glm_vec4_maxadd(vec4 a, vec4 b, vec4 dest) { _mm_max_ps(glmm_load(a), glmm_load(b)))); #elif defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_add(glmm_load(dest), - wasm_f32x4_max(glmm_load(a), - glmm_load(b)))); + glmm_store(dest, wasm_f32x4_add( + glmm_load(dest), + wasm_f32x4_max(glmm_load(a), glmm_load(b)))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vmaxq_f32(vld1q_f32(a), @@ -682,9 +682,9 @@ glm_vec4_minadd(vec4 a, vec4 b, vec4 dest) { _mm_min_ps(glmm_load(a), glmm_load(b)))); #elif defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_f32x4_add(glmm_load(dest), - wasm_f32x4_min(glmm_load(a), - glmm_load(b)))); + glmm_store(dest, wasm_f32x4_add( + glmm_load(dest), + wasm_f32x4_min(glmm_load(a), glmm_load(b)))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vaddq_f32(vld1q_f32(dest), vminq_f32(vld1q_f32(a), @@ -709,7 +709,8 @@ glm_vec4_negate_to(vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); #elif defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(dest, wasm_v128_xor(glmm_load(v), wasm_f32x4_const_splat(-0.0f))); + glmm_store(dest, wasm_v128_xor(glmm_load(v), + wasm_f32x4_const_splat(-0.0f))); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vnegq_f32(vld1q_f32(v))); #else @@ -760,7 +761,8 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) { x0 = glmm_load(v); xdot = glmm_vdot(x0, x0); - dot = _mm_cvtss_f32(xdot); + // dot = _mm_cvtss_f32(xdot); + dot = wasm_f32x4_extract_lane(xdot, 0); if (dot == 0.0f) { glmm_store(dest, wasm_f32x4_const_splat(0.f)); @@ -903,8 +905,9 @@ glm_vec4_clamp(vec4 v, float minVal, float maxVal) { glmm_store(v, _mm_min_ps(_mm_max_ps(glmm_load(v), _mm_set1_ps(minVal)), _mm_set1_ps(maxVal))); #elif defined(__wasm__) && defined(__wasm_simd128__) - glmm_store(v, wasm_f32x4_min(wasm_f32x4_max(glmm_load(v), wasm_f32x4_splat(minVal)), - wasm_f32x4_splat(maxVal))); + glmm_store(v, wasm_f32x4_min( + wasm_f32x4_max(glmm_load(v), wasm_f32x4_splat(minVal)), + wasm_f32x4_splat(maxVal))); #elif defined(CGLM_NEON_FP) vst1q_f32(v, vminq_f32(vmaxq_f32(vld1q_f32(v), vdupq_n_f32(minVal)), vdupq_n_f32(maxVal)));