build: add missing call/aabb2d.h

now working on v0.9.6
Merge pull request #413 from recp/optimize-inv
2026-02-17 03:39:05 +00:00 · 2025-02-13 22:35:16 +03:00 · 2025-02-13 12:25:10 +03:00 · 2025-02-12 23:08:07 +03:00 · 2025-02-09 15:30:49 +03:00 · 2025-02-09 15:13:28 +03:00
120 changed files with 4119 additions and 1060 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,25 +8,80 @@ on:
 jobs:
  build_autotools:
-    name: Autotools / ${{ matrix.os }}
+    name: Autotools / ${{ matrix.os }} / ${{ matrix.simd }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        os: [macos-12, macos-14, ubuntu-22.04]
+        include:
          # x86/x64 builds
          - { os: macos-13, simd: none }
          - { os: macos-13, simd: sse }
          - { os: macos-13, simd: sse2 }
          - { os: macos-13, simd: sse3 }
          - { os: macos-13, simd: sse4 }
          - { os: macos-13, simd: avx }
          - { os: macos-13, simd: avx2 }
          - { os: macos-14, simd: none }
          - { os: macos-14, simd: sse }
          - { os: macos-14, simd: sse2 }
          - { os: macos-14, simd: sse3 }
          - { os: macos-14, simd: sse4 }
          - { os: macos-14, simd: avx }
          - { os: macos-14, simd: avx2 }
          - { os: ubuntu-22.04, simd: none }
          - { os: ubuntu-22.04, simd: sse }
          - { os: ubuntu-22.04, simd: sse2 }
          - { os: ubuntu-22.04, simd: sse3 }
          - { os: ubuntu-22.04, simd: sse4 }
          - { os: ubuntu-22.04, simd: avx }
          - { os: ubuntu-22.04, simd: avx2 }
          - { os: ubuntu-24.04, simd: none }
          - { os: ubuntu-24.04, simd: sse }
          - { os: ubuntu-24.04, simd: sse2 }
          - { os: ubuntu-24.04, simd: sse3 }
          - { os: ubuntu-24.04, simd: sse4 }
          - { os: ubuntu-24.04, simd: avx }
          - { os: ubuntu-24.04, simd: avx2 }
          # ARM64 builds
          - { os: ubuntu-latest-arm64, simd: neon }
    steps:
    - uses: actions/checkout@v4
-    - name: Install Autotools
+    - name: Install Autotools on macOS
      if: runner.os == 'macOS'
      run: brew upgrade && brew install autoconf automake libtool
    - name: Install Autotools on Ubuntu
      if: matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-24.04'
      run: sudo apt-get install -y autoconf automake libtool
    - name: Set SIMD flags
      run: |
        if [ "${{ matrix.simd }}" == "none" ]; then
          export CFLAGS=""
        elif [ "${{ matrix.simd }}" == "sse" ]; then
          export CFLAGS="-msse"
        elif [ "${{ matrix.simd }}" == "sse2" ]; then
          export CFLAGS="-msse2"
        elif [ "${{ matrix.simd }}" == "sse3" ]; then
          export CFLAGS="-msse3"
        elif [ "${{ matrix.simd }}" == "sse4" ]; then
          export CFLAGS="-msse4"
        elif [ "${{ matrix.simd }}" == "avx" ]; then
          export CFLAGS="-mavx"
        elif [ "${{ matrix.simd }}" == "avx2" ]; then
          export CFLAGS="-mavx2"
        elif [ "${{ matrix.simd }}" == "neon" ]; then
          export CFLAGS="-mfpu=neon"
        fi
    - name: Generate Autotools
      run: ./autogen.sh
    - name: Configure Autotools
-      run: ./configure
+      run: ./configure CFLAGS="$CFLAGS"
    - name: Build
      run: make
@@ -55,13 +110,141 @@ jobs:
    - name: Build
      run: cmake --build build
  build_cmake_ubuntu:
    name: CMake / ${{ matrix.target.os }} / ${{ matrix.target.cc }} / ${{ matrix.target.arch }} / ${{ matrix.target.simd }}
    runs-on: ${{ matrix.target.arch == 'arm64' && 'ubuntu-latest-arm64' || matrix.target.os }}
    strategy:
      fail-fast: false
      matrix:
        target:
          # GCC 11 builds
          - { os: ubuntu-20.04, cc: gcc-11, arch: x64, simd: none }
          - { os: ubuntu-20.04, cc: gcc-11, arch: x64, simd: sse }
          - { os: ubuntu-20.04, cc: gcc-11, arch: x64, simd: sse2 }
          - { os: ubuntu-20.04, cc: gcc-11, arch: x64, simd: sse3 }
          - { os: ubuntu-20.04, cc: gcc-11, arch: x64, simd: sse4 }
          - { os: ubuntu-20.04, cc: gcc-11, arch: x64, simd: avx }
          - { os: ubuntu-20.04, cc: gcc-11, arch: x64, simd: avx2 }
          # GCC 12 builds
          - { os: ubuntu-22.04, cc: gcc-12, arch: x64, simd: none }
          - { os: ubuntu-22.04, cc: gcc-12, arch: x64, simd: sse }
          - { os: ubuntu-22.04, cc: gcc-12, arch: x64, simd: sse2 }
          - { os: ubuntu-22.04, cc: gcc-12, arch: x64, simd: sse3 }
          - { os: ubuntu-22.04, cc: gcc-12, arch: x64, simd: sse4 }
          - { os: ubuntu-22.04, cc: gcc-12, arch: x64, simd: avx }
          - { os: ubuntu-22.04, cc: gcc-12, arch: x64, simd: avx2 }
          # GCC 13 builds
          - { os: ubuntu-24.04, cc: gcc-13, arch: x64, simd: none }
          - { os: ubuntu-24.04, cc: gcc-13, arch: x64, simd: sse }
          - { os: ubuntu-24.04, cc: gcc-13, arch: x64, simd: sse2 }
          - { os: ubuntu-24.04, cc: gcc-13, arch: x64, simd: sse3 }
          - { os: ubuntu-24.04, cc: gcc-13, arch: x64, simd: sse4 }
          - { os: ubuntu-24.04, cc: gcc-13, arch: x64, simd: avx }
          - { os: ubuntu-24.04, cc: gcc-13, arch: x64, simd: avx2 }
          # Clang 12 builds
          - { os: ubuntu-20.04, cc: clang-12, arch: x64, simd: none }
          - { os: ubuntu-20.04, cc: clang-12, arch: x64, simd: sse }
          - { os: ubuntu-20.04, cc: clang-12, arch: x64, simd: sse2 }
          - { os: ubuntu-20.04, cc: clang-12, arch: x64, simd: sse3 }
          - { os: ubuntu-20.04, cc: clang-12, arch: x64, simd: sse4 }
          - { os: ubuntu-20.04, cc: clang-12, arch: x64, simd: avx }
          - { os: ubuntu-20.04, cc: clang-12, arch: x64, simd: avx2 }
          # Clang 15 builds
          - { os: ubuntu-22.04, cc: clang-15, arch: x64, simd: none }
          - { os: ubuntu-22.04, cc: clang-15, arch: x64, simd: sse }
          - { os: ubuntu-22.04, cc: clang-15, arch: x64, simd: sse2 }
          - { os: ubuntu-22.04, cc: clang-15, arch: x64, simd: sse3 }
          - { os: ubuntu-22.04, cc: clang-15, arch: x64, simd: sse4 }
          - { os: ubuntu-22.04, cc: clang-15, arch: x64, simd: avx }
          - { os: ubuntu-22.04, cc: clang-15, arch: x64, simd: avx2 }
          # ARM64 builds
          - { os: ubuntu-latest, cc: gcc-12, arch: arm64, simd: neon }
          - { os: ubuntu-latest, cc: gcc-13, arch: arm64, simd: neon }
          # ARMv7 builds
          - { os: ubuntu-latest-arm64, cc: gcc-12, arch: armv7, simd: neon }
          - { os: ubuntu-latest-arm64, cc: gcc-12, arch: armv7, simd: none }
    steps:
    - uses: actions/checkout@v4
    - name: Add Ubuntu Toolchain PPA
      if: matrix.target.os == 'ubuntu-20.04'
      run: |
        sudo apt-get update
        sudo apt-get install -y software-properties-common
        sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
        sudo apt-get update
    - name: Install Compiler and Ninja
      run: |
        sudo apt-get install -y ${{ matrix.target.cc }} ninja-build
    - name: Set SIMD flags
      run: |
        if [ "${{ matrix.simd }}" == "none" ]; then
          export CFLAGS=""
        elif [ "${{ matrix.simd }}" == "sse" ]; then
          export CFLAGS="-msse"
        elif [ "${{ matrix.simd }}" == "sse2" ]; then
          export CFLAGS="-msse2"
        elif [ "${{ matrix.simd }}" == "sse3" ]; then
          export CFLAGS="-msse3"
        elif [ "${{ matrix.simd }}" == "sse4" ]; then
          export CFLAGS="-msse4"
        elif [ "${{ matrix.simd }}" == "avx" ]; then
          export CFLAGS="-mavx"
        elif [ "${{ matrix.simd }}" == "avx2" ]; then
          export CFLAGS="-mavx2"
        elif [ "${{ matrix.simd }}" == "neon" ]; then
          export CFLAGS="-mfpu=neon"
        fi
    - name: Configure CMake
      run: |
        if [ "${{ matrix.target.arch }}" == "armv7" ]; then
          # Build for ARMv7
          neon_flags=""
          if [ "${{ matrix.simd }}" == "neon" ]; then
            neon_flags="-mfpu=neon -mfloat-abi=hard"
          fi
          cmake -B build -GNinja -DCMAKE_BUILD_TYPE=Release \
            -DCMAKE_C_COMPILER=${{ matrix.target.cc }} \
            -DCMAKE_C_FLAGS="$CFLAGS -m32 -march=armv7-a ${neon_flags}" \
            -DCGLM_STATIC=ON -DCGLM_USE_TEST=ON
        elif [ "${{ matrix.target.arch }}" == "arm64" ]; then
          # Build for ARM64 (AArch64)
          neon_flags=""
          if [ "${{ matrix.simd }}" == "neon" ]; then
            neon_flags="+simd" # Enable SIMD/NEON features on ARM64
          else
            neon_flags="+nosimd" # Explicitly disable SIMD/NEON
          fi
          cmake -B build -GNinja -DCMAKE_BUILD_TYPE=Release \
            -DCMAKE_C_COMPILER=${{ matrix.target.cc }} \
            -DCMAKE_C_FLAGS="$CFLAGS -march=armv8-a${neon_flags}" \
            -DCGLM_STATIC=ON -DCGLM_USE_TEST=ON
        else
          # Normal build (x86/x64)
          cmake -B build -GNinja -DCMAKE_BUILD_TYPE=Release \
            -DCMAKE_C_COMPILER=${{ matrix.target.cc }} \
            -DCMAKE_C_FLAGS="$CFLAGS" \
            -DCGLM_STATIC=ON -DCGLM_USE_TEST=ON
        fi
    - name: Build
      run: cmake --build build
    - name: Test
      working-directory: build
      run: ./tests
  build_cmake_macos:
    name: CMake / ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        os: [macos-12, macos-14]
+        os: [macos-13, macos-14]
    steps:
    - uses: actions/checkout@v4
@@ -86,76 +269,270 @@ jobs:
      working-directory: build
      run: ./tests
-  build_cmake_ubuntu:
+  build_cmake:
-    name: CMake / ${{ matrix.target.os }} / ${{ matrix.target.cc }}
+    name: CMake / ${{ matrix.os }} / ${{ matrix.simd }}
-    runs-on: ${{ matrix.target.os }}
+    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        target:
+        include:
-          - { os: ubuntu-20.04, cc: gcc-11 }
+          # x86/x64 builds
-          - { os: ubuntu-22.04, cc: gcc-12 }
+          - { os: macos-13, simd: none }
-          - { os: ubuntu-22.04, cc: gcc-13 }
+          - { os: macos-13, simd: sse }
-          - { os: ubuntu-20.04, cc: clang-12 }
+          - { os: macos-13, simd: sse2 }
-          - { os: ubuntu-22.04, cc: clang-15 }
+          - { os: macos-13, simd: sse3 }
          - { os: macos-13, simd: sse4 }
          - { os: macos-13, simd: avx }
          - { os: macos-13, simd: avx2 }
          - { os: macos-14, simd: none }
          - { os: macos-14, simd: sse }
          - { os: macos-14, simd: sse2 }
          - { os: macos-14, simd: sse3 }
          - { os: macos-14, simd: sse4 }
          - { os: macos-14, simd: avx }
          - { os: macos-14, simd: avx2 }
          - { os: windows-2022, simd: none }
          - { os: windows-2022, simd: sse }
          - { os: windows-2022, simd: sse2 }
          - { os: windows-2022, simd: sse3 }
          - { os: windows-2022, simd: sse4 }
          - { os: windows-2022, simd: avx }
          - { os: windows-2022, simd: avx2 }
          # ARM64 builds
          - { os: macos-14-arm64, simd: neon }
    steps:
    - uses: actions/checkout@v4
-    - name: Install Compiler and Ninja
+    - name: Install Ninja on macOS
-      run: |
+      if: runner.os == 'macOS'
-        sudo apt-get update -y
+      run: brew upgrade && brew install ninja
        sudo apt-get install -y ${{ matrix.target.cc }} ninja-build
-    - name: Configure CMake
+    - name: Set SIMD flags (Windows)
      if: runner.os == 'Windows'
      shell: pwsh
      run: |
-        cmake \
+        $simd = "${{ matrix.simd }}"
-          -B build \
+        if ($simd -eq "none") {
-          -GNinja \
+          $env:CFLAGS = ""
-          -DCMAKE_C_COMPILER=${{ matrix.target.cc }} \
+        } elseif ($simd -eq "sse") {
-          -DCMAKE_BUILD_TYPE=Release \
+          $env:CFLAGS = "-arch:SSE"
-          -DCGLM_STATIC=ON \
+        } elseif ($simd -eq "sse2") {
-          -DCGLM_USE_TEST=ON
+          $env:CFLAGS = "-arch:SSE2"
        } elseif ($simd -eq "sse3") {
          $env:CFLAGS = "-arch:SSE3"
        } elseif ($simd -eq "sse4") {
          $env:CFLAGS = "-arch:SSE4"
        } elseif ($simd -eq "avx") {
          $env:CFLAGS = "-arch:AVX"
        } elseif ($simd -eq "avx2") {
          $env:CFLAGS = "-arch:AVX2"
        } elseif ($simd -eq "neon") {
          $env:CFLAGS = "-arch:NEON"
        }
    - name: Set SIMD flags (Unix)
      if: runner.os != 'Windows'
      shell: bash
      run: |
        if [ "${{ matrix.simd }}" == "none" ]; then
          export CFLAGS=""
        elif [ "${{ matrix.simd }}" == "sse" ]; then
          export CFLAGS="-msse"
        elif [ "${{ matrix.simd }}" == "sse2" ]; then
          export CFLAGS="-msse2"
        elif [ "${{ matrix.simd }}" == "sse3" ]; then
          export CFLAGS="-msse3"
        elif [ "${{ matrix.simd }}" == "sse4" ]; then
          export CFLAGS="-msse4"
        elif [ "${{ matrix.simd }}" == "avx" ]; then
          export CFLAGS="-mavx"
        elif [ "${{ matrix.simd }}" == "avx2" ]; then
          export CFLAGS="-mavx2"
        elif [ "${{ matrix.simd }}" == "neon" ]; then
          export CFLAGS="-mfpu=neon"
        fi
    - name: Configure CMake (Windows)
      if: runner.os == 'Windows'
      shell: pwsh
      run: cmake -B build -G "Visual Studio 17 2022" -A x64 -T host=x64 -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="$env:CFLAGS" -DCGLM_STATIC=ON -DCGLM_USE_TEST=ON
    - name: Configure CMake (Unix)
      if: runner.os != 'Windows'
      shell: bash
      run: cmake -B build -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="$CFLAGS" -DCGLM_STATIC=ON -DCGLM_USE_TEST=ON
    - name: Build
      run: cmake --build build
-    - name: Test
+    - name: Test (Windows)
      if: runner.os == 'Windows'
      shell: pwsh
      working-directory: build
      run: .\Debug\tests.exe
    - name: Test (Unix)
      if: runner.os != 'Windows'
      shell: bash
      working-directory: build
      run: ./tests
-  build_cmake_windows:
+  build_meson:
-    name: CMake / ${{ matrix.platform.name }}
+    name: Meson / ${{ matrix.os }} / ${{ matrix.simd }}
-    runs-on: windows-2022
+    runs-on: ${{ contains(matrix.os, 'arm64') && 'ubuntu-latest-arm64' || matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        platform:
+        include:
-        - { name: Windows (x64),          flags: -A x64 }
+          # x86/x64 builds
-        - { name: Windows (x86),          flags: -A Win32 }
+          - { os: macos-14, simd: none }
-        - { name: Windows (clang-cl x64), flags: -T ClangCL -A x64 }
+          - { os: macos-14, simd: sse }
-        - { name: Windows (clang-cl x86), flags: -T ClangCL -A Win32 }
+          - { os: macos-14, simd: sse2 }
-        - { name: Windows (ARM),          flags: -A ARM, skip_tests: true, skip_build: true } # This fails to build.
+          - { os: macos-14, simd: sse3 }
-        - { name: Windows (ARM64),        flags: -A ARM64, skip_tests: true }
+          - { os: macos-14, simd: sse4 }
-        - { name: UWP (ARM64),            flags: -A ARM64, -DCMAKE_SYSTEM_NAME=WindowsStore -DCMAKE_SYSTEM_VERSION="10.0", skip_tests: true }
+          - { os: macos-14, simd: avx }
-        - { name: UWP (x64),              flags: -A x64 -DCMAKE_SYSTEM_NAME=WindowsStore -DCMAKE_SYSTEM_VERSION="10.0", skip_tests: true }
+          - { os: macos-14, simd: avx2 }
          - { os: ubuntu-22.04, simd: none }
          - { os: ubuntu-22.04, simd: sse }
          - { os: ubuntu-22.04, simd: sse2 }
          - { os: ubuntu-22.04, simd: sse3 }
          - { os: ubuntu-22.04, simd: sse4 }
          - { os: ubuntu-22.04, simd: avx }
          - { os: ubuntu-22.04, simd: avx2 }
          - { os: ubuntu-24.04, simd: none }
          - { os: ubuntu-24.04, simd: sse }
          - { os: ubuntu-24.04, simd: sse2 }
          - { os: ubuntu-24.04, simd: sse3 }
          - { os: ubuntu-24.04, simd: sse4 }
          - { os: ubuntu-24.04, simd: avx }
          - { os: ubuntu-24.04, simd: avx2 }
          - { os: windows-2022, simd: none }
          - { os: windows-2022, simd: sse }
          - { os: windows-2022, simd: sse2 }
          - { os: windows-2022, simd: sse3 }
          - { os: windows-2022, simd: sse4 }
          - { os: windows-2022, simd: avx }
          - { os: windows-2022, simd: avx2 }
          # ARM64 builds
          - { os: ubuntu-latest-arm64, simd: neon }
    steps:
    - uses: actions/checkout@v4
-    - name: Configure CMake
+    - uses: actions/setup-python@v5
-      run: cmake -B build `
+      with:
-          -DCGLM_STATIC=ON `
+        python-version: '3.12'
-          -DCGLM_USE_TEST=ON `
+        cache: 'pip'
          ${{ matrix.platform.flags }}
-    - name: Build
+    - name: Install meson
-      if: ${{ !matrix.platform.skip_build }}
+      run: python3 -m pip install meson ninja
      run: cmake --build build --config Release --parallel
-    - name: Test
+    - name: Set SIMD flags (Windows)
-      if: ${{ !matrix.platform.skip_tests }}
+      if: runner.os == 'Windows'
-      working-directory: build
+      shell: pwsh
-      run: .\Release\tests.exe
+      run: |
        $simd = "${{ matrix.simd }}"
        if ($simd -eq "none") {
          $env:CFLAGS = ""
        } elseif ($simd -eq "sse") {
          $env:CFLAGS = "-arch:SSE"
        } elseif ($simd -eq "sse2") {
          $env:CFLAGS = "-arch:SSE2"
        } elseif ($simd -eq "sse3") {
          $env:CFLAGS = "-arch:SSE3"
        } elseif ($simd -eq "sse4") {
          $env:CFLAGS = "-arch:SSE4"
        } elseif ($simd -eq "avx") {
          $env:CFLAGS = "-arch:AVX"
        } elseif ($simd -eq "avx2") {
          $env:CFLAGS = "-arch:AVX2"
        } elseif ($simd -eq "neon") {
          $env:CFLAGS = "-arch:NEON"
        }
    - name: Set SIMD flags (Unix)
      if: runner.os != 'Windows'
      shell: bash
      run: |
        if [ "${{ matrix.simd }}" == "none" ]; then
          export CFLAGS=""
        elif [ "${{ matrix.simd }}" == "sse" ]; then
          export CFLAGS="-msse"
        elif [ "${{ matrix.simd }}" == "sse2" ]; then
          export CFLAGS="-msse2"
        elif [ "${{ matrix.simd }}" == "sse3" ]; then
          export CFLAGS="-msse3"
        elif [ "${{ matrix.simd }}" == "sse4" ]; then
          export CFLAGS="-msse4"
        elif [ "${{ matrix.simd }}" == "avx" ]; then
          export CFLAGS="-mavx"
        elif [ "${{ matrix.simd }}" == "avx2" ]; then
          export CFLAGS="-mavx2"
        elif [ "${{ matrix.simd }}" == "neon" ]; then
          export CFLAGS="-mfpu=neon"
        fi
    - name: Build with meson (Windows)
      if: runner.os == 'Windows'
      shell: pwsh
      run: |
        meson setup build -Dbuildtype=release --default-library=static -Dbuild_tests=true -Dc_args="$env:CFLAGS"
        meson test -C build
    - name: Build with meson (Unix)
      if: runner.os != 'Windows'
      shell: bash
      run: |
        meson setup build -Dbuildtype=release --default-library=static -Dbuild_tests=true -Dc_args="$CFLAGS"
        meson test -C build
  build_msbuild:
    name: MSBuild / Windows / ${{ matrix.simd }}
    runs-on: windows-2022
    strategy:
      fail-fast: false
      matrix:
        simd: [none, sse, sse2, sse3, sse4, avx, avx2, neon]
    steps:
    - uses: actions/checkout@v4
    - uses: microsoft/setup-msbuild@v2
    - name: Retarget solution
      run: |
        vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
        $vsInstallPath = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
        & "$vsInstallPath\Common7\IDE\devenv.com" cglm.sln /Upgrade
    - name: Set SIMD flags
      run: |
        if ($Env:SIMD -eq 'none') {
          $Env:CFLAGS=""
        } elseif ($Env:SIMD -eq 'sse') {
          $Env:CFLAGS="-arch:SSE"
        } elseif ($Env:SIMD -eq 'sse2') {
          $Env:CFLAGS="-arch:SSE2"
        } elseif ($Env:SIMD -eq 'sse3') {
          $Env:CFLAGS="-arch:SSE3"
        } elseif ($Env:SIMD -eq 'sse4') {
          $Env:CFLAGS="-arch:SSE4"
        } elseif ($Env:SIMD -eq 'avx') {
          $Env:CFLAGS="-arch:AVX"
        } elseif ($Env:SIMD -eq 'avx2') {
          $Env:CFLAGS="-arch:AVX2"
        } elseif ($Env:SIMD -eq 'neon') {
          $Env:CFLAGS="-arch:NEON"
        }
    - name: Build (x86)
      working-directory: win
      run: msbuild cglm.vcxproj /p:Configuration=Release /p:Platform=x86 /p:PlatformToolset=v143 /p:BuildInParallel=true /p:AdditionalOptions="$Env:CFLAGS"
    - name: Build (x64)
      working-directory: win
      run: msbuild cglm.vcxproj /p:Configuration=Release /p:Platform=x64 /p:PlatformToolset=v143 /p:BuildInParallel=true /p:AdditionalOptions="$Env:CFLAGS"
  build_documentation:
    name: Documentation
@@ -176,57 +553,13 @@ jobs:
      working-directory: docs
      run: sphinx-build -W --keep-going source build
  build_meson:
    name: Meson / ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [macos-14, ubuntu-22.04]
    steps:
    - uses: actions/checkout@v4
    - uses: actions/setup-python@v5
      with:
        python-version: '3.12'
        cache: 'pip'
    - name: Install meson
      run: python3 -m pip install meson ninja
    - name: Build
      run: meson setup build -Dbuildtype=release --default-library=static -Dbuild_tests=true
    - name: Test
      run: meson test -C build
  build_msbuild:
    name: MSBuild / Windows
    runs-on: windows-2022
    # This has no test yet.
    # It could also try building for ARM, ARM64, ARM64EC, but those fail currently.
    steps:
    - uses: actions/checkout@v4
    - uses: microsoft/setup-msbuild@v2
    - name: Build (x86)
      working-directory: win
      run: msbuild cglm.vcxproj /p:Configuration=Release /p:Platform=x86 /p:BuildInParallel=true
    - name: Build (x64)
      working-directory: win
      run: msbuild cglm.vcxproj /p:Configuration=Release /p:Platform=x64 /p:BuildInParallel=true
  build_swift:
    name: Swift ${{ matrix.swift }} / ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        os: [macos-12, macos-14, ubuntu-22.04]
+        os: [macos-13, macos-14, ubuntu-22.04]
    # This has no test yet.
    steps:
@@ -234,3 +567,79 @@ jobs:
      - name: Build
        run: swift build
  build_cmake_arm:
    name: CMake / ARM / ${{ matrix.os }} / ${{ matrix.arch }} / ${{ matrix.simd }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        include:
          # Linux ARM builds
          - os: ubuntu-latest-arm64
            arch: arm64
            simd: neon
          - os: ubuntu-latest-arm64
            arch: armv7
            simd: neon
          - os: ubuntu-latest-arm64
            arch: armv7
            simd: none
          # Windows ARM builds
          - os: windows-latest-arm64
            arch: arm64
            simd: neon
          - os: windows-latest-arm64
            arch: arm
            simd: neon
          - os: windows-latest-arm64
            arch: arm
            simd: none
    steps:
    - uses: actions/checkout@v4
    - name: Configure CMake (Windows)
      if: runner.os == 'Windows'
      shell: pwsh
      run: |
        $flags = ""
        if ("${{ matrix.arch }}" -eq "arm") {
          $flags = "-m32 -march=armv7-a"
          if ("${{ matrix.simd }}" -eq "neon") {
            $flags += " -mfpu=neon"
          }
        }
        elseif ("${{ matrix.simd }}" -eq "neon") {
          $flags = "-march=armv8-a+simd"
        }
        cmake -B build -G "Visual Studio 17 2022" -A ${{ matrix.arch == 'arm64' && 'ARM64' || 'ARM' }} `
          -DCMAKE_BUILD_TYPE=Release `
          -DCMAKE_C_FLAGS="$flags" `
          -DCGLM_STATIC=ON -DCGLM_USE_TEST=ON
    - name: Configure CMake (Unix)
      if: runner.os != 'Windows'
      shell: bash
      run: |
        flags=""
        if [ "${{ matrix.arch }}" = "armv7" ]; then
          flags="-m32 -march=armv7-a"
          if [ "${{ matrix.simd }}" = "neon" ]; then
            flags="$flags -mfpu=neon -mfloat-abi=hard"
          fi
        elif [ "${{ matrix.simd }}" = "neon" ]; then
          flags="-march=armv8-a+simd"
        fi
        cmake -B build -GNinja -DCMAKE_BUILD_TYPE=Release \
          -DCMAKE_C_FLAGS="$flags" \
          -DCGLM_STATIC=ON -DCGLM_USE_TEST=ON
    - name: Build
      run: cmake --build build
    - name: Test
      working-directory: build
      run: ./tests
--- a/.github/workflows/cmake-wasm.yml
+++ b/.github/workflows/cmake-wasm.yml
@@ -31,10 +31,20 @@ jobs:
        wget --no-verbose https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-${{matrix.wasi_sdk_version}}/wasi-sdk-${{matrix.wasi_sdk_version}}.0-linux.tar.gz
        tar xf wasi-sdk-${{matrix.wasi_sdk_version}}.0-linux.tar.gz
    # Building a wasm library without needing to define a main():
    #   https://github.com/WebAssembly/wasi-sdk/issues/332
    - name: Modify CMakeLists.txt for WASI
      run: |
        echo 'if (CMAKE_SYSTEM_NAME STREQUAL "WASI")' >> CMakeLists.txt
        echo '  target_link_options(${PROJECT_NAME} PRIVATE -mexec-model=reactor)' >> CMakeLists.txt
        echo 'endif()' >> CMakeLists.txt
    - name: Configure CMake
      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
-      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DCMAKE_C_FLAGS="${{matrix.C_FLAGS}}" -DCMAKE_TOOLCHAIN_FILE=${{github.workspace}}/wasi-sdk-${{matrix.wasi_sdk_version}}.0/share/cmake/wasi-sdk.cmake -DWASI_SDK_PREFIX=${{github.workspace}}/wasi-sdk-${{matrix.wasi_sdk_version}}.0 -DCGLM_USE_TEST=ON
+      # Below suppress <<'clock' is deprecated: WASI lacks process-associated clocks; ...>> warns:
      #   -D_WASI_EMULATED_PROCESS_CLOCKS" -DCMAKE_EXE_LINKER_FLAGS="-lwasi-emulated-process-clocks
      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{matrix.BUILD_TYPE}} -DCMAKE_C_FLAGS="${{matrix.C_FLAGS}} -D_WASI_EMULATED_PROCESS_CLOCKS" -DCMAKE_EXE_LINKER_FLAGS="-lwasi-emulated-process-clocks" -DCMAKE_TOOLCHAIN_FILE=${{github.workspace}}/wasi-sdk-${{matrix.wasi_sdk_version}}.0/share/cmake/wasi-sdk.cmake -DWASI_SDK_PREFIX=${{github.workspace}}/wasi-sdk-${{matrix.wasi_sdk_version}}.0 -DCGLM_STATIC=ON -DCGLM_SHARED=OFF -DCGLM_USE_TEST=ON
    - name: Build
      # Build your program with the given configuration
--- a/.gitignore
+++ b/.gitignore
@@ -80,3 +80,4 @@ confdefs.h
 cmake-build-debug
 *.o.tmp
 xcode/*
 .vscode
--- a/.gitmodules
+++ b/.gitmodules
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@@ -1,20 +0,0 @@
 {
    "configurations": [
        {
            "name": "Mac",
            "includePath": [
                "${workspaceFolder}/**"
            ],
            "defines": [],
            "macFrameworkPath": [
                "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
            ],
            "compilerPath": "/usr/bin/clang",
            "cStandard": "c23",
            "cppStandard": "c++23",
            "intelliSenseMode": "macos-clang-arm64",
            "configurationProvider": "vector-of-bool.cmake-tools"
        }
    ],
    "version": 4
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,31 +0,0 @@
 {
  "C_Cpp.default.configurationProvider": "vector-of-bool.cmake-tools",
  "restructuredtext.confPath": "${workspaceFolder}/docs/source",
  "workbench.colorTheme": "Default Light+ Experimental",
  "editor.fontSize": 14,
  "workbench.colorCustomizations": {
 	"[Default Light*]": {
  	    "editor.background": "#fefefe",
  	    "sideBar.background": "#fefefe",
  	    "sideBar.foreground": "#343436",
  	    "sideBarTitle.foreground": "#343436",
  	    "sideBar.border": "#e2e2e4",
  	    "statusBar.background": "#fefefe",
  	    "titleBar.activeBackground": "#fefefe",
  	    "tab.activeBackground": "#f4fff4aa",
  	    "tab.inactiveBackground": "#fefefe",
  	    "activityBar.background": "#fefefe",
  	    "editorGroupHeader.tabsBackground": "#fefefe"
 	},
 	"[Default Dark*]": {
 		"editor.background": "#1D1D25",
 		"sideBar.background": "#1D1D25",
 		"statusBar.background": "#1D1D25",
 		"titleBar.activeBackground": "#1D1D25",
 		"tab.activeBackground": "#2C2C3A",
 		"tab.inactiveBackground": "#1D1D25",
 		"activityBar.background": "#1D1D25",
 		"editorGroupHeader.tabsBackground": "#1D1D25"
    }
  },
 }
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -0,0 +1,199 @@
 # Building the library
 cglm can be built using one of the following build systems:
 ## CMake (All platforms)
 ```bash
 $ mkdir build
 $ cd build
 $ cmake .. # [Optional] -DCGLM_SHARED=ON
 $ make
 $ sudo make install # [Optional]
 ```
 ### Options with defaults
 ```CMake
 option(CGLM_SHARED "Shared build" ON)
 option(CGLM_STATIC "Static build" OFF)
 option(CGLM_USE_C99 "" OFF) # C11 
 option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test
 ```
 ### Including in a CMake project
 #### Header only
 This requires no building or installation of cglm.
 * Example:
 ``` cmake
 cmake_minimum_required(VERSION 3.8.2)
 project(<Your Project Name>)
 add_executable(${PROJECT_NAME} src/main.c)
 target_link_libraries(${LIBRARY_NAME} PRIVATE
  cglm_headers)
 add_subdirectory(external/cglm/ EXCLUDE_FROM_ALL)
 ```
 #### Linked
 * Example:
 ```cmake
 cmake_minimum_required(VERSION 3.8.2)
 project(<Your Project Name>)
 add_executable(${PROJECT_NAME} src/main.c)
 target_link_libraries(${LIBRARY_NAME} PRIVATE
  cglm)
 add_subdirectory(external/cglm/)
 # or you can use find_package to configure cglm
 ```
 ### Using CMake to build for WebAssembly
 Since math functions like `sinf` are used, this can not be targeted at `wasm32-unknown-unknown`, one of [wasi-sdk](https://github.com/WebAssembly/wasi-sdk) or [emscripten](https://github.com/emscripten-core/emsdk) should be used.
 Should note that shared build is not yet supported for WebAssembly.
 For [simd128](https://github.com/WebAssembly/simd) support, add `-msimd128` to `CMAKE_C_FLAGS`, in command line `-DCMAKE_C_FLAGS="-msimd128"`.
 For tests, the cmake option `CGLM_USE_TEST` would still work, you'll need a wasi runtime for running tests, see our [ci config file](.github/workflows/cmake-wasm.yml) for a detailed example.
 #### WASI SDK
 ```bash
 $ cmake .. \
  -DCMAKE_TOOLCHAIN_FILE=/path/to/wasi-sdk-19.0/share/cmake/wasi-sdk.cmake \
  -DWASI_SDK_PREFIX=/path/to/wasi-sdk-19.0
 ```
 Where `/path/to/wasi-sdk-19.0/` is the path to extracted [wasi sdk](https://github.com/WebAssembly/wasi-sdk).
 In this case it would by default make a static build.
 #### Emscripten
 ```bash
 $ emcmake cmake .. \
  -DCMAKE_EXE_LINKER_FLAGS="-s STANDALONE_WASM" \
  -DCGLM_STATIC=ON
 ```
 The `emcmake` here is the cmake wrapper for Emscripten from installed [emsdk](https://github.com/emscripten-core/emsdk).
 ## Meson (All platforms)
 ```bash
 $ meson build # [Optional] --default-library=static
 $ cd build
 $ ninja
 $ sudo ninja install # [Optional]
 ```
 ### Options with Defaults:
 ```meson
 c_std=c11
 buildtype=release
 default_library=shared
 build_tests=true # to run tests: ninja test
 ```
 ### Including in a Meson project
 * Example:
 ```meson
 # Clone cglm or create a cglm.wrap under <source_root>/subprojects
 project('name', 'c')
 cglm_dep = dependency('cglm', fallback : 'cglm', 'cglm_dep')
 executable('exe', 'src/main.c', dependencies : cglm_dep)
 ```
 ## Swift (Swift Package Manager)
 Currently only default build options are supported. Add **cglm** dependency to your project:
 ```swift
 ...
 Package( 
  ...
  dependencies: [
    ...
    .package(url: "https://github.com/recp/cglm", .branch("master")),
  ]
  ...
 )
 ```
 Now add **cgml** as a dependency to your target. Product choices are:
 - **cglm** for inlined version of the library which can be linked only statically
 - **cglmc** for a compiled version of the library with no linking limitation
 ```swift
 ...
 .target(
  ...
  dependencies: [
    ...
    .product(name: "cglm", package: "cglm"),
  ]
  ...
 )
 ...
 ```
 ## Unix (Autotools)
 ```bash
 $ sh autogen.sh
 $ ./configure
 $ make
 $ make check # [Optional]
 $ [sudo] make install # [Optional]
 ```
 This will also install pkg-config files so you can use
 `pkg-config --cflags cglm` and `pkg-config --libs cglm` to retrieve compiler
 and linker flags.
 The files will be installed into the given prefix (usually `/usr/local` by
 default on Linux), but your pkg-config may not be configured to actually check
 there. You can figure out where it's looking by running `pkg-config --variable
 pc_path pkg-config` and change the path the files are installed to via
 `./configure --with-pkgconfigdir=/your/path`. Alternatively, you can add the
 prefix path to your `PKG_CONFIG_PATH` environment variable.
 ## Windows (MSBuild)
 Windows related build file and project files are located in `win` folder,
 make sure you are inside `cglm/win` folder.
 Code Analysis is enabled, so it may take awhile to build.
 ```Powershell
 $ cd win
 $ .\build.bat
 ```
 if `msbuild` won't work (because of multi version VS) then try to build with `devenv`:
 ```Powershell
 $ devenv cglm.sln /Build Release
 ```
 ### Running Tests on Windows
 You can see test project in same visual studio solution file. It is enough to run that project to run tests.
 # Building the documentation
 First you need install Sphinx: http://www.sphinx-doc.org/en/master/usage/installation.html
 then:
 ```bash
 $ cd docs
 $ sphinx-build source build
 ```
 it will compile docs into build folder, you can run index.html inside that function.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
-cmake_minimum_required(VERSION 3.8.2)
+cmake_minimum_required(VERSION 3.13)
 project(cglm
-  VERSION 0.9.4
+  VERSION 0.9.6
  HOMEPAGE_URL https://github.com/recp/cglm
  DESCRIPTION "OpenGL Mathematics (glm) for C"
  LANGUAGES C
@@ -93,8 +93,10 @@ add_library(${PROJECT_NAME}
  src/mat4x2.c
  src/mat4x3.c
  src/plane.c
  src/noise.c
  src/frustum.c
  src/box.c
  src/aabb2d.c
  src/project.c
  src/sphere.c
  src/ease.c
--- a/Makefile.am
+++ b/Makefile.am
@@ -67,6 +67,7 @@ cglm_HEADERS = include/cglm/version.h \
               include/cglm/util.h \
               include/cglm/quat.h \
               include/cglm/plane.h \
               include/cglm/noise.h \
               include/cglm/frustum.h \
               include/cglm/box.h \
               include/cglm/aabb2d.h \
@@ -120,6 +121,7 @@ cglm_call_HEADERS = include/cglm/call/mat4.h \
                    include/cglm/call/quat.h \
                    include/cglm/call/euler.h \
                    include/cglm/call/plane.h \
                    include/cglm/call/noise.h \
                    include/cglm/call/frustum.h \
                    include/cglm/call/box.h \
                    include/cglm/call/project.h \
@@ -129,7 +131,8 @@ cglm_call_HEADERS = include/cglm/call/mat4.h \
                    include/cglm/call/bezier.h \
                    include/cglm/call/ray.h \
                    include/cglm/call/affine.h \
-                    include/cglm/call/affine2d.h
+                    include/cglm/call/affine2d.h \
                    include/cglm/call/aabb2d.h
 cglm_call_clipspacedir=$(includedir)/cglm/call/clipspace
 cglm_call_clipspace_HEADERS = include/cglm/call/clipspace/persp_lh_no.h \
@@ -210,6 +213,7 @@ cglm_struct_HEADERS = include/cglm/struct/mat4.h \
                      include/cglm/struct/quat.h \
                      include/cglm/struct/euler.h \
                      include/cglm/struct/plane.h \
                      include/cglm/struct/noise.h \
                      include/cglm/struct/frustum.h \
                      include/cglm/struct/box.h \
                      include/cglm/struct/aabb2d.h \
@@ -261,6 +265,7 @@ libcglm_la_SOURCES=\
    src/mat4x2.c \
    src/mat4x3.c \
    src/plane.c \
    src/noise.c \
    src/frustum.c \
    src/box.c \
    src/project.c \
@@ -270,6 +275,7 @@ libcglm_la_SOURCES=\
    src/bezier.c \
    src/ray.c \
    src/affine2d.c \
    src/aabb2d.c \
    src/clipspace/ortho_lh_no.c \
    src/clipspace/ortho_lh_zo.c \
    src/clipspace/ortho_rh_no.c \
--- a/README.md
+++ b/README.md
@@ -39,73 +39,100 @@
 <br>
 <p align="center">
-Highly optimized 2D|3D math library, also known as <b>OpenGL Mathematics (glm) for `C`</b>. <b>cglm</b> provides lot of utils to help math operations to be fast and quick to write. It is community friendly, feel free to bring any issues, bugs you faced. 
+A highly optimized 2D|3D math library. Also known as OpenGL Mathematics (glm) for C. <b>cglm</b> provides fast and ergonomic math functions to ease graphics programming. It is community friendly – feel free to report any bugs and issues you face. <br>
 <i>If you're using C++, you might want to check out <a href="https://github.com/g-truc/glm">GLM</a></i>
 </p>
 - Allocation-free
 - Header-only
 - SIMD-optimized
 - API-agnostic
 ---
-#### 📚 Documentation
+### 📚 Documentation
-Almost all functions (inline versions) and parameters are documented inside the corresponding headers. <br />
+All functions and their parameters are documented above their declaration inside their corresponding headers. <br />
-Complete documentation: http://cglm.readthedocs.io
+Alternatively, you can read the complete documentation [here](http://cglm.readthedocs.io).
-#### 📌 Note for previous versions:
+### 🔨 Building
- _dup (duplicate) is changed to _copy. For instance `glm_vec_dup -> glm_vec3_copy`
+cglm can be used in it's entirety as a header-only library simply by including `cglm/cglm.h`. If you wish to link against it instead, it can be built using one of the supported build systems. Detailed information about building on individual platforms and build systems along with the instructions for building the documentation can be found in [BUILDING.md](./BUILDING.md).
 - OpenGL related functions are dropped to make this lib platform/third-party independent
 - make sure you have latest version and feel free to report bugs, troubles
 - **[bugfix]** euler angles was implemented in reverse order (extrinsic) it was fixed, now they are intrinsic. Make sure that
 you have the latest version
 - **[major change]** by starting v0.4.0, quaternions are stored as [x, y, z, w], it was [w, x, y, z] in v0.3.5 and earlier versions
 - **[api rename]** by starting v0.4.5, **glm_simd** functions are renamed to **glmm_**  
 - **[new option]** by starting v0.4.5, you can disable alignment requirement, check options in docs.  
 - **[major change]** by starting v0.5.0, vec3 functions use **glm_vec3_** namespace, it was **glm_vec_** until v0.5.0
 - **[major change]** by starting v0.5.1, built-in alignment is removed from **vec3** and **mat3** types
 - **[major change]** by starting v0.7.3, inline print functions are disabled in release/production mode to eliminate print costs (see options in documentation). Print output also improved. You can disable colors if you need  (see documentation)
 - **[major change]** by starting v0.8.3, **cglm** supports alternative clipspace configurations e.g. Left Handed, Zero-to-One (_zo)... `CGLM_FORCE_DEPTH_ZERO_TO_ONE` and `CGLM_FORCE_LEFT_HANDED` is provided to control clipspace. You should be able to use **cglm** with Vulkan, DirectX and Metal now... see https://cglm.readthedocs.io/en/latest/opt.html#clipspace-option-s
-#### 📌 Note for C++ developers:
+### ✅ Usage
 If you are not aware of the original GLM library yet, you may also want to look at:
 https://github.com/g-truc/glm
-#### 📌 Note for new comers (Important):
+#### Header-only
 - `vec4` and `mat4` variables must be aligned. (There will be unaligned versions later)
 - **in** and **[in, out]** parameters must be initialized (please). But **[out]** parameters not, initializing out param is  also redundant
 - All functions are inline if you don't want to use pre-compiled versions with glmc_ prefix, you can ignore build process. Just include headers.
 - if your debugger takes you to cglm headers then make sure you are not trying to copy vec4 to vec3 or alig issues...
 - Welcome!
-#### 📌 Note for experienced developers:
+Include the `cglm/cglm.h` header and use functions with the `glm_` prefix.
- Since I'm testing this library in my projects, sometimes bugs occurs; finding that bug[s] and making improvements would be more easy with multiple developer/contributor and their projects or knowledge. Consider to make some tests if you suspect something is wrong and any feedbacks, contributions and bug reports are always welcome.
+```c
 #include "cglm/cglm.h"
-#### 📌 Allocations?
+// ...
 `cglm` doesn't alloc any memory on heap. So it doesn't provide any allocator. You should alloc memory for **out** parameters too if you pass pointer of memory location. Don't forget that **vec4** (also quat/**versor**) and **mat4** must be aligned (16-bytes), because *cglm* uses SIMD instructions to optimize most operations if available.
-#### 📌 Returning vector or matrix... ?
+vec2 vector;
 glm_vec2_zero(vector);
 ```
-**cglm** supports both *ARRAY API* and *STRUCT API*, so you can return structs if you utilize struct api (`glms_`).
+#### Struct API
-<hr/>
+Include `cglm/struct.h` and use `glms_`.
 ```c
 #include "cglm/struct.h"
-<table>
+// ...
  <tbody>
    <tr>
      <td>
        <div>Like some other graphics libraries (especially OpenGL) this library use Column-Major layout to keep matrices in the memory. </div>
        <div>&nbsp;</div>
        <div>In the future the library may support an option to use row-major layout, CURRENTLY if you need to row-major layout you will need to transpose it. </div>
      </td>
      <td>
        <img src="https://upload.wikimedia.org/wikipedia/commons/3/3f/Matrix_Columns.svg" width="300px" />
      </td>
    </tr>
  </tbody>
 </table>
-## 🚀 Features
+vec2s vector = glms_vec2_zero();
- **scalar** and **simd** (sse, avx, neon...) optimizations
+```
- option to use different clipspaces e.g. Left Handed, Zero-to-One... (currently right handed negative-one is default)
+
- array api and struct api, you can use arrays or structs.
+#### Linked
 Include `cglm/call.h` and use `glmc_`.
 ```c
 #include "cglm/call.h"
 // ...
 vec2 vector;
 glmc_vec2_zero(vector);
 ```
 ### ❗ Alignment
 While cglm by default aligns what's necessary, it is possible to disable this by defining `CGLM_ALL_UNALIGNED`. If you're targeting machines with any kind of SIMD support, make sure that all `vec4`, `mat4` and `mat2` arguments you pass to cglm functions are aligned to prevent unexpected crashes, alternatively use the unaligned versions if present. 
 ### Struct API
 The struct API works as follows (note the `s` suffix on types, `glms_` prefix on functions and `GLMS_` on constants):
 ```C
 #include <cglm/struct.h>
 mat4s mat = GLMS_MAT4_IDENTITY_INIT;
 mat4s inv = glms_mat4_inv(mat);
 ```
 Struct functions generally take parameters *by copy* and *return* the results rather than taking pointers and writing to out parameters. That means your variables can usually be `const`, if you're into that.
 The types used are actually unions that allow access to the same data in multiple ways. One of these involves anonymous structures available since C11. MSVC supports them in earlier versions out of the box and GCC/Clang as well if you enable `-fms-extensions`.
 To explicitly enable anonymous structures `#define CGLM_USE_ANONYMOUS_STRUCT 1`, or `0` to disable them.
 For backwards compatibility, you can also `#define CGLM_NO_ANONYMOUS_STRUCT` to disable them. If you don't specify explicitly, cglm will attempt a best guess based on your compiler and C version.
 ### 📌 Migration notes:
 - `_dup` (duplicate) functions were renamed to `_copy`. For instance: `glm_vec_dup` -> `glm_vec3_copy`.
 - OpenGL related functions were dropped to make cglm API independent.
 - **[bugfix]** Euler angles had been previously implemented in reverse order (extrinsic). This was fixed to be intrinsic.
 - **[major change]** Starting with **v0.4.0**, quaternions are stored as [x, y, z, w]. Previously it was [w, x, y, z].
 - **[api rename]** Starting with **v0.4.5**, `glm_simd_` functions are renamed to `glmm_`.
 - **[new option]** Starting with **v0.4.5**, alignment requirements can be disabled. Read more in the documentation.  
 - **[major change]** Starting with **v0.5.0**, vec3 functions occupy the **glm_vec3_** namespace. This used to be **glm_vec_** in earlier versions.
 - **[major change]** Starting with **v0.5.1**, `vec3` and `mat3` types are not aligned by default.
 - **[major change]** Starting with **v0.7.3**, inline print functions are disabled by default in release mode to eliminate printing costs (see the Options chapter of the docs). <br> Colored output can be disabled (see documentation).
 - **[major change]** Starting with **v0.8.3**, alternate clipspace configurations are supported. The `CGLM_FORCE_DEPTH_ZERO_TO_ONE` and `CGLM_FORCE_LEFT_HANDED` flags are provided to control clip depth and handedness. This makes it easier to incorporate cglm into projects using graphics APIs such as Vulkan or Metal. See https://cglm.readthedocs.io/en/latest/opt.html#clipspace-option-s
 ### 🚀 Features
 - scalar and simd (sse, avx, neon...) optimizations
 - general purpose matrix operations (mat4, mat3)
 - chain matrix multiplication (square only)
 - general purpose vector operations (cross, dot, rotate, proj, angle...)
@@ -117,49 +144,35 @@ https://github.com/g-truc/glm
 - quaternions
 - euler angles / yaw-pitch-roll to matrix
 - extract euler angles
 - inline or pre-compiled function call
 - frustum (extract view frustum planes, corners...)
 - bounding box (AABB in Frustum (culling), crop, merge...)
 - bounding sphere
 - project, unproject
 - easing functions
 - curves
- curve interpolation helpers (S*M*C, deCasteljau...)
+- curve interpolation helpers (SMC, deCasteljau...)
- helpers to convert cglm types to Apple's simd library to pass cglm types to Metal GL without packing them on both sides
+- comversion helpers from cglm types to Apple's simd library to pass cglm types to Metal GL without packing them on both sides
 - ray intersection helpers
- and others...
+---
-<hr />
+<table>
  <tbody>
    <tr>
      <td>
        <div>Like other graphics libraries (especially OpenGL), cglm uses column-major layout to keep matrices in memory. </div>
        <div>&nbsp;</div>
        <div>While we might support row-major matrices in the future, currently if you need your matrices to be in row-major layout you have to transpose them. </div>
      </td>
      <td>
        <img src="https://upload.wikimedia.org/wikipedia/commons/3/3f/Matrix_Columns.svg" width="300px" />
      </td>
    </tr>
  </tbody>
 </table>
-You have two options to call a function/operation: inline or library call (link)
+---
 Almost all functions are marked inline (always_inline) so compiler will probably inline.
 To call pre-compiled versions, just use `glmc_` (c stands for 'call') instead of `glm_`.
-```C
+cglm contains general purpose mat4 product and inverse functions but also provides optimized versions for affine transformations. If you want to multiply two affine transformation matrices you can use glm_mul instead of glm_mat4_mul and glm_inv_tr (ROT + TR) instead glm_mat4_inv.
  #include <cglm/cglm.h>   /* for inline */
  #include <cglm/call.h>   /* for library call (this also includes cglm.h) */
  mat4 rot, trans, rt;
  /* ... */
  glm_mul(trans, rot, rt);  /* inline */
  glmc_mul(trans, rot, rt); /* call from library */
 ```
 Most of math functions are optimized manually with SSE2 if available, if not? Dont worry there are non-sse versions of all operations
 You can pass matrices and vectors as array to functions rather than get address.
 ```C
  mat4 m = {
    1, 0, 0, 0,
    0, 1, 0, 0,
    0, 0, 1, 0,
    0, 0, 0, 1
  };
  glm_translate(m, (vec3){1.0f, 0.0f, 0.0f});
 ```
 Library contains general purpose mat4 mul and inverse functions, and also contains some special forms (optimized) of these functions for affine transformations' matrices. If you want to multiply two affine transformation matrices you can use glm_mul instead of glm_mat4_mul and glm_inv_tr (ROT + TR) instead glm_mat4_inv
 ```C
 /* multiplication */
 mat4 modelMat;
@@ -169,311 +182,9 @@ glm_mul(T, R, modelMat);
 glm_inv_tr(modelMat);
 ```
 ### Struct API
 The struct API works as follows, note the `s` suffix on types, the `glms_` prefix on functions and the `GLMS_` prefix on constants:
 ```C
 #include <cglm/struct.h>
 mat4s mat = GLMS_MAT4_IDENTITY_INIT;
 mat4s inv = glms_mat4_inv(mat);
 ```
 Struct functions generally take their parameters as *values* and *return* their results, rather than taking pointers and writing to out parameters. That means your parameters can usually be `const`, if you're into that.
 The types used are actually unions that allow access to the same data multiple ways. One of those ways involves anonymous structures, available since C11. MSVC also supports it for earlier C versions out of the box and GCC/Clang do if you enable `-fms-extensions`. To explicitly enable these anonymous structures, `#define CGLM_USE_ANONYMOUS_STRUCT` to `1`, to disable them, to `0`. For backward compatibility, you can also `#define CGLM_NO_ANONYMOUS_STRUCT` (value is irrelevant) to disable them. If you don't specify explicitly, cglm will do a best guess based on your compiler and the C version you're using.
 ## 🔨 Build
 ### CMake (All platforms)
 ```bash
 $ mkdir build
 $ cd build
 $ cmake .. # [Optional] -DCGLM_SHARED=ON
 $ make
 $ sudo make install # [Optional]
 ```
 ##### Cmake options with Defaults:
 ```CMake
 option(CGLM_SHARED "Shared build" ON)
 option(CGLM_STATIC "Static build" OFF)
 option(CGLM_USE_C99 "" OFF) # C11 
 option(CGLM_USE_TEST "Enable Tests" OFF) # for make check - make test
 ```
 #### Use as header-only library with your CMake project
 This requires no building or installation of cglm.
 * Example:
 ``` cmake
 cmake_minimum_required(VERSION 3.8.2)
 project(<Your Project Name>)
 add_executable(${PROJECT_NAME} src/main.c)
 target_link_libraries(${LIBRARY_NAME} PRIVATE
  cglm_headers)
 add_subdirectory(external/cglm/ EXCLUDE_FROM_ALL)
 ```
 #### Use with your CMake project
 * Example:
 ```cmake
 cmake_minimum_required(VERSION 3.8.2)
 project(<Your Project Name>)
 add_executable(${PROJECT_NAME} src/main.c)
 target_link_libraries(${LIBRARY_NAME} PRIVATE
  cglm)
 add_subdirectory(external/cglm/)
 # or you can use find_package to configure cglm
 ```
 #### Use CMake to build for WebAssembly
 Since math functions like `sinf` is used, this can not be targeted at `wasm32-unknown-unknown`, one of [wasi-sdk](https://github.com/WebAssembly/wasi-sdk) or [emscripten](https://github.com/emscripten-core/emsdk) should be used.
 Should note that shared build is not yet supported for WebAssembly.
 For [simd128](https://github.com/WebAssembly/simd) support, add `-msimd128` to `CMAKE_C_FLAGS`, in command line `-DCMAKE_C_FLAGS="-msimd128"`.
 For tests, the cmake option `CGLM_USE_TEST` would still work, you'll need a wasi runtime for running tests, see our [ci config file](.github/workflows/cmake-wasm.yml) for a detailed example.
 ##### Use CMake and WASI SDK to build for WebAssembly
 ```bash
 $ cmake .. \
  -DCMAKE_TOOLCHAIN_FILE=/path/to/wasi-sdk-19.0/share/cmake/wasi-sdk.cmake \
  -DWASI_SDK_PREFIX=/path/to/wasi-sdk-19.0
 ```
 Where `/path/to/wasi-sdk-19.0/` is the path to extracted [wasi sdk](https://github.com/WebAssembly/wasi-sdk).
 In this case it would by default make a static build.
 ##### Use CMake and Emscripten SDK to build for WebAssembly
 ```bash
 $ emcmake cmake .. \
  -DCMAKE_EXE_LINKER_FLAGS="-s STANDALONE_WASM" \
  -DCGLM_STATIC=ON
 ```
 The `emcmake` here is the cmake wrapper for Emscripten from installed [emsdk](https://github.com/emscripten-core/emsdk).
 ### Meson (All platforms)
 ```bash
 $ meson build # [Optional] --default-library=static
 $ cd build
 $ ninja
 $ sudo ninja install # [Optional]
 ```
 ##### Meson options with Defaults:
 ```meson
 c_std=c11
 buildtype=release
 default_library=shared
 build_tests=true # to run tests: ninja test
 ```
 #### Use with your Meson project
 * Example:
 ```meson
 # Clone cglm or create a cglm.wrap under <source_root>/subprojects
 project('name', 'c')
 cglm_dep = dependency('cglm', fallback : 'cglm', 'cglm_dep')
 executable('exe', 'src/main.c', dependencies : cglm_dep)
 ```
 ### Swift (Swift Package Manager)
 Currently only default build options are supported. Add **cglm** dependency to your project:
 ```swift
 ...
 Package( 
  ...
  dependencies: [
    ...
    .package(url: "https://github.com/recp/cglm", .branch("master")),
  ]
  ...
 )
 ```
 Now add **cgml** as a dependency to your target. Product choices are:
 - **cglm** for inlined version of the library which can be linked only statically
 - **cglmc** for a compiled version of the library with no linking limitation
 ```swift
 ...
 .target(
  ...
  dependencies: [
    ...
    .product(name: "cglm", package: "cglm"),
  ]
  ...
 )
 ...
 ```
 ### Unix (Autotools)
 ```bash
 $ sh autogen.sh
 $ ./configure
 $ make
 $ make check # [Optional]
 $ [sudo] make install # [Optional]
 ```
 This will also install pkg-config files so you can use
 `pkg-config --cflags cglm` and `pkg-config --libs cglm` to retrieve compiler
 and linker flags.
 The files will be installed into the given prefix (usually `/usr/local` by
 default on Linux), but your pkg-config may not be configured to actually check
 there. You can figure out where it's looking by running `pkg-config --variable
 pc_path pkg-config` and change the path the files are installed to via
 `./configure --with-pkgconfigdir=/your/path`. Alternatively, you can add the
 prefix path to your `PKG_CONFIG_PATH` environment variable.
 ### Windows (MSBuild)
 Windows related build file and project files are located in `win` folder,
 make sure you are inside `cglm/win` folder.
 Code Analysis is enabled, so it may take awhile to build.
 ```Powershell
 $ cd win
 $ .\build.bat
 ```
 if `msbuild` won't work (because of multi version VS) then try to build with `devenv`:
 ```Powershell
 $ devenv cglm.sln /Build Release
 ```
 #### Running Tests on Windows
 You can see test project in same visual studio solution file. It is enough to run that project to run tests.
 ### Building Docs
 First you need install Sphinx: http://www.sphinx-doc.org/en/master/usage/installation.html
 then:
 ```bash
 $ cd docs
 $ sphinx-build source build
 ```
 it will compile docs into build folder, you can run index.html inside that function.
 ## How to use
 If you want to use the inline versions of functions, then include the main header
 ```C
 #include <cglm/cglm.h>
 ```
 the header will include all headers. Then call the func you want e.g. rotate vector by axis:
 ```C
 glm_vec3_rotate(v1, glm_rad(45), (vec3){1.0f, 0.0f, 0.0f});
 ```
 some functions are overloaded :) e.g you can normalize vector:
 ```C
 glm_vec3_normalize(vec);
 ```
 this will normalize vec and store normalized vector into `vec` but if you will store normalized vector into another vector do this:
 ```C
 glm_vec3_normalize_to(vec, result);
 ```
 like this function you may see `_to` postfix, this functions store results to another variables and save temp memory
 to call pre-compiled versions include header with `c` postfix, c means call. Pre-compiled versions are just wrappers.
 ```C
 #include <cglm/call.h>
 ```
 this header will include all headers with c postfix. You need to call functions with c posfix:
 ```C
 glmc_vec3_normalize(vec);
 ```
 Function usage and parameters are documented inside related headers. You may see same parameter passed twice in some examples like this:
 ```C
 glm_mat4_mul(m1, m2, m1);
 /* or */
 glm_mat4_mul(m1, m1, m1);
 ```
 the first two parameter are **[in]** and the last one is **[out]** parameter. After multiplying *m1* and *m2*, the result is stored in *m1*. This is why we send *m1* twice. You may store the result in a different matrix, this is just an example.
 ### Example: Computing MVP matrix
 #### Option 1
 ```C
 mat4 proj, view, model, mvp;
 /* init proj, view and model ... */
 glm_mat4_mul(proj, view, viewProj);
 glm_mat4_mul(viewProj, model, mvp);
 ```
 #### Option 2
 ```C
 mat4 proj, view, model, mvp;
 /* init proj, view and model ... */
 glm_mat4_mulN((mat4 *[]){&proj, &view, &model}, 3, mvp);
 ```
 ## How to send matrix to OpenGL
 mat4 is array of vec4 and vec4 is array of floats. `glUniformMatrix4fv` functions accecpts `float*` as `value` (last param), so you can cast mat4 to float* or you can pass first column of matrix as beginning of memory of matrix:
 Option 1: Send first column
 ```C
 glUniformMatrix4fv(location, 1, GL_FALSE, matrix[0]);
 /* array of matrices */
 glUniformMatrix4fv(location, 1, GL_FALSE, matrix[0][0]);
 ```
 Option 2: Cast matrix to pointer type (also valid for multiple dimensional arrays)
 ```C
 glUniformMatrix4fv(location, 1, GL_FALSE, (float *)matrix);
 ```
 You can pass matrices the same way to other APIs e.g. Vulkan, DX...
 ## Notes
 - This library does not support double type... yet
 - If headers are not working properly with your compiler, IDE please open an issue, because I'm using GCC and clang to test it maybe sometimes MSVC
 **TODO:**
 - [ ] Unit tests (In Progress)
 - [ ] Unit tests for comparing cglm with glm results
 - [x] Add version info
 - [ ] Unaligned operations (e.g. `glm_umat4_mul`)
 - [x] Extra documentation
 - [x] ARM Neon Arch
 ## Contributors
-This project exists thanks to all the people who contribute. [[Contribute](CONTRIBUTING.md)].
+This project exists thanks to all the people who contribute. [[Contribute](CONTRIBUTING.md)]
 <a href="https://github.com/recp/cglm/graphs/contributors"><img src="https://opencollective.com/cglm/contributors.svg?width=890&button=false" /></a>
@@ -498,6 +209,3 @@ Support this project by becoming a sponsor. Your logo will show up here with a l
 <a href="https://opencollective.com/cglm/sponsor/7/website" target="_blank"><img src="https://opencollective.com/cglm/sponsor/7/avatar.svg"></a>
 <a href="https://opencollective.com/cglm/sponsor/8/website" target="_blank"><img src="https://opencollective.com/cglm/sponsor/8/avatar.svg"></a>
 <a href="https://opencollective.com/cglm/sponsor/9/website" target="_blank"><img src="https://opencollective.com/cglm/sponsor/9/avatar.svg"></a>
 ## License
 MIT. check the LICENSE file
--- a/cglm.podspec
+++ b/cglm.podspec
@@ -2,7 +2,7 @@ Pod::Spec.new do |s|
  # Description
  s.name         = "cglm"
-  s.version      = "0.9.3"
+  s.version      = "0.9.5"
  s.summary      = "📽 Highly Optimized Graphics Math (glm) for C"
  s.description  = <<-DESC
 cglm is math library for graphics programming for C. See the documentation or README for all features.
--- a/configure.ac
+++ b/configure.ac
@@ -7,7 +7,7 @@
 #*****************************************************************************
 AC_PREREQ([2.69])
-AC_INIT([cglm], [0.9.4], [info@recp.me])
+AC_INIT([cglm], [0.9.6], [info@recp.me])
 AM_INIT_AUTOMAKE([-Wall foreign subdir-objects serial-tests])
 # Don't use the default cflags (-O2 -g), we set ours manually in Makefile.am.
--- a/docs/source/aabb2d.rst
+++ b/docs/source/aabb2d.rst
@@ -82,7 +82,7 @@ Functions documentation
    | crops a bounding box with another one.
-    this could be useful for gettng a bbox which fits with view frustum and
+    this could be useful for getting a bbox which fits with view frustum and
    object bounding boxes. In this case you crop view frustum box with objects
    box
@@ -95,7 +95,7 @@ Functions documentation
    | crops a bounding box with another one.
-    this could be useful for gettng a bbox which fits with view frustum and
+    this could be useful for getting a bbox which fits with view frustum and
    object bounding boxes. In this case you crop view frustum box with objects
    box
--- a/docs/source/affine-pre.rst
+++ b/docs/source/affine-pre.rst
@@ -117,7 +117,7 @@ Functions documentation
    If you need to rotate object around itself e.g. center of object or at
    some point [of object] then `glm_rotate_at()` would be better choice to do so.
-    Even if object's model transform is identiy, rotation may not be around
+    Even if object's model transform is identity, rotation may not be around
    center of object if object does not lay out at ORIGIN perfectly.
    Using `glm_rotate_at()` with center of bounding shape ( AABB, Sphere ... )
--- a/docs/source/api_inline_array.rst
+++ b/docs/source/api_inline_array.rst
@@ -66,6 +66,7 @@ Follow the :doc:`build` documentation for this
   ivec4
   color
   plane
   noise
   project
   util
   io
--- a/docs/source/api_struct.rst
+++ b/docs/source/api_struct.rst
@@ -9,7 +9,7 @@ By default struct api adds `s` suffix to every type name e.g. vec3s, mat4s, vers
 Also struct api `s` suffix to namespace e.g. `glms_vec3_add`, `glms_mat4_mul` etc.
 By starting v0.9.0, struct api namespace is configurable. We can omit **glms_** namespace or 
-even change it with custom name to move existing api integrations to **cglm** more easliy...
+even change it with custom name to move existing api integrations to **cglm** more easily...
 We can also add **s** to function names if we want e.g. `glms_vec3_add()` -> `vec3_add()` or `vec3s_add()`.
 By including **cglm/struct.h** header you will include all struct api. It will also include **cglm/cglm.h** too. 
--- a/docs/source/box.rst
+++ b/docs/source/box.rst
@@ -62,7 +62,7 @@ Functions documentation
    | crops a bounding box with another one.
-    this could be useful for gettng a bbox which fits with view frustum and
+    this could be useful for getting a bbox which fits with view frustum and
    object bounding boxes. In this case you crop view frustum box with objects
    box
@@ -75,7 +75,7 @@ Functions documentation
    | crops a bounding box with another one.
-    this could be useful for gettng a bbox which fits with view frustum and
+    this could be useful for getting a bbox which fits with view frustum and
    object bounding boxes. In this case you crop view frustum box with objects
    box
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -62,9 +62,9 @@ author = u'Recep Aslantas'
 # built documents.
 #
 # The short X.Y version.
-version = u'0.9.4'
+version = u'0.9.6'
 # The full version, including alpha/beta/rc tags.
-release = u'0.9.4'
+release = u'0.9.6'
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -91,6 +91,7 @@ todo_include_todos = False
 # a list of builtin themes.
 #
 html_theme = 'sphinx_rtd_theme'
 pygments_style = 'monokai'
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -111,8 +112,11 @@ html_theme_options = {
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
+html_static_path = ['sphinx-static']
 # Add customm CSS and JS files
 html_css_files = ['theme_overrides.css']
 html_js_files = []
 # -- Options for HTMLHelp output ------------------------------------------
--- a/docs/source/mat3x4.rst
+++ b/docs/source/mat3x4.rst
@@ -23,6 +23,17 @@ Functions:
 #. :c:func:`glm_mat3x4_transpose`
 #. :c:func:`glm_mat3x4_scale`
 Represented
 ~~~~~~~~~~~
 .. csv-table:: mat3x4
   :header: "", "column 1", "column 2", "column 3"
   "row 1", "m00", "m10", "m20"
   "row 2", "m01", "m11", "m21"
   "row 3", "m02", "m12", "m22"
   "row 4", "m03", "m13", "m23"
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -51,28 +62,74 @@ Functions documentation
      | *[in]*  **src**  pointer to an array of floats
      | *[out]* **dest** destination matrix3x4
-.. c:function:: void glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat3 dest)
+.. c:function:: void glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat4 dest)
    multiply m1 and m2 to dest
    .. code-block:: c
-       glm_mat3x4_mul(mat3x4, mat4x3, mat3);
+       glm_mat3x4_mul(mat3x4, mat4x3, mat4);
    Parameters:
      | *[in]*  **m1**    left matrix (mat3x4)
      | *[in]*  **m2**    right matrix (mat4x3)
-      | *[out]* **dest**  destination matrix (mat3)
+      | *[out]* **dest**  destination matrix (mat4)
-.. c:function:: void glm_mat3x4_mulv(mat3x4 m, vec4 v, vec3 dest)
+    .. csv-table:: mat3x4
        :header: "", "column 1", "column 2", "column 3"
-    multiply mat3x4 with vec4 (column vector) and store in dest vector
+        "row 1", "a00", "a10", "a20"
        "row 2", "a01", "a11", "a21"
        "row 3", "a02", "a12", "a22"
        "row 4", "a03", "a13", "a23"
    .. csv-table:: mat4x3
        :header: "", "column 1", "column 2", "column 3", "column 4"
        "row 1", "b00", "b10", "b20", "b30"
        "row 2", "b01", "b11", "b21", "b31"
        "row 3", "b02", "b12", "b22", "b32"
    .. csv-table:: mat4x4
        :header: "", "column 1", "column 2", "column 3", "column 4"
        "row 1", "a00 * b00 + a10 * b01 + a20 * b02", "a00 * b10 + a10 * b11 + a20 * b12", "a00 * b20 + a10 * b21 + a20 * b22", "a00 * b30 + a10 * b31 + a20 * b32"
        "row 2", "a01 * b00 + a11 * b01 + a21 * b02", "a01 * b10 + a11 * b11 + a21 * b12", "a01 * b20 + a11 * b21 + a21 * b22", "a01 * b30 + a11 * b31 + a21 * b32"
        "row 3", "a02 * b00 + a12 * b01 + a22 * b02", "a02 * b10 + a12 * b11 + a22 * b12", "a02 * b20 + a12 * b21 + a22 * b22", "a02 * b30 + a12 * b31 + a22 * b32"
        "row 4", "a03 * b00 + a13 * b01 + a23 * b02", "a03 * b10 + a13 * b11 + a23 * b12", "a03 * b20 + a13 * b21 + a23 * b22", "a03 * b30 + a13 * b31 + a23 * b32"
 .. c:function:: void glm_mat3x4_mulv(mat3x4 m, vec3 v, vec4 dest)
    multiply mat3x4 with vec3 (column vector) and store in dest vector
    Parameters:
      | *[in]*  **m**     mat3x4 (left)
-      | *[in]*  **v**     vec4 (right, column vector)
+      | *[in]*  **v**     vec3 (right, column vector)
      | *[out]* **dest**  destination (result, column vector)
    .. csv-table:: mat3x4
        :header: "", "column 1", "column 2", "column 3"
        "row 1", "m00", "m10", "m20"
        "row 2", "m01", "m11", "m21"
        "row 3", "m02", "m12", "m22"
        "row 4", "m03", "m13", "m23"
    .. csv-table:: column vec3 (1x3)
        :header: "", "column 1"
        "row 1", "v0"
        "row 2", "v1"
        "row 3", "v2"
    .. csv-table:: column vec4 (1x4)
        :header: "", "column 1"
        "row 1", "m00 * v0 + m10 * v1 + m20 * v2"
        "row 2", "m01 * v0 + m11 * v1 + m21 * v2"
        "row 3", "m02 * v0 + m12 * v1 + m22 * v2"
        "row 4", "m03 * v0 + m13 * v1 + m23 * v2"
 .. c:function:: void glm_mat3x4_transpose(mat3x4 m, mat4x3 dest)
    transpose matrix and store in dest
--- a/docs/source/mat4x2.rst
+++ b/docs/source/mat4x2.rst
@@ -23,6 +23,15 @@ Functions:
 #. :c:func:`glm_mat4x2_transpose`
 #. :c:func:`glm_mat4x2_scale`
 Represented
 ~~~~~~~~~~~
 .. csv-table:: mat4x2
   :header: "", "column 1", "column 2", "column 3", "column4"
   "row 1", "m00", "m10", "m20", "m30"
   "row 2", "m01", "m11", "m21", "m31"
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -51,28 +60,70 @@ Functions documentation
      | *[in]*  **src**  pointer to an array of floats
      | *[out]* **dest** destination matrix4x2
-.. c:function:: void glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat4 dest)
+.. c:function:: void glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat2 dest)
    multiply m1 and m2 to dest
    .. code-block:: c
-       glm_mat4x2_mul(mat4x2, mat2x4, mat4);
+       glm_mat4x2_mul(mat4x2, mat2x4, mat2);
    Parameters:
      | *[in]*  **m1**    left matrix (mat4x2)
      | *[in]*  **m2**    right matrix (mat2x4)
-      | *[out]* **dest**  destination matrix (mat4)
+      | *[out]* **dest**  destination matrix (mat2)
-.. c:function:: void glm_mat4x2_mulv(mat4x2 m, vec2 v, vec4 dest)
+    .. csv-table:: mat4x2
        :header: "", "column 1", "column 2", "column 3", "column 4"
-    multiply mat4x2 with vec2 (column vector) and store in dest vector
+        "row 1", "a00", "a10", "a20", "a30"
        "row 2", "a01", "a11", "a21", "a31"
    .. csv-table:: mat2x4
        :header: "", "column 1", "column 2"
        "row 1", "b00", "b10"
        "row 2", "b01", "b11"
        "row 3", "b02", "b12"
        "row 4", "b03", "b13"
    .. csv-table:: mat2x2
        :header: "", "column 1", "column 2"
        "row 1", "a00 * b00 + a10 * b01 + a20 * b02 + a30 * b03", "a00 * b10 + a10 * b11 + a20 * b12 + a30 * b13"
        "row 2", "a01 * b00 + a11 * b01 + a21 * b02 + a31 * b03", "a01 * b10 + a11 * b11 + a21 * b12 + a31 * b13"
 .. c:function:: void glm_mat4x2_mulv(mat4x2 m, vec4 v, vec2 dest)
    multiply mat4x2 with vec4 (column vector) and store in dest vector
    Parameters:
      | *[in]*  **m**     mat4x2 (left)
-      | *[in]*  **v**     vec2 (right, column vector)
+      | *[in]*  **v**     vec4 (right, column vector)
      | *[out]* **dest**  destination (result, column vector)
    .. csv-table:: mat4x2
        :header: "", "column 1", "column 2", "column 3"
        "row 1", "m00", "m10", "m20"
        "row 2", "m01", "m11", "m21"
        "row 3", "m02", "m12", "m22"
        "row 4", "m03", "m13", "m23"
    .. csv-table:: column vec4 (1x4)
        :header: "", "column 1"
        "row 1", "v0"
        "row 2", "v1"
        "row 3", "v2"
        "row 4", "v3"
    .. csv-table:: column vec2 (1x2)
        :header: "", "column 1"
        "row 1", "m00 * v0 + m10 * v1 + m20 * v2 + m30 * v3"
        "row 2", "m01 * v0 + m11 * v1 + m21 * v2 + m31 * v3"
 .. c:function:: void glm_mat4x2_transpose(mat4x2 m, mat2x4 dest)
    transpose matrix and store in dest
--- a/docs/source/mat4x3.rst
+++ b/docs/source/mat4x3.rst
@@ -23,6 +23,16 @@ Functions:
 #. :c:func:`glm_mat4x3_transpose`
 #. :c:func:`glm_mat4x3_scale`
 Represented
 ~~~~~~~~~~~
 .. csv-table:: mat4x3
   :header: "", "column 1", "column 2", "column 3", "column4"
   "row 1", "m00", "m10", "m20", "m30"
   "row 2", "m01", "m11", "m21", "m31"
   "row 3", "m02", "m12", "m22", "m32"
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -51,28 +61,72 @@ Functions documentation
      | *[in]*  **src**  pointer to an array of floats
      | *[out]* **dest** destination matrix4x3
-.. c:function:: void glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat4 dest)
+.. c:function:: void glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat3 dest)
    multiply m1 and m2 to dest
    .. code-block:: c
-       glm_mat4x3_mul(mat4x3, mat3x4, mat4);
+       glm_mat4x3_mul(mat4x3, mat3x4, mat3);
    Parameters:
      | *[in]*  **m1**    left matrix (mat4x3)
      | *[in]*  **m2**    right matrix (mat3x4)
-      | *[out]* **dest**  destination matrix (mat4)
+      | *[out]* **dest**  destination matrix (mat3)
-.. c:function:: void glm_mat4x3_mulv(mat4x3 m, vec3 v, vec4 dest)
+    .. csv-table:: mat4x3
        :header: "", "column 1", "column 2", "column 3", "column 4"
-    multiply mat4x3 with vec3 (column vector) and store in dest vector
+        "row 1", "a00", "a10", "a20", "a30"
        "row 2", "a01", "a11", "a21", "a31"
        "row 3", "a02", "a12", "a22", "a32"
    .. csv-table:: mat3x4
        :header: "", "column 1", "column 2", "column 3"
        "row 1", "b00", "b10", "b20"
        "row 2", "b01", "b11", "b21"
        "row 3", "b02", "b12", "b22"
        "row 4", "b03", "b13", "b23"
    .. csv-table:: mat3x3
        :header: "", "column 1", "column 2", "column 3"
        "row 1", "a00 * b00 + a10 * b01 + a20 * b02 + a30 * b03", "a00 * b10 + a10 * b11 + a20 * b12 + a30 * b13", "a00 * b20 + a10 * b21 + a20 * b22 + a30 * b23"
        "row 2", "a01 * b00 + a11 * b01 + a21 * b02 + a31 * b03", "a01 * b10 + a11 * b11 + a21 * b12 + a31 * b13", "a01 * b20 + a11 * b21 + a21 * b22 + a31 * b23"
        "row 3", "a02 * b00 + a12 * b01 + a22 * b02 + a32 * b03", "a02 * b10 + a12 * b11 + a22 * b12 + a32 * b13", "a02 * b20 + a12 * b21 + a22 * b22 + a32 * b23"
 .. c:function:: void glm_mat4x3_mulv(mat4x3 m, vec4 v, vec3 dest)
    multiply mat4x3 with vec4 (column vector) and store in dest column vector
    Parameters:
      | *[in]*  **m**     mat4x3 (left)
-      | *[in]*  **v**     vec3 (right, column vector)
+      | *[in]*  **v**     vec4 (right, column vector)
      | *[out]* **dest**  destination (result, column vector)
    .. csv-table:: mat4x3
        :header: "", "column 1", "column 2", "column 3", "column 4"
        "row 1", "m00", "m10", "m20", "m30"
        "row 2", "m01", "m11", "m21", "m31"
        "row 3", "m02", "m12", "m22", "m32"
    .. csv-table:: column vec4 (1x4)
        :header: "", "column 1"
        "row 1", "v0"
        "row 2", "v1"
        "row 3", "v2"
        "row 4", "v3"
    .. csv-table:: column vec3 (1x3)
        :header: "", "column 1"
        "row 1", "m00 * v0 + m10 * v1 + m20 * v2 + m30 * v3"
        "row 2", "m01 * v0 + m11 * v1 + m21 * v2 + m31 * v3"
        "row 3", "m02 * v0 + m12 * v1 + m22 * v2 + m32 * v3"
 .. c:function:: void glm_mat4x3_transpose(mat4x3 m, mat3x4 dest)
    transpose matrix and store in dest
--- a/docs/source/noise.rst
+++ b/docs/source/noise.rst
@@ -0,0 +1,60 @@
 .. default-domain:: C
 perlin
 ================================================================================
 Header: cglm/noise.h
 Classic Perlin noise implementation.
 Based on the work of Stefan Gustavson and Ashima Arts on "webgl-noise":
 https://github.com/stegu/webgl-noise
 Following Stefan Gustavson's paper "Simplex noise demystified":
 http://www.itn.liu.se/~stegu/simplexnoise/simplexnoise.pdf
 Implementation based on glm::perlin function:
 https://github.com/g-truc/glm/blob/master/glm/gtc/noise.inl
 Table of contents (click to go):
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Functions:
 1. :c:func:`glm_perlin_vec4`
 #. :c:func:`glm_perlin_vec3`
 #. :c:func:`glm_perlin_vec2`
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
 .. c:function:: float  glm_perlin_vec4(vec4 point)
    | Classic Perlin noise
    Parameters:
      | *[in]*  **point**  4D point
    Returns:
      | noise value
 .. c:function:: float  glm_perlin_vec3(vec3 point)
    | Classic Perlin noise
    Parameters:
      | *[in]*  **point**  3D point
    Returns:
      | noise value
 .. c:function:: float  glm_perlin_vec2(vec2 point)
    | Classic Perlin noise
    Parameters:
      | *[in]*  **point**  2D point
    Returns:
      | noise value
--- a/docs/source/opengl.rst
+++ b/docs/source/opengl.rst
@@ -34,7 +34,7 @@ array of matrices:
   /* ... */
   glUniformMatrix4fv(location, count, GL_FALSE, matrix[0][0]);
-1. Cast matrix to pointer
+2. Cast matrix to pointer
 --------------------------
 .. code-block:: c
--- a/docs/source/opt.rst
+++ b/docs/source/opt.rst
@@ -76,7 +76,7 @@ SSE and SSE2 Shuffle Option
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 **_mm_shuffle_ps** generates **shufps** instruction even if registers are same.
 You can force it to generate **pshufd** instruction by defining
-**CGLM_USE_INT_DOMAIN** macro. As default it is not defined.
+**CGLM_NO_INT_DOMAIN** macro. As default it is not defined.
 SSE3 and SSE4 Dot Product Options
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/quat.rst
+++ b/docs/source/quat.rst
@@ -11,9 +11,9 @@ Header: cglm/quat.h
 What you can do with quaternions with existing functions is (Some of them):
- You can rotate transform matrix using quaterion
+- You can rotate transform matrix using quaternion
- You can rotate vector using quaterion
+- You can rotate vector using quaternion
- You can create view matrix using quaterion
+- You can create view matrix using quaternion
 - You can create a lookrotation (from source point to dest)
 Table of contents (click to go):
@@ -55,6 +55,7 @@ Functions:
 #. :c:func:`glm_quat_lerp`
 #. :c:func:`glm_quat_nlerp`
 #. :c:func:`glm_quat_slerp`
 #. :c:func:`glm_quat_slerp_longest`
 #. :c:func:`glm_quat_look`
 #. :c:func:`glm_quat_for`
 #. :c:func:`glm_quat_forp`
@@ -351,6 +352,17 @@ Functions documentation
      | *[in]*  **t**     interpolant (amount) clamped between 0 and 1
      | *[out]* **dest**  result quaternion
 .. c:function:: void glm_quat_slerp_longest(versor q, versor r, float  t, versor dest)
    | interpolates between two quaternions
    | using spherical linear interpolation (SLERP) and always takes the longest path
    Parameters:
      | *[in]*  **from**  from
      | *[in]*  **to**    to
      | *[in]*  **t**     interpolant (amount) clamped between 0 and 1
      | *[out]* **dest**  result quaternion
 .. c:function:: void  glm_quat_look(vec3 eye, versor ori, mat4 dest)
    | creates view matrix using quaternion as camera orientation
--- a/docs/source/sphinx-static/theme_overrides.css
+++ b/docs/source/sphinx-static/theme_overrides.css
@@ -0,0 +1,12 @@
@media screen {
  /* content column
   *
   * RTD theme's default is 800px as max width for the content, but we have
   * tables with tons of columns, which need the full width of the view-port.
   *
   * Comment from yocto project theme_overrides.css
   */
  .wy-nav-content{ max-width: none; }
 }
--- a/docs/source/vec2-ext.rst
+++ b/docs/source/vec2-ext.rst
@@ -27,6 +27,8 @@ Functions:
 #. :c:func:`glm_vec2_isvalid`
 #. :c:func:`glm_vec2_sign`
 #. :c:func:`glm_vec2_abs`
 #. :c:func:`glm_vec2_fract`
 #. :c:func:`glm_vec2_floor`
 #. :c:func:`glm_vec2_sqrt`
 Functions documentation
@@ -134,6 +136,22 @@ Functions documentation
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector
 .. c:function:: void glm_vec2_fract(vec2 v, vec2 dest)
    get fractional part of each vector item
    Parameters:
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector
 .. c:function:: void glm_vec2_floor(vec2 v, vec2 dest)
    floor value of each vector item
    Parameters:
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector
 .. c:function:: void glm_vec2_sqrt(vec2 v, vec2 dest)
    square root of each vector item
--- a/docs/source/vec3-ext.rst
+++ b/docs/source/vec3-ext.rst
@@ -28,6 +28,8 @@ Functions:
 #. :c:func:`glm_vec3_isvalid`
 #. :c:func:`glm_vec3_sign`
 #. :c:func:`glm_vec3_abs`
 #. :c:func:`glm_vec3_fract`
 #. :c:func:`glm_vec3_floor`
 #. :c:func:`glm_vec3_sqrt`
 Functions documentation
@@ -151,6 +153,22 @@ Functions documentation
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector
 .. c:function:: void glm_vec3_fract(vec3 v, vec3 dest)
    fractional part of each vector item
    Parameters:
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector
 .. c:function:: void glm_vec3_floor(vec3 v, vec3 dest)
    floor of each vector item
    Parameters:
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector
 .. c:function:: void glm_vec3_sqrt(vec3 v, vec3 dest)
    square root of each vector item
--- a/docs/source/vec4-ext.rst
+++ b/docs/source/vec4-ext.rst
@@ -23,6 +23,15 @@ Functions:
 #. :c:func:`glm_vec4_eqv_eps`
 #. :c:func:`glm_vec4_max`
 #. :c:func:`glm_vec4_min`
 #. :c:func:`glm_vec4_isnan`
 #. :c:func:`glm_vec4_isinf`
 #. :c:func:`glm_vec4_isvalid`
 #. :c:func:`glm_vec4_sign`
 #. :c:func:`glm_vec4_abs`
 #. :c:func:`glm_vec4_fract`
 #. :c:func:`glm_vec4_floor`
 #. :c:func:`glm_vec4_sqrt`
 Functions documentation
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -129,6 +138,30 @@ Functions documentation
      | *[in]*   **v**     vector
      | *[out]*  **dest**  sign vector (only keeps signs as -1, 0, -1)
 .. c:function:: void glm_vec4_abs(vec4 v, vec4 dest)
    absolute value of each vector item
    Parameters:
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector (abs(v))
 .. c:function:: void glm_vec4_fract(vec4 v, vec4 dest)
    fractional part of each vector item
    Parameters:
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector (fract(v))
 .. c:function:: void glm_vec4_floor(vec4 v, vec4 dest)
    floor of each vector item
    Parameters:
      | *[in]*   **v**     vector
      | *[out]*  **dest**  destination vector (floor(v))
 .. c:function:: void glm_vec4_sqrt(vec4 v, vec4 dest)
    square root of each vector item
--- a/include/cglm/aabb2d.h
+++ b/include/cglm/aabb2d.h
@@ -95,7 +95,7 @@ glm_aabb2d_merge(vec2 aabb1[2], vec2 aabb2[2], vec2 dest[2]) {
 /*!
 * @brief crops a bounding aabb with another one.
 *
- * this could be useful for gettng a baabb which fits with view frustum and
+ * this could be useful for getting a baabb which fits with view frustum and
 * object bounding aabbes. In this case you crop view frustum aabb with objects
 * aabb
 *
@@ -116,7 +116,7 @@ glm_aabb2d_crop(vec2 aabb[2], vec2 cropAabb[2], vec2 dest[2]) {
 /*!
 * @brief crops a bounding aabb with another one.
 *
- * this could be useful for gettng a baabb which fits with view frustum and
+ * this could be useful for getting a baabb which fits with view frustum and
 * object bounding aabbes. In this case you crop view frustum aabb with objects
 * aabb
 *
@@ -268,4 +268,3 @@ glm_aabb2d_contains(vec2 aabb[2], vec2 other[2]) {
 }
 #endif /* cglm_aabb2d_h */
--- a/include/cglm/affine-pre.h
+++ b/include/cglm/affine-pre.h
@@ -215,7 +215,7 @@ glm_rotate_z(mat4 m, float angle, mat4 dest) {
 *   If you need to rotate object around itself e.g. center of object or at
 *   some point [of object] then `glm_rotate_at()` would be better choice to do so.
 *
- *   Even if object's model transform is identiy, rotation may not be around
+ *   Even if object's model transform is identity, rotation may not be around
 *   center of object if object does not lay out at ORIGIN perfectly.
 *
 *   Using `glm_rotate_at()` with center of bounding shape ( AABB, Sphere ... )
--- a/include/cglm/box.h
+++ b/include/cglm/box.h
@@ -75,7 +75,7 @@ glm_aabb_merge(vec3 box1[2], vec3 box2[2], vec3 dest[2]) {
 /*!
 * @brief crops a bounding box with another one.
 *
- * this could be useful for gettng a bbox which fits with view frustum and
+ * this could be useful for getting a bbox which fits with view frustum and
 * object bounding boxes. In this case you crop view frustum box with objects
 * box
 *
@@ -98,7 +98,7 @@ glm_aabb_crop(vec3 box[2], vec3 cropBox[2], vec3 dest[2]) {
 /*!
 * @brief crops a bounding box with another one.
 *
- * this could be useful for gettng a bbox which fits with view frustum and
+ * this could be useful for getting a bbox which fits with view frustum and
 * object bounding boxes. In this case you crop view frustum box with objects
 * box
 *
--- a/include/cglm/call.h
+++ b/include/cglm/call.h
@@ -32,6 +32,7 @@ extern "C" {
 #include "call/quat.h"
 #include "call/euler.h"
 #include "call/plane.h"
 #include "call/noise.h"
 #include "call/frustum.h"
 #include "call/aabb2d.h"
 #include "call/box.h"
--- a/include/cglm/call/aabb2d.h
+++ b/include/cglm/call/aabb2d.h
@@ -87,5 +87,3 @@ glmc_aabb2d_circle(vec2 aabb[2], vec3 s);
 }
 #endif
 #endif /* cglmc_aabb2d_h */
--- a/include/cglm/call/box.h
+++ b/include/cglm/call/box.h
@@ -76,4 +76,3 @@ glmc_aabb_sphere(vec3 box[2], vec4 s);
 }
 #endif
 #endif /* cglmc_box_h */
--- a/include/cglm/call/mat3x4.h
+++ b/include/cglm/call/mat3x4.h
@@ -27,11 +27,11 @@ glmc_mat3x4_make(const float * __restrict src, mat3x4 dest);
 CGLM_EXPORT
 void
-glmc_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat3 dest);
+glmc_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat4 dest);
 CGLM_EXPORT
 void
-glmc_mat3x4_mulv(mat3x4 m, vec4 v, vec3 dest);
+glmc_mat3x4_mulv(mat3x4 m, vec3 v, vec4 dest);
 CGLM_EXPORT
 void
--- a/include/cglm/call/mat4x2.h
+++ b/include/cglm/call/mat4x2.h
@@ -27,11 +27,11 @@ glmc_mat4x2_make(const float * __restrict src, mat4x2 dest);
 CGLM_EXPORT
 void
-glmc_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat4 dest);
+glmc_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat2 dest);
 CGLM_EXPORT
 void
-glmc_mat4x2_mulv(mat4x2 m, vec2 v, vec4 dest);
+glmc_mat4x2_mulv(mat4x2 m, vec4 v, vec2 dest);
 CGLM_EXPORT
 void
--- a/include/cglm/call/mat4x3.h
+++ b/include/cglm/call/mat4x3.h
@@ -27,11 +27,11 @@ glmc_mat4x3_make(const float * __restrict src, mat4x3 dest);
 CGLM_EXPORT
 void
-glmc_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat4 dest);
+glmc_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat3 dest);
 CGLM_EXPORT
 void
-glmc_mat4x3_mulv(mat4x3 m, vec3 v, vec4 dest);
+glmc_mat4x3_mulv(mat4x3 m, vec4 v, vec3 dest);
 CGLM_EXPORT
 void
--- a/include/cglm/call/noise.h
+++ b/include/cglm/call/noise.h
@@ -0,0 +1,31 @@
 /*
 * Copyright (c), Recep Aslantas.
 *
 * MIT License (MIT), http://opensource.org/licenses/MIT
 * Full license can be found in the LICENSE file
 */
 #ifndef cglmc_noise_h
 #define cglmc_noise_h
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "../cglm.h"
 CGLM_EXPORT
 float
 glmc_perlin_vec4(vec4 point);
 CGLM_EXPORT
 float
 glmc_perlin_vec3(vec3 point);
 CGLM_EXPORT
 float
 glmc_perlin_vec2(vec2 point);
 #ifdef __cplusplus
 }
 #endif
 #endif /* cglmc_noise_h */
--- a/include/cglm/call/project.h
+++ b/include/cglm/call/project.h
@@ -37,5 +37,3 @@ glmc_pickmatrix(vec2 center, vec2 size, vec4 vp, mat4 dest);
 }
 #endif
 #endif /* cglmc_project_h */
--- a/include/cglm/call/quat.h
+++ b/include/cglm/call/quat.h
@@ -133,6 +133,10 @@ CGLM_EXPORT
 void
 glmc_quat_slerp(versor q, versor r, float t, versor dest);
 CGLM_EXPORT
 void
 glmc_quat_slerp_longest(versor q, versor r, float t, versor dest);
 CGLM_EXPORT
 void
 glmc_quat_look(vec3 eye, versor ori, mat4 dest);
--- a/include/cglm/call/vec2.h
+++ b/include/cglm/call/vec2.h
@@ -17,6 +17,18 @@ CGLM_EXPORT
 void
 glmc_vec2(float * __restrict v, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_fill(vec2 v, float val);
 CGLM_EXPORT
 bool
 glmc_vec2_eq(vec2 v, float val);
 CGLM_EXPORT
 bool
 glmc_vec2_eqv(vec2 a, vec2 b);
 CGLM_EXPORT
 void
 glmc_vec2_copy(vec2 a, vec2 dest);
@@ -177,10 +189,38 @@ CGLM_EXPORT
 void
 glmc_vec2_abs(vec2 v, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_fract(vec2 v, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_floor(vec2 v, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_mods(vec2 v, float s, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_swizzle(vec2 v, int mask, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_lerp(vec2 from, vec2 to, float t, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_step(vec2 edge, vec2 x, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_steps(float edge, vec2 x, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_stepr(vec2 edge, float x, vec2 dest);
 CGLM_EXPORT
 void
 glmc_vec2_complex_mul(vec2 a, vec2 b, vec2 dest);
--- a/include/cglm/call/vec3.h
+++ b/include/cglm/call/vec3.h
@@ -19,6 +19,7 @@ extern "C" {
 #define glmc_vec3_flipsign_to(v, dest) glmc_vec3_negate_to(v, dest)
 #define glmc_vec3_inv(v)               glmc_vec3_negate(v)
 #define glmc_vec3_inv_to(v, dest)      glmc_vec3_negate_to(v, dest)
 #define glmc_vec3_step_uni(edge, x, dest) glmc_vec3_steps(edge, x, dest);
 CGLM_EXPORT
 void
@@ -232,10 +233,6 @@ glmc_vec3_mixc(vec3 from, vec3 to, float t, vec3 dest) {
  glmc_vec3_lerpc(from, to, t, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec3_step_uni(float edge, vec3 x, vec3 dest);
 CGLM_EXPORT
 void
 glmc_vec3_step(vec3 edge, vec3 x, vec3 dest);
@@ -256,6 +253,10 @@ CGLM_EXPORT
 void
 glmc_vec3_smoothinterpc(vec3 from, vec3 to, float t, vec3 dest);
 CGLM_EXPORT
 void
 glmc_vec3_swizzle(vec3 v, int mask, vec3 dest);
 /* ext */
 CGLM_EXPORT
@@ -322,6 +323,22 @@ CGLM_EXPORT
 void
 glmc_vec3_fract(vec3 v, vec3 dest);
 CGLM_EXPORT
 void
 glmc_vec3_floor(vec3 v, vec3 dest);
 CGLM_EXPORT
 void
 glmc_vec3_mods(vec3 v, float s, vec3 dest);
 CGLM_EXPORT
 void
 glmc_vec3_steps(float edge, vec3 x, vec3 dest);
 CGLM_EXPORT
 void
 glmc_vec3_stepr(vec3 edge, float x, vec3 dest);
 CGLM_EXPORT
 float
 glmc_vec3_hadd(vec3 v);
--- a/include/cglm/call/vec4.h
+++ b/include/cglm/call/vec4.h
@@ -20,6 +20,7 @@ extern "C" {
 #define glmc_vec4_flipsign_to(v, dest)  glmc_vec4_negate_to(v, dest)
 #define glmc_vec4_inv(v)                glmc_vec4_negate(v)
 #define glmc_vec4_inv_to(v, dest)       glmc_vec4_negate_to(v, dest)
 #define glmc_vec4_step_uni(edge, x, dest) glmc_vec4_steps(edge, x, dest)
 CGLM_EXPORT
 void
@@ -205,10 +206,6 @@ glmc_vec4_mixc(vec4 from, vec4 to, float t, vec4 dest) {
  glmc_vec4_lerpc(from, to, t, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec4_step_uni(float edge, vec4 x, vec4 dest);
 CGLM_EXPORT
 void
 glmc_vec4_step(vec4 edge, vec4 x, vec4 dest);
@@ -233,6 +230,10 @@ CGLM_EXPORT
 void
 glmc_vec4_cubic(float s, vec4 dest);
 CGLM_EXPORT
 void
 glmc_vec4_swizzle(vec4 v, int mask, vec4 dest);
 /* ext */
 CGLM_EXPORT
@@ -299,6 +300,22 @@ CGLM_EXPORT
 void
 glmc_vec4_fract(vec4 v, vec4 dest);
 CGLM_EXPORT
 void
 glmc_vec4_floor(vec4 v, vec4 dest);
 CGLM_EXPORT
 void
 glmc_vec4_mods(vec4 v, float s, vec4 dest);
 CGLM_EXPORT
 void
 glmc_vec4_steps(float edge, vec4 x, vec4 dest);
 CGLM_EXPORT
 void
 glmc_vec4_stepr(vec4 edge, float x, vec4 dest);
 CGLM_EXPORT
 float
 glmc_vec4_hadd(vec4 v);
@@ -323,4 +340,3 @@ glmc_vec4_refract(vec4 v, vec4 n, float eta, vec4 dest);
 }
 #endif
 #endif /* cglmc_vec4_h */
--- a/include/cglm/cglm.h
+++ b/include/cglm/cglm.h
@@ -30,6 +30,7 @@
 #include "quat.h"
 #include "euler.h"
 #include "plane.h"
 #include "noise.h"
 #include "aabb2d.h"
 #include "box.h"
 #include "color.h"
--- a/include/cglm/common.h
+++ b/include/cglm/common.h
@@ -51,6 +51,7 @@
 #define GLM_SHUFFLE4(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
 #define GLM_SHUFFLE3(z, y, x)    (((z) << 4) | ((y) << 2) | (x))
 #define GLM_SHUFFLE2(y, x)       (((y) << 2) | (x))
 #include "types.h"
 #include "simd/intrin.h"
--- a/include/cglm/io.h
+++ b/include/cglm/io.h
@@ -132,9 +132,9 @@ glm_mat4_print(mat4              matrix,
  for (i = 0; i < m; i++) {
    for (j = 0; j < n; j++) {
      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
-        cwi = sprintf(buff, "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
+        cwi = snprintf(buff, sizeof(buff), "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
      else
-        cwi = sprintf(buff, "% g", (double)matrix[i][j]);
+        cwi = snprintf(buff, sizeof(buff), "% g", (double)matrix[i][j]);
      cw[i] = GLM_MAX(cw[i], cwi);
    }
  }
@@ -175,9 +175,9 @@ glm_mat3_print(mat3              matrix,
  for (i = 0; i < m; i++) {
    for (j = 0; j < n; j++) {
      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
-        cwi = sprintf(buff, "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
+        cwi = snprintf(buff, sizeof(buff), "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
      else
-        cwi = sprintf(buff, "% g", (double)matrix[i][j]);
+        cwi = snprintf(buff, sizeof(buff), "% g", (double)matrix[i][j]);
      cw[i] = GLM_MAX(cw[i], cwi);
    }
  }
@@ -217,9 +217,9 @@ glm_mat2_print(mat2              matrix,
  for (i = 0; i < m; i++) {
    for (j = 0; j < n; j++) {
      if (matrix[i][j] < CGLM_PRINT_MAX_TO_SHORT)
-        cwi = sprintf(buff, "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
+        cwi = snprintf(buff, sizeof(buff), "% .*f", CGLM_PRINT_PRECISION, (double)matrix[i][j]);
      else
-        cwi = sprintf(buff, "% g", (double)matrix[i][j]);
+        cwi = snprintf(buff, sizeof(buff), "% g", (double)matrix[i][j]);
      cw[i] = GLM_MAX(cw[i], cwi);
    }
  }
--- a/include/cglm/mat2.h
+++ b/include/cglm/mat2.h
@@ -235,7 +235,7 @@ glm_mat2_scale(mat2 m, float s) {
  glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]),
                                  wasm_f32x4_splat(s)));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s)));
+  glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), glmm_set1(s)));
 #elif defined(CGLM_NEON_FP)
  vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s)));
 #else
--- a/include/cglm/mat3.h
+++ b/include/cglm/mat3.h
@@ -334,7 +334,7 @@ glm_mat3_det(mat3 mat) {
        d = mat[1][0], e = mat[1][1], f = mat[1][2],
        g = mat[2][0], h = mat[2][1], i = mat[2][2];
-  return a * (e * i - h * f) - d * (b * i - c * h) + g * (b * f - c * e);
+  return a * (e * i - h * f) - d * (b * i - h * c) + g * (b * f - e * c);
 }
 /*!
@@ -346,24 +346,22 @@ glm_mat3_det(mat3 mat) {
 CGLM_INLINE
 void
 glm_mat3_inv(mat3 mat, mat3 dest) {
  float det;
  float a = mat[0][0], b = mat[0][1], c = mat[0][2],
        d = mat[1][0], e = mat[1][1], f = mat[1][2],
-        g = mat[2][0], h = mat[2][1], i = mat[2][2];
+        g = mat[2][0], h = mat[2][1], i = mat[2][2],
-  dest[0][0] =   e * i - f * h;
+        c1  = e * i - f * h, c2 = d * i - g * f, c3 = d * h - g * e,
-  dest[0][1] = -(b * i - h * c);
+        idt = 1.0f / (a * c1 - b * c2 + c * c3), ndt = -idt;
  dest[0][2] =   b * f - e * c;
  dest[1][0] = -(d * i - g * f);
  dest[1][1] =   a * i - c * g;
  dest[1][2] = -(a * f - d * c);
  dest[2][0] =   d * h - g * e;
  dest[2][1] = -(a * h - g * b);
  dest[2][2] =   a * e - b * d;
-  det = 1.0f / (a * dest[0][0] + b * dest[1][0] + c * dest[2][0]);
+  dest[0][0] = idt * c1;
-
+  dest[0][1] = ndt * (b * i - h * c);
-  glm_mat3_scale(dest, det);
+  dest[0][2] = idt * (b * f - e * c);
  dest[1][0] = ndt * c2;
  dest[1][1] = idt * (a * i - g * c);
  dest[1][2] = ndt * (a * f - d * c);
  dest[2][0] = idt * c3;
  dest[2][1] = ndt * (a * h - g * b);
  dest[2][2] = idt * (a * e - d * b);
 }
 /*!
--- a/include/cglm/mat3x4.h
+++ b/include/cglm/mat3x4.h
@@ -14,8 +14,8 @@
   CGLM_INLINE void glm_mat3x4_copy(mat3x4 mat, mat3x4 dest);
   CGLM_INLINE void glm_mat3x4_zero(mat3x4 mat);
   CGLM_INLINE void glm_mat3x4_make(const float * __restrict src, mat3x4 dest);
-   CGLM_INLINE void glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat3 dest);
+   CGLM_INLINE void glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat4 dest);
-   CGLM_INLINE void glm_mat3x4_mulv(mat3x4 m, vec4 v, vec3 dest);
+   CGLM_INLINE void glm_mat3x4_mulv(mat3x4 m, vec3 v, vec4 dest);
   CGLM_INLINE void glm_mat3x4_transpose(mat3x4 m, mat4x3 dest);
   CGLM_INLINE void glm_mat3x4_scale(mat3x4 m, float s);
 */
@@ -87,16 +87,16 @@ glm_mat3x4_make(const float * __restrict src, mat3x4 dest) {
 * @brief multiply m1 and m2 to dest
 *
 * @code
- * glm_mat3x4_mul(mat3x4, mat4x3, mat3);
+ * glm_mat3x4_mul(mat3x4, mat4x3, mat4);
 * @endcode
 *
 * @param[in]  m1   left matrix (mat3x4)
 * @param[in]  m2   right matrix (mat4x3)
- * @param[out] dest destination matrix (mat3)
+ * @param[out] dest destination matrix (mat4)
 */
 CGLM_INLINE
 void
-glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat3 dest) {
+glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat4 dest) {
  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2], a03 = m1[0][3],
        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2], a13 = m1[1][3],
        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2], a23 = m1[2][3],
@@ -106,21 +106,29 @@ glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat3 dest) {
        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2],
        b30 = m2[3][0], b31 = m2[3][1], b32 = m2[3][2];
-  dest[0][0] = a00 * b00 + a01 * b10 + a02 * b20 + a03 * b30;
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02;
-  dest[0][1] = a00 * b01 + a01 * b11 + a02 * b21 + a03 * b31;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02;
-  dest[0][2] = a00 * b02 + a01 * b12 + a02 * b22 + a03 * b32;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02;
  dest[0][3] = a03 * b00 + a13 * b01 + a23 * b02;
-  dest[1][0] = a10 * b00 + a11 * b10 + a12 * b20 + a13 * b30;
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12;
-  dest[1][1] = a10 * b01 + a11 * b11 + a12 * b21 + a13 * b31;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12;
-  dest[1][2] = a10 * b02 + a11 * b12 + a12 * b22 + a13 * b32;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12;
  dest[1][3] = a03 * b10 + a13 * b11 + a23 * b12;
-  dest[2][0] = a20 * b00 + a21 * b10 + a22 * b20 + a23 * b30;
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22;
-  dest[2][1] = a20 * b01 + a21 * b11 + a22 * b21 + a23 * b31;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22;
-  dest[2][2] = a20 * b02 + a21 * b12 + a22 * b22 + a23 * b32;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22;
  dest[2][3] = a03 * b20 + a13 * b21 + a23 * b22;
  dest[3][0] = a00 * b30 + a10 * b31 + a20 * b32;
  dest[3][1] = a01 * b30 + a11 * b31 + a21 * b32;
  dest[3][2] = a02 * b30 + a12 * b31 + a22 * b32;
  dest[3][3] = a03 * b30 + a13 * b31 + a23 * b32;
 }
 /*!
- * @brief multiply matrix with column vector and store in dest vector
+ * @brief multiply matrix with column vector and store in dest column vector
 *
 * @param[in]  m    matrix (left)
 * @param[in]  v    vector (right, column vector)
@@ -128,12 +136,13 @@ glm_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat3 dest) {
 */
 CGLM_INLINE
 void
-glm_mat3x4_mulv(mat3x4 m, vec4 v, vec3 dest) {
+glm_mat3x4_mulv(mat3x4 m, vec3 v, vec4 dest) {
-  float v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3];
+  float v0 = v[0], v1 = v[1], v2 = v[2];
-  dest[0] = m[0][0] * v0 + m[0][1] * v1 + m[0][2] * v2 + m[0][3] * v3;
+  dest[0] = m[0][0] * v0 + m[1][0] * v1 + m[2][0] * v2;
-  dest[1] = m[1][0] * v0 + m[1][1] * v1 + m[1][2] * v2 + m[1][3] * v3;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1 + m[2][1] * v2;
-  dest[2] = m[2][0] * v0 + m[2][1] * v1 + m[2][2] * v2 + m[2][3] * v3;
+  dest[2] = m[0][2] * v0 + m[1][2] * v1 + m[2][2] * v2;
  dest[3] = m[0][3] * v0 + m[1][3] * v1 + m[2][3] * v2;
 }
 /*!
@@ -162,10 +171,9 @@ glm_mat3x4_transpose(mat3x4 m, mat4x3 dest) {
 CGLM_INLINE
 void
 glm_mat3x4_scale(mat3x4 m, float s) {
-  m[0][0] *= s;  m[1][0] *= s;   m[2][0] *= s;
+  m[0][0] *= s; m[0][1] *= s; m[0][2] *= s; m[0][3] *= s;
-  m[0][1] *= s;  m[1][1] *= s;   m[2][1] *= s;
+  m[1][0] *= s; m[1][1] *= s; m[1][2] *= s; m[1][3] *= s;
-  m[0][2] *= s;  m[1][2] *= s;   m[2][2] *= s;
+  m[2][0] *= s; m[2][1] *= s; m[2][2] *= s; m[2][3] *= s;
  m[0][3] *= s;  m[1][3] *= s;   m[2][3] *= s;
 }
 #endif
--- a/include/cglm/mat4.h
+++ b/include/cglm/mat4.h
@@ -520,6 +520,8 @@ void
 glm_mat4_transpose_to(mat4 m, mat4 dest) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glm_mat4_transp_wasm(m, dest);
 #elif defined(__AVX__)
  glm_mat4_transp_avx(m, dest);
 #elif defined( __SSE__ ) || defined( __SSE2__ )
  glm_mat4_transp_sse2(m, dest);
 #elif defined(CGLM_NEON_FP)
@@ -546,6 +548,8 @@ void
 glm_mat4_transpose(mat4 m) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glm_mat4_transp_wasm(m, m);
 #elif defined(__AVX__)
  glm_mat4_transp_avx(m, m);
 #elif defined( __SSE__ ) || defined( __SSE2__ )
  glm_mat4_transp_sse2(m, m);
 #elif defined(CGLM_NEON_FP)
@@ -645,51 +649,44 @@ glm_mat4_det(mat4 mat) {
 CGLM_INLINE
 void
 glm_mat4_inv(mat4 mat, mat4 dest) {
-#if defined( __SSE__ ) || defined( __SSE2__ )
+#if defined(__wasm__) && defined(__wasm_simd128__)
  glm_mat4_inv_wasm(mat, dest);
 #elif defined( __SSE__ ) || defined( __SSE2__ )
  glm_mat4_inv_sse2(mat, dest);
 #elif defined(CGLM_NEON_FP)
  glm_mat4_inv_neon(mat, dest);
 #else
  float t[6];
  float det;
  float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3],
        e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3],
        i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3],
-        m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3];
+        m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3],
-  t[0] = k * p - o * l; t[1] = j * p - n * l; t[2] = j * o - n * k;
+        c1  = k * p - l * o,  c2  = c * h - d * g,  c3  = i * p - l * m,
-  t[3] = i * p - m * l; t[4] = i * o - m * k; t[5] = i * n - m * j;
+        c4  = a * h - d * e,  c5  = j * p - l * n,  c6  = b * h - d * f, 
        c7  = i * n - j * m,  c8  = a * f - b * e,  c9  = j * o - k * n,
        c10 = b * g - c * f,  c11 = i * o - k * m,  c12 = a * g - c * e,
-  dest[0][0] =  f * t[0] - g * t[1] + h * t[2];
+        idt = 1.0f/(c8*c1+c4*c9+c10*c3+c2*c7-c12*c5-c6*c11), ndt = -idt;
  dest[1][0] =-(e * t[0] - g * t[3] + h * t[4]);
  dest[2][0] =  e * t[1] - f * t[3] + h * t[5];
  dest[3][0] =-(e * t[2] - f * t[4] + g * t[5]);
-  dest[0][1] =-(b * t[0] - c * t[1] + d * t[2]);
+  dest[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
-  dest[1][1] =  a * t[0] - c * t[3] + d * t[4];
+  dest[0][1] = (b * c1  - c * c5  + d * c9)  * ndt;
-  dest[2][1] =-(a * t[1] - b * t[3] + d * t[5]);
+  dest[0][2] = (n * c2  - o * c6  + p * c10) * idt;
-  dest[3][1] =  a * t[2] - b * t[4] + c * t[5];
+  dest[0][3] = (j * c2  - k * c6  + l * c10) * ndt;
-  t[0] = g * p - o * h; t[1] = f * p - n * h; t[2] = f * o - n * g;
+  dest[1][0] = (e * c1  - g * c3  + h * c11) * ndt;
-  t[3] = e * p - m * h; t[4] = e * o - m * g; t[5] = e * n - m * f;
+  dest[1][1] = (a * c1  - c * c3  + d * c11) * idt;
  dest[1][2] = (m * c2  - o * c4  + p * c12) * ndt;
  dest[1][3] = (i * c2  - k * c4  + l * c12) * idt;
-  dest[0][2] =  b * t[0] - c * t[1] + d * t[2];
+  dest[2][0] = (e * c5  - f * c3  + h * c7)  * idt;
-  dest[1][2] =-(a * t[0] - c * t[3] + d * t[4]);
+  dest[2][1] = (a * c5  - b * c3  + d * c7)  * ndt;
-  dest[2][2] =  a * t[1] - b * t[3] + d * t[5];
+  dest[2][2] = (m * c6  - n * c4  + p * c8)  * idt;
-  dest[3][2] =-(a * t[2] - b * t[4] + c * t[5]);
+  dest[2][3] = (i * c6  - j * c4  + l * c8)  * ndt;
-  t[0] = g * l - k * h; t[1] = f * l - j * h; t[2] = f * k - j * g;
+  dest[3][0] = (e * c9  - f * c11 + g * c7)  * ndt;
-  t[3] = e * l - i * h; t[4] = e * k - i * g; t[5] = e * j - i * f;
+  dest[3][1] = (a * c9  - b * c11 + c * c7)  * idt;
-
+  dest[3][2] = (m * c10 - n * c12 + o * c8)  * ndt;
-  dest[0][3] =-(b * t[0] - c * t[1] + d * t[2]);
+  dest[3][3] = (i * c10 - j * c12 + k * c8)  * idt;
  dest[1][3] =  a * t[0] - c * t[3] + d * t[4];
  dest[2][3] =-(a * t[1] - b * t[3] + d * t[5]);
  dest[3][3] =  a * t[2] - b * t[4] + c * t[5];
  det = 1.0f / (a * dest[0][0] + b * dest[1][0]
              + c * dest[2][0] + d * dest[3][0]);
  glm_mat4_scale_p(dest, det);
 #endif
 }
--- a/include/cglm/mat4x2.h
+++ b/include/cglm/mat4x2.h
@@ -14,8 +14,8 @@
   CGLM_INLINE void glm_mat4x2_copy(mat4x2 mat, mat4x2 dest);
   CGLM_INLINE void glm_mat4x2_zero(mat4x2 mat);
   CGLM_INLINE void glm_mat4x2_make(const float * __restrict src, mat4x2 dest);
-   CGLM_INLINE void glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat4 dest);
+   CGLM_INLINE void glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat2 dest);
-   CGLM_INLINE void glm_mat4x2_mulv(mat4x2 m, vec2 v, vec4 dest);
+   CGLM_INLINE void glm_mat4x2_mulv(mat4x2 m, vec4 v, vec2 dest);
   CGLM_INLINE void glm_mat4x2_transpose(mat4x2 m, mat2x4 dest);
   CGLM_INLINE void glm_mat4x2_scale(mat4x2 m, float s);
 */
@@ -90,16 +90,16 @@ glm_mat4x2_make(const float * __restrict src, mat4x2 dest) {
 * @brief multiply m1 and m2 to dest
 *
 * @code
- * glm_mat4x2_mul(mat4x2, mat2x4, mat4);
+ * glm_mat4x2_mul(mat4x2, mat2x4, mat2);
 * @endcode
 *
 * @param[in]  m1   left matrix (mat4x2)
 * @param[in]  m2   right matrix (mat2x4)
- * @param[out] dest destination matrix (mat4)
+ * @param[out] dest destination matrix (mat2)
 */
 CGLM_INLINE
 void
-glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat4 dest) {
+glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat2 dest) {
  float a00 = m1[0][0], a01 = m1[0][1],
        a10 = m1[1][0], a11 = m1[1][1],
        a20 = m1[2][0], a21 = m1[2][1],
@@ -108,29 +108,15 @@ glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat4 dest) {
        b00 = m2[0][0], b01 = m2[0][1], b02 = m2[0][2], b03 = m2[0][3],
        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2], b13 = m2[1][3];
-  dest[0][0] = a00 * b00 + a01 * b10;
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02 + a30 * b03;
-  dest[0][1] = a00 * b01 + a01 * b11;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02 + a31 * b03;
  dest[0][2] = a00 * b02 + a01 * b12;
  dest[0][3] = a00 * b03 + a01 * b13;
-  dest[1][0] = a10 * b00 + a11 * b10;
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12 + a30 * b13;
-  dest[1][1] = a10 * b01 + a11 * b11;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12 + a31 * b13;
  dest[1][2] = a10 * b02 + a11 * b12;
  dest[1][3] = a10 * b03 + a11 * b13;
  dest[2][0] = a20 * b00 + a21 * b10;
  dest[2][1] = a20 * b01 + a21 * b11;
  dest[2][2] = a20 * b02 + a21 * b12;
  dest[2][3] = a20 * b03 + a21 * b13;
  dest[3][0] = a30 * b00 + a31 * b10;
  dest[3][1] = a30 * b01 + a31 * b11;
  dest[3][2] = a30 * b02 + a31 * b12;
  dest[3][3] = a30 * b03 + a31 * b13;
 }
 /*!
- * @brief multiply matrix with column vector and store in dest vector
+ * @brief multiply matrix with column vector and store in dest column vector
 *
 * @param[in]  m    matrix (left)
 * @param[in]  v    vector (right, column vector)
@@ -138,13 +124,11 @@ glm_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat4 dest) {
 */
 CGLM_INLINE
 void
-glm_mat4x2_mulv(mat4x2 m, vec2 v, vec4 dest) {
+glm_mat4x2_mulv(mat4x2 m, vec4 v, vec2 dest) {
-  float v0 = v[0], v1 = v[1];
+  float v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3];
-  dest[0] = m[0][0] * v0 + m[0][1] * v1;
+  dest[0] = m[0][0] * v0 + m[1][0] * v1 + m[2][0] * v2 + m[3][0] * v3;
-  dest[1] = m[1][0] * v0 + m[1][1] * v1;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1 + m[2][1] * v2 + m[3][1] * v3;
  dest[2] = m[2][0] * v0 + m[2][1] * v1;
  dest[3] = m[3][0] * v0 + m[3][1] * v1;
 }
 /*!
--- a/include/cglm/mat4x3.h
+++ b/include/cglm/mat4x3.h
@@ -14,8 +14,8 @@
   CGLM_INLINE void glm_mat4x3_copy(mat4x3 mat, mat4x3 dest);
   CGLM_INLINE void glm_mat4x3_zero(mat4x3 mat);
   CGLM_INLINE void glm_mat4x3_make(const float * __restrict src, mat4x3 dest);
-   CGLM_INLINE void glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat4 dest);
+   CGLM_INLINE void glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat3 dest);
-   CGLM_INLINE void glm_mat4x3_mulv(mat4x3 m, vec3 v, vec4 dest);
+   CGLM_INLINE void glm_mat4x3_mulv(mat4x3 m, vec4 v, vec3 dest);
   CGLM_INLINE void glm_mat4x3_transpose(mat4x3 m, mat3x4 dest);
   CGLM_INLINE void glm_mat4x3_scale(mat4x3 m, float s);
 */
@@ -99,16 +99,16 @@ glm_mat4x3_make(const float * __restrict src, mat4x3 dest) {
 * @brief multiply m1 and m2 to dest
 *
 * @code
- * glm_mat4x3_mul(mat4x3, mat3x4, mat4);
+ * glm_mat4x3_mul(mat4x3, mat3x4, mat3);
 * @endcode
 *
 * @param[in]  m1   left matrix (mat4x3)
 * @param[in]  m2   right matrix (mat3x4)
- * @param[out] dest destination matrix (mat4)
+ * @param[out] dest destination matrix (mat3)
 */
 CGLM_INLINE
 void
-glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat4 dest) {
+glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat3 dest) {
  float a00 = m1[0][0], a01 = m1[0][1], a02 = m1[0][2],
        a10 = m1[1][0], a11 = m1[1][1], a12 = m1[1][2],
        a20 = m1[2][0], a21 = m1[2][1], a22 = m1[2][2],
@@ -118,29 +118,21 @@ glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat4 dest) {
        b10 = m2[1][0], b11 = m2[1][1], b12 = m2[1][2], b13 = m2[1][3],
        b20 = m2[2][0], b21 = m2[2][1], b22 = m2[2][2], b23 = m2[2][3];
-  dest[0][0] = a00 * b00 + a01 * b10 + a02 * b20;
+  dest[0][0] = a00 * b00 + a10 * b01 + a20 * b02 + a30 * b03;
-  dest[0][1] = a00 * b01 + a01 * b11 + a02 * b21;
+  dest[0][1] = a01 * b00 + a11 * b01 + a21 * b02 + a31 * b03;
-  dest[0][2] = a00 * b02 + a01 * b12 + a02 * b22;
+  dest[0][2] = a02 * b00 + a12 * b01 + a22 * b02 + a32 * b03;
  dest[0][3] = a00 * b03 + a01 * b13 + a02 * b23;
-  dest[1][0] = a10 * b00 + a11 * b10 + a12 * b20;
+  dest[1][0] = a00 * b10 + a10 * b11 + a20 * b12 + a30 * b13;
-  dest[1][1] = a10 * b01 + a11 * b11 + a12 * b21;
+  dest[1][1] = a01 * b10 + a11 * b11 + a21 * b12 + a31 * b13;
-  dest[1][2] = a10 * b02 + a11 * b12 + a12 * b22;
+  dest[1][2] = a02 * b10 + a12 * b11 + a22 * b12 + a32 * b13;
  dest[1][3] = a10 * b03 + a11 * b13 + a12 * b23;
-  dest[2][0] = a20 * b00 + a21 * b10 + a22 * b20;
+  dest[2][0] = a00 * b20 + a10 * b21 + a20 * b22 + a30 * b23;
-  dest[2][1] = a20 * b01 + a21 * b11 + a22 * b21;
+  dest[2][1] = a01 * b20 + a11 * b21 + a21 * b22 + a31 * b23;
-  dest[2][2] = a20 * b02 + a21 * b12 + a22 * b22;
+  dest[2][2] = a02 * b20 + a12 * b21 + a22 * b22 + a32 * b23;
  dest[2][3] = a20 * b03 + a21 * b13 + a22 * b23;
  dest[3][0] = a30 * b00 + a31 * b10 + a32 * b20;
  dest[3][1] = a30 * b01 + a31 * b11 + a32 * b21;
  dest[3][2] = a30 * b02 + a31 * b12 + a32 * b22;
  dest[3][3] = a30 * b03 + a31 * b13 + a32 * b23;
 }
 /*!
- * @brief multiply matrix with column vector and store in dest vector
+ * @brief multiply matrix with column vector and store in dest column vector
 *
 * @param[in]  m    matrix (left)
 * @param[in]  v    vector (right, column vector)
@@ -148,13 +140,12 @@ glm_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat4 dest) {
 */
 CGLM_INLINE
 void
-glm_mat4x3_mulv(mat4x3 m, vec3 v, vec4 dest) {
+glm_mat4x3_mulv(mat4x3 m, vec4 v, vec3 dest) {
-  float v0 = v[0], v1 = v[1], v2 = v[2];
+  float v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3];
-  dest[0] = m[0][0] * v0 + m[0][1] * v1 + m[0][2] * v2;
+  dest[0] = m[0][0] * v0 + m[1][0] * v1 + m[2][0] * v2 + m[3][0] * v3;
-  dest[1] = m[1][0] * v0 + m[1][1] * v1 + m[1][2] * v2;
+  dest[1] = m[0][1] * v0 + m[1][1] * v1 + m[2][1] * v2 + m[3][1] * v3;
-  dest[2] = m[2][0] * v0 + m[2][1] * v1 + m[2][2] * v2;
+  dest[2] = m[0][2] * v0 + m[1][2] * v1 + m[2][2] * v2 + m[3][2] * v3;
  dest[3] = m[3][0] * v0 + m[3][1] * v1 + m[3][2] * v2;
 }
 /*!
--- a/include/cglm/noise.h
+++ b/include/cglm/noise.h
@@ -0,0 +1,734 @@
 /*
 * Copyright (c), Recep Aslantas.
 *
 * MIT License (MIT), http://opensource.org/licenses/MIT
 * Full license can be found in the LICENSE file
 *
 * Based on the work of Stefan Gustavson and Ashima Arts on "webgl-noise":
 * https://github.com/stegu/webgl-noise
 * Following Stefan Gustavson's paper "Simplex noise demystified":
 * http://www.itn.liu.se/~stegu/simplexnoise/simplexnoise.pdf
 * 
 * Implementation based on glm::perlin function:
 * https://github.com/g-truc/glm/blob/master/glm/gtc/noise.inl
 */
 #ifndef cglm_noise_h
 #define cglm_noise_h
 #include "vec4.h"
 #include "vec4-ext.h"
 #include "vec3.h"
 #include "vec3-ext.h"
 #include "vec2.h"
 #include "vec2-ext.h"
 #define glm__noiseDetail_mod289(x) (x - floorf(x * (1.0f / 289.0f)) * 289.0f)
 /* glm__noiseDetail_permute(vec4 x, vec4 dest) */
 #define glm__noiseDetail_permute(x, dest) { \
  dest[0] = glm__noiseDetail_mod289((x[0] * 34.0f + 1.0f) * x[0]); \
  dest[1] = glm__noiseDetail_mod289((x[1] * 34.0f + 1.0f) * x[1]); \
  dest[2] = glm__noiseDetail_mod289((x[2] * 34.0f + 1.0f) * x[2]); \
  dest[3] = glm__noiseDetail_mod289((x[3] * 34.0f + 1.0f) * x[3]); \
 }
 /* glm__noiseDetail_fade_vec4(vec4 t, vec4 dest) */
 #define glm__noiseDetail_fade_vec4(t, dest) { \
  /* dest = (t * t * t) * (t * (t * 6.0f - 15.0f) + 10.0f) */ \
  vec4 temp; \
  glm_vec4_mul(t, t, temp); \
  glm_vec4_mul(temp, t, temp); \
  /* dest = (t * (t * 6.0f - 15.0f) + 10.0f) */ \
  glm_vec4_scale(t, 6.0f, dest); \
  glm_vec4_subs(dest, 15.0f, dest); \
  glm_vec4_mul(t, dest, dest); \
  glm_vec4_adds(dest, 10.0f, dest); \
  /* dest = temp * dest */ \
  glm_vec4_mul(temp, dest, dest); \
 }
 /* glm__noiseDetail_fade_vec3(vec3 t, vec3 dest) */
 #define glm__noiseDetail_fade_vec3(t, dest) { \
  /* dest = (t * t * t) * (t * (t * 6.0f - 15.0f) + 10.0f) */ \
  /* temp = t * t * t */ \
  vec3 temp; \
  glm_vec3_mul(t, t, temp); \
  glm_vec3_mul(temp, t, temp); \
  /* dest = (t * (t * 6.0f - 15.0f) + 10.0f) */ \
  glm_vec3_scale(t, 6.0f, dest); \
  glm_vec3_subs(dest, 15.0f, dest); \
  glm_vec3_mul(t, dest, dest); \
  glm_vec3_adds(dest, 10.0f, dest); \
  /* dest = temp * dest */ \
  glm_vec3_mul(temp, dest, dest); \
 }
 /* glm__noiseDetail_fade_vec2(vec2 t, vec2 dest) */
 #define glm__noiseDetail_fade_vec2(t, dest) { \
    /* dest = (t * t * t) * (t * (t * 6.0f - 15.0f) + 10.0f) */ \
    /* temp = t * t * t */ \
    vec2 temp; \
    glm_vec2_mul(t, t, temp); \
    glm_vec2_mul(temp, t, temp); \
    /* dest = (t * (t * 6.0f - 15.0f) + 10.0f) */ \
    glm_vec2_scale(t, 6.0f, dest); \
    glm_vec2_subs(dest, 15.0f, dest); \
    glm_vec2_mul(t, dest, dest); \
    glm_vec2_adds(dest, 10.0f, dest); \
    /* dest = temp * dest */ \
    glm_vec2_mul(temp, dest, dest); \
 }
 /* glm__noiseDetail_taylorInvSqrt(vec4 x, vec4 dest) */
 #define glm__noiseDetail_taylorInvSqrt(x, dest) {                        \
  /* dest = 1.79284291400159f - 0.85373472095314f * x */                 \
  vec4 temp;                                                             \
  glm_vec4_scale(x, 0.85373472095314f, temp); /* temp = 0.853...f * x */ \
  glm_vec4_fill(dest, 1.79284291400159f); /* dest = 1.792...f */         \
  glm_vec4_sub(dest, temp, dest); /* dest = 1.79284291400159f - temp */  \
 }
 /* norm = taylorInvSqrt(vec4(
 *     dot(g00__, g00__),
 *     dot(g01__, g01__),
 *     dot(g10__, g10__),
 *     dot(g11__, g11__)
 * ));
 */
 /* glm__noiseDetail_gradNorm_vec4(vec4 g00__, vec4 g01__, vec4 g10__, vec4 g11__) */
 #define glm__noiseDetail_gradNorm_vec4(g00__, g01__, g10__, g11__) {           \
  vec4 norm;                                                                   \
  norm[0] = glm_vec4_dot(g00__, g00__); /* norm.x = dot(g00__, g00__) */       \
  norm[1] = glm_vec4_dot(g01__, g01__); /* norm.y = dot(g01__, g01__) */       \
  norm[2] = glm_vec4_dot(g10__, g10__); /* norm.z = dot(g10__, g10__) */       \
  norm[3] = glm_vec4_dot(g11__, g11__); /* norm.w = dot(g11__, g11__) */       \
  glm__noiseDetail_taylorInvSqrt(norm, norm); /* norm = taylorInvSqrt(norm) */ \
                                                                               \
  glm_vec4_scale(g00__, norm[0], g00__); /* g00__ *= norm.x */                 \
  glm_vec4_scale(g01__, norm[1], g01__); /* g01__ *= norm.y */                 \
  glm_vec4_scale(g10__, norm[2], g10__); /* g10__ *= norm.z */                 \
  glm_vec4_scale(g11__, norm[3], g11__); /* g11__ *= norm.w */                 \
 }
 /* glm__noiseDetail_gradNorm_vec3(vec3 g00_, vec3 g01_, vec3 g10_, vec3 g11_) */
 #define glm__noiseDetail_gradNorm_vec3(g00_, g01_, g10_, g11_) {               \
  vec4 norm;                                                                   \
  norm[0] = glm_vec3_dot(g00_, g00_); /* norm.x = dot(g00_, g00_) */           \
  norm[1] = glm_vec3_dot(g01_, g01_); /* norm.y = dot(g01_, g01_) */           \
  norm[2] = glm_vec3_dot(g10_, g10_); /* norm.z = dot(g10_, g10_) */           \
  norm[3] = glm_vec3_dot(g11_, g11_); /* norm.w = dot(g11_, g11_) */           \
  glm__noiseDetail_taylorInvSqrt(norm, norm); /* norm = taylorInvSqrt(norm) */ \
                                                                               \
  glm_vec3_scale(g00_, norm[0], g00_); /* g00_ *= norm.x */                    \
  glm_vec3_scale(g01_, norm[1], g01_); /* g01_ *= norm.y */                    \
  glm_vec3_scale(g10_, norm[2], g10_); /* g10_ *= norm.z */                    \
  glm_vec3_scale(g11_, norm[3], g11_); /* g11_ *= norm.w */                    \
 }
 /* glm__noiseDetail_gradNorm_vec2(vec2 g00, vec2 g01, vec2 g10, vec2 g11) */
 #define glm__noiseDetail_gradNorm_vec2(g00, g01, g10, g11) {                   \
  vec4 norm;                                                                   \
  norm[0] = glm_vec2_dot(g00, g00); /* norm.x = dot(g00, g00) */               \
  norm[1] = glm_vec2_dot(g01, g01); /* norm.y = dot(g01, g01) */               \
  norm[2] = glm_vec2_dot(g10, g10); /* norm.z = dot(g10, g10) */               \
  norm[3] = glm_vec2_dot(g11, g11); /* norm.w = dot(g11, g11) */               \
  glm__noiseDetail_taylorInvSqrt(norm, norm); /* norm = taylorInvSqrt(norm) */ \
                                                                               \
  glm_vec2_scale(g00, norm[0], g00); /* g00 *= norm.x */                       \
  glm_vec2_scale(g01, norm[1], g01); /* g01 *= norm.y */                       \
  glm_vec2_scale(g10, norm[2], g10); /* g10 *= norm.z */                       \
  glm_vec2_scale(g11, norm[3], g11); /* g11 *= norm.w */                       \
 }
 /* glm__noiseDetail_i2gxyzw(vec4 ixy, vec4 gx, vec4 gy, vec4 gz, vec4 gw) */
 #define glm__noiseDetail_i2gxyzw(ixy, gx, gy, gz, gw) {      \
  /* gx = ixy / 7.0 */                                       \
  glm_vec4_divs(ixy, 7.0f, gx); /* gx = ixy / 7.0 */         \
                                                             \
  /* gy = fract(gx) / 7.0 */                                 \
  glm_vec4_floor(gx, gy); /* gy = floor(gx) */               \
  glm_vec4_divs(gy, 7.0f, gy); /* gy /= 7.0 */               \
                                                             \
  /* gz = floor(gy) / 6.0 */                                 \
  glm_vec4_floor(gy, gz); /* gz = floor(gy) */               \
  glm_vec4_divs(gz, 6.0f, gz); /* gz /= 6.0 */               \
                                                             \
  /* gx = fract(gx) - 0.5f */                                \
  glm_vec4_fract(gx, gx); /* gx = fract(gx) */               \
  glm_vec4_subs(gx, 0.5f, gx); /* gx -= 0.5f */              \
                                                             \
  /* gy = fract(gy) - 0.5f */                                \
  glm_vec4_fract(gy, gy); /* gy = fract(gy) */               \
  glm_vec4_subs(gy, 0.5f, gy); /* gy -= 0.5f */              \
                                                             \
  /* gz = fract(gz) - 0.5f */                                \
  glm_vec4_fract(gz, gz); /* gz = fract(gz) */               \
  glm_vec4_subs(gz, 0.5f, gz); /* gz -= 0.5f */              \
                                                             \
  /* abs(gx), abs(gy), abs(gz) */                            \
  vec4 gxa, gya, gza;                                        \
  glm_vec4_abs(gx, gxa); /* gxa = abs(gx) */                 \
  glm_vec4_abs(gy, gya); /* gya = abs(gy) */                 \
  glm_vec4_abs(gz, gza); /* gza = abs(gz) */                 \
                                                             \
  /* gw = 0.75 - abs(gx) - abs(gy) - abs(gz) */              \
  glm_vec4_fill(gw, 0.75f); /* gw = 0.75 */                  \
  glm_vec4_sub(gw, gxa, gw); /* gw -= gxa */                 \
  glm_vec4_sub(gw, gza, gw); /* gw -= gza */                 \
  glm_vec4_sub(gw, gya, gw); /* gw -= gya */                 \
                                                             \
  /* sw = step(gw, 0.0); */                                  \
  vec4 sw;                                                   \
  glm_vec4_stepr(gw, 0.0f, sw); /* sw = step(gw, 0.0) */     \
                                                             \
  /* gx -= sw * (step(vec4(0), gx) - T(0.5)); */             \
  vec4 temp = {0.0f}; /* temp = 0.0 */                       \
  glm_vec4_step(temp, gx, temp); /* temp = step(temp, gx) */ \
  glm_vec4_subs(temp, 0.5f, temp); /* temp -= 0.5 */         \
  glm_vec4_mul(sw, temp, temp); /* temp *= sw */             \
  glm_vec4_sub(gx, temp, gx); /* gx -= temp */               \
                                                             \
  /* gy -= sw * (step(vec4(0), gy) - T(0.5)); */             \
  glm_vec4_zero(temp); /* reset temp */                      \
  glm_vec4_step(temp, gy, temp); /* temp = step(temp, gy) */ \
  glm_vec4_subs(temp, 0.5f, temp); /* temp -= 0.5 */         \
  glm_vec4_mul(sw, temp, temp); /* temp *= sw */             \
  glm_vec4_sub(gy, temp, gy); /* gy -= temp */               \
 }
 /* NOTE: This function is not *quite* analogous to glm__noiseDetail_i2gxyzw
 * to try to match the output of glm::perlin. I think it might be a bug in
 * in the original implementation, but for now I'm keeping it consistent. -MK
 * 
 * Follow up: The original implementation (glm v 1.0.1) does:
 * 
 *   vec<4, T, Q> gx0 = ixy0 * T(1.0 / 7.0);
 * 
 * as opposed to:
 * 
 *   vec<4, T, Q> gx0 = ixy0 / T(7);
 * 
 * This ends up mapping to different simd instructions, at least on AMD.
 * The delta is tiny but it gets amplified by the rest of the noise function.
 * Hence we too need to do `glm_vec4_scale` as opposed to `glm_vec4_divs`, to
 * match it. -MK
 */
 /* glm__noiseDetail_i2gxyz(vec4 i, vec4 gx, vec4 gy, vec4 gz) */
 #define glm__noiseDetail_i2gxyz(ixy, gx, gy, gz) {               \
  /* gx = ixy / 7.0 */                                           \
  glm_vec4_scale(ixy, 1.0f / 7.0f, gx); /* gx = ixy * (1/7.0) */\
                                                                 \
  /* gy = fract(floor(gx0) / 7.0)) - 0.5; */                     \
  glm_vec4_floor(gx, gy); /* gy = floor(gx) */                   \
  glm_vec4_scale(gy, 1.0f / 7.0f, gy); /* gy *= 1 / 7.0 */       \
  glm_vec4_fract(gy, gy); /* gy = fract(gy) */                   \
  glm_vec4_subs(gy, 0.5f, gy); /* gy -= 0.5f */                  \
                                                                 \
  /* gx = fract(gx); */                                          \
  glm_vec4_fract(gx, gx); /* gx = fract(gx) */                   \
                                                                 \
  /* abs(gx), abs(gy) */                                         \
  vec4 gxa, gya;                                                 \
  glm_vec4_abs(gx, gxa); /* gxa = abs(gx) */                     \
  glm_vec4_abs(gy, gya); /* gya = abs(gy) */                     \
                                                                 \
  /* gz = vec4(0.5) - abs(gx0) - abs(gy0); */                    \
  glm_vec4_fill(gz, 0.5f); /* gz = 0.5 */                        \
  glm_vec4_sub(gz, gxa, gz); /* gz -= gxa */                     \
  glm_vec4_sub(gz, gya, gz); /* gz -= gya */                     \
                                                                 \
  /* sz = step(gw, 0.0); */                                      \
  vec4 sz;                                                       \
  glm_vec4_stepr(gz, 0.0f, sz); /* sz = step(gz, 0.0) */         \
                                                                 \
  /* gx0 -= sz0 * (step(0.0, gx0) - T(0.5)); */                  \
  vec4 temp = {0.0f}; /* temp = 0.0 */                           \
  glm_vec4_step(temp, gx, temp); /* temp = step(temp, gx) */     \
  glm_vec4_subs(temp, 0.5f, temp); /* temp -= 0.5 */             \
  glm_vec4_mul(sz, temp, temp); /* temp *= sz */                 \
  glm_vec4_sub(gx, temp, gx); /* gx -= temp */                   \
                                                                 \
  /* gy0 -= sz0 * (step(0.0, gy0) - T(0.5)); */                  \
  glm_vec4_zero(temp); /* reset temp */                          \
  glm_vec4_step(temp, gy, temp); /* temp = step(temp, gy) */     \
  glm_vec4_subs(temp, 0.5f, temp); /* temp -= 0.5 */             \
  glm_vec4_mul(sz, temp, temp); /* temp *= sz */                 \
  glm_vec4_sub(gy, temp, gy); /* gy -= temp */                   \
 }
 /* glm__noiseDetail_i2gxy(vec4 i, vec4 gx, vec4 gy) */
 #define glm__noiseDetail_i2gxy(i, gx, gy) {                      \
  /* gx = 2.0 * fract(i / 41.0) - 1.0; */                        \
  glm_vec4_divs(i, 41.0f, gx); /* gx = i / 41.0 */               \
  glm_vec4_fract(gx, gx); /* gx = fract(gx) */                   \
  glm_vec4_scale(gx, 2.0f, gx); /* gx *= 2.0 */                  \
  glm_vec4_subs(gx, 1.0f, gx); /* gx -= 1.0 */                   \
                                                                 \
  /* gy = abs(gx) - 0.5; */                                      \
  glm_vec4_abs(gx, gy); /* gy = abs(gx) */                       \
  glm_vec4_subs(gy, 0.5f, gy); /* gy -= 0.5 */                   \
                                                                 \
  /* tx = floor(gx + 0.5); */                                    \
  vec4 tx;                                                       \
  glm_vec4_adds(gx, 0.5f, tx); /* tx = gx + 0.5 */               \
  glm_vec4_floor(tx, tx); /* tx = floor(tx) */                   \
                                                                 \
  /* gx = gx - tx; */                                            \
  glm_vec4_sub(gx, tx, gx); /* gx -= tx */                       \
 }
 /* ============================================================================
 * Classic perlin noise
 * ============================================================================
 */
 /*!
 * @brief Classic perlin noise
 *
 * @param[in]  point  4D vector
 * @returns           perlin noise value
 */
 CGLM_INLINE
 float
 glm_perlin_vec4(vec4 point) {
  /* Integer part of p for indexing */
  vec4 Pi0;
  glm_vec4_floor(point, Pi0); /* Pi0 = floor(point); */
  /* Integer part + 1 */
  vec4 Pi1;
  glm_vec4_adds(Pi0, 1.0f, Pi1); /* Pi1 = Pi0 + 1.0f; */
  glm_vec4_mods(Pi0, 289.0f, Pi0); /* Pi0 = mod(Pi0, 289.0f); */
  glm_vec4_mods(Pi1, 289.0f, Pi1); /* Pi1 = mod(Pi1, 289.0f); */
  /* Fractional part of p for interpolation */
  vec4 Pf0;
  glm_vec4_fract(point, Pf0);
  /* Fractional part - 1.0 */
  vec4 Pf1;
  glm_vec4_subs(Pf0, 1.0f, Pf1);
  vec4 ix = {Pi0[0], Pi1[0], Pi0[0], Pi1[0]};
  vec4 iy = {Pi0[1], Pi0[1], Pi1[1], Pi1[1]};
  vec4 iz0 = {Pi0[2], Pi0[2], Pi0[2], Pi0[2]}; /* iz0 = vec4(Pi0.z); */
  vec4 iz1 = {Pi1[2], Pi1[2], Pi1[2], Pi1[2]}; /* iz1 = vec4(Pi1.z); */
  vec4 iw0 = {Pi0[3], Pi0[3], Pi0[3], Pi0[3]}; /* iw0 = vec4(Pi0.w); */
  vec4 iw1 = {Pi1[3], Pi1[3], Pi1[3], Pi1[3]}; /* iw1 = vec4(Pi1.w); */
  /* ------------ */
  /* ixy = permute(permute(ix) + iy) */
  vec4 ixy;
  glm__noiseDetail_permute(ix, ixy); /* ixy = permute(ix) */
  glm_vec4_add(ixy, iy, ixy); /* ixy += iy; */
  glm__noiseDetail_permute(ixy, ixy); /* ixy = permute(ixy) */
  /* ixy0 = permute(ixy + iz0) */
  vec4 ixy0;
  glm_vec4_add(ixy, iz0, ixy0); /* ixy0 = ixy + iz0 */
  glm__noiseDetail_permute(ixy0, ixy0); /* ixy0 = permute(ixy0) */
  /* ixy1 = permute(ixy + iz1) */
  vec4 ixy1;
  glm_vec4_add(ixy, iz1, ixy1); /* ixy1 = ixy, iz1 */
  glm__noiseDetail_permute(ixy1, ixy1); /* ixy1 = permute(ixy1) */
  /* ixy00 = permute(ixy0 + iw0) */
  vec4 ixy00;
  glm_vec4_add(ixy0, iw0, ixy00); /* ixy00 = ixy0 + iw0 */
  glm__noiseDetail_permute(ixy00, ixy00); /* ixy00 = permute(ixy00) */
  /* ixy01 = permute(ixy0 + iw1) */
  vec4 ixy01;
  glm_vec4_add(ixy0, iw1, ixy01); /* ixy01 = ixy0 + iw1 */
  glm__noiseDetail_permute(ixy01, ixy01); /* ixy01 = permute(ixy01) */
  /* ixy10 = permute(ixy1 + iw0) */
  vec4 ixy10;
  glm_vec4_add(ixy1, iw0, ixy10); /* ixy10 = ixy1 + iw0 */
  glm__noiseDetail_permute(ixy10, ixy10); /* ixy10 = permute(ixy10) */
  /* ixy11 = permute(ixy1 + iw1) */
  vec4 ixy11;
  glm_vec4_add(ixy1, iw1, ixy11); /* ixy11 = ixy1 + iw1 */
  glm__noiseDetail_permute(ixy11, ixy11); /* ixy11 = permute(ixy11) */
  /* ------------ */
  vec4 gx00, gy00, gz00, gw00;
  glm__noiseDetail_i2gxyzw(ixy00, gx00, gy00, gz00, gw00);
  vec4 gx01, gy01, gz01, gw01;
  glm__noiseDetail_i2gxyzw(ixy01, gx01, gy01, gz01, gw01);
  vec4 gx10, gy10, gz10, gw10;
  glm__noiseDetail_i2gxyzw(ixy10, gx10, gy10, gz10, gw10);
  vec4 gx11, gy11, gz11, gw11;
  glm__noiseDetail_i2gxyzw(ixy11, gx11, gy11, gz11, gw11);
  /* ------------ */
  vec4 g0000 = {gx00[0], gy00[0], gz00[0], gw00[0]}; /* g0000 = vec4(gx00.x, gy00.x, gz00.x, gw00.x); */
  vec4 g0100 = {gx00[2], gy00[2], gz00[2], gw00[2]}; /* g0100 = vec4(gx00.z, gy00.z, gz00.z, gw00.z); */
  vec4 g1000 = {gx00[1], gy00[1], gz00[1], gw00[1]}; /* g1000 = vec4(gx00.y, gy00.y, gz00.y, gw00.y); */
  vec4 g1100 = {gx00[3], gy00[3], gz00[3], gw00[3]}; /* g1100 = vec4(gx00.w, gy00.w, gz00.w, gw00.w); */
  vec4 g0001 = {gx01[0], gy01[0], gz01[0], gw01[0]}; /* g0001 = vec4(gx01.x, gy01.x, gz01.x, gw01.x); */
  vec4 g0101 = {gx01[2], gy01[2], gz01[2], gw01[2]}; /* g0101 = vec4(gx01.z, gy01.z, gz01.z, gw01.z); */
  vec4 g1001 = {gx01[1], gy01[1], gz01[1], gw01[1]}; /* g1001 = vec4(gx01.y, gy01.y, gz01.y, gw01.y); */
  vec4 g1101 = {gx01[3], gy01[3], gz01[3], gw01[3]}; /* g1101 = vec4(gx01.w, gy01.w, gz01.w, gw01.w); */
  vec4 g0010 = {gx10[0], gy10[0], gz10[0], gw10[0]}; /* g0010 = vec4(gx10.x, gy10.x, gz10.x, gw10.x); */
  vec4 g0110 = {gx10[2], gy10[2], gz10[2], gw10[2]}; /* g0110 = vec4(gx10.z, gy10.z, gz10.z, gw10.z); */
  vec4 g1010 = {gx10[1], gy10[1], gz10[1], gw10[1]}; /* g1010 = vec4(gx10.y, gy10.y, gz10.y, gw10.y); */
  vec4 g1110 = {gx10[3], gy10[3], gz10[3], gw10[3]}; /* g1110 = vec4(gx10.w, gy10.w, gz10.w, gw10.w); */
  vec4 g0011 = {gx11[0], gy11[0], gz11[0], gw11[0]}; /* g0011 = vec4(gx11.x, gy11.x, gz11.x, gw11.x); */
  vec4 g0111 = {gx11[2], gy11[2], gz11[2], gw11[2]}; /* g0111 = vec4(gx11.z, gy11.z, gz11.z, gw11.z); */
  vec4 g1011 = {gx11[1], gy11[1], gz11[1], gw11[1]}; /* g1011 = vec4(gx11.y, gy11.y, gz11.y, gw11.y); */
  vec4 g1111 = {gx11[3], gy11[3], gz11[3], gw11[3]}; /* g1111 = vec4(gx11.w, gy11.w, gz11.w, gw11.w); */
  glm__noiseDetail_gradNorm_vec4(g0000, g0100, g1000, g1100);
  glm__noiseDetail_gradNorm_vec4(g0001, g0101, g1001, g1101);
  glm__noiseDetail_gradNorm_vec4(g0010, g0110, g1010, g1110);
  glm__noiseDetail_gradNorm_vec4(g0011, g0111, g1011, g1111);
  /* ------------ */
  float n0000 = glm_vec4_dot(g0000, Pf0); /* n0000 = dot(g0000, Pf0) */
  /* n1000 = dot(g1000, vec4(Pf1.x, Pf0.y, Pf0.z, Pf0.w)) */
  vec4 n1000d = {Pf1[0], Pf0[1], Pf0[2], Pf0[3]};
  float n1000 = glm_vec4_dot(g1000, n1000d);
  /* n0100 = dot(g0100, vec4(Pf0.x, Pf1.y, Pf0.z, Pf0.w)) */
  vec4 n0100d = {Pf0[0], Pf1[1], Pf0[2], Pf0[3]};
  float n0100 = glm_vec4_dot(g0100, n0100d);
  /* n1100 = dot(g1100, vec4(Pf1.x, Pf1.y, Pf0.z, Pf0.w)) */
  vec4 n1100d = {Pf1[0], Pf1[1], Pf0[2], Pf0[3]};
  float n1100 = glm_vec4_dot(g1100, n1100d);
  /* n0010 = dot(g0010, vec4(Pf0.x, Pf0.y, Pf1.z, Pf0.w)) */
  vec4 n0010d = {Pf0[0], Pf0[1], Pf1[2], Pf0[3]};
  float n0010 = glm_vec4_dot(g0010, n0010d);
  /* n1010 = dot(g1010, vec4(Pf1.x, Pf0.y, Pf1.z, Pf0.w)) */
  vec4 n1010d = {Pf1[0], Pf0[1], Pf1[2], Pf0[3]};
  float n1010 = glm_vec4_dot(g1010, n1010d);
  /* n0110 = dot(g0110, vec4(Pf0.x, Pf1.y, Pf1.z, Pf0.w)) */
  vec4 n0110d = {Pf0[0], Pf1[1], Pf1[2], Pf0[3]};
  float n0110 = glm_vec4_dot(g0110, n0110d);
  /* n1110 = dot(g1110, vec4(Pf1.x, Pf1.y, Pf1.z, Pf0.w)) */
  vec4 n1110d = {Pf1[0], Pf1[1], Pf1[2], Pf0[3]};
  float n1110 = glm_vec4_dot(g1110, n1110d);
  /* n0001 = dot(g0001, vec4(Pf0.x, Pf0.y, Pf0.z, Pf1.w)) */
  vec4 n0001d = {Pf0[0], Pf0[1], Pf0[2], Pf1[3]};
  float n0001 = glm_vec4_dot(g0001, n0001d);
  /* n1001 = dot(g1001, vec4(Pf1.x, Pf0.y, Pf0.z, Pf1.w)) */
  vec4 n1001d = {Pf1[0], Pf0[1], Pf0[2], Pf1[3]};
  float n1001 = glm_vec4_dot(g1001, n1001d);
  /* n0101 = dot(g0101, vec4(Pf0.x, Pf1.y, Pf0.z, Pf1.w)) */
  vec4 n0101d = {Pf0[0], Pf1[1], Pf0[2], Pf1[3]};
  float n0101 = glm_vec4_dot(g0101, n0101d);
  /* n1101 = dot(g1101, vec4(Pf1.x, Pf1.y, Pf0.z, Pf1.w)) */
  vec4 n1101d = {Pf1[0], Pf1[1], Pf0[2], Pf1[3]};
  float n1101 = glm_vec4_dot(g1101, n1101d);
  /* n0011 = dot(g0011, vec4(Pf0.x, Pf0.y, Pf1.z, Pf1.w)) */
  vec4 n0011d = {Pf0[0], Pf0[1], Pf1[2], Pf1[3]};
  float n0011 = glm_vec4_dot(g0011, n0011d);
  /* n1011 = dot(g1011, vec4(Pf1.x, Pf0.y, Pf1.z, Pf1.w)) */
  vec4 n1011d = {Pf1[0], Pf0[1], Pf1[2], Pf1[3]};
  float n1011 = glm_vec4_dot(g1011, n1011d);
  /* n0111 = dot(g0111, vec4(Pf0.x, Pf1.y, Pf1.z, Pf1.w)) */
  vec4 n0111d = {Pf0[0], Pf1[1], Pf1[2], Pf1[3]};
  float n0111 = glm_vec4_dot(g0111, n0111d);
  float n1111 = glm_vec4_dot(g1111, Pf1); /* n1111 = dot(g1111, Pf1) */
  /* ------------ */
  vec4 fade_xyzw;
  glm__noiseDetail_fade_vec4(Pf0, fade_xyzw); /* fade_xyzw = fade(Pf0) */
  /* n_0w = lerp(vec4(n0000, n1000, n0100, n1100), vec4(n0001, n1001, n0101, n1101), fade_xyzw.w) */
  vec4 n_0w1 = {n0000, n1000, n0100, n1100};
  vec4 n_0w2 = {n0001, n1001, n0101, n1101};
  vec4 n_0w;
  glm_vec4_lerp(n_0w1, n_0w2, fade_xyzw[3], n_0w);
  /* n_1w = lerp(vec4(n0010, n1010, n0110, n1110), vec4(n0011, n1011, n0111, n1111), fade_xyzw.w) */
  vec4 n_1w1 = {n0010, n1010, n0110, n1110};
  vec4 n_1w2 = {n0011, n1011, n0111, n1111};
  vec4 n_1w;
  glm_vec4_lerp(n_1w1, n_1w2, fade_xyzw[3], n_1w);
  /* n_zw = lerp(n_0w, n_1w, fade_xyzw.z) */
  vec4 n_zw;
  glm_vec4_lerp(n_0w, n_1w, fade_xyzw[2], n_zw);
  /* n_yzw = lerp(vec2(n_zw.x, n_zw.y), vec2(n_zw.z, n_zw.w), fade_xyzw.y) */
  vec2 n_yzw;
  vec2 n_yzw1 = {n_zw[0], n_zw[1]};
  vec2 n_yzw2 = {n_zw[2], n_zw[3]};
  glm_vec2_lerp(n_yzw1, n_yzw2, fade_xyzw[1], n_yzw);
  /* n_xyzw = lerp(n_yzw.x, n_yzw.y, fade_xyzw.x) */
  float n_xyzw = glm_lerp(n_yzw[0], n_yzw[1], fade_xyzw[0]);
  return n_xyzw * 2.2f;
 }
 /*!
 * @brief Classic perlin noise
 *
 * @param[in]  point  3D vector
 * @returns           perlin noise value
 */
 CGLM_INLINE
 float
 glm_perlin_vec3(vec3 point) {
  /* Integer part of p for indexing */
  vec3 Pi0;
  glm_vec3_floor(point, Pi0); /* Pi0 = floor(point); */
  /* Integer part + 1 */
  vec3 Pi1;
  glm_vec3_adds(Pi0, 1.0f, Pi1); /* Pi1 = Pi0 + 1.0f; */
  glm_vec3_mods(Pi0, 289.0f, Pi0); /* Pi0 = mod(Pi0, 289.0f); */
  glm_vec3_mods(Pi1, 289.0f, Pi1); /* Pi1 = mod(Pi1, 289.0f); */
  /* Fractional part of p for interpolation */
  vec3 Pf0;
  glm_vec3_fract(point, Pf0);
  /* Fractional part - 1.0 */
  vec3 Pf1;
  glm_vec3_subs(Pf0, 1.0f, Pf1);
  vec4 ix = {Pi0[0], Pi1[0], Pi0[0], Pi1[0]};
  vec4 iy = {Pi0[1], Pi0[1], Pi1[1], Pi1[1]};
  vec4 iz0 = {Pi0[2], Pi0[2], Pi0[2], Pi0[2]}; /* iz0 = vec4(Pi0.z); */
  vec4 iz1 = {Pi1[2], Pi1[2], Pi1[2], Pi1[2]}; /* iz1 = vec4(Pi1.z); */
  /* ------------ */
  /* ixy = permute(permute(ix) + iy) */
  vec4 ixy;
  glm__noiseDetail_permute(ix, ixy); /* ixy = permute(ix) */
  glm_vec4_add(ixy, iy, ixy); /* ixy += iy; */
  glm__noiseDetail_permute(ixy, ixy); /* ixy = permute(ixy) */
  /* ixy0 = permute(ixy + iz0) */
  vec4 ixy0;
  glm_vec4_add(ixy, iz0, ixy0); /* ixy0 = ixy + iz0 */
  glm__noiseDetail_permute(ixy0, ixy0); /* ixy0 = permute(ixy0) */
  /* ixy1 = permute(ixy + iz1) */
  vec4 ixy1;
  glm_vec4_add(ixy, iz1, ixy1); /* ixy1 = ixy, iz1 */
  glm__noiseDetail_permute(ixy1, ixy1); /* ixy1 = permute(ixy1) */
  /* ------------ */
  vec4 gx0, gy0, gz0;
  glm__noiseDetail_i2gxyz(ixy0, gx0, gy0, gz0);
  vec4 gx1, gy1, gz1;
  glm__noiseDetail_i2gxyz(ixy1, gx1, gy1, gz1);
  /* ------------ */
  vec3 g000 = {gx0[0], gy0[0], gz0[0]}; /* g000 = vec3(gx0.x, gy0.x, gz0.x); */
  vec3 g100 = {gx0[1], gy0[1], gz0[1]}; /* g100 = vec3(gx0.y, gy0.y, gz0.y); */
  vec3 g010 = {gx0[2], gy0[2], gz0[2]}; /* g010 = vec3(gx0.z, gy0.z, gz0.z); */
  vec3 g110 = {gx0[3], gy0[3], gz0[3]}; /* g110 = vec3(gx0.w, gy0.w, gz0.w); */
  vec3 g001 = {gx1[0], gy1[0], gz1[0]}; /* g001 = vec3(gx1.x, gy1.x, gz1.x); */
  vec3 g101 = {gx1[1], gy1[1], gz1[1]}; /* g101 = vec3(gx1.y, gy1.y, gz1.y); */
  vec3 g011 = {gx1[2], gy1[2], gz1[2]}; /* g011 = vec3(gx1.z, gy1.z, gz1.z); */
  vec3 g111 = {gx1[3], gy1[3], gz1[3]}; /* g111 = vec3(gx1.w, gy1.w, gz1.w); */
  glm__noiseDetail_gradNorm_vec3(g000, g010, g100, g110);
  glm__noiseDetail_gradNorm_vec3(g001, g011, g101, g111);
  /* ------------ */
  float n000 = glm_vec3_dot(g000, Pf0); /* n000 = dot(g000, Pf0) */
  /* n100 = dot(g100, vec3(Pf1.x, Pf0.y, Pf0.z)) */
  vec3 n100d = {Pf1[0], Pf0[1], Pf0[2]};
  float n100 = glm_vec3_dot(g100, n100d);
  /* n010 = dot(g010, vec3(Pf0.x, Pf1.y, Pf0.z)) */
  vec3 n010d = {Pf0[0], Pf1[1], Pf0[2]};
  float n010 = glm_vec3_dot(g010, n010d);
  /* n110 = dot(g110, vec3(Pf1.x, Pf1.y, Pf0.z)) */
  vec3 n110d = {Pf1[0], Pf1[1], Pf0[2]};
  float n110 = glm_vec3_dot(g110, n110d);
  /* n001 = dot(g001, vec3(Pf0.x, Pf0.y, Pf1.z)) */
  vec3 n001d = {Pf0[0], Pf0[1], Pf1[2]};
  float n001 = glm_vec3_dot(g001, n001d);
  /* n101 = dot(g101, vec3(Pf1.x, Pf0.y, Pf1.z)) */
  vec3 n101d = {Pf1[0], Pf0[1], Pf1[2]};
  float n101 = glm_vec3_dot(g101, n101d);
  /* n011 = dot(g011, vec3(Pf0.x, Pf1.y, Pf1.z)) */
  vec3 n011d = {Pf0[0], Pf1[1], Pf1[2]};
  float n011 = glm_vec3_dot(g011, n011d);
  float n111 = glm_vec3_dot(g111, Pf1); /* n111 = dot(g111, Pf1) */
  /* ------------ */
  vec3 fade_xyz;
  glm__noiseDetail_fade_vec3(Pf0, fade_xyz); /* fade_xyz = fade(Pf0) */
  /* n_z = lerp(vec4(n000, n100, n010, n110), vec4(n001, n101, n011, n111), fade_xyz.z); */
  vec4 n_z;
  vec4 n_z1 = {n000, n100, n010, n110};
  vec4 n_z2 = {n001, n101, n011, n111};
  glm_vec4_lerp(n_z1, n_z2, fade_xyz[2], n_z);
  /* vec2 n_yz = lerp(vec2(n_z.x, n_z.y), vec2(n_z.z, n_z.w), fade_xyz.y); */
  vec2 n_yz;
  vec2 n_yz1 = {n_z[0], n_z[1]};
  vec2 n_yz2 = {n_z[2], n_z[3]};
  glm_vec2_lerp(n_yz1, n_yz2, fade_xyz[1], n_yz);
  /* n_xyz = lerp(n_yz.x, n_yz.y, fade_xyz.x); */
  float n_xyz = glm_lerp(n_yz[0], n_yz[1], fade_xyz[0]);
  return n_xyz * 2.2f;
 }
 /*!
 * @brief Classic perlin noise
 *
 * @param[in]  point  2D vector
 * @returns           perlin noise value
 */
 CGLM_INLINE
 float
 glm_perlin_vec2(vec2 point) {
  /* Integer part of p for indexing */
  /* Pi = floor(vec4(point.x, point.y, point.x, point.y)) + vec4(0.0, 0.0, 1.0, 1.0); */
  vec4 Pi = {point[0], point[1], point[0], point[1]}; /* Pi = vec4(point.x, point.y, point.x, point.y) */
  glm_vec4_floor(Pi, Pi); /* Pi = floor(Pi) */
  Pi[2] += 1.0f; /* Pi.z += 1.0 */
  Pi[3] += 1.0f; /* Pi.w += 1.0 */
  /* Fractional part of p for interpolation */
  /* vec<4, T, Q> Pf = glm::fract(vec<4, T, Q>(Position.x, Position.y, Position.x, Position.y)) - vec<4, T, Q>(0.0, 0.0, 1.0, 1.0); */
  vec4 Pf = {point[0], point[1], point[0], point[1]}; /* Pf = vec4(point.x, point.y, point.x, point.y) */
  glm_vec4_fract(Pf, Pf); /* Pf = fract(Pf) */
  Pf[2] -= 1.0f; /* Pf.z -= 1.0 */
  Pf[3] -= 1.0f; /* Pf.w -= 1.0 */
  /* Mod to avoid truncation effects in permutation */
  glm_vec4_mods(Pi, 289.0f, Pi); /* Pi = mod(Pi, 289.0f); */
  vec4 ix = {Pi[0], Pi[2], Pi[0], Pi[2]}; /* ix = vec4(Pi.x, Pi.z, Pi.x, Pi.z) */
  vec4 iy = {Pi[1], Pi[1], Pi[3], Pi[3]}; /* iy = vec4(Pi.y, Pi.y, Pi.w, Pi.w) */
  vec4 fx = {Pf[0], Pf[2], Pf[0], Pf[2]}; /* fx = vec4(Pf.x, Pf.z, Pf.x, Pf.z) */
  vec4 fy = {Pf[1], Pf[1], Pf[3], Pf[3]}; /* fy = vec4(Pf.y, Pf.y, Pf.w, Pf.w) */
  /* ------------ */
  /* i = permute(permute(ix) + iy); */
  vec4 i;
  glm__noiseDetail_permute(ix, i); /* i = permute(ix) */
  glm_vec4_add(i, iy, i); /* i += iy; */
  glm__noiseDetail_permute(i, i); /* i = permute(i) */
  /* ------------ */
  vec4 gx, gy;
  glm__noiseDetail_i2gxy(i, gx, gy);
  /* ------------ */
  vec2 g00 = {gx[0], gy[0]}; /* g00 = vec2(gx.x, gy.x) */
  vec2 g10 = {gx[1], gy[1]}; /* g10 = vec2(gx.y, gy.y) */
  vec2 g01 = {gx[2], gy[2]}; /* g01 = vec2(gx.z, gy.z) */
  vec2 g11 = {gx[3], gy[3]}; /* g11 = vec2(gx.w, gy.w) */
  glm__noiseDetail_gradNorm_vec2(g00, g01, g10, g11);
  /* ------------ */
  /* n00 = dot(g00, vec2(fx.x, fy.x)) */
  vec2 n00d = {fx[0], fy[0]}; /* n00d = vec2(fx.x, fy.x) */
  float n00 = glm_vec2_dot(g00, n00d); /* n00 = dot(g00, n00d) */
  /* n10 = dot(g10, vec2(fx.y, fy.y)) */
  vec2 n10d = {fx[1], fy[1]}; /* n10d = vec2(fx.y, fy.y) */
  float n10 = glm_vec2_dot(g10, n10d); /* n10 = dot(g10, n10d) */
  /* n01 = dot(g01, vec2(fx.z, fy.z)) */
  vec2 n01d = {fx[2], fy[2]}; /* n01d = vec2(fx.z, fy.z) */
  float n01 = glm_vec2_dot(g01, n01d); /* n01 = dot(g01, n01d) */
  /* n11 = dot(g11, vec2(fx.w, fy.w)) */
  vec2 n11d = {fx[3], fy[3]}; /* n11d = vec2(fx.w, fy.w) */
  float n11 = glm_vec2_dot(g11, n11d); /* n11 = dot(g11, n11d) */
  /* ------------ */
  /* fade_xyz = fade(vec2(Pf.x, Pf.y)) */
  vec2 fade_xy;
  vec2 temp2 = {Pf[0], Pf[1]}; /* temp = vec2(Pf.x, Pf.y) */
  glm__noiseDetail_fade_vec2(temp2, fade_xy); /* fade_xy = fade(temp) */
  /* n_x = lerp(vec2(n00, n01), vec2(n10, n11), fade_xy.x); */
  vec2 n_x;
  vec2 n_x1 = {n00, n01}; /* n_x1 = vec2(n00, n01) */
  vec2 n_x2 = {n10, n11}; /* n_x2 = vec2(n10, n11) */
  glm_vec2_lerp(n_x1, n_x2, fade_xy[0], n_x); /* n_x = lerp(n_x1, n_x2, fade_xy.x) */
  /* T n_xy = mix(n_x.x, n_x.y, fade_xy.y); */
  /* n_xy = lerp(n_x.x, n_x.y, fade_xy.y); */
  float n_xy = glm_lerp(n_x[0], n_x[1], fade_xy[1]);
  return n_xy * 2.3f;
 }
 /* Undefine all helper macros */
 #undef glm__noiseDetail_mod289
 #undef glm__noiseDetail_permute
 #undef glm__noiseDetail_fade_vec4
 #undef glm__noiseDetail_fade_vec3
 #undef glm__noiseDetail_fade_vec2
 #undef glm__noiseDetail_taylorInvSqrt
 #undef glm__noiseDetail_gradNorm_vec4
 #undef glm__noiseDetail_gradNorm_vec3
 #undef glm__noiseDetail_gradNorm_vec2
 #undef glm__noiseDetail_i2gxyzw
 #undef glm__noiseDetail_i2gxyz
 #undef glm__noiseDetail_i2gxy
 #endif /* cglm_noise_h */
--- a/include/cglm/quat.h
+++ b/include/cglm/quat.h
@@ -39,6 +39,7 @@
   CGLM_INLINE void glm_quat_lerp(versor from, versor to, float t, versor dest);
   CGLM_INLINE void glm_quat_lerpc(versor from, versor to, float t, versor dest);
   CGLM_INLINE void glm_quat_slerp(versor q, versor r, float t, versor dest);
   CGLM_INLINE void glm_quat_slerp_longest(versor q, versor r, float t, versor dest);
   CGLM_INLINE void glm_quat_nlerp(versor q, versor r, float t, versor dest);
   CGLM_INLINE void glm_quat_look(vec3 eye, versor ori, mat4 dest);
   CGLM_INLINE void glm_quat_for(vec3 dir, vec3 fwd, vec3 up, versor dest);
@@ -122,7 +123,7 @@ glm_quat_identity_array(versor * __restrict q, size_t count) {
 }
 /*!
- * @brief inits quaterion with raw values
+ * @brief inits quaternion with raw values
 *
 * @param[out]  q     quaternion
 * @param[in]   x     x
@@ -742,6 +743,52 @@ glm_quat_slerp(versor from, versor to, float t, versor dest) {
  glm_vec4_scale(q1, 1.0f / sinTheta, dest);
 }
 /*!
 * @brief interpolates between two quaternions
 *        using spherical linear interpolation (SLERP) and always takes the long path
 *
 * @param[in]   from  from
 * @param[in]   to    to
 * @param[in]   t     amount
 * @param[out]  dest  result quaternion
 */
 CGLM_INLINE
 void
 glm_quat_slerp_longest(versor from, versor to, float t, versor dest) {
  CGLM_ALIGN(16) vec4 q1, q2;
  float cosTheta, sinTheta, angle;
  cosTheta = glm_quat_dot(from, to);
  glm_quat_copy(from, q1);
  if (fabsf(cosTheta) >= 1.0f) {
    glm_quat_copy(q1, dest);
    return;
  }
  /* longest path */
  if (!(cosTheta < 0.0f)) {
    glm_vec4_negate(q1);
    cosTheta = -cosTheta;
  }
  sinTheta = sqrtf(1.0f - cosTheta * cosTheta);
  /* LERP to avoid zero division */
  if (fabsf(sinTheta) < 0.001f) {
    glm_quat_lerp(from, to, t, dest);
    return;
  }
  /* SLERP */
  angle = acosf(cosTheta);
  glm_vec4_scale(q1, sinf((1.0f - t) * angle), q1);
  glm_vec4_scale(to, sinf(t * angle), q2);
  glm_vec4_add(q1, q2, q1);
  glm_vec4_scale(q1, 1.0f / sinTheta, dest);
 }
 /*!
 * @brief creates view matrix using quaternion as camera orientation
 *
--- a/include/cglm/simd/arm.h
+++ b/include/cglm/simd/arm.h
@@ -63,8 +63,21 @@ static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_
 static inline
 float32x4_t
 glmm_vhadd(float32x4_t v) {
 #if CGLM_ARM64
  float32x4_t p;
  p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */
  return vpaddq_f32(p, p); /* [t, t, t, t] */;
 #else
  return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
                   vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
 #endif
  /* TODO: measure speed of this compare to above */
  /* return vdupq_n_f32(vaddvq_f32(v)); */
  /*
  return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
                   vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
   */
  /*
   this seems slower:
   v = vaddq_f32(v, vrev64q_f32(v));
@@ -108,6 +121,12 @@ glmm_dot(float32x4_t a, float32x4_t b) {
  return glmm_hadd(vmulq_f32(a, b));
 }
 static inline
 float32x4_t
 glmm_vdot(float32x4_t a, float32x4_t b) {
  return glmm_vhadd(vmulq_f32(a, b));
 }
 static inline
 float
 glmm_norm(float32x4_t a) {
--- a/include/cglm/simd/avx/mat4.h
+++ b/include/cglm/simd/avx/mat4.h
@@ -12,16 +12,46 @@
 #include "../../common.h"
 #include "../intrin.h"
 #include <immintrin.h>
 CGLM_INLINE
 void
 glm_mat4_scale_avx(mat4 m, float s) {
-  __m256 y0;
+  __m256 y0, y1, y2, y3, y4;
  y0 = _mm256_set1_ps(s);
-  glmm_store256(m[0], _mm256_mul_ps(y0, glmm_load256(m[0])));
+  y0 = glmm_load256(m[0]);            /* h g f e d c b a */
-  glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2])));
+  y1 = glmm_load256(m[2]);            /* p o n m l k j i */
  y2 = _mm256_broadcast_ss(&s);
  y3 = _mm256_mul_ps(y0, y2);
  y4 = _mm256_mul_ps(y1, y2);
  glmm_store256(m[0], y3);
  glmm_store256(m[2], y4);
 }
 /* TODO: this must be tested and compared to SSE version, may be slower!!! */
 CGLM_INLINE
 void
 glm_mat4_transp_avx(mat4 m, mat4 dest) {
  __m256 y0, y1, y2, y3;
  y0 = glmm_load256(m[0]);                   /* h g f e d c b a */
  y1 = glmm_load256(m[2]);                   /* p o n m l k j i */
  y2 = _mm256_unpacklo_ps(y0, y1);           /* n f m e j b i a */
  y3 = _mm256_unpackhi_ps(y0, y1);           /* p h o g l d k c */
  y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* l d k c j b i a */
  y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p h o g n f m e */
  y2 = _mm256_unpacklo_ps(y0, y1);           /* o k g c m i e a */
  y3 = _mm256_unpackhi_ps(y0, y1);           /* p l h d n j f b */
  y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* n j f b m i e a */
  y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p l h d o k g c */
  glmm_store256(dest[0], y0);
  glmm_store256(dest[2], y1);
 }
 CGLM_INLINE
@@ -29,7 +59,8 @@ void
 glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
  /* D = R * L (Column-Major) */
-  __m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+  __m256  y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13;
  __m256i yi0, yi1, yi2, yi3;
  y0 = glmm_load256(m2[0]); /* h g f e d c b a */
  y1 = glmm_load256(m2[2]); /* p o n m l k j i */
@@ -41,35 +72,43 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
  y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
  y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
  yi0 = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
  yi1 = _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2);
  yi2 = _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1);
  yi3 = _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3);
  /* f f f f a a a a */
  /* h h h h c c c c */
  /* e e e e b b b b */
  /* g g g g d d d d */
-  y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
+  y6 = _mm256_permutevar_ps(y0, yi0);
-  y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
+  y7 = _mm256_permutevar_ps(y0, yi1);
-  y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+  y8 = _mm256_permutevar_ps(y0, yi2);
-  y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
+  y9 = _mm256_permutevar_ps(y0, yi3);
  glmm_store256(dest[0],
                _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
                                            _mm256_mul_ps(y3, y7)),
                              _mm256_add_ps(_mm256_mul_ps(y4, y8),
                                            _mm256_mul_ps(y5, y9))));
  /* n n n n i i i i */
  /* p p p p k k k k */
  /* m m m m j j j j */
  /* o o o o l l l l */
-  y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
+  y10 = _mm256_permutevar_ps(y1, yi0);
-  y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
+  y11 = _mm256_permutevar_ps(y1, yi1);
-  y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+  y12 = _mm256_permutevar_ps(y1, yi2);
-  y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
+  y13 = _mm256_permutevar_ps(y1, yi3);
-  glmm_store256(dest[2],
+  y0 = _mm256_mul_ps(y2, y6);
-                _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
+  y1 = _mm256_mul_ps(y2, y10);
-                                            _mm256_mul_ps(y3, y7)),
+
-                              _mm256_add_ps(_mm256_mul_ps(y4, y8),
+  y0 = glmm256_fmadd(y3, y7, y0);
-                                            _mm256_mul_ps(y5, y9))));
+  y1 = glmm256_fmadd(y3, y11, y1);
  y0 = glmm256_fmadd(y4, y8, y0);
  y1 = glmm256_fmadd(y4, y12, y1);
  y0 = glmm256_fmadd(y5, y9, y0);
  y1 = glmm256_fmadd(y5, y13, y1);
  glmm_store256(dest[0], y0);
  glmm_store256(dest[2], y1);
 }
 #endif
--- a/include/cglm/simd/intrin.h
+++ b/include/cglm/simd/intrin.h
@@ -8,7 +8,7 @@
 #ifndef cglm_intrin_h
 #define cglm_intrin_h
-#if defined( _MSC_VER )
+#if defined(_MSC_VER) && !defined(_M_ARM64EC)
 #  if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
 #    ifndef __SSE__
 #      define __SSE__
@@ -20,13 +20,37 @@
 #    ifndef __SSE__
 #      define __SSE__
 #    endif
-#endif
+#  endif
 /* do not use alignment for older visual studio versions */
-#  if _MSC_VER < 1913     /* Visual Studio 2017 version 15.6 */
+/* also ARM32 also causes similar error, disable it for now on ARM32 too */
 #  if _MSC_VER < 1913 || _M_ARM     /* Visual Studio 2017 version 15.6 */
 #    define CGLM_ALL_UNALIGNED
 #  endif
 #endif
 #ifdef __AVX__
 #  include <immintrin.h>
 #  define CGLM_AVX_FP 1
 #    ifndef __SSE2__
 #      define __SSE2__
 #    endif
 #    ifndef __SSE3__
 #      define __SSE3__
 #    endif
 #    ifndef __SSE4__
 #      define __SSE4__
 #    endif
 #    ifndef __SSE4_1__
 #      define __SSE4_1__
 #    endif
 #    ifndef __SSE4_2__
 #      define __SSE4_2__
 #    endif
 #  ifndef CGLM_SIMD_x86
 #    define CGLM_SIMD_x86
 #  endif
 #endif
 #if defined(__SSE__)
 #  include <xmmintrin.h>
 #  define CGLM_SSE_FP 1
@@ -64,14 +88,6 @@
 #  endif
 #endif
 #ifdef __AVX__
 #  include <immintrin.h>
 #  define CGLM_AVX_FP 1
 #  ifndef CGLM_SIMD_x86
 #    define CGLM_SIMD_x86
 #  endif
 #endif
 /* ARM Neon */
 #if defined(_WIN32) && defined(_MSC_VER)
 /* TODO: non-ARM stuff already inported, will this be better option */
@@ -100,7 +116,7 @@
 #else /* non-windows */
 #  if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #    include <arm_neon.h>
-#    if defined(__ARM_NEON_FP)
+#    if defined(__ARM_NEON_FP) || defined(__ARM_FP)
 #      define CGLM_NEON_FP 1
 #    endif
 #    ifndef CGLM_SIMD_ARM
--- a/include/cglm/simd/neon/mat4.h
+++ b/include/cglm/simd/neon/mat4.h
@@ -172,6 +172,8 @@ glm_mat4_det_neon(mat4 mat) {
  return glmm_hadd(vmulq_f32(x2, r0));
 }
 /* old one */
 #if 0
 CGLM_INLINE
 void
 glm_mat4_inv_neon(mat4 mat, mat4 dest) {
@@ -297,7 +299,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) {
                    vget_low_f32(vzipq_f32(v2, v3).val[0]));
  /*
-  x0 = glmm_div(glmm_set1(1.0f), glmm_vhadd(vmulq_f32(x0, r0)));
+  x0 = glmm_div(glmm_set1_rval(1.0f), glmm_vhadd(vmulq_f32(x0, r0)));
  glmm_store(dest[0], vmulq_f32(v0, x0));
  glmm_store(dest[1], vmulq_f32(v1, x0));
@@ -312,6 +314,155 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) {
  glmm_store(dest[2], glmm_div(v2, x0));
  glmm_store(dest[3], glmm_div(v3, x0));
 }
 #endif
 CGLM_INLINE
 void
 glm_mat4_inv_neon(mat4 mat, mat4 dest) {
  float32x4_t   r0, r1, r2, r3,
                v0, v1, v2, v3, v4, v5,
                t0, t1, t2;
  float32x4x2_t a0, a1, a2, a3, a4;
  float32x4_t   s1 = glmm_float32x4_SIGNMASK_PNPN, s2;
 #if !CGLM_ARM64
  float32x2_t   l0, l1;
 #endif
  s2 = vrev64q_f32(s1);
  /* 127 <- 0 */
  r0 = glmm_load(mat[0]);                  /* d c b a */
  r1 = glmm_load(mat[1]);                  /* h g f e */
  r2 = glmm_load(mat[2]);                  /* l k j i */
  r3 = glmm_load(mat[3]);                  /* p o n m */
  a1 = vzipq_f32(r0, r2);                  /* l d k c, j b i a */
  a2 = vzipq_f32(r1, r3);                  /* p h o g, n f m e */
  a3 = vzipq_f32(a2.val[0], a1.val[0]);    /* j n b f, i m a e */
  a4 = vzipq_f32(a2.val[1], a1.val[1]);    /* l p d h, k o c g */
  v0 = vextq_f32(a1.val[0], a1.val[1], 2); /* k c j b */
  v1 = vextq_f32(a2.val[0], a2.val[1], 2); /* o g n f */
  v2 = vextq_f32(a1.val[1], a2.val[0], 2); /* m e l d */
  v3 = vextq_f32(a2.val[1], a1.val[0], 2); /* i a p h */
  v4 = vextq_f32(v1, v2, 2);               /* l d o g */
  v5 = vextq_f32(v0, v3, 2);               /* p h k c */
  /* c2 = c * h - g * d   c12 = a * g - c * e   c8  = a * f - b * e
     c1 = k * p - o * l   c11 = i * o - k * m   c7  = i * n - j * m
     c4 = h * a - d * e   c6  = b * h - d * f   c10 = b * g - c * f
     c3 = p * i - l * m   c5  = j * p - l * n   c9  = j * o - k * n */
  t0 = vmulq_f32(v5, v3);
  t1 = vmulq_f32(a1.val[0], a2.val[1]);
  t2 = vmulq_f32(a1.val[0], v1);
  t0 = glmm_fnmadd(v4, v2, t0);
  t1 = glmm_fnmadd(a1.val[1], a2.val[0], t1);
  t2 = glmm_fnmadd(v0, a2.val[0], t2);
  t0 = vrev64q_f32(t0);
  t1 = vrev64q_f32(t1);
  t2 = vrev64q_f32(t2);
  /* det */
  v0 = vrev64q_f32(t2);
  v1 = vextq_f32(t1, t1, 2);
  v0 = vmulq_f32(t0, v0);
  v1 = vrev64q_f32(v1);
  v1 = vmulq_f32(v1, t1);
  /* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
 #if CGLM_ARM64
  v0 = vpaddq_f32(v0, v0);
  v0 = vpaddq_f32(v0, v0);
 #else
  l0 = vget_low_f32(v0);
  l1 = vget_high_f32(v0);
  l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */ 
  l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */ 
  l0 = vadd_f32(l0, l1);  /* [sum, sum] */ 
  v0 = vcombine_f32(l0, l0); 
 #endif
  /* c5 * c12 + c6 * c11 */
 #if CGLM_ARM64
  v1 = vpaddq_f32(v1, v1);
 #else
  l0 = vget_low_f32(v1);
  l1 = vget_high_f32(v1);
  l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */ 
  l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */ 
  v1 = vcombine_f32(l0, l1);
 #endif
  v0 = vsubq_f32(v0, v1);    /* det */
  /* inv div */
  v1 = vdupq_n_f32(1.0f);
  v0 = glmm_div(v1, v0);     /* inv div */
  /* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */
  t0 = vmulq_f32(t0, v0);
  t1 = vmulq_f32(t1, v0);
  t2 = vmulq_f32(t2, v0);
  a0 = vzipq_f32(t0, t0);    /* c4  c4  c3 c3, c2  c2  c1  c1  */
  a1 = vzipq_f32(t1, t1);    /* c6  c6  c5 c5, c12 c12 c11 c11 */
  a2 = vzipq_f32(t2, t2);    /* c10 c10 c9 c9, c8  c8  c7  c7  */
  /* result */
  /* dest[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
     dest[0][1] = (b * c1  - c * c5  + d * c9)  * ndt;
     dest[0][2] = (n * c2  - o * c6  + p * c10) * idt;
     dest[0][3] = (j * c2  - k * c6  + l * c10) * ndt;
     dest[1][0] = (e * c1  - g * c3  + h * c11) * ndt;
     dest[1][1] = (a * c1  - c * c3  + d * c11) * idt;
     dest[1][2] = (m * c2  - o * c4  + p * c12) * ndt;
     dest[1][3] = (i * c2  - k * c4  + l * c12) * idt;
     dest[2][0] = (e * c5  - f * c3  + h * c7)  * idt;
     dest[2][1] = (a * c5  - b * c3  + d * c7)  * ndt;
     dest[2][2] = (m * c6  - n * c4  + p * c8)  * idt;
     dest[2][3] = (i * c6  - j * c4  + l * c8)  * ndt;
     dest[3][0] = (e * c9  - f * c11 + g * c7)  * ndt;
     dest[3][1] = (a * c9  - b * c11 + c * c7)  * idt;
     dest[3][2] = (m * c10 - n * c12 + o * c8)  * ndt;
     dest[3][3] = (i * c10 - j * c12 + k * c8)  * idt; */
  r0 = vmulq_f32(a3.val[1], a0.val[0]);
  r1 = vmulq_f32(a3.val[0], a0.val[0]);
  r2 = vmulq_f32(a3.val[0], a1.val[1]);
  r3 = vmulq_f32(a3.val[0], a2.val[1]);
  r0 = glmm_fnmadd(a4.val[0], a1.val[1], r0);
  r1 = glmm_fnmadd(a4.val[0], a0.val[1], r1);
  r2 = glmm_fnmadd(a3.val[1], a0.val[1], r2);
  r3 = glmm_fnmadd(a3.val[1], a1.val[0], r3);
  r0 = glmm_fmadd(a4.val[1], a2.val[1], r0);
  r1 = glmm_fmadd(a4.val[1], a1.val[0], r1);
  r2 = glmm_fmadd(a4.val[1], a2.val[0], r2);
  r3 = glmm_fmadd(a4.val[0], a2.val[0], r3);
  /* 4xor may be fastart then 4mul, see above  */
  r0 = glmm_xor(r0, s1);
  r1 = glmm_xor(r1, s2);
  r2 = glmm_xor(r2, s1);
  r3 = glmm_xor(r3, s2);
  glmm_store(dest[0], r0);
  glmm_store(dest[1], r1);
  glmm_store(dest[2], r2);
  glmm_store(dest[3], r3);
 }
 #endif
 #endif /* cglm_mat4_neon_h */
--- a/include/cglm/simd/sse2/mat4.h
+++ b/include/cglm/simd/sse2/mat4.h
@@ -18,7 +18,7 @@ CGLM_INLINE
 void
 glm_mat4_scale_sse2(mat4 m, float s) {
  __m128 x0;
-  x0 = _mm_set1_ps(s);
+  x0 = glmm_set1(s);
  glmm_store(m[0], _mm_mul_ps(glmm_load(m[0]), x0));
  glmm_store(m[1], _mm_mul_ps(glmm_load(m[1]), x0));
@@ -295,6 +295,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
  glmm_store(dest[3], _mm_mul_ps(v3, x0));
 }
 /* old one */
 #if 0
 CGLM_INLINE
 void
 glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
@@ -424,13 +426,148 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
  x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
  x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
-  x0 = _mm_div_ps(_mm_set1_ps(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0)));
+  x0 = _mm_div_ps(glmm_set1(1.0f), glmm_vhadd(_mm_mul_ps(x0, r0)));
  glmm_store(dest[0], _mm_mul_ps(v0, x0));
  glmm_store(dest[1], _mm_mul_ps(v1, x0));
  glmm_store(dest[2], _mm_mul_ps(v2, x0));
  glmm_store(dest[3], _mm_mul_ps(v3, x0));
 }
 #endif
 CGLM_INLINE
 void
 glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
  __m128 r0, r1, r2, r3, s1, s2,
         v0, v1, v2, v3, v4, v5,
         t0, t1, t2,
         x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13;
  /* s1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
  s1 = glmm_float32x4_SIGNMASK_NPNP;
  s2 = glmm_shuff1(s1, 2, 1, 2, 1);
  /* 127 <- 0 */
  r1 = glmm_load(mat[1]); /* h g f e */
  r0 = glmm_load(mat[0]); /* d c b a */
  r3 = glmm_load(mat[3]); /* p o n m */
  r2 = glmm_load(mat[2]); /* l k j i */
  x4  = _mm_unpackhi_ps(r0, r2); /* l d k c */
  x5  = _mm_unpacklo_ps(r0, r2); /* j b i a */
  x6  = _mm_unpackhi_ps(r1, r3); /* p h o g */
  x7  = _mm_unpacklo_ps(r1, r3); /* n f m e */
  x0  = _mm_unpackhi_ps(x7, x5); /* j n b f */
  x1  = _mm_unpacklo_ps(x7, x5); /* i m a e */
  x2  = _mm_unpackhi_ps(x6, x4); /* l p d h */
  x3  = _mm_unpacklo_ps(x6, x4); /* k o c g */
  /* c2 = c * h - d * g   c12 = a * g - c * e    c8  = a * f - b * e
     c1 = k * p - l * o   c11 = i * o - k * m    c7  = i * n - j * m
     c4 = a * h - d * e   c6  = b * h - d * f    c10 = b * g - c * f
     c3 = i * p - l * m   c5  = j * p - l * n    c9  = j * o - k * n */
  x8  = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(3, 1, 3, 1)); /* k c j b */
  x9  = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(2, 0, 2, 0)); /* o g n f */
  x10 = glmm_shuff1(x2, 2, 0, 2, 0);                     /* p h p h */
  x11 = glmm_shuff1(x2, 3, 1, 3, 1);                     /* l d l d */
 #if 0 /* TODO measure both */
  x12 = _mm_shuffle_ps(x4, x5, _MM_SHUFFLE(1, 0, 1, 0)); /* i a k c */
  x13 = _mm_shuffle_ps(x6, x7, _MM_SHUFFLE(1, 0, 1, 0)); /* m e o g */
 #else
  x12 = _mm_movelh_ps(x4, x5);                           /* i a k c */
  x13 = _mm_movelh_ps(x6, x7);                           /* m e o g */
 #endif
  t0 = _mm_mul_ps(x12, x10);
  t1 = _mm_mul_ps(x5, x6);
  t2 = _mm_mul_ps(x5, x9);
  t0 = glmm_fnmadd(x11, x13, t0);
  t1 = glmm_fnmadd(x4, x7, t1);
  t2 = glmm_fnmadd(x8, x7, t2);
  /* det */
  /* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
  /* v1: c5 * c12 + c6 * c11 */
  v5 = glmm_set1_rval(1.0f);
  v0 = glmm_shuff1(t2, 2, 3, 0, 1);
  v1 = glmm_shuff1(t1, 0, 1, 2, 3);
  v0 = _mm_mul_ps(t0, v0);
  v1 = _mm_mul_ps(t1, v1);  
  v2 = glmm_shuff1(v1, 1, 0, 0, 1);
  v3 = glmm_shuff1(v0, 0, 1, 2, 3);
  v1 = _mm_add_ps(v1, v2);
  v0 = _mm_add_ps(v0, v3);
  v2 = glmm_shuff1(v0, 1, 0, 0, 1);
  v0 = _mm_add_ps(v0, v2); 
  v0 = _mm_sub_ps(v0, v1); /* det */
  v0 = _mm_div_ps(v5, v0); /* idt */
  /* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */
  t0 = _mm_mul_ps(t0, v0);
  t1 = _mm_mul_ps(t1, v0);
  t2 = _mm_mul_ps(t2, v0);
  v0 = glmm_shuff1(t0, 0, 0, 1, 1); /* c2  c2  c1  c1  */
  v1 = glmm_shuff1(t0, 2, 2, 3, 3); /* c4  c4  c3 c3   */
  v2 = glmm_shuff1(t1, 0, 0, 1, 1); /* c12 c12 c11 c11 */
  v3 = glmm_shuff1(t1, 2, 2, 3, 3); /* c6  c6  c5 c5   */
  v4 = glmm_shuff1(t2, 0, 0, 1, 1); /* c8  c8  c7  c7  */
  v5 = glmm_shuff1(t2, 2, 2, 3, 3); /* c10 c10 c9 c9   */
  /* result */
  /* dest[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
     dest[0][1] = (b * c1  - c * c5  + d * c9)  * ndt;
     dest[0][2] = (n * c2  - o * c6  + p * c10) * idt;
     dest[0][3] = (j * c2  - k * c6  + l * c10) * ndt;
     dest[1][0] = (e * c1  - g * c3  + h * c11) * ndt;
     dest[1][1] = (a * c1  - c * c3  + d * c11) * idt;
     dest[1][2] = (m * c2  - o * c4  + p * c12) * ndt;
     dest[1][3] = (i * c2  - k * c4  + l * c12) * idt;
     dest[2][0] = (e * c5  - f * c3  + h * c7)  * idt;
     dest[2][1] = (a * c5  - b * c3  + d * c7)  * ndt;
     dest[2][2] = (m * c6  - n * c4  + p * c8)  * idt;
     dest[2][3] = (i * c6  - j * c4  + l * c8)  * ndt;
     dest[3][0] = (e * c9  - f * c11 + g * c7)  * ndt;
     dest[3][1] = (a * c9  - b * c11 + c * c7)  * idt;
     dest[3][2] = (m * c10 - n * c12 + o * c8)  * ndt;
     dest[3][3] = (i * c10 - j * c12 + k * c8)  * idt; */
  r0 = _mm_mul_ps(x0, v0);
  r1 = _mm_mul_ps(x1, v0);
  r2 = _mm_mul_ps(x1, v3);
  r3 = _mm_mul_ps(x1, v5);
  r0 = glmm_fnmadd(x3, v3, r0);
  r1 = glmm_fnmadd(x3, v1, r1);
  r2 = glmm_fnmadd(x0, v1, r2);
  r3 = glmm_fnmadd(x0, v2, r3);
  r0 = glmm_fmadd(x2, v5, r0);
  r1 = glmm_fmadd(x2, v2, r1);
  r2 = glmm_fmadd(x2, v4, r2);
  r3 = glmm_fmadd(x3, v4, r3);
  /* 4xor may be fastart then 4mul, see above  */
  r0 = _mm_xor_ps(r0, s1);
  r1 = _mm_xor_ps(r1, s2);
  r2 = _mm_xor_ps(r2, s1);
  r3 = _mm_xor_ps(r3, s2);
  glmm_store(dest[0], r0);
  glmm_store(dest[1], r1);
  glmm_store(dest[2], r2);
  glmm_store(dest[3], r3);
 }
 #endif
 #endif /* cglm_mat_sse_h */
--- a/include/cglm/simd/wasm.h
+++ b/include/cglm/simd/wasm.h
@@ -14,8 +14,9 @@
 #define glmm_load(p)      wasm_v128_load(p)
 #define glmm_store(p, a)  wasm_v128_store(p, (a))
-#define glmm_set1(x) wasm_f32x4_splat(x)
+#define glmm_set1(x)      wasm_f32x4_splat(x)
-#define glmm_128     v128_t
+#define glmm_set1_rval(x) wasm_f32x4_splat(x)
 #define glmm_128          v128_t
 #define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z)
--- a/include/cglm/simd/x86.h
+++ b/include/cglm/simd/x86.h
@@ -18,31 +18,46 @@
 #  define glmm_store(p, a)  _mm_store_ps(p, a)
 #endif
 #define glmm_set1(x) _mm_set1_ps(x)
 #define glmm_128     __m128
-#if defined(CGLM_USE_INT_DOMAIN) && defined(__SSE2__)
+#ifdef __AVX__
 #  define glmm_shuff1(xmm, z, y, x, w)                                        \
-     _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm),                \
+     _mm_permute_ps((xmm), _MM_SHUFFLE(z, y, x, w))
                                        _MM_SHUFFLE(z, y, x, w)))
 #else
-#  define glmm_shuff1(xmm, z, y, x, w)                                        \
+#  if !defined(CGLM_NO_INT_DOMAIN) && defined(__SSE2__)
 #    define glmm_shuff1(xmm, z, y, x, w)                                      \
       _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm),              \
                                          _MM_SHUFFLE(z, y, x, w)))
 #  else
 #    define glmm_shuff1(xmm, z, y, x, w)                                      \
       _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
 #  endif
 #endif
 #define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
-#define glmm_splat_x(x) glmm_splat(x, 0)
+#ifdef __AVX__
-#define glmm_splat_y(x) glmm_splat(x, 1)
+#  define glmm_set1(x)      _mm_broadcast_ss(&x)
-#define glmm_splat_z(x) glmm_splat(x, 2)
+#  define glmm_set1_ptr(x)  _mm_broadcast_ss(x)
-#define glmm_splat_w(x) glmm_splat(x, 3)
+#  define glmm_set1_rval(x) _mm_set1_ps(x)
 #  ifdef __AVX2__
 #    define glmm_splat_x(x) _mm_broadcastss_ps(x)
 #  else
 #    define glmm_splat_x(x) _mm_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0))
 #  endif
 #  define glmm_splat_y(x)   _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1))
 #  define glmm_splat_z(x)   _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2))
 #  define glmm_splat_w(x)   _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3))
 #else
 #  define glmm_set1(x)      _mm_set1_ps(x)
 #  define glmm_set1_ptr(x)  _mm_set1_ps(*x)
 #  define glmm_set1_rval(x) _mm_set1_ps(x)
-/* glmm_shuff1x() is DEPRECATED!, use glmm_splat() */
+#  define glmm_splat_x(x)   glmm_splat(x, 0)
-#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
+#  define glmm_splat_y(x)   glmm_splat(x, 1)
-
+#  define glmm_splat_z(x)   glmm_splat(x, 2)
-#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1)                     \
+#  define glmm_splat_w(x)   glmm_splat(x, 3)
-     glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)),           \
+#endif
                 z1, y1, x1, w1)
 #ifdef __AVX__
 #  ifdef CGLM_ALL_UNALIGNED
@@ -86,7 +101,7 @@
 #if defined(__SSE2__)
 #  define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(GLMM_NEGZEROf)) /* _mm_set1_ps(-0.0f) */
 #else
-#  define glmm_float32x4_SIGNMASK_NEG _mm_set1_ps(GLMM_NEGZEROf)
+#  define glmm_float32x4_SIGNMASK_NEG glmm_set1(GLMM_NEGZEROf)
 #endif
 #define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf))
--- a/include/cglm/struct.h
+++ b/include/cglm/struct.h
@@ -31,6 +31,7 @@ extern "C" {
 #include "struct/affine.h"
 #include "struct/frustum.h"
 #include "struct/plane.h"
 #include "struct/noise.h"
 #include "struct/box.h"
 #include "struct/color.h"
 #include "struct/io.h"
--- a/include/cglm/struct/aabb2d.h
+++ b/include/cglm/struct/aabb2d.h
@@ -62,7 +62,7 @@ glms_aabb2d_(merge)(vec2s aabb1[2], vec2s aabb2[2], vec2s dest[2]) {
 /*!
 * @brief crops a bounding box with another one.
 *
- * this could be useful for gettng a bbox which fits with view frustum and
+ * this could be useful for getting a bbox which fits with view frustum and
 * object bounding boxes. In this case you crop view frustum box with objects
 * box
 *
@@ -86,7 +86,7 @@ glms_aabb2d_(crop)(vec2s aabb[2], vec2s cropAabb[2], vec2s dest[2]) {
 /*!
 * @brief crops a bounding box with another one.
 *
- * this could be useful for gettng a bbox which fits with view frustum and
+ * this could be useful for getting a bbox which fits with view frustum and
 * object bounding boxes. In this case you crop view frustum box with objects
 * box
 *
@@ -137,8 +137,27 @@ glms_aabb2d_(isvalid)(vec2s aabb[2]) {
 */
 CGLM_INLINE
 float
-glms_aabb2d_(size)(vec2s aabb[2]) {
+glms_aabb2d_(diag)(vec2s aabb[2]) {
-  return glm_vec2_distance(aabb[0].raw, aabb[1].raw);
+  vec2 rawAabb[2];
  glms_vec2_(unpack)(rawAabb, aabb, 2);
  return glm_aabb2d_diag(rawAabb);
 }
 /*!
 * @brief size of aabb
 *
 * @param[in]  aabb bounding aabb
 * @param[out]  dest size
 */
 CGLM_INLINE
 vec2s
 glms_aabb2d_(sizev)(vec2s aabb[2]) {
  vec2s size;
  vec2  rawAabb[2];
  glms_vec2_(unpack)(rawAabb, aabb, 2);
  glm_aabb2d_sizev(rawAabb, size.raw);
  return size;
 }
 /*!
@@ -232,4 +251,3 @@ glms_aabb2d_(contains)(vec2s aabb[2], vec2s other[2]) {
 }
 #endif /* cglms_aabb2ds_h */
--- a/include/cglm/struct/box.h
+++ b/include/cglm/struct/box.h
@@ -62,7 +62,7 @@ glms_aabb_(merge)(vec3s box1[2], vec3s box2[2], vec3s dest[2]) {
 /*!
 * @brief crops a bounding box with another one.
 *
- * this could be useful for gettng a bbox which fits with view frustum and
+ * this could be useful for getting a bbox which fits with view frustum and
 * object bounding boxes. In this case you crop view frustum box with objects
 * box
 *
@@ -86,7 +86,7 @@ glms_aabb_(crop)(vec3s box[2], vec3s cropBox[2], vec3s dest[2]) {
 /*!
 * @brief crops a bounding box with another one.
 *
- * this could be useful for gettng a bbox which fits with view frustum and
+ * this could be useful for getting a bbox which fits with view frustum and
 * object bounding boxes. In this case you crop view frustum box with objects
 * box
 *
--- a/include/cglm/struct/ivec2.h
+++ b/include/cglm/struct/ivec2.h
@@ -238,7 +238,7 @@ glms_ivec2_(subs)(ivec2s v, int s) {
 /*!
 * @brief multiply vector [a] with vector [b] and store result in [dest]
 *
- * @param[in]  a    frist vector
+ * @param[in]  a    first vector
 * @param[in]  b    second vector
 * @returns         destination
 */
--- a/include/cglm/struct/ivec3.h
+++ b/include/cglm/struct/ivec3.h
@@ -163,7 +163,7 @@ glms_ivec3_(dot)(ivec3s a, ivec3s b) {
 * @brief norm * norm (magnitude) of vec
 *
 * we can use this func instead of calling norm * norm, because it would call
- * sqrtf fuction twice but with this func we can avoid func call, maybe this is
+ * sqrtf function twice but with this func we can avoid func call, maybe this is
 * not good name for this func
 *
 * @param[in] v vector
@@ -253,7 +253,7 @@ glms_ivec3_(subs)(ivec3s v, int s) {
 /*!
 * @brief multiply vector [a] with vector [b] and store result in [dest]
 *
- * @param[in]  a    frist vector
+ * @param[in]  a    first vector
 * @param[in]  b    second vector
 * @returns         destination
 */
--- a/include/cglm/struct/ivec4.h
+++ b/include/cglm/struct/ivec4.h
@@ -201,7 +201,7 @@ glms_ivec4_(subs)(ivec4s v, int s) {
 /*!
 * @brief multiply vector [a] with vector [b] and store result in [dest]
 *
- * @param[in]  a    frist vector
+ * @param[in]  a    first vector
 * @param[in]  b    second vector
 * @returns         destination
 */
--- a/include/cglm/struct/mat3x4.h
+++ b/include/cglm/struct/mat3x4.h
@@ -13,8 +13,8 @@
 Functions:
   CGLM_INLINE mat3x4s glms_mat3x4_zero(void);
   CGLM_INLINE mat3x4s glms_mat3x4_make(const float * __restrict src);
-   CGLM_INLINE mat3s   glms_mat3x4_mul(mat3x4s m1, mat4x3s m2);
+   CGLM_INLINE mat4s   glms_mat3x4_mul(mat3x4s m1, mat4x3s m2);
-   CGLM_INLINE vec3s   glms_mat3x4_mulv(mat3x4s m, vec4s v);
+   CGLM_INLINE vec4s   glms_mat3x4_mulv(mat3x4s m, vec3s v);
   CGLM_INLINE mat4x3s glms_mat3x4_transpose(mat3x4s m);
   CGLM_INLINE mat3x4s glms_mat3x4_scale(mat3x4s m, float s);
 */
@@ -70,27 +70,27 @@ glms_mat3x4_(make)(const float * __restrict src) {
 *
 * @param[in]  m1   left matrix (mat3x4s)
 * @param[in]  m2   right matrix (mat4x3s)
- * @returns destination matrix (mat3s)
+ * @returns destination matrix (mat4s)
 */
 CGLM_INLINE
-mat3s
+mat4s
 glms_mat3x4_(mul)(mat3x4s m1, mat4x3s m2) {
-  mat3s r;
+  mat4s r;
  glm_mat3x4_mul(m1.raw, m2.raw, r.raw);
  return r;
 }
 /*!
- * @brief multiply matrix with column vector and store in dest vector
+ * @brief multiply matrix with column vector and store in dest column vector
 *
 * @param[in]  m    matrix (left)
 * @param[in]  v    vector (right, column vector)
- * @param[out] dest result vector
+ * @returns destination vector (vec4s)
 */
 CGLM_INLINE
-vec3s
+vec4s
-glms_mat3x4_(mulv)(mat3x4s m, vec4s v) {
+glms_mat3x4_(mulv)(mat3x4s m, vec3s v) {
-  vec3s r;
+  vec4s r;
  glm_mat3x4_mulv(m.raw, v.raw, r.raw);
  return r;
 }
--- a/include/cglm/struct/mat4x2.h
+++ b/include/cglm/struct/mat4x2.h
@@ -13,8 +13,8 @@
 Functions:
   CGLM_INLINE mat4x2s glms_mat4x2_zero(void);
   CGLM_INLINE mat4x2s glms_mat4x2_make(const float * __restrict src);
-   CGLM_INLINE mat4s   glms_mat4x2_mul(mat4x2s m1, mat2x4s m2);
+   CGLM_INLINE mat2s   glms_mat4x2_mul(mat4x2s m1, mat2x4s m2);
-   CGLM_INLINE vec4s   glms_mat4x2_mulv(mat4x2s m, vec2s v);
+   CGLM_INLINE vec2s   glms_mat4x2_mulv(mat4x2s m, vec4s v);
   CGLM_INLINE mat2x4s glms_mat4x2_transpose(mat4x2s m);
   CGLM_INLINE mat4x2s glms_mat4x2_scale(mat4x2s m, float s);
 */
@@ -71,27 +71,27 @@ glms_mat4x2_(make)(const float * __restrict src) {
 *
 * @param[in]  m1   left matrix (mat4x2s)
 * @param[in]  m2   right matrix (mat2x4s)
- * @returns destination matrix (mat4s)
+ * @returns destination matrix (mat2s)
 */
 CGLM_INLINE
-mat4s
+mat2s
 glms_mat4x2_(mul)(mat4x2s m1, mat2x4s m2) {
-  mat4s r;
+  mat2s r;
  glm_mat4x2_mul(m1.raw, m2.raw, r.raw);
  return r;
 }
 /*!
- * @brief multiply matrix with column vector and store in dest vector
+ * @brief multiply matrix with column vector and store in dest column vector
 *
 * @param[in]  m    matrix (left)
 * @param[in]  v    vector (right, column vector)
- * @param[out] dest result vector
+ * @returns destination vector (vec2s)
 */
 CGLM_INLINE
-vec4s
+vec2s
-glms_mat4x2_(mulv)(mat4x2s m, vec2s v) {
+glms_mat4x2_(mulv)(mat4x2s m, vec4s v) {
-  vec4s r;
+  vec2s r;
  glm_mat4x2_mulv(m.raw, v.raw, r.raw);
  return r;
 }
--- a/include/cglm/struct/mat4x3.h
+++ b/include/cglm/struct/mat4x3.h
@@ -13,8 +13,8 @@
 Functions:
   CGLM_INLINE mat4x3s glms_mat4x3_zero(void);
   CGLM_INLINE mat4x3s glms_mat4x3_make(const float * __restrict src);
-   CGLM_INLINE mat4s   glms_mat4x3_mul(mat4x3s m1, mat3x4s m2);
+   CGLM_INLINE mat3s   glms_mat4x3_mul(mat4x3s m1, mat3x4s m2);
-   CGLM_INLINE vec4s   glms_mat4x3_mulv(mat4x3s m, vec3s v);
+   CGLM_INLINE vec3s   glms_mat4x3_mulv(mat4x3s m, vec4s v);
   CGLM_INLINE mat3x4s glms_mat4x3_transpose(mat4x3s m);
   CGLM_INLINE mat4x3s glms_mat4x3_scale(mat4x3s m, float s);
 */
@@ -70,12 +70,12 @@ glms_mat4x3_(make)(const float * __restrict src) {
 *
 * @param[in]  m1   left matrix (mat4x3s)
 * @param[in]  m2   right matrix (mat3x4s)
- * @returns destination matrix (mat4s)
+ * @returns destination matrix (mat3s)
 */
 CGLM_INLINE
-mat4s
+mat3s
 glms_mat4x3_(mul)(mat4x3s m1, mat3x4s m2) {
-  mat4s r;
+  mat3s r;
  glm_mat4x3_mul(m1.raw, m2.raw, r.raw);
  return r;
 }
@@ -85,12 +85,12 @@ glms_mat4x3_(mul)(mat4x3s m1, mat3x4s m2) {
 *
 * @param[in]  m    matrix (left)
 * @param[in]  v    vector (right, column vector)
- * @param[out] dest result vector
+ * @returns destination vector (vec3s)
 */
 CGLM_INLINE
-vec4s
+vec3s
-glms_mat4x3_(mulv)(mat4x3s m, vec3s v) {
+glms_mat4x3_(mulv)(mat4x3s m, vec4s v) {
-  vec4s r;
+  vec3s r;
  glm_mat4x3_mulv(m.raw, v.raw, r.raw);
  return r;
 }
--- a/include/cglm/struct/noise.h
+++ b/include/cglm/struct/noise.h
@@ -0,0 +1,57 @@
 /*
 * Copyright (c), Recep Aslantas.
 *
 * MIT License (MIT), http://opensource.org/licenses/MIT
 * Full license can be found in the LICENSE file
 */
 #ifndef cglms_noises_h
 #define cglms_noises_h
 #include "../common.h"
 #include "../types-struct.h"
 #include "../noise.h"
 #include "vec4.h"
 /*
 Functions:
   CGLM_INLINE float glms_perlin_vec4(vec4s point);
 */
 /*!
 * @brief Classic perlin noise
 *
 * @param[in]  point  4D vector
 * @returns           perlin noise value
 */
 CGLM_INLINE
 float
 glms_perlin_vec4(vec4s point) {
  return glm_perlin_vec4(point.raw);
 }
 /*!
 * @brief Classic perlin noise
 *
 * @param[in]  point  3D vector
 * @returns           perlin noise value
 */
 CGLM_INLINE
 float
 glms_perlin_vec3(vec3s point) {
  return glm_perlin_vec3(point.raw);
 }
 /*!
 * @brief Classic perlin noise
 *
 * @param[in]  point  2D vector
 * @returns           perlin noise value
 */
 CGLM_INLINE
 float
 glms_perlin_vec2(vec2s point) {
  return glm_perlin_vec2(point.raw);
 }
 #endif /* cglms_noises_h */
--- a/include/cglm/struct/quat.h
+++ b/include/cglm/struct/quat.h
@@ -37,6 +37,7 @@
   CGLM_INLINE versors glms_quat_lerpc(versors from, versors to, float t)
   CGLM_INLINE versors glms_quat_nlerp(versors from, versors to, float t)
   CGLM_INLINE versors glms_quat_slerp(versors from, versors to, float t)
   CGLM_INLINE versors glms_quat_slerp_longest(versors from, versors to, float t)
   CGLM_INLINE mat4s.  glms_quat_look(vec3s eye, versors ori)
   CGLM_INLINE versors glms_quat_for(vec3s dir, vec3s fwd, vec3s up)
   CGLM_INLINE versors glms_quat_forp(vec3s from, vec3s to, vec3s fwd, vec3s up)
@@ -104,7 +105,7 @@ glms_quat_(identity_array)(versors * __restrict q, size_t count) {
 }
 /*!
- * @brief inits quaterion with raw values
+ * @brief inits quaternion with raw values
 *
 * @param[in]   x     x
 * @param[in]   y     y
@@ -457,6 +458,23 @@ glms_quat_(slerp)(versors from, versors to, float t) {
  return dest;
 }
 /*!
 * @brief interpolates between two quaternions
 *        using spherical linear interpolation (SLERP) and always takes the longest path
 *
 * @param[in]   from  from
 * @param[in]   to    to
 * @param[in]   t     amount
 * @returns result quaternion
 */
 CGLM_INLINE
 versors
 glms_quat_(slerp_longest)(versors from, versors to, float t) {
  versors dest;
  glm_quat_slerp_longest(from.raw, to.raw, t, dest.raw);
  return dest;
 }
 /*!
 * @brief creates view matrix using quaternion as camera orientation
 *
--- a/include/cglm/struct/vec2-ext.h
+++ b/include/cglm/struct/vec2-ext.h
@@ -23,6 +23,12 @@
   CGLM_INLINE bool  glms_vec2_isinf(vec2s v)
   CGLM_INLINE bool  glms_vec2_isvalid(vec2s v)
   CGLM_INLINE vec2s glms_vec2_sign(vec2s v)
   CGLM_INLINE vec2s glms_vec2_abs(vec2s v)
   CGLM_INLINE vec2s glms_vec2_fract(vec2s v)
   CGLM_INLINE vec2s glms_vec2_floor(vec2s v)
   CGLM_INLINE vec2s glms_vec2_mods(vec2s v, float s)
   CGLM_INLINE vec2s glms_vec2_steps(float edge, vec2s v)
   CGLM_INLINE vec2s glms_vec2_stepr(vec2s edge, float v)
   CGLM_INLINE vec2s glms_vec2_sqrt(vec2s v)
 */
@@ -133,7 +139,7 @@ glms_vec2_min(vec2s v) {
 }
 /*!
- * @brief check if all items are NaN (not a number)
+ * @brief check if one of items is NaN (not a number)
 *        you should only use this in DEBUG mode or very critical asserts
 *
 * @param[in] v vector
@@ -145,7 +151,7 @@ glms_vec2_(isnan)(vec2s v) {
 }
 /*!
- * @brief check if all items are INFINITY
+ * @brief check if one of items is INFINITY
 *        you should only use this in DEBUG mode or very critical asserts
 *
 * @param[in] v vector
@@ -184,6 +190,95 @@ glms_vec2_(sign)(vec2s v) {
  return r;
 }
 /*!
 * @brief fractional part of each vector item
 *
 * @param   v   vector
 * @returns     abs vector
 */
 CGLM_INLINE
 vec2s
 glms_vec2_(abs)(vec2s v) {
  vec2s r;
  glm_vec2_abs(v.raw, r.raw);
  return r;
 }
 /*!
 * @brief fractional part of each vector item
 *
 * @param[in]  v    vector
 * @returns         destination vector
 */
 CGLM_INLINE
 vec2s
 glms_vec2_(fract)(vec2s v) {
  vec2s r;
  glm_vec2_fract(v.raw, r.raw);
  return r;
 }
 /*!
 * @brief floor of each vector item
 *
 * @param[in]  v    vector
 * @returns         destination vector
 */
 CGLM_INLINE
 vec2s
 glms_vec2_(floor)(vec2s v) {
  vec2s r;
  glm_vec2_floor(v.raw, r.raw);
  return r;
 }
 /*!
 * @brief mod of each vector item by scalar
 *
 * @param[in]  v    vector
 * @param[in]  s    scalar
 * @returns         destination vector
 */
 CGLM_INLINE
 vec2s
 glms_vec2_(mods)(vec2s v, float s) {
  vec2s r;
  glm_vec2_mods(v.raw, s, r.raw);
  return r;
 }
 /*!
 * @brief threshold each vector item with scalar
 *        condition is: (x[i] < edge) ? 0.0 : 1.0
 *
 * @param[in]   edge   threshold
 * @param[in]   x      vector to test against threshold
 * @returns            destination
 */
 CGLM_INLINE
 vec2s
 glms_vec2_(steps)(float edge, vec2s x) {
  vec2s r;
  glm_vec2_steps(edge, x.raw, r.raw);
  return r;
 }
 /*!
 * @brief threshold a value with *vector* as the threshold
 *        condition is: (x < edge[i]) ? 0.0 : 1.0
 *
 * @param[in]   edge   threshold vector
 * @param[in]   x      value to test against threshold
 * @returns            destination
 */
 CGLM_INLINE
 vec2s
 glms_vec2_(stepr)(vec2s edge, float x) {
  vec2s r;
  glm_vec2_stepr(edge.raw, x, r.raw);
  return r;
 }
 /*!
 * @brief square root of each vector item
 *
--- a/include/cglm/struct/vec2.h
+++ b/include/cglm/struct/vec2.h
@@ -53,6 +53,7 @@
   CGLM_INLINE vec2s glms_vec2_minv(vec2s a, vec2s b)
   CGLM_INLINE vec2s glms_vec2_clamp(vec2s v, float minVal, float maxVal)
   CGLM_INLINE vec2s glms_vec2_lerp(vec2s from, vec2s to, float t)
   CGLM_INLINE vec2s glms_vec2_step(vec2s edge, vec2s x)
   CGLM_INLINE vec2s glms_vec2_make(float * restrict src)
   CGLM_INLINE vec2s glms_vec2_reflect(vec2s v, vec2s n)
   CGLM_INLINE bool  glms_vec2_refract(vec2s v, vec2s n, float eta, vec2s *dest)
@@ -679,6 +680,21 @@ glms_vec2_(lerp)(vec2s from, vec2s to, float t) {
  return r;
 }
 /*!
 * @brief threshold function
 *
 * @param[in]   edge    threshold
 * @param[in]   x       value to test against threshold
 * @returns             destination
 */
 CGLM_INLINE
 vec2s
 glms_vec2_(step)(vec2s edge, vec2s x) {
  vec2s r;
  glm_vec2_step(edge.raw, x.raw, r.raw);
  return r;
 }
 /*!
 * @brief Create two dimensional vector from pointer
 *
--- a/include/cglm/struct/vec3-ext.h
+++ b/include/cglm/struct/vec3-ext.h
@@ -26,6 +26,10 @@
   CGLM_INLINE vec3s glms_vec3_sign(vec3s v);
   CGLM_INLINE vec3s glms_vec3_abs(vec3s v);
   CGLM_INLINE vec3s glms_vec3_fract(vec3s v);
   CGLM_INLINE vec3s glms_vec3_floor(vec3s v);
   CGLM_INLINE vec3s glms_vec3_mods(vec3s v, float s);
   CGLM_INLINE vec3s glms_vec3_steps(float edge, vec3s v);
   CGLM_INLINE vec3s glms_vec3_stepr(vec3s edge, float v);
   CGLM_INLINE float glms_vec3_hadd(vec3s v);
   CGLM_INLINE vec3s glms_vec3_sqrt(vec3s v);
 */
@@ -151,7 +155,7 @@ glms_vec3_(min)(vec3s v) {
 }
 /*!
- * @brief check if all items are NaN (not a number)
+ * @brief check if one of items is NaN (not a number)
 *        you should only use this in DEBUG mode or very critical asserts
 *
 * @param[in] v vector
@@ -163,7 +167,7 @@ glms_vec3_(isnan)(vec3s v) {
 }
 /*!
- * @brief check if all items are INFINITY
+ * @brief check if one of items is INFINITY
 *        you should only use this in DEBUG mode or very critical asserts
 *
 * @param[in] v vector
@@ -230,6 +234,67 @@ glms_vec3_(fract)(vec3s v) {
  return r;
 }
 /*!
 * @brief floor of each vector item
 *
 * @param[in]  v    vector
 * @return          dest destination vector
 */
 CGLM_INLINE
 vec3s
 glms_vec3_(floor)(vec3s v) {
  vec3s r;
  glm_vec3_floor(v.raw, r.raw);
  return r;
 }
 /*!
 * @brief mod of each vector item by scalar
 *
 * @param[in]  v    vector
 * @param[in]  s    scalar
 * @returns         destination vector
 */
 CGLM_INLINE
 vec3s
 glms_vec3_(mods)(vec3s v, float s) {
  vec3s r;
  glm_vec3_mods(v.raw, s, r.raw);
  return r;
 }
 /*!
 * @brief threshold each vector item with scalar
 *        condition is: (x[i] < edge) ? 0.0 : 1.0
 *
 * @param[in]   edge   threshold
 * @param[in]   x      vector to test against threshold
 * @returns            destination
 */
 CGLM_INLINE
 vec3s
 glms_vec3_(steps)(float edge, vec3s x) {
  vec3s r;
  glm_vec3_steps(edge, x.raw, r.raw);
  return r;
 }
 /*!
 * @brief threshold a value with *vector* as the threshold
 *        condition is: (x < edge[i]) ? 0.0 : 1.0
 *
 * @param[in]   edge   threshold vector
 * @param[in]   x      value to test against threshold
 * @returns            destination
 */
 CGLM_INLINE
 vec3s
 glms_vec3_(stepr)(vec3s edge, float x) {
  vec3s r;
  glm_vec3_stepr(edge.raw, x, r.raw);
  return r;
 }
 /*!
 * @brief vector reduction by summation
 * @warning could overflow
--- a/include/cglm/struct/vec3.h
+++ b/include/cglm/struct/vec3.h
@@ -68,7 +68,6 @@
   CGLM_INLINE vec3s glms_vec3_lerpc(vec3s from, vec3s to, float t);
   CGLM_INLINE vec3s glms_vec3_mix(vec3s from, vec3s to, float t);
   CGLM_INLINE vec3s glms_vec3_mixc(vec3s from, vec3s to, float t);
   CGLM_INLINE vec3s glms_vec3_step_uni(float edge, vec3s x);
   CGLM_INLINE vec3s glms_vec3_step(vec3s edge, vec3s x);
   CGLM_INLINE vec3s glms_vec3_smoothstep_uni(float edge0, float edge1, vec3s x);
   CGLM_INLINE vec3s glms_vec3_smoothstep(vec3s edge0, vec3s edge1, vec3s x);
@@ -84,6 +83,9 @@
   CGLM_INLINE vec3s glms_cross(vec3s a, vec3s b);
   CGLM_INLINE float glms_dot(vec3s a, vec3s b);
   CGLM_INLINE vec3s glms_normalize(vec3s v);
 Deprecated:
   glms_vec3_step_uni  -->  use glms_vec3_steps
 */
 #ifndef cglms_vec3s_h
@@ -95,6 +97,9 @@
 #include "../vec3.h"
 #include "vec3-ext.h"
 /* DEPRECATED! */
 #define glms_vec3_step_uni(edge, x) glms_vec3_steps(edge, x)
 #define GLMS_VEC3_ONE_INIT   {GLM_VEC3_ONE_INIT}
 #define GLMS_VEC3_ZERO_INIT  {GLM_VEC3_ZERO_INIT}
@@ -910,21 +915,6 @@ glms_vec3_(mixc)(vec3s from, vec3s to, float t) {
  return r;
 }
 /*!
 * @brief threshold function (unidimensional)
 *
 * @param[in]   edge    threshold
 * @param[in]   x       value to test against threshold
 * @returns             0.0 if x < edge, else 1.0
 */
 CGLM_INLINE
 vec3s
 glms_vec3_(step_uni)(float edge, vec3s x) {
  vec3s r;
  glm_vec3_step_uni(edge, x.raw, r.raw);
  return r;
 }
 /*!
 * @brief threshold function
 *
--- a/include/cglm/struct/vec4-ext.h
+++ b/include/cglm/struct/vec4-ext.h
@@ -26,6 +26,10 @@
   CGLM_INLINE vec4s glms_vec4_sign(vec4s v);
   CGLM_INLINE vec4s glms_vec4_abs(vec4s v);
   CGLM_INLINE vec4s glms_vec4_fract(vec4s v);
   CGLM_INLINE float glms_vec4_floor(vec4s v);
   CGLM_INLINE float glms_vec4_mods(vec4s v, float s);
   CGLM_INLINE float glms_vec4_steps(float edge, vec4s v);
   CGLM_INLINE void  glms_vec4_stepr(vec4s edge, float v);
   CGLM_INLINE float glms_vec4_hadd(vec4s v);
   CGLM_INLINE vec4s glms_vec4_sqrt(vec4s v);
 */
@@ -230,6 +234,67 @@ glms_vec4_(fract)(vec4s v) {
  return r;
 }
 /*!
 * @brief floor of each vector item
 *
 * @param[in]  v    vector
 * @returns          dest destination vector
 */
 CGLM_INLINE
 vec4s
 glms_vec4_(floor)(vec4s v) {
  vec4s r;
  glm_vec4_floor(v.raw, r.raw);
  return r;
 }
 /*!
 * @brief mod of each vector item by scalar
 *
 * @param[in]  v    vector
 * @param[in]  s    scalar
 * @returns         destination vector
 */
 CGLM_INLINE
 vec4s
 glms_vec4_(mods)(vec4s v, float s) {
  vec4s r;
  glm_vec4_mods(v.raw, s, r.raw);
  return r;
 }
 /*!
 * @brief threshold each vector item with scalar
 *        condition is: (x[i] < edge) ? 0.0 : 1.0
 *
 * @param[in]   edge   threshold
 * @param[in]   x      vector to test against threshold
 * @returns            destination
 */
 CGLM_INLINE
 vec4s
 glms_vec4_(steps)(float edge, vec4s x) {
  vec4s r;
  glm_vec4_steps(edge, x.raw, r.raw);
  return r;
 }
 /*!
 * @brief threshold a value with *vector* as the threshold
 *        condition is: (x < edge[i]) ? 0.0 : 1.0
 *
 * @param[in]   edge   threshold vector
 * @param[in]   x      value to test against threshold
 * @returns            destination
 */
 CGLM_INLINE
 vec4s
 glms_vec4_(stepr)(vec4s edge, float x) {
  vec4s r;
  glm_vec4_stepr(edge.raw, x, r.raw);
  return r;
 }
 /*!
 * @brief vector reduction by summation
 * @warning could overflow
--- a/include/cglm/struct/vec4.h
+++ b/include/cglm/struct/vec4.h
@@ -58,7 +58,6 @@
   CGLM_INLINE vec4s glms_vec4_lerpc(vec4s from, vec4s to, float t);
   CGLM_INLINE vec4s glms_vec4_mix(vec4s from, vec4s to, float t);
   CGLM_INLINE vec4s glms_vec4_mixc(vec4s from, vec4s to, float t);
   CGLM_INLINE vec4s glms_vec4_step_uni(float edge, vec4s x);
   CGLM_INLINE vec4s glms_vec4_step(vec4s edge, vec4s x);
   CGLM_INLINE vec4s glms_vec4_smoothstep_uni(float edge0, float edge1, vec4s x);
   CGLM_INLINE vec4s glms_vec4_smoothstep(vec4s edge0, vec4s edge1, vec4s x);
@@ -69,6 +68,9 @@
   CGLM_INLINE vec4s glms_vec4_make(float * restrict src);
   CGLM_INLINE vec4s glms_vec4_reflect(vec4s v, vec4s n);
   CGLM_INLINE bool  glms_vec4_refract(vec4s v, vec4s n, float eta, vec4s *dest)
 Deprecated:
   glms_vec4_step_uni  -->  use glms_vec4_steps
 */
 #ifndef cglms_vec4s_h
@@ -80,6 +82,9 @@
 #include "../vec4.h"
 #include "vec4-ext.h"
 /* DEPRECATED! */
 #define glms_vec4_step_uni(edge, x) glms_vec4_steps(edge, x)
 #define GLMS_VEC4_ONE_INIT   {GLM_VEC4_ONE_INIT}
 #define GLMS_VEC4_BLACK_INIT {GLM_VEC4_BLACK_INIT}
 #define GLMS_VEC4_ZERO_INIT  {GLM_VEC4_ZERO_INIT}
@@ -786,21 +791,6 @@ glms_vec4_(mixc)(vec4s from, vec4s to, float t) {
  return r;
 }
 /*!
 * @brief threshold function (unidimensional)
 *
 * @param[in]   edge    threshold
 * @param[in]   x       value to test against threshold
 * @returns             0.0 if x < edge, else 1.0
 */
 CGLM_INLINE
 vec4s
 glms_vec4_(step_uni)(float edge, vec4s x) {
  vec4s r;
  glm_vec4_step_uni(edge, x.raw, r.raw);
  return r;
 }
 /*!
 * @brief threshold function
 *
--- a/include/cglm/types-struct.h
+++ b/include/cglm/types-struct.h
@@ -24,6 +24,12 @@
     /* The user has defined CGLM_NO_ANONYMOUS_STRUCT. This used to be the
      * only #define governing the use of anonymous structs, so for backward
      * compatibility, we still honor that choice and disable them. */
 #    define CGLM_USE_ANONYMOUS_STRUCT 0
     /* Disable anonymous structs if strict ANSI mode is enabled for C89 or C99 */
 #  elif defined(__STRICT_ANSI__) && \
        (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
     /* __STRICT_ANSI__ is defined and we're in C89
      * or C99 mode (C11 or later not detected) */
 #    define CGLM_USE_ANONYMOUS_STRUCT 0
 #  elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \
        (defined(__cplusplus)      && __cplusplus >= 201103L)
--- a/include/cglm/types.h
+++ b/include/cglm/types.h
@@ -14,7 +14,8 @@
 #if defined(_MSC_VER)
 /* do not use alignment for older visual studio versions */
-#  if _MSC_VER < 1913 /*  Visual Studio 2017 version 15.6  */
+/* also ARM32 also causes similar error, disable it for now on ARM32 too */
 #  if _MSC_VER < 1913 || _M_ARM /*  Visual Studio 2017 version 15.6  */
 #    define CGLM_ALL_UNALIGNED
 #    define CGLM_ALIGN(X) /* no alignment */
 #  else
--- a/include/cglm/vec2-ext.h
+++ b/include/cglm/vec2-ext.h
@@ -20,6 +20,11 @@
   CGLM_INLINE bool  glm_vec2_isvalid(vec2 v);
   CGLM_INLINE void  glm_vec2_sign(vec2 v, vec2 dest);
   CGLM_INLINE void  glm_vec2_abs(vec2 v, vec2 dest);
   CGLM_INLINE void  glm_vec2_fract(vec2 v, vec2 dest);
   CGLM_INLINE void  glm_vec2_floor(vec2 v, vec2 dest);
   CGLM_INLINE float glm_vec2_mods(vec2 v, float s, vec2 dest);
   CGLM_INLINE float glm_vec2_steps(float edge, vec2 v, vec2 dest);
   CGLM_INLINE void  glm_vec2_stepr(vec2 edge, float v, vec2 dest);
   CGLM_INLINE void  glm_vec2_sqrt(vec2 v, vec2 dest);
   CGLM_INLINE void  glm_vec2_complex_mul(vec2 a, vec2 b, vec2 dest)
   CGLM_INLINE void  glm_vec2_complex_div(vec2 a, vec2 b, vec2 dest)
@@ -128,7 +133,7 @@ glm_vec2_min(vec2 v) {
 }
 /*!
- * @brief check if all items are NaN (not a number)
+ * @brief check if one of items is NaN (not a number)
 *        you should only use this in DEBUG mode or very critical asserts
 *
 * @param[in] v vector
@@ -136,11 +141,15 @@ glm_vec2_min(vec2 v) {
 CGLM_INLINE
 bool
 glm_vec2_isnan(vec2 v) {
 #ifndef CGLM_FAST_MATH
  return isnan(v[0]) || isnan(v[1]);
 #else
  return false;
 #endif
 }
 /*!
- * @brief check if all items are INFINITY
+ * @brief check if one of items is INFINITY
 *        you should only use this in DEBUG mode or very critical asserts
 *
 * @param[in] v vector
@@ -148,7 +157,11 @@ glm_vec2_isnan(vec2 v) {
 CGLM_INLINE
 bool
 glm_vec2_isinf(vec2 v) {
 #ifndef CGLM_FAST_MATH
  return isinf(v[0]) || isinf(v[1]);
 #else
  return false;
 #endif
 }
 /*!
@@ -190,6 +203,46 @@ glm_vec2_abs(vec2 v, vec2 dest) {
  dest[1] = fabsf(v[1]);
 }
 /*!
 * @brief fractional part of each vector item
 *
 * @param[in]  v    vector
 * @param[out] dest destination vector
 */
 CGLM_INLINE
 void
 glm_vec2_fract(vec2 v, vec2 dest) {
  dest[0] = fminf(v[0] - floorf(v[0]), 0.999999940395355224609375f);
  dest[1] = fminf(v[1] - floorf(v[1]), 0.999999940395355224609375f);
 }
 /*!
 * @brief floor of each vector item
 *
 * @param[in]  v    vector
 * @param[out] dest destination vector
 */
 CGLM_INLINE
 void
 glm_vec2_floor(vec2 v, vec2 dest) {
  dest[0] = floorf(v[0]);
  dest[1] = floorf(v[1]);
 }
 /*!
 * @brief mod of each vector item, result is written to dest (dest = v % s)
 *
 * @param[in]  v    vector
 * @param[in]  s    scalar
 * @param[out] dest destination vector
 */
 CGLM_INLINE
 void
 glm_vec2_mods(vec2 v, float s, vec2 dest) {
  dest[0] = fmodf(v[0], s);
  dest[1] = fmodf(v[1], s);
 }
 /*!
 * @brief square root of each vector item
 *
@@ -220,6 +273,36 @@ glm_vec2_complex_mul(vec2 a, vec2 b, vec2 dest) {
  dest[1] = ti;
 }
 /*!
 * @brief threshold each vector item with scalar
 *        condition is: (x[i] < edge) ? 0.0 : 1.0
 *
 * @param[in]   edge    threshold
 * @param[in]   x       vector to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec2_steps(float edge, vec2 x, vec2 dest) {
  dest[0] = glm_step(edge, x[0]);
  dest[1] = glm_step(edge, x[1]);
 }
 /*!
 * @brief threshold a value with *vector* as the threshold
 *        condition is: (x < edge[i]) ? 0.0 : 1.0
 *
 * @param[in]   edge    threshold vector
 * @param[in]   x       value to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec2_stepr(vec2 edge, float x, vec2 dest) {
  dest[0] = glm_step(edge[0], x);
  dest[1] = glm_step(edge[1], x);
 }
 /*!
 * @brief treat vectors as complex numbers and divide them as such.
 *
--- a/include/cglm/vec2.h
+++ b/include/cglm/vec2.h
@@ -53,7 +53,9 @@
   CGLM_INLINE void  glm_vec2_maxv(vec2 v1, vec2 v2, vec2 dest)
   CGLM_INLINE void  glm_vec2_minv(vec2 v1, vec2 v2, vec2 dest)
   CGLM_INLINE void  glm_vec2_clamp(vec2 v, float minVal, float maxVal)
   CGLM_INLINE void  glm_vec2_swizzle(vec2 v, int mask, vec2 dest)
   CGLM_INLINE void  glm_vec2_lerp(vec2 from, vec2 to, float t, vec2 dest)
   CGLM_INLINE void  glm_vec2_step(vec2 edge, vec2 x, vec2 dest)
   CGLM_INLINE void  glm_vec2_make(float * restrict src, vec2 dest)
   CGLM_INLINE void  glm_vec2_reflect(vec2 v, vec2 n, vec2 dest)
   CGLM_INLINE void  glm_vec2_refract(vec2 v, vec2 n, float eta, vec2 dest)
@@ -679,6 +681,24 @@ glm_vec2_clamp(vec2 v, float minval, float maxval) {
  v[1] = glm_clamp(v[1], minval, maxval);
 }
 /*!
 * @brief swizzle vector components
 *
 * @param[in]  v    source
 * @param[in]  mask mask
 * @param[out] dest destination
 */
 CGLM_INLINE
 void
 glm_vec2_swizzle(vec2 v, int mask, vec2 dest) {
  vec2 t;
  t[0] = v[(mask & (3 << 0))];
  t[1] = v[(mask & (3 << 2)) >> 2];
  glm_vec2_copy(t, dest);
 }
 /*!
 * @brief linear interpolation between two vector
 *
@@ -701,6 +721,20 @@ glm_vec2_lerp(vec2 from, vec2 to, float t, vec2 dest) {
  glm_vec2_add(from, v, dest);
 }
 /*!
 * @brief threshold function
 *
 * @param[in]   edge    threshold
 * @param[in]   x       value to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec2_step(vec2 edge, vec2 x, vec2 dest) {
  dest[0] = glm_step(edge[0], x[0]);
  dest[1] = glm_step(edge[1], x[1]);
 }
 /*!
 * @brief Create two dimensional vector from pointer
 *
@@ -749,7 +783,7 @@ glm_vec2_refract(vec2 v, vec2 n, float eta, vec2 dest) {
  ndi = glm_vec2_dot(n, v);
  eni = eta * ndi;
-  k   = 1.0f + eta * eta - eni * eni;
+  k   = 1.0f - eta * eta + eni * eni;
  if (k < 0.0f) {
    glm_vec2_zero(dest);
--- a/include/cglm/vec3-ext.h
+++ b/include/cglm/vec3-ext.h
@@ -26,6 +26,10 @@
   CGLM_INLINE void  glm_vec3_sign(vec3 v, vec3 dest);
   CGLM_INLINE void  glm_vec3_abs(vec3 v, vec3 dest);
   CGLM_INLINE void  glm_vec3_fract(vec3 v, vec3 dest);
   CGLM_INLINE void  glm_vec3_floor(vec3 v, vec3 dest);
   CGLM_INLINE float glm_vec3_mods(vec3 v, float s, vec3 dest);
   CGLM_INLINE float glm_vec3_steps(float edge, vec3 v, vec3 dest);
   CGLM_INLINE void  glm_vec3_stepr(vec3 edge, float v, vec3 dest);
   CGLM_INLINE float glm_vec3_hadd(vec3 v);
   CGLM_INLINE void  glm_vec3_sqrt(vec3 v, vec3 dest);
 */
@@ -164,7 +168,7 @@ glm_vec3_min(vec3 v) {
 }
 /*!
- * @brief check if all items are NaN (not a number)
+ * @brief check if one of items is NaN (not a number)
 *        you should only use this in DEBUG mode or very critical asserts
 *
 * @param[in] v vector
@@ -172,11 +176,15 @@ glm_vec3_min(vec3 v) {
 CGLM_INLINE
 bool
 glm_vec3_isnan(vec3 v) {
 #ifndef CGLM_FAST_MATH
  return isnan(v[0]) || isnan(v[1]) || isnan(v[2]);
 #else
  return false;
 #endif
 }
 /*!
- * @brief check if all items are INFINITY
+ * @brief check if one of items is INFINITY
 *        you should only use this in DEBUG mode or very critical asserts
 *
 * @param[in] v vector
@@ -184,7 +192,11 @@ glm_vec3_isnan(vec3 v) {
 CGLM_INLINE
 bool
 glm_vec3_isinf(vec3 v) {
 #ifndef CGLM_FAST_MATH
  return isinf(v[0]) || isinf(v[1]) || isinf(v[2]);
 #else
  return false;
 #endif
 }
 /*!
@@ -242,6 +254,67 @@ glm_vec3_fract(vec3 v, vec3 dest) {
  dest[2] = fminf(v[2] - floorf(v[2]), 0.999999940395355224609375f);
 }
 /*!
 * @brief floor of each vector item
 *
 * @param[in]  v    vector
 * @param[out] dest destination vector
 */
 CGLM_INLINE
 void
 glm_vec3_floor(vec3 v, vec3 dest) {
  dest[0] = floorf(v[0]);
  dest[1] = floorf(v[1]);
  dest[2] = floorf(v[2]);
 }
 /*!
 * @brief mod of each vector item, result is written to dest (dest = v % s)
 *
 * @param[in]  v    vector
 * @param[in]  s    scalar
 * @param[out] dest destination vector
 */
 CGLM_INLINE
 void
 glm_vec3_mods(vec3 v, float s, vec3 dest) {
  dest[0] = fmodf(v[0], s);
  dest[1] = fmodf(v[1], s);
  dest[2] = fmodf(v[2], s);
 }
 /*!
 * @brief threshold each vector item with scalar
 *        condition is: (x[i] < edge) ? 0.0 : 1.0
 *
 * @param[in]   edge    threshold
 * @param[in]   x       vector to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec3_steps(float edge, vec3 x, vec3 dest) {
  dest[0] = glm_step(edge, x[0]);
  dest[1] = glm_step(edge, x[1]);
  dest[2] = glm_step(edge, x[2]);
 }
 /*!
 * @brief threshold a value with *vector* as the threshold
 *        condition is: (x < edge[i]) ? 0.0 : 1.0
 *
 * @param[in]   edge    threshold vector
 * @param[in]   x       value to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec3_stepr(vec3 edge, float x, vec3 dest) {
  dest[0] = glm_step(edge[0], x);
  dest[1] = glm_step(edge[1], x);
  dest[2] = glm_step(edge[2], x);
 }
 /*!
 * @brief vector reduction by summation
 * @warning could overflow
--- a/include/cglm/vec3.h
+++ b/include/cglm/vec3.h
@@ -72,7 +72,6 @@
   CGLM_INLINE void  glm_vec3_lerpc(vec3 from, vec3 to, float t, vec3 dest);
   CGLM_INLINE void  glm_vec3_mix(vec3 from, vec3 to, float t, vec3 dest);
   CGLM_INLINE void  glm_vec3_mixc(vec3 from, vec3 to, float t, vec3 dest);
   CGLM_INLINE void  glm_vec3_step_uni(float edge, vec3 x, vec3 dest);
   CGLM_INLINE void  glm_vec3_step(vec3 edge, vec3 x, vec3 dest);
   CGLM_INLINE void  glm_vec3_smoothstep_uni(float edge0, float edge1, vec3 x, vec3 dest);
   CGLM_INLINE void  glm_vec3_smoothstep(vec3 edge0, vec3 edge1, vec3 x, vec3 dest);
@@ -97,6 +96,7 @@
   glm_vec3_inv
   glm_vec3_inv_to
   glm_vec3_mulv
   glm_vec3_step_uni  -->  use glm_vec3_steps
 */
 #ifndef cglm_vec3_h
@@ -114,6 +114,7 @@
 #define glm_vec3_inv(v)               glm_vec3_negate(v)
 #define glm_vec3_inv_to(v, dest)      glm_vec3_negate_to(v, dest)
 #define glm_vec3_mulv(a, b, d)        glm_vec3_mul(a, b, d)
 #define glm_vec3_step_uni(edge, x, dest) glm_vec3_steps(edge, x, dest)
 #define GLM_VEC3_ONE_INIT   {1.0f, 1.0f, 1.0f}
 #define GLM_VEC3_ZERO_INIT  {0.0f, 0.0f, 0.0f}
@@ -1012,21 +1013,6 @@ glm_vec3_mixc(vec3 from, vec3 to, float t, vec3 dest) {
  glm_vec3_lerpc(from, to, t, dest);
 }
 /*!
 * @brief threshold function (unidimensional)
 *
 * @param[in]   edge    threshold
 * @param[in]   x       value to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec3_step_uni(float edge, vec3 x, vec3 dest) {
  dest[0] = glm_step(edge, x[0]);
  dest[1] = glm_step(edge, x[1]);
  dest[2] = glm_step(edge, x[2]);
 }
 /*!
 * @brief threshold function
 *
@@ -1263,7 +1249,7 @@ glm_vec3_refract(vec3 v, vec3 n, float eta, vec3 dest) {
  ndi = glm_vec3_dot(n, v);
  eni = eta * ndi;
-  k   = 1.0f + eta * eta - eni * eni;
+  k   = 1.0f - eta * eta + eni * eni;
  if (k < 0.0f) {
    glm_vec3_zero(dest);
--- a/include/cglm/vec4-ext.h
+++ b/include/cglm/vec4-ext.h
@@ -26,6 +26,10 @@
   CGLM_INLINE void  glm_vec4_sign(vec4 v, vec4 dest);
   CGLM_INLINE void  glm_vec4_abs(vec4 v, vec4 dest);
   CGLM_INLINE void  glm_vec4_fract(vec4 v, vec4 dest);
   CGLM_INLINE void  glm_vec4_floor(vec4 v, vec4 dest);
   CGLM_INLINE float glm_vec4_mods(vec4 v, float s, vec4 dest);
   CGLM_INLINE float glm_vec4_steps(float edge, vec4 v, vec4 dest);
   CGLM_INLINE void  glm_vec4_stepr(vec4 edge, float v, vec4 dest);
   CGLM_INLINE float glm_vec4_hadd(vec4 v);
   CGLM_INLINE void  glm_vec4_sqrt(vec4 v, vec4 dest);
 */
@@ -48,7 +52,7 @@ glm_vec4_broadcast(float val, vec4 d) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glmm_store(d, wasm_f32x4_splat(val));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(d, _mm_set1_ps(val));
+  glmm_store(d, glmm_set1(val));
 #else
  d[0] = d[1] = d[2] = d[3] = val;
 #endif
@@ -66,7 +70,7 @@ glm_vec4_fill(vec4 v, float val) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glmm_store(v, wasm_f32x4_splat(val));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(v, _mm_set1_ps(val));
+  glmm_store(v, glmm_set1(val));
 #else
  v[0] = v[1] = v[2] = v[3] = val;
 #endif
@@ -186,7 +190,11 @@ glm_vec4_min(vec4 v) {
 CGLM_INLINE
 bool
 glm_vec4_isnan(vec4 v) {
 #ifndef CGLM_FAST_MATH
  return isnan(v[0]) || isnan(v[1]) || isnan(v[2]) || isnan(v[3]);
 #else
  return false;
 #endif
 }
 /*!
@@ -198,7 +206,11 @@ glm_vec4_isnan(vec4 v) {
 CGLM_INLINE
 bool
 glm_vec4_isinf(vec4 v) {
 #ifndef CGLM_FAST_MATH
  return isinf(v[0]) || isinf(v[1]) || isinf(v[2]) || isinf(v[3]);
 #else
  return false;
 #endif
 }
 /*!
@@ -280,6 +292,71 @@ glm_vec4_fract(vec4 v, vec4 dest) {
  dest[3] = fminf(v[3] - floorf(v[3]), 0.999999940395355224609375f);
 }
 /*!
 * @brief floor of each vector item
 *
 * @param[in]  v    vector
 * @param[out] dest destination vector
 */
 CGLM_INLINE
 void
 glm_vec4_floor(vec4 v, vec4 dest) {
  dest[0] = floorf(v[0]);
  dest[1] = floorf(v[1]);
  dest[2] = floorf(v[2]);
  dest[3] = floorf(v[3]);
 }
 /*!
 * @brief mod of each vector item, result is written to dest (dest = v % s)
 *
 * @param[in]  v    vector
 * @param[in]  s    scalar
 * @param[out] dest destination vector
 */
 CGLM_INLINE
 void
 glm_vec4_mods(vec4 v, float s, vec4 dest) {
  dest[0] = fmodf(v[0], s);
  dest[1] = fmodf(v[1], s);
  dest[2] = fmodf(v[2], s);
  dest[3] = fmodf(v[3], s);
 }
 /*!
 * @brief threshold each vector item with scalar
 *        condition is: (x[i] < edge) ? 0.0 : 1.0
 *
 * @param[in]   edge    threshold
 * @param[in]   x       vector to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec4_steps(float edge, vec4 x, vec4 dest) {
  dest[0] = glm_step(edge, x[0]);
  dest[1] = glm_step(edge, x[1]);
  dest[2] = glm_step(edge, x[2]);
  dest[3] = glm_step(edge, x[3]);
 }
 /*!
 * @brief threshold a value with *vector* as the threshold
 *        condition is: (x < edge[i]) ? 0.0 : 1.0
 *
 * @param[in]   edge    threshold vector
 * @param[in]   x       value to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec4_stepr(vec4 edge, float x, vec4 dest) {
  dest[0] = glm_step(edge[0], x);
  dest[1] = glm_step(edge[1], x);
  dest[2] = glm_step(edge[2], x);
  dest[3] = glm_step(edge[3], x);
 }
 /*!
 * @brief vector reduction by summation
 * @warning could overflow
--- a/include/cglm/vec4.h
+++ b/include/cglm/vec4.h
@@ -57,7 +57,6 @@
   CGLM_INLINE void  glm_vec4_clamp(vec4 v, float minVal, float maxVal);
   CGLM_INLINE void  glm_vec4_lerp(vec4 from, vec4 to, float t, vec4 dest);
   CGLM_INLINE void  glm_vec4_lerpc(vec4 from, vec4 to, float t, vec4 dest);
   CGLM_INLINE void  glm_vec4_step_uni(float edge, vec4 x, vec4 dest);
   CGLM_INLINE void  glm_vec4_step(vec4 edge, vec4 x, vec4 dest);
   CGLM_INLINE void  glm_vec4_smoothstep_uni(float edge0, float edge1, vec4 x, vec4 dest);
   CGLM_INLINE void  glm_vec4_smoothstep(vec4 edge0, vec4 edge1, vec4 x, vec4 dest);
@@ -75,6 +74,7 @@
   glm_vec4_inv
   glm_vec4_inv_to
   glm_vec4_mulv
   glm_vec4_step_uni  --> use glm_vec4_steps
 */
 #ifndef cglm_vec4_h
@@ -92,6 +92,7 @@
 #define glm_vec4_inv(v)                glm_vec4_negate(v)
 #define glm_vec4_inv_to(v, dest)       glm_vec4_negate_to(v, dest)
 #define glm_vec4_mulv(a, b, d)         glm_vec4_mul(a, b, d)
 #define glm_vec4_step_uni(edge, x, dest) glm_vec4_steps(edge, x, dest)
 #define GLM_VEC4_ONE_INIT   {1.0f, 1.0f, 1.0f, 1.0f}
 #define GLM_VEC4_BLACK_INIT {0.0f, 0.0f, 0.0f, 1.0f}
@@ -215,7 +216,7 @@ glm_vec4_one(vec4 v) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glmm_store(v, wasm_f32x4_const_splat(1.0f));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(v, _mm_set1_ps(1.0f));
+  glmm_store(v, glmm_set1_rval(1.0f));
 #elif defined(CGLM_NEON_FP)
  vst1q_f32(v, vdupq_n_f32(1.0f));
 #else
@@ -367,7 +368,7 @@ glm_vec4_adds(vec4 v, float s, vec4 dest) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glmm_store(dest, wasm_f32x4_add(glmm_load(v), wasm_f32x4_splat(s)));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(dest, _mm_add_ps(glmm_load(v), _mm_set1_ps(s)));
+  glmm_store(dest, _mm_add_ps(glmm_load(v), glmm_set1(s)));
 #elif defined(CGLM_NEON_FP)
  vst1q_f32(dest, vaddq_f32(vld1q_f32(v), vdupq_n_f32(s)));
 #else
@@ -415,7 +416,7 @@ glm_vec4_subs(vec4 v, float s, vec4 dest) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glmm_store(dest, wasm_f32x4_sub(glmm_load(v), wasm_f32x4_splat(s)));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(dest, _mm_sub_ps(glmm_load(v), _mm_set1_ps(s)));
+  glmm_store(dest, _mm_sub_ps(glmm_load(v), glmm_set1(s)));
 #elif defined(CGLM_NEON_FP)
  vst1q_f32(dest, vsubq_f32(vld1q_f32(v), vdupq_n_f32(s)));
 #else
@@ -463,7 +464,7 @@ glm_vec4_scale(vec4 v, float s, vec4 dest) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glmm_store(dest, wasm_f32x4_mul(glmm_load(v), wasm_f32x4_splat(s)));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(dest, _mm_mul_ps(glmm_load(v), _mm_set1_ps(s)));
+  glmm_store(dest, _mm_mul_ps(glmm_load(v), glmm_set1(s)));
 #elif defined(CGLM_NEON_FP)
  vst1q_f32(dest, vmulq_f32(vld1q_f32(v), vdupq_n_f32(s)));
 #else
@@ -525,10 +526,8 @@ glm_vec4_div(vec4 a, vec4 b, vec4 dest) {
 CGLM_INLINE
 void
 glm_vec4_divs(vec4 v, float s, vec4 dest) {
-#if defined(__wasm__) && defined(__wasm_simd128__)
+#if defined(CGLM_SIMD)
-  glmm_store(dest, wasm_f32x4_div(glmm_load(v), wasm_f32x4_splat(s)));
+  glmm_store(dest, glmm_div(glmm_load(v), glmm_set1(s)));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
  glmm_store(dest, _mm_div_ps(glmm_load(v), _mm_set1_ps(s)));
 #else
  glm_vec4_scale(v, 1.0f / s, dest);
 #endif
@@ -923,7 +922,7 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) {
    return;
  }
-  glmm_store(dest, wasm_f32x4_div(x0, wasm_f32x4_sqrt(xdot)));
+  glmm_store(dest, glmm_div(x0, wasm_f32x4_sqrt(xdot)));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
  __m128 xdot, x0;
  float  dot;
@@ -937,7 +936,7 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) {
    return;
  }
-  glmm_store(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
+  glmm_store(dest, glmm_div(x0, _mm_sqrt_ps(xdot)));
 #else
  float norm;
@@ -1065,8 +1064,8 @@ glm_vec4_clamp(vec4 v, float minVal, float maxVal) {
  glmm_store(v, glmm_min(glmm_max(glmm_load(v), wasm_f32x4_splat(minVal)),
                         wasm_f32x4_splat(maxVal)));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(v, glmm_min(glmm_max(glmm_load(v), _mm_set1_ps(minVal)),
+  glmm_store(v, glmm_min(glmm_max(glmm_load(v), glmm_set1(minVal)),
-                         _mm_set1_ps(maxVal)));
+                         glmm_set1(maxVal)));
 #elif defined(CGLM_NEON_FP)
  glmm_store(v, glmm_min(glmm_max(vld1q_f32(v), vdupq_n_f32(minVal)),
                         vdupq_n_f32(maxVal)));
@@ -1148,22 +1147,6 @@ glm_vec4_mixc(vec4 from, vec4 to, float t, vec4 dest) {
  glm_vec4_lerpc(from, to, t, dest);
 }
 /*!
 * @brief threshold function (unidimensional)
 *
 * @param[in]   edge    threshold
 * @param[in]   x       value to test against threshold
 * @param[out]  dest    destination
 */
 CGLM_INLINE
 void
 glm_vec4_step_uni(float edge, vec4 x, vec4 dest) {
  dest[0] = glm_step(edge, x[0]);
  dest[1] = glm_step(edge, x[1]);
  dest[2] = glm_step(edge, x[2]);
  dest[3] = glm_step(edge, x[3]);
 }
 /*!
 * @brief threshold function
 *
@@ -1350,7 +1333,7 @@ glm_vec4_refract(vec4 v, vec4 n, float eta, vec4 dest) {
  ndi = glm_vec4_dot(n, v);
  eni = eta * ndi;
-  k   = 1.0f + eta * eta - eni * eni;
+  k   = 1.0f - eta * eta + eni * eni;
  if (k < 0.0f) {
    glm_vec4_zero(dest);
--- a/include/cglm/version.h
+++ b/include/cglm/version.h
@@ -10,6 +10,6 @@
 #define CGLM_VERSION_MAJOR 0
 #define CGLM_VERSION_MINOR 9
-#define CGLM_VERSION_PATCH 4
+#define CGLM_VERSION_PATCH 6
 #endif /* cglm_version_h */
--- a/meson.build
+++ b/meson.build
@@ -1,5 +1,5 @@
 project('cglm', 'c',
-    version : '0.9.4',
+    version : '0.9.6',
    license : 'mit',
    default_options : [
        'c_std=c11',
@@ -56,8 +56,10 @@ cglm_src = files(
    'src/mat4x2.c',
    'src/mat4x3.c',
    'src/plane.c',
    'src/noise.c',
    'src/frustum.c',
    'src/box.c',
    'src/aabb2d.c',
    'src/project.c',
    'src/sphere.c',
    'src/ease.c',
--- a/src/euler.c
+++ b/src/euler.c
@@ -97,4 +97,3 @@ void
 glmc_euler_zyx_quat(vec3 angles, versor dest) {
  glm_euler_zyx_quat(angles, dest);
 }
--- a/src/ivec2.c
+++ b/src/ivec2.c
@@ -247,4 +247,3 @@ void
 glmc_ivec2_abs(ivec2 v, ivec2 dest) {
  glm_ivec2_abs(v, dest);
 }
--- a/src/ivec3.c
+++ b/src/ivec3.c
@@ -253,4 +253,3 @@ void
 glmc_ivec3_abs(ivec3 v, ivec3 dest) {
  glm_ivec3_abs(v, dest);
 }
--- a/src/ivec4.c
+++ b/src/ivec4.c
@@ -199,4 +199,3 @@ void
 glmc_ivec4_abs(ivec4 v, ivec4 dest) {
  glm_ivec4_abs(v, dest);
 }
--- a/src/mat3x4.c
+++ b/src/mat3x4.c
@@ -28,13 +28,13 @@ glmc_mat3x4_make(const float * __restrict src, mat3x4 dest) {
 CGLM_EXPORT
 void
-glmc_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat3 dest) {
+glmc_mat3x4_mul(mat3x4 m1, mat4x3 m2, mat4 dest) {
  glm_mat3x4_mul(m1, m2, dest);
 }
 CGLM_EXPORT
 void
-glmc_mat3x4_mulv(mat3x4 m, vec4 v, vec3 dest) {
+glmc_mat3x4_mulv(mat3x4 m, vec3 v, vec4 dest) {
  glm_mat3x4_mulv(m, v, dest);
 }
--- a/src/mat4x2.c
+++ b/src/mat4x2.c
@@ -28,13 +28,13 @@ glmc_mat4x2_make(const float * __restrict src, mat4x2 dest) {
 CGLM_EXPORT
 void
-glmc_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat4 dest) {
+glmc_mat4x2_mul(mat4x2 m1, mat2x4 m2, mat2 dest) {
  glm_mat4x2_mul(m1, m2, dest);
 }
 CGLM_EXPORT
 void
-glmc_mat4x2_mulv(mat4x2 m, vec2 v, vec4 dest) {
+glmc_mat4x2_mulv(mat4x2 m, vec4 v, vec2 dest) {
  glm_mat4x2_mulv(m, v, dest);
 }
--- a/src/mat4x3.c
+++ b/src/mat4x3.c
@@ -28,13 +28,13 @@ glmc_mat4x3_make(const float * __restrict src, mat4x3 dest) {
 CGLM_EXPORT
 void
-glmc_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat4 dest) {
+glmc_mat4x3_mul(mat4x3 m1, mat3x4 m2, mat3 dest) {
  glm_mat4x3_mul(m1, m2, dest);
 }
 CGLM_EXPORT
 void
-glmc_mat4x3_mulv(mat4x3 m, vec3 v, vec4 dest) {
+glmc_mat4x3_mulv(mat4x3 m, vec4 v, vec3 dest) {
  glm_mat4x3_mulv(m, v, dest);
 }
--- a/src/noise.c
+++ b/src/noise.c
@@ -0,0 +1,27 @@
 /*
 * Copyright (c), Recep Aslantas.
 *
 * MIT License (MIT), http://opensource.org/licenses/MIT
 * Full license can be found in the LICENSE file
 */
 #include "../include/cglm/cglm.h"
 #include "../include/cglm/call.h"
 CGLM_EXPORT
 float
 glmc_perlin_vec4(vec4 p) {
  return glm_perlin_vec4(p);
 }
 CGLM_EXPORT
 float
 glmc_perlin_vec3(vec3 p) {
  return glm_perlin_vec3(p);
 }
 CGLM_EXPORT
 float
 glmc_perlin_vec2(vec2 p) {
  return glm_perlin_vec2(p);
 }
--- a/src/quat.c
+++ b/src/quat.c
@@ -188,6 +188,12 @@ glmc_quat_slerp(versor from, versor to, float t, versor dest) {
  glm_quat_slerp(from, to, t, dest);
 }
 CGLM_EXPORT
 void
 glmc_quat_slerp_longest(versor from, versor to, float t, versor dest) {
  glm_quat_slerp_longest(from, to, t, dest);
 }
 CGLM_EXPORT
 void
 glmc_quat_look(vec3 eye, versor ori, mat4 dest) {
--- a/src/vec2.c
+++ b/src/vec2.c
@@ -273,6 +273,48 @@ glmc_vec2_abs(vec2 v, vec2 dest) {
  glm_vec2_abs(v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec2_fract(vec2 v, vec2 dest) {
  glm_vec2_fract(v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec2_floor(vec2 v, vec2 dest) {
  glm_vec2_floor(v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec2_mods(vec2 v, float s, vec2 dest) {
  glm_vec2_mods(v, s, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec2_step(vec2 edge, vec2 v, vec2 dest) {
  glm_vec2_step(edge, v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec2_steps(float edge, vec2 v, vec2 dest) {
  glm_vec2_steps(edge, v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec2_stepr(vec2 edge, float v, vec2 dest) {
  glm_vec2_stepr(edge, v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec2_swizzle(vec2 v, int mask, vec2 dest) {
  glm_vec2_swizzle(v, mask, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec2_lerp(vec2 from, vec2 to, float t, vec2 dest) {
--- a/src/vec3.c
+++ b/src/vec3.c
@@ -308,12 +308,6 @@ glmc_vec3_lerpc(vec3 from, vec3 to, float t, vec3 dest) {
  glm_vec3_lerpc(from, to, t, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec3_step_uni(float edge, vec3 x, vec3 dest) {
  glm_vec3_step_uni(edge, x, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec3_step(vec3 edge, vec3 x, vec3 dest) {
@@ -344,6 +338,12 @@ glmc_vec3_smoothinterpc(vec3 from, vec3 to, float t, vec3 dest) {
  glm_vec3_smoothinterpc(from, to, t, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec3_swizzle(vec3 v, int mask, vec3 dest) {
  glm_vec3_swizzle(v, mask, dest);
 }
 /* ext */
 CGLM_EXPORT
@@ -442,6 +442,30 @@ glmc_vec3_fract(vec3 v, vec3 dest) {
  glm_vec3_fract(v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec3_floor(vec3 v, vec3 dest) {
  glm_vec3_floor(v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec3_mods(vec3 v, float s, vec3 dest) {
  glm_vec3_mods(v, s, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec3_steps(float edge, vec3 v, vec3 dest) {
  glm_vec3_steps(edge, v, dest);
 }
 CGLM_EXPORT
 void
 glmc_vec3_stepr(vec3 edge, float v, vec3 dest) {
  glm_vec3_stepr(edge, v, dest);
 }
 CGLM_EXPORT
 float
 glmc_vec3_hadd(vec3 v) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Recep Aslantas	144d1e7c29	build: add missing call/aabb2d.h	2025-02-13 22:35:16 +03:00
Recep Aslantas	643700c05e	now working on v0.9.6	2025-02-13 12:25:10 +03:00
Recep Aslantas	fb4eac2ee0	Merge pull request #413 from recp/optimize-inv WIP: More Optimizations and SIMD fixes for MSVC & ARM	2025-02-12 23:08:07 +03:00
Recep Aslantas	4d0a0a7025	Update wasm.h	2025-02-09 15:30:49 +03:00
Recep Aslantas	a88d32c776	Merge branch 'master' into optimize-inv	2025-02-09 15:13:28 +03:00
Recep Aslantas	3bfd31af99	build, win: add missing files	2025-02-09 15:01:55 +03:00
Recep Aslantas	3ee809b9b3	Merge pull request #444 from wethu/wethu-docs-minor-fix-example-numbering Fix numbering - opengl matrix uniform examples (docs)	2025-01-30 23:10:03 +03:00
Ellis	8cdf496baf	Fix numbering	2025-01-30 21:39:17 +10:00
Recep Aslantas	d42bff7773	Update ci.yml	2025-01-25 11:53:27 +03:00
Recep Aslantas	365d43bef4	Update ci.yml	2025-01-25 11:47:32 +03:00
Recep Aslantas	5770fdb336	ci: add arm builds	2025-01-25 11:08:06 +03:00
Recep Aslantas	042d84f058	ci: add arm builds	2025-01-25 10:58:48 +03:00
Recep Aslantas	70a1a946ee	arm: use glmm_div where possible to fix ARMv7 <-> ARM64	2025-01-24 22:21:11 +03:00
Recep Aslantas	8ca1f19aa2	arm: use glmm_div where possible to fix ARMv7 <-> ARM64	2025-01-24 22:20:57 +03:00
Recep Aslantas	adb3ac18c0	Merge branch 'master' into optimize-inv	2025-01-24 16:04:49 +03:00
Recep Aslantas	441f2657ab	suppress param-name warns	2025-01-24 11:20:49 +03:00
Recep Aslantas	9b660e8bd0	Update cmake-wasm.yml	2025-01-23 14:10:04 +03:00
Recep Aslantas	40a2aca7ec	Update cmake-wasm.yml	2025-01-23 13:55:14 +03:00
Recep Aslantas	a48fa8be65	Update cmake-wasm.yml	2025-01-23 13:54:37 +03:00
Recep Aslantas	2f36faa70a	Update cmake-wasm.yml	2025-01-23 13:45:19 +03:00
Recep Aslantas	36f59b7b6c	Update cmake-wasm.yml	2025-01-23 13:39:30 +03:00
Recep Aslantas	f4993318d2	Update ci.yml	2025-01-23 13:04:37 +03:00
Recep Aslantas	5672370e31	Update ci.yml	2025-01-23 12:49:52 +03:00
Recep Aslantas	ae1b9a3982	Update ci.yml	2025-01-23 12:44:04 +03:00
Recep Aslantas	a18d9c28e7	Update ci.yml	2025-01-23 12:41:08 +03:00
Recep Aslantas	fc91f95bb2	Merge branch 'master' of https://github.com/recp/cglm	2025-01-23 12:24:49 +03:00
Recep Aslantas	39052494a2	Revert "Update ci.yml" This reverts commit `488f782704`.	2025-01-23 12:22:27 +03:00
Recep Aslantas	488f782704	Update ci.yml Update ci.yml Update ci.yml	2025-01-23 12:22:09 +03:00
Recep Aslantas	814c354337	Update ci.yml	2025-01-23 12:18:12 +03:00
Recep Aslantas	1ad56f0e94	Update ci.yml	2025-01-23 12:16:07 +03:00
Recep Aslantas	cc54ad3f92	Update ci.yml	2025-01-23 12:15:22 +03:00
Recep Aslantas	3971ef8ef1	Update ci.yml	2025-01-23 01:09:20 +03:00
Recep Aslantas	b4a3ed32d9	Update ci.yml	2025-01-23 01:07:32 +03:00
Recep Aslantas	eb37a28ff5	Update ci.yml	2025-01-23 00:58:02 +03:00
Recep Aslantas	925f9c1d1a	Update ci.yml	2025-01-23 00:50:36 +03:00
Recep Aslantas	9f74fd9597	Update ci.yml	2025-01-23 00:44:12 +03:00
Recep Aslantas	99937807cf	Update ci.yml	2025-01-23 00:19:52 +03:00
Recep Aslantas	cc79b440e7	Update ci.yml	2025-01-23 00:13:59 +03:00
Recep Aslantas	e161c4d0a7	Update ci.yml	2025-01-23 00:06:00 +03:00
Recep Aslantas	e8c791e91e	Merge pull request #441 from MarcinKonowalczyk/perlin Implement Perlin noise	2025-01-22 23:23:03 +03:00
Recep Aslantas	241b751d8c	build, autotools: add missing file	2025-01-22 23:17:23 +03:00
Marcin	dfc9969f85	vectorise fades fix fade for vec2	2025-01-22 16:49:27 +00:00
Marcin	2b4aef2a29	glm__noiseDetail_fade_vec2 arg restrict	2025-01-22 16:49:26 +00:00
Marcin	fd0131734f	fix granNorm arg order	2025-01-22 16:49:26 +00:00
Marcin	9cfa40f423	glm__noiseDetail_taylorInvSqrt	2025-01-22 16:49:26 +00:00
Marcin	b79347eb13	vdivq_f32	2025-01-22 16:49:26 +00:00
Marcin	450d747867	1/7 patch	2025-01-22 14:40:18 +00:00
Recep Aslantas	26e174db46	Merge pull request #442 from duarm/master Rename struct/aabb2d.h functions to match aabb2d.h, add tests	2025-01-21 10:00:12 +03:00
Recep Aslantas	e605c1d585	Update aabb2d.h	2025-01-21 09:59:48 +03:00
duarm	f815918a74	rename struct/aabb2d.h functions to match aabb2d.h, add tests	2025-01-20 13:29:21 -03:00
Marcin	948642ff33	_glm_ -> glm__ for internal macros	2025-01-20 14:09:05 +00:00
Marcin	23c0f5f660	couple more // comments	2025-01-18 20:11:00 +00:00
Marcin	d3ad1645fc	purged // comments in noise.h	2025-01-18 20:11:00 +00:00
Marcin	fa7bc07ae9	changed noiseDetail to #defines _glm_noiseDetail_mod289 _glm_noiseDetail_permute _glm_noiseDetail_fade_vec4 _glm_noiseDetail_fade_vec3 _glm_noiseDetail_fade_vec2 _glm_noiseDetail_taylorInvSqrt _glm_noiseDetail_gradNorm_vec4 _glm_noiseDetail_gradNorm_vec3 _glm_noiseDetail_gradNorm_vec2 _glm_noiseDetail_i2gxyzw _glm_noiseDetail_i2gxyz _glm_noiseDetail_i2gxy	2025-01-18 20:11:00 +00:00
Marcin	f32f18a373	sets -> fill	2025-01-18 20:10:59 +00:00
Marcin	082f1878dd	glms_vec2_mods doc	2025-01-18 20:10:59 +00:00
Marcin	4b0e7dadd6	vec2 swizzle	2025-01-18 20:10:59 +00:00
Marcin	b45bf1d571	switch deprecation in cglm/call.h to #define	2025-01-18 20:10:59 +00:00
Marcin	8493a6c0d3	fix cglm_vec3/4_swizzle	2025-01-18 20:10:59 +00:00
Marcin	f50736aee7	vec2_step test	2025-01-18 20:10:59 +00:00
Marcin	6bc980f3d9	add missing vec2 step	2025-01-18 20:10:59 +00:00
Marcin	5cffcf74c4	missing glmc_vec2_stepr	2025-01-18 20:10:58 +00:00
Marcin	fb469c779d	missing doc	2025-01-18 20:10:58 +00:00
Marcin	8e69157052	missing vec4 steps and stepr	2025-01-18 20:10:58 +00:00
Marcin	6c0e3e9460	deprecate step_uni in favour of steps This seems to be the newer naming system	2025-01-18 20:10:58 +00:00
Marcin	4ca0c536af	steps and stepr test	2025-01-18 20:10:58 +00:00
Marcin	9a1206f3f1	steps and stepr boilerplate	2025-01-18 20:10:58 +00:00
Marcin	c2ebef3867	change steps -> stepr and move to ext	2025-01-18 20:10:57 +00:00
Marcin	0e0eff71ce	correct vec4_mods doc	2025-01-18 20:10:57 +00:00
Marcin	6620adcc16	fix invalid types in vec2_frac/floor	2025-01-18 20:10:57 +00:00
Marcin	a986a4d741	add missing floor tests	2025-01-18 20:10:57 +00:00
Marcin	32e7d5cceb	mods test	2025-01-18 20:10:57 +00:00
Marcin	e14c730d5c	mods boilerplate	2025-01-18 20:10:57 +00:00
Marcin	fbf0014c82	missing vec4_floor doc	2025-01-18 20:10:56 +00:00
Marcin	0483362f5c	move mods to ext	2025-01-18 20:10:56 +00:00
Marcin	35af0c04fe	remove extra spaces from vec3.h and vec4.h	2025-01-18 13:19:29 +00:00
Marcin	66d51e5771	double spaces in other files	2025-01-18 13:19:29 +00:00
Marcin	5a3a16d9ad	double spaces in noise.h	2025-01-18 13:19:29 +00:00
Marcin	651ad8ca32	vec2-ext docs	2025-01-18 12:57:33 +00:00
Marcin	b1192c8638	vec3-ext docs	2025-01-18 12:57:33 +00:00
Marcin	eaf2d7314e	vec4-ext docs	2025-01-18 12:57:33 +00:00
Marcin	200b0875ba	vec2_floor test	2025-01-18 12:57:32 +00:00
Marcin	9a25fab6f0	vec2_floor boilerplate	2025-01-18 12:57:32 +00:00
Marcin	bfaf413a5d	vec2_floor impl	2025-01-18 12:57:32 +00:00
Marcin	9594d0cc86	add missing glms_vec2_abs	2025-01-18 12:57:32 +00:00
Marcin	2890472a0b	add missing test entry for vec2_abs	2025-01-18 12:57:32 +00:00
Marcin	c48befca37	vec2_fract test	2025-01-18 12:57:32 +00:00
Marcin	3c9eecd0be	vec2_fract boilerplate	2025-01-18 12:57:32 +00:00
Marcin	a9fee1b4d7	vec2_fract	2025-01-18 12:57:31 +00:00
Marcin	52753672bb	vec4_fract test	2025-01-18 12:57:31 +00:00
Marcin	68215526cf	vec4_floor boilerplate	2025-01-18 12:57:31 +00:00
Marcin	c27ef7e93b	vec3_floor struct doc	2025-01-18 12:57:31 +00:00
Marcin	2ba561cc92	move vec4_floor to ext	2025-01-18 12:57:31 +00:00
Marcin	e66f2f3df4	vec3_floor test	2025-01-18 12:57:31 +00:00
Marcin	967c9e0a09	vec3_floor boilerplate	2025-01-18 12:57:31 +00:00
Marcin	1637d2cef1	move vec3_floor to ext	2025-01-18 12:57:30 +00:00
Marcin	2acdd1e4d0	fix _glm_noiseDetail_gradNorm_vec2	2025-01-18 12:57:30 +00:00
Marcin	606ecbceaa	vac_muls -> vec_scale	2025-01-18 12:57:30 +00:00
Marcin	f1a72241b1	docs	2025-01-17 20:34:47 +00:00
Marcin	9085ed020a	glm_perlin_vec2 test	2025-01-17 20:22:00 +00:00
Marcin	1377a94a17	glm_perlin_vec2 boilerplate	2025-01-17 20:21:59 +00:00
Marcin	a98c270eee	glm_perlin_vec2 impl	2025-01-17 20:21:59 +00:00
Marcin	83b67baa23	glm_perlin_vec3 docs	2025-01-17 16:40:07 +00:00
Marcin	ae82a493f7	note	2025-01-17 16:38:19 +00:00
Marcin	98ab6fcbe0	glm_perlin_vec3 test	2025-01-17 16:38:19 +00:00
Marcin	f0529646b2	glm_perlin_vec3 boilerplate	2025-01-17 16:38:19 +00:00
Marcin	585a999d79	docs	2025-01-17 16:38:19 +00:00
Marcin	5f241a2daf	glm_perlin_vec3 impl	2025-01-17 16:38:19 +00:00
Marcin	fbdc46b205	more doc	2025-01-17 16:38:18 +00:00
Marcin	ae1bee7481	doc	2025-01-17 16:38:18 +00:00
Marcin	a0d8803f76	perlin.h -> noise.h	2025-01-15 14:05:08 +00:00
Marcin	43c9f84c8c	test_perlin	2025-01-15 13:35:10 +00:00
Marcin	71a0dc6c35	minor reshuffle	2025-01-15 13:35:10 +00:00
Marcin	5d34a04496	refactor gNorm	2025-01-15 13:35:10 +00:00
Marcin	b54dff0124	minor comment	2025-01-15 13:35:10 +00:00
Marcin	fda5406ac0	unnecessary zero init	2025-01-15 13:35:10 +00:00
Marcin	f19dc13e39	missed bracket	2025-01-15 13:35:09 +00:00
Marcin	f3f75a2727	impl but buggy	2025-01-15 13:35:09 +00:00
Marcin	c3e16a53f4	work in progress	2025-01-15 13:35:09 +00:00
Marcin	8a2fd9cda9	docs boilerplate	2025-01-14 17:34:51 +00:00
Marcin	98f53c750d	test boilerplate	2025-01-14 17:32:56 +00:00
Marcin	3e52d90ecb	boilerplate	2025-01-14 17:32:46 +00:00
Marcin	a4cd7e008d	initial impl	2025-01-14 17:31:35 +00:00
Recep Aslantas	5861c37a93	Merge pull request #437 from master30f/readme-restructure Restructure and rephrase README	2025-01-08 11:28:30 +03:00
zorby	aae82c1d4a	README: Add chapter about alignment	2025-01-04 11:51:43 +01:00
zorby	7e51ed88e8	README: Add Features	2025-01-04 11:23:37 +01:00
zorby	32a05a579b	README: Move Struct API chapter	2025-01-04 11:22:52 +01:00
Recep Aslantas	9b67866154	Merge pull request #438 from master30f/file-cleanup Remove .vscode and .gitmodules	2025-01-03 23:42:41 +03:00
zorby	bab7d7bb12	Add .vscode to .gitignore	2025-01-03 17:09:44 +01:00
zorby	5a207d84bb	Remove .gitmodules See issue #436.	2025-01-03 10:43:47 +01:00
zorby	7783acae10	Remove .vscode See issue #436.	2025-01-03 10:40:17 +01:00
zorby	9d079d3cc3	Restructure README	2024-12-30 13:50:11 +01:00
Recep Aslantas	054b2df004	vscode: disable formatOnSave	2024-12-03 14:27:06 +03:00
Recep Aslantas	93a2926a13	Merge pull request #430 from nitrix/feat/cmake-3.13 Bump CMake minimum version to 3.13	2024-09-10 14:06:45 +03:00
Alex Belanger	5484259328	Undid the formatting.	2024-09-09 10:35:24 -04:00
Alex Belanger	e930737807	Bump CMake minimum version to 3.13	2024-09-09 10:06:09 -04:00
Recep Aslantas	c12f318fab	supress fast-math warns	2024-08-27 13:40:34 +03:00
Recep Aslantas	b0f35203f9	suppress warns	2024-08-27 12:32:58 +03:00
Recep Aslantas	ad009d4e49	doc: improve briefs	2024-08-27 12:29:43 +03:00
Recep Aslantas	2f619cdd6f	newlines	2024-08-27 12:22:42 +03:00
Recep Aslantas	33e78ca2ad	suppress warns about va-args	2024-08-27 12:13:28 +03:00
Recep Aslantas	f82d570dec	take -pedantic into account to validate CGLM_USE_ANONYMOUS_STRUCT	2024-08-27 11:58:34 +03:00
Recep Aslantas	48839a38a1	fix refract	2024-07-15 12:41:23 +03:00
Recep Aslantas	d491108386	Merge pull request #423 from waywardmonkeys/reduce-typo-count Reduce typo count.	2024-07-15 11:29:42 +03:00
Bruce Mitchener	068f6951b3	Reduce typo count.	2024-07-14 09:10:19 +07:00
Recep Aslantas	ed731f991d	Merge pull request #421 from myfreeer/patch-1 mat4: wasm simd128 for glm_mat4_inv	2024-07-03 17:06:53 +03:00
myfreeer	07dc9520a4	mat4: wasm simd128 for glm_mat4_inv The function `glm_mat4_inv_wasm` has been implemented, but not used in `glm_mat4_inv`. This commit adds a conditional macro to add the case of wasm and simd128 case for calling `glm_mat4_inv_wasm`.	2024-07-03 20:04:50 +08:00
Recep Aslantas	be0defb7ac	Merge pull request #420 from not-kaz/master Fixed 'missing-prototypes' warnings for some vec2 funcs.	2024-06-11 14:52:02 +03:00
Kaz	5b2c37f73b	Fixed 'missing-prototypes' warnings for some vec2 funcs. Using gcc and missing-prototypes flag shows warnings for: glmc_vec2_fill, glmc_vec2_eq and glmc_vec2_eqv. Seems they were not added to call/vec2.h for some reason. This should fix the warnings.	2024-06-09 11:06:18 +02:00
Recep Aslantas	a93a9ef9a2	Merge pull request #416 from telephone001/quat_slerp_longest added quat_slerp_longest	2024-05-13 17:26:49 +03:00
Recep Aslantas	c5b2afc1c4	Update quat.h	2024-05-13 17:20:28 +03:00
John Choi	cd5ed1f4c4	added quat_slerp_longest	2024-05-10 22:32:25 -05:00
Recep Aslantas	eb3a51e591	win: suppress C4996; use snprintf() instead of sprintf()	2024-04-22 21:33:31 +03:00
Recep Aslantas	44cd0ae4fd	avx: optimize avx mat4 scale and mat4 mul	2024-04-13 00:33:57 +03:00
Recep Aslantas	d75467f93f	avx: implement transpose with AVX	2024-04-13 00:12:14 +03:00
Recep Aslantas	45c1beff51	simd: fix glmm_set1, glmm_splat	2024-04-12 21:53:20 +03:00
Recep Aslantas	14c567d9d9	sse: drop unused macros: glmm_shuff1x, glmm_shuff2	2024-04-11 21:57:46 +03:00
Recep Aslantas	480e1de048	sse: make use of int domain as default behavior if possible ( compiler may ignore it ) also use AVX's `_mm_permute_ps`for shuffling single vector	2024-04-11 21:57:16 +03:00
Recep Aslantas	de66f0a67f	glmm, avx: optimize splat macros	2024-04-10 23:49:18 +03:00
Recep Aslantas	68bdec4510	simd: use glmm_set1() to optimize broadcasting single float	2024-04-10 22:52:53 +03:00
Recep Aslantas	62c0448e25	simd, msvc: ensure required definitions are exist on msvc	2024-04-07 22:48:11 +03:00
Recep Aslantas	4f00ce0e52	sse: reduce some instructions in mat4 inv	2024-04-07 22:33:37 +03:00
Recep Aslantas	a7845ffc44	msvc, simd: fix simd headers for _M_ARM64EC	2024-04-07 00:54:29 +03:00
Recep Aslantas	bd941ed7fb	arm, neon: fix neon support on GCC ARM	2024-04-06 14:23:36 +03:00
Recep Aslantas	87350f809b	msvc bug: dont align types due to "ARM32 = C2719: formal parameter with requested alignment of 16 won't be aligned." on ARM32/MSVC until a good solution.	2024-04-06 14:11:46 +03:00
Recep Aslantas	f50a7a7d00	arm, neon: improve glm_mat4_inv_neon	2024-04-06 14:10:13 +03:00
Recep Aslantas	b3308af146	arm: fix glmm_vhadd on ARM32	2024-04-06 14:09:52 +03:00
Recep Aslantas	a94f839d6d	Merge branch 'master' into optimize-inv	2024-04-04 00:31:08 +03:00
Recep Aslantas	0ff0e8948f	Update mat4.h	2024-04-03 00:05:42 +03:00
Recep Aslantas	5b772d0eb4	neon: mat4_inv, reduce 1mul for two extra 2xor	2024-04-03 00:03:55 +03:00
Recep Aslantas	c528ca1095	neon: mat4_inv remastered	2024-04-02 13:21:49 +03:00
Recep Aslantas	f0e09776d7	arm, neon: optimize glmm_vhadd and add glmm_vdot	2024-04-02 02:36:16 +03:00
Recep Aslantas	da4224ba32	now working on v0.9.5	2024-04-02 00:52:54 +03:00
Recep Aslantas	1796cc5ce2	Merge pull request #412 from recp/sse_only separate SSE and SSE2	2024-04-01 17:52:06 +03:00
Recep Aslantas	9ad7dd3fbc	Merge pull request #409 from EasyIP2023/feature/expand-content-width docs: expand wy-nav-content width to edge of screen	2024-03-31 23:44:40 +03:00
Recep Aslantas	28142b5912	Merge pull request #411 from EasyIP2023/bugfix/mat4x3-multiplication mat4x3: fix multiplication functions	2024-03-31 23:44:00 +03:00
Recep Aslantas	f07d75c680	Merge pull request #410 from EasyIP2023/bugfix/mat4x2-multiplication mat4x2: fix multiplication functions	2024-03-31 23:43:11 +03:00
Recep Aslantas	17d8b83a38	Merge pull request #408 from EasyIP2023/bugfix/mat3x4-multiplication mat3x4: fix multiplication functions	2024-03-31 23:41:40 +03:00
Vincent Davis Jr	013ac5dd07	docs: mat4x3 account for latest mulitplication changes This also includes tables to explain how mat4x3, column vectors, and row vectors are represented. Also includes how resulting matrix or vector is formed. Signed-off-by: Vincent Davis Jr <vince@underview.tech>	2024-03-31 14:43:31 -04:00
Vincent Davis Jr	a0e3d3766f	mat4x3: fix multiplication functions Signed-off-by: Vincent Davis Jr <vince@underview.tech>	2024-03-31 14:43:30 -04:00
Vincent Davis Jr	fc7f0e13fd	docs: mat3x4 account for latest mulitplication changes This also includes tables to explain how mat3x4, column vectors, and row vectors are represented. Also includes how resulting matrix or vector is formed. Signed-off-by: Vincent Davis Jr <vince@underview.tech>	2024-03-31 13:43:41 -04:00
Vincent Davis Jr	1340b5d512	mat3x4: fix multiplication functions Signed-off-by: Vincent Davis Jr <vince@underview.tech>	2024-03-31 13:43:39 -04:00
Vincent Davis Jr	85165dd3e3	docs: mat4x2 account for latest mulitplication changes This also includes tables to explain how mat4x2, column vectors, and row vectors are represented. Also includes how resulting matrix or vector is formed. Signed-off-by: Vincent Davis Jr <vince@underview.tech>	2024-03-31 13:40:43 -04:00
Vincent Davis Jr	3445f93fbc	mat4x2: fix multiplication functions Signed-off-by: Vincent Davis Jr <vince@underview.tech>	2024-03-31 13:40:41 -04:00
Vincent Davis Jr	d2642eb206	docs: expand wy-nav-content width to edge of screen RTD theme's default is 800px as max width for the content, but we have tables with tons of columns, which need the full width of the view-port. Comment from yocto project theme_overrides.css Signed-off-by: Vincent Davis Jr <vince@underview.tech>	2024-03-31 12:22:13 -04:00
Recep Aslantas	8366e51b47	optimize mat4 scalar inv	2024-03-29 22:13:23 +03:00
Recep Aslantas	30b4ea80a9	optimize mat3 scalar inv	2024-03-29 20:59:54 +03:00