Added sse2neon to fix compile errors on macOS for custom hashcat codes on phc-winner-argon2

2025-07-22 22:48:47 +00:00 · 2025-05-30 21:57:05 +02:00 · 2025-05-30 21:57:05 +02:00 · 70825ebac4
commit 70825ebac4
parent dc50bdbc72
29 changed files with 24422 additions and 0 deletions
--- a/deps/phc-winner-argon2-20190702/_hashcat/blake2/blamka-round-opt.h
+++ b/deps/phc-winner-argon2-20190702/_hashcat/blake2/blamka-round-opt.h
@ -20,7 +20,12 @@

 #include "blake2-impl.h"

+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
 #include <emmintrin.h>
+#elif defined(__aarch64__)
+#include <sse2neon.h>
+#endif
+
 #if defined(__SSSE3__)
 #include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
 #endif
--- a/deps/sse2neon/.ci/check-format.sh
+++ b/deps/sse2neon/.ci/check-format.sh
@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+. .ci/common.sh
+
+set -x
+
+for file in ${SOURCES};
+do
+    clang-format-18 ${file} > expected-format
+    diff -u -p --label="${file}" --label="expected coding style" ${file} expected-format
+done
+exit $(clang-format-18 --output-replacements-xml ${SOURCES} | egrep -c "</replacement>")
--- a/deps/sse2neon/.ci/common.sh
+++ b/deps/sse2neon/.ci/common.sh
@ -0,0 +1,18 @@
+GCC_REL=14.2.rel1
+ARM_MIRROR=https://github.com/DLTcollab/toolchain-arm/raw/main
+
+SOURCES=$(find $(git rev-parse --show-toplevel) | egrep "\.(cpp|h)\$" | egrep -v "arm-gnu-toolchain-${GCC_REL}-x86_64-aarch64-none-linux-gnu|arm-gnu-toolchain-${GCC_REL}-x86_64-arm-none-linux-gnueabihf")
+
+# Expect host is Linux/x86_64
+check_platform()
+{
+    MACHINE_TYPE=`uname -m`
+    if [ ${MACHINE_TYPE} != 'x86_64' ]; then
+        exit
+    fi
+
+    OS_TYPE=`uname -s`
+    if [ ${OS_TYPE} != 'Linux' ]; then
+        exit
+    fi
+}
--- a/deps/sse2neon/.ci/cross-check.sh
+++ b/deps/sse2neon/.ci/cross-check.sh
@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+. .ci/common.sh
+
+check_platform
+
+# Clang/LLVM is natively a cross-compiler.
+# TODO: Do cross-compilation using Clang
+# https://clang.llvm.org/docs/CrossCompilation.html
+if [ $(printenv CXX | grep clang) ]; then
+    exit
+fi
+
+set -x
+
+make clean
+export PATH=arm-gnu-toolchain-${GCC_REL}-x86_64-aarch64-none-linux-gnu/bin:$PATH
+make CROSS_COMPILE=aarch64-none-linux-gnu- check || exit 1 # ARMv8-A
+
+make clean
+export PATH=arm-gnu-toolchain-${GCC_REL}-x86_64-arm-none-linux-gnueabihf/bin:$PATH
+make CROSS_COMPILE=arm-none-linux-gnueabihf- check || exit 1 # ARMv7-A
--- a/deps/sse2neon/.ci/cross-tool.sh
+++ b/deps/sse2neon/.ci/cross-tool.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+. .ci/common.sh
+
+check_platform
+
+sudo apt-get update -q -y
+sudo apt-get install -q -y qemu-user
+
+# Clang/LLVM is natively a cross-compiler, meaning that one set of programs
+# can compile to all targets by setting the -target option.
+if [ $(printenv CXX | grep clang) ]; then
+    exit
+fi
+
+set -x
+
+sudo apt-get install -y curl xz-utils
+
+curl -L \
+    ${ARM_MIRROR}/arm-gnu-toolchain-${GCC_REL}-x86_64-arm-none-linux-gnueabihf.tar.xz \
+    | tar -Jx || exit 1
+
+curl -L \
+    ${ARM_MIRROR}/arm-gnu-toolchain-${GCC_REL}-x86_64-aarch64-none-linux-gnu.tar.xz \
+    | tar -Jx || exit 1
--- a/deps/sse2neon/.clang-format
+++ b/deps/sse2neon/.clang-format
@ -0,0 +1,22 @@
+BasedOnStyle: Chromium
+Language: Cpp
+MaxEmptyLinesToKeep: 3
+IndentCaseLabels: false
+AllowShortIfStatementsOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+DerivePointerAlignment: false
+PointerAlignment: Right
+SpaceAfterCStyleCast: true
+TabWidth: 4
+UseTab: Never
+IndentWidth: 4
+BreakBeforeBraces: Linux
+AccessModifierOffset: -4
+ForEachMacros:
+  - SET_FOREACH
+  - RB_FOREACH
+AlignEscapedNewlines: Left
+AttributeMacros:
+  - FORCE_INLINE
+  - ALIGN_STRUCT
--- a/deps/sse2neon/.gitattributes
+++ b/deps/sse2neon/.gitattributes
@ -0,0 +1,4 @@
+*.md text=auto
+LICENSE text=auto
+
+sse2neon.h -text linguist-language=c
--- a/deps/sse2neon/.github/CODEOWNERS
+++ b/deps/sse2neon/.github/CODEOWNERS
@ -0,0 +1,5 @@
+# Lines starting with '#' are comments.
+# More details are here: https://help.github.com/articles/about-codeowners/
+
+# Global codeowners:
+* @jserv @howjmay
--- a/deps/sse2neon/.github/workflows/main.yml
+++ b/deps/sse2neon/.github/workflows/main.yml
@ -0,0 +1,127 @@
+name: GitHub Actions
+
+on: [push, pull_request]
+
+jobs:
+  host-x86:
+    runs-on: ubuntu-24.04
+    strategy:
+      matrix:
+        arch: [x86_64]
+        cxx_compiler: [g++, clang++]
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v4
+      - name: build artifact
+        env:
+          CXX: ${{ matrix.cxx_compiler }}
+        run: |
+          sh .ci/cross-tool.sh
+          make check
+          sh .ci/cross-check.sh
+
+  host-win:
+    runs-on: windows-2022
+    strategy:
+      matrix:
+        arch:
+          - x86_64
+          - armv7
+          - aarch64
+    env:
+      LLVM_MINGW_URL: https://github.com/mstorsjo/llvm-mingw/releases/download/20241217/llvm-mingw-20241217-msvcrt-x86_64.zip
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: unpack llvm-mingw
+        run: |
+          curl -L -O $LLVM_MINGW_URL
+          unzip -q llvm-mingw-*.zip
+          rm llvm-mingw-*.zip
+          mv llvm-mingw-* "$HOME/llvm-mingw"
+          echo "$HOME/llvm-mingw/bin" >> $GITHUB_PATH
+      - name: checkout code
+        uses: actions/checkout@v4
+      - name: build artifact
+        env:
+          CXX: ${{ matrix.arch }}-w64-mingw32-clang++
+        run: mingw32-make processor=${{ matrix.arch }}
+      - name: run tests
+        if: matrix.arch == 'x86_64'
+        run: mingw32-make check
+
+  host-arm:
+    runs-on: ubuntu-24.04
+    strategy:
+      matrix:
+        arch_with_features: [
+          {arch: armv7, feature: none, arch_cflags: none},
+          {arch: aarch64, feature: none, arch_cflags: none},
+          {arch: aarch64, feature: crypto+crc, arch_cflags: none},
+          {arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8'}
+        ]
+        cxx_compiler: [g++, clang++-15]
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v4
+      - name: build artifact
+        # The Github Action for non-x86 CPU
+        # https://github.com/uraimo/run-on-arch-action
+        uses: uraimo/run-on-arch-action@v2
+        with:
+          arch: ${{ matrix.arch_with_features.arch }}
+          distro: ubuntu22.04
+          # Speed up builds by storing container images in a GitHub package registry.
+          githubToken: ${{ github.token }}
+          env: |
+            CXX: ${{ matrix.cxx_compiler }}
+            ARCH_CFLAGS: ${{ matrix.arch_with_features.arch_cflags }}
+          install: |
+            apt-get update -q -y
+            apt-get install -q -y gcc "${{ matrix.cxx_compiler }}" make
+          run: |
+            make FEATURE=${{ matrix.arch_with_features.feature }} check
+
+  host-win-msvc:
+    runs-on: windows-2022
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v4
+
+      - name: add msbuild to PATH
+        uses: microsoft/setup-msbuild@v2
+
+      - name: build artifact
+        run: msbuild sse2neon.vcxproj -t:rebuild -property:Configuration=Release -property:Platform=ARM64
+
+      - name: upload artifact
+        uses: actions/upload-artifact@master
+        with:
+          name: msvc-arm64-artifact
+          path: ARM64
+
+  test-win-msvc:
+    runs-on: ubuntu-24.04
+    container: linaro/wine-arm64
+    needs: host-win-msvc
+    steps:
+      - name: download artifact
+        uses: actions/download-artifact@master
+        with:
+          name: msvc-arm64-artifact
+
+      - name: Run tests
+        run: wine-arm64 cmd.exe /c 'Release\sse2neon.exe'
+
+
+  coding-style:
+    runs-on: ubuntu-24.04
+    steps:
+      - name: checkout code
+        uses: actions/checkout@v4
+      - name: style check
+        run: |
+            sudo apt-get install -q -y clang-format-18
+            sh .ci/check-format.sh
+        shell: bash
--- a/deps/sse2neon/.gitignore
+++ b/deps/sse2neon/.gitignore
@ -0,0 +1,10 @@
+*.exe
+*.o
+*.gch
+tests/*.d
+tests/main
+gcc-arm-*
+.vs/
+Debug/
+Release/
+*.vcxproj.user
--- a/deps/sse2neon/CONTRIBUTING.md
+++ b/deps/sse2neon/CONTRIBUTING.md
@ -0,0 +1,462 @@
+# Contributing to SSE2NEON
+
+:+1::tada: First off, thanks for taking the time to contribute! :tada::+1:
+
+The following is a set of guidelines for contributing to [SSE2NEON](https://github.com/DLTcollab/sse2neon),
+hosted on GitHub. These are mostly guidelines, not rules. Use your best
+judgment, and feel free to propose changes to this document in a pull request.
+
+## Issues
+
+This project uses GitHub Issues to track ongoing development, discuss project plans, and keep track of bugs. Be sure to search for existing issues before you create another one.
+
+Visit our [Issues page on GitHub](https://github.com/DLTcollab/sse2neon/issues) to search and submit.
+
+## Add New Intrinsic
+
+The new intrinsic conversion should be added in the `sse2neon.h` file,
+and it should be placed in the correct classification with the alphabetical order.
+The classification can be referenced from [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html).
+
+Classification: `SSE`, `SSE2`, `SSE3`, `SSSE3`, `SSE4.1`, `SSE4.2`
+
+## Coding Convention
+
+We welcome all contributions from corporate, acaddemic and individual developers. However, there are a number of fundamental ground rules that you must adhere to in order to participate. These rules are outlined as follows:
+* All code must adhere to the existing C coding style (see below). While we are somewhat flexible in basic style, you will adhere to what is currently in place. Uncommented, complicated algorithmic constructs will be rejected.
+* All external pull requests must contain sufficient documentation in the pull request comments in order to be accepted.
+
+Software requirement: [clang-format](https://clang.llvm.org/docs/ClangFormat.html) version 18 or later.
+
+Use the command `$ clang-format -i *.[ch]` to enforce a consistent coding style.
+
+## Naming Conventions
+
+There are some general rules.
+* Names with leading and trailing underscores are reserved for system purposes, and most systems use them for names that the user should not have to know.
+* Function, typedef, and variable names, as well as struct, union, and enum tag names should be in lower case.
+* Many function-like macros are in all CAPS.
+* Avoid names that differ only in case, like `foo` and `Foo`. Similarly, avoid `foobar` and `foo_bar`. The potential for confusion is considerable.
+* Similarly, avoid names that look like each other. On many terminals and printers, `l`, `1` and `I` look quite similar. A variable named `l` is particularly bad because it looks so much like the constant `1`.
+
+In general, global names (including enums) should have a common prefix (`SSE2NEON_` for macros and enum constants; `_sse2neon_` for functions) identifying the module that they belong with. Globals may alternatively be grouped in a global structure. Typedeffed names often have `_t` appended to their name.
+
+Avoid using names that might conflict with other names used in standard libraries. There may be more library code included in some systems than you need. Your program could also be extended in the future.
+
+## Coding Style for Modern C
+
+This coding style is a variation of the K&R style. Some general principles: honor tradition, but accept progress; be consistent;
+embrace the latest C standards; embrace modern compilers, their static analysis
+capabilities and sanitizers.
+
+### Indentation
+
+Use 4 spaces rather than tabs.
+
+### Line length
+
+All lines should generally be within 80 characters.  Wrap long lines.
+There are some good reasons behind this:
+* It forces the developer to write more succinct code;
+* Humans are better at processing information in smaller quantity portions;
+* It helps users of vi/vim (and potentially other editors) who use vertical splits.
+
+### Comments
+
+Multi-line comments shall have the opening and closing characters
+in a separate line, with the lines containing the content prefixed by a space
+and the `*` characters for alignment, e.g.,
+```c
+/*
+ * This is a multi-line comment.
+ */
+
+/* One line comment. */
+```
+
+Use multi-line comments for more elaborative descriptions or before more
+significant logical block of code.
+
+Single-line comments shall be written in C89 style:
+```c
+    return (uintptr_t) val;  /* return a bitfield */
+```
+
+Leave two spaces between the statement and the inline comment.
+
+### Spacing and brackets
+
+Use one space after the conditional or loop keyword, no spaces around
+their brackets, and one space before the opening curly bracket.
+
+Functions (their declarations or calls), `sizeof` operator or similar
+macros shall not have a space after their name/keyword or around the
+brackets, e.g.,
+```c
+unsigned total_len = offsetof(obj_t, items[n]);
+unsigned obj_len = sizeof(obj_t);
+```
+
+Use brackets to avoid ambiguity and with operators such as `sizeof`,
+but otherwise avoid redundant or excessive brackets.
+
+### Variable names and declarations
+
+- Use descriptive names for global variables and short names for locals.
+Find the right balance between descriptive and succinct.
+
+- Use [snakecase](https://en.wikipedia.org/wiki/Snake_case).
+Do not use "camelcase".
+
+- Do not use Hungarian notation or other unnecessary prefixing or suffixing.
+
+- Use the following spacing for pointers:
+```c
+const char *name;  /* const pointer; '*' with the name and space before it */
+conf_t * const cfg;  /* pointer to a const data; spaces around 'const' */
+const uint8_t * const charmap;  /* const pointer and const data */
+const void * restrict key;  /* const pointer which does not alias */
+```
+
+### Type definitions
+
+Declarations shall be on the same line, e.g.,
+```c
+typedef void (*dir_iter_t)(void *, const char *, struct dirent *);
+```
+
+_Typedef_ structures rather than pointers.  Note that structures can be kept
+opaque if they are not dereferenced outside the translation unit where they
+are defined.  Pointers can be _typedefed_ only if there is a very compelling
+reason.
+
+New types may be suffixed with `_t`.  Structure name, when used within the
+translation unit, may be omitted, e.g.:
+
+```c
+typedef struct {
+    unsigned if_index;
+    unsigned addr_len;
+    addr_t next_hop;
+} route_info_t;
+```
+
+### Initialization
+
+Embrace C99 structure initialization where reasonable, e.g.,
+```c
+static const crypto_ops_t openssl_ops = {
+    .create = openssl_crypto_create,
+    .destroy = openssl_crypto_destroy,
+    .encrypt = openssl_crypto_encrypt,
+    .decrypt = openssl_crypto_decrypt,
+    .hmac = openssl_crypto_hmac,
+};
+```
+
+Embrace C99 array initialization, especially for the state machines, e.g.,
+```c
+static const uint8_t tcp_fsm[TCP_NSTATES][2][TCPFC_COUNT] = {
+    [TCPS_CLOSED] = {
+        [FLOW_FORW] = {
+            /* Handshake (1): initial SYN. */
+            [TCPFC_SYN]	= TCPS_SYN_SENT,
+        },
+    },
+    ...
+}
+```
+
+### Control structures
+
+Try to make the control flow easy to follow.  Avoid long convoluted logic
+expressions; try to split them where possible (into inline functions,
+separate if-statements, etc).
+
+The control structure keyword and the expression in the brackets should be
+separated by a single space.  The opening curly bracket shall be in the
+same line, also separated by a single space.  Example:
+
+```c
+    for (;;) {
+        obj = get_first();
+        while ((obj = get_next(obj))) {
+            ...
+        }
+        if (done)
+            break;
+    }
+```
+
+Do not add inner spaces around the brackets. There should be one space after
+the semicolon when `for` has expressions:
+```c
+    for (unsigned i = 0; i < __arraycount(items); i++) {
+        ...
+    }
+```
+
+#### Avoid unnecessary nesting levels
+
+Avoid:
+```c
+int inspect(obj_t *obj)
+{
+    if (cond) {
+        ...
+        /* long code block */
+        ...
+        return 0;
+    }
+    return -1;
+}
+```
+
+Consider:
+```c
+int inspect(obj_t *obj)
+{
+    if (!cond)
+        return -1;
+
+    ...
+    return 0;
+}
+```
+
+However, do not make logic more convoluted.
+
+### `if` statements
+
+Curly brackets and spacing follow the K&R style:
+```c
+    if (a == b) {
+        ..
+    } else if (a < b) {
+        ...
+    } else {
+        ...
+    }
+```
+
+Simple and succinct one-line if-statements may omit curly brackets:
+```c
+    if (!valid)
+        return -1;
+```
+
+However, do prefer curly brackets with multi-line or more complex statements.
+If one branch uses curly brackets, then all other branches shall use the
+curly brackets too.
+
+Wrap long conditions to the if-statement indentation adding extra 4 spaces:
+```c
+    if (some_long_expression &&
+        another_expression) {
+        ...
+    }
+```
+
+#### Avoid redundant `else`
+
+Avoid:
+```c
+    if (flag & F_FEATURE_X) {
+        ...
+        return 0;
+    } else {
+        return -1;
+    }
+```
+
+Consider:
+```c
+    if (flag & F_FEATURE_X) {
+        ...
+        return 0;
+    }
+    return -1;
+```
+
+### `switch` statements
+
+Switch statements should have the `case` blocks at the same indentation
+level, e.g.:
+```c
+    switch (expr) {
+    case A:
+        ...
+        break;
+    case B:
+        /* fallthrough */
+    case C:
+        ...
+        break;
+    }
+```
+
+If the case block does not break, then it is strongly recommended to add a
+comment containing "fallthrough" to indicate it.  Modern compilers can also
+be configured to require such comment (see gcc `-Wimplicit-fallthrough`).
+
+### Function definitions
+
+The opening and closing curly brackets shall also be in the separate lines (K&R style).
+
+```c
+ssize_t hex_write(FILE *stream, const void *buf, size_t len)
+{
+    ...
+}
+```
+
+Do not use old style K&R style C definitions.
+
+### Object abstraction
+
+Objects are often "simulated" by the C programmers with a `struct` and
+its "public API".  To enforce the information hiding principle, it is a
+good idea to define the structure in the source file (translation unit)
+and provide only the _declaration_ in the header.  For example, `obj.c`:
+
+```c
+#include "obj.h"
+
+struct obj {
+    int value;
+}
+
+obj_t *obj_create(void)
+{
+    return calloc(1, sizeof(obj_t));
+}
+
+void obj_destroy(obj_t *obj)
+{
+    free(obj);
+}
+```
+
+With an example `obj.h`:
+```c
+#ifndef _OBJ_H_
+#define _OBJ_H_
+
+typedef struct obj;
+
+obj_t *obj_create(void);
+void obj_destroy(obj_t *);
+
+#endif
+```
+
+Such structuring will prevent direct access of the `obj_t` members outside
+the `obj.c` source file.  The implementation (of such "class" or "module")
+may be large and abstracted within separate source files.  In such case,
+consider separating structures and "methods" into separate headers (think of
+different visibility), for example `obj_impl.h` (private) and `obj.h` (public).
+
+Consider `crypto_impl.h`:
+```c
+#ifndef _CRYPTO_IMPL_H_
+#define _CRYPTO_IMPL_H_
+
+#if !defined(__CRYPTO_PRIVATE)
+#error "only to be used by the crypto modules"
+#endif
+
+#include "crypto.h"
+
+typedef struct crypto {
+    crypto_cipher_t cipher;
+    void *key;
+    size_t key_len;
+    ...
+}
+...
+
+#endif
+```
+
+And `crypto.h` (public API):
+
+```c
+#ifndef _CRYPTO_H_
+#define _CRYPTO_H_
+
+typedef struct crypto crypto_t;
+
+crypto_t *crypto_create(crypto_cipher_t);
+void crypto_destroy(crypto_t *);
+...
+
+#endif
+```
+
+### Use reasonable types
+
+Use `unsigned` for general iterators; use `size_t` for general sizes; use
+`ssize_t` to return a size which may include an error.  Of course, consider
+possible overflows.
+
+Avoid using `uint8_t` or `uint16_t` or other sub-word types for general
+iterators and similar cases, unless programming for micro-controllers or
+other constrained environments.
+
+C has rather peculiar _type promotion rules_ and unnecessary use of sub-word
+types might contribute to a bug once in a while.
+
+### Embrace portability
+
+#### Byte-order
+
+Do not assume x86 or little-endian architecture.  Use endian conversion
+functions for operating the on-disk and on-the-wire structures or other
+cases where it is appropriate.
+
+#### Types
+
+- Do not assume a particular 32-bit vs 64-bit architecture, e.g., do not
+assume the size of `long` or `unsigned long`.  Use `int64_t` or `uint64_t`
+for the 8-byte integers.
+
+- Do not assume `char` is signed; for example, on Arm it is unsigned.
+
+- Use C99 macros for constant prefixes or formatting of the fixed-width
+types.
+
+Use:
+```c
+#define	SOME_CONSTANT (UINT64_C(1) << 48)
+printf("val %" PRIu64 "\n", SOME_CONSTANT);
+```
+
+Do not use:
+```c
+#define	SOME_CONSTANT (1ULL << 48)
+printf("val %lld\n", SOME_CONSTANT);
+```
+
+#### Avoid unaligned access
+
+Do not assume unaligned access is safe.  It is not safe on Arm, POWER,
+and various other architectures.  Moreover, even on x86 unaligned access
+is slower.
+
+#### Avoid extreme portability
+
+Unless programming for micro-controllers or exotic CPU architectures,
+focus on the common denominator of the modern CPU architectures, avoiding
+the very maximum portability which can make the code unnecessarily cumbersome.
+
+Some examples:
+- It is fair to assume `sizeof(int) == 4` since it is the case on all modern
+mainstream architectures.  PDP-11 era is long gone.
+- Using `1U` instead of `UINT32_C(1)` or `(uint32_t) 1` is also fine.
+- It is fair to assume that `NULL` is matching `(uintptr_t) 0` and it is fair
+to `memset()` structures with zero.  Non-zero `NULL` is for retro computing.
+
+## References
+- [Linux kernel coding style](https://www.kernel.org/doc/html/latest/process/coding-style.html)
+- 1999, Brian W. Kernighan and Rob Pike, The Practice of Programming, Addison–Wesley.
+- 1993, Bill Shannon, [C Style and Coding Standards for SunOS](https://devnull-cz.github.io/unix-linux-prog-in-c/cstyle.ms.pdf)
--- a/deps/sse2neon/LICENSE
+++ b/deps/sse2neon/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2015-2025 SSE2NEON Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/deps/sse2neon/Makefile
+++ b/deps/sse2neon/Makefile
@ -0,0 +1,93 @@
+ifndef CC
+override CC = gcc
+endif
+
+ifndef CXX
+override CXX = g++
+endif
+
+ifndef CROSS_COMPILE
+    processor := $(shell uname -m)
+else # CROSS_COMPILE was set
+    CC = $(CROSS_COMPILE)gcc
+    CXX = $(CROSS_COMPILE)g++
+    CXXFLAGS += -static
+    LDFLAGS += -static
+    check_arm := $(shell echo | $(CROSS_COMPILE)cpp -dM - | grep " __ARM_ARCH " | cut -c20-)
+    ifeq ($(check_arm),8)
+        processor = aarch64
+    else ifeq ($(check_arm),7) # detect ARMv7-A only
+        processor = arm
+    else
+        $(error Unsupported cross-compiler)
+    endif
+endif
+
+EXEC_WRAPPER =
+ifdef CROSS_COMPILE
+EXEC_WRAPPER = qemu-$(processor)
+endif
+
+# Follow platform-specific configurations
+ARCH_CFLAGS ?=
+ARCH_CFLAGS_IS_SET =
+ifeq ($(ARCH_CFLAGS),)
+    ARCH_CFLAGS_IS_SET = true
+endif
+ifeq ($(ARCH_CFLAGS),none)
+    ARCH_CFLAGS_IS_SET = true
+endif
+ifdef ARCH_CFLAGS_IS_SET
+    ifeq ($(processor),$(filter $(processor),aarch64 arm64))
+        override ARCH_CFLAGS := -march=armv8-a+fp+simd
+    else ifeq ($(processor),$(filter $(processor),i386 x86_64))
+        override ARCH_CFLAGS := -maes -mpclmul -mssse3 -msse4.2
+    else ifeq ($(processor),$(filter $(processor),arm armv7 armv7l))
+        override ARCH_CFLAGS := -mfpu=neon
+    else
+        $(error Unsupported architecture)
+    endif
+endif
+
+FEATURE ?=
+ifneq ($(FEATURE),)
+ifneq ($(FEATURE),none)
+COMMA:= ,
+ARCH_CFLAGS := $(ARCH_CFLAGS)+$(subst $(COMMA),+,$(FEATURE))
+endif
+endif
+
+CXXFLAGS += -Wall -Wcast-qual -Wconversion -I. $(ARCH_CFLAGS) -std=gnu++14
+LDFLAGS	+= -lm
+OBJS = \
+    tests/binding.o \
+    tests/common.o \
+    tests/impl.o \
+    tests/main.o
+deps := $(OBJS:%.o=%.o.d)
+
+.SUFFIXES: .o .cpp
+.cpp.o:
+	$(CXX) -o $@ $(CXXFLAGS) -c -MMD -MF $@.d $<
+
+EXEC = tests/main
+
+$(EXEC): $(OBJS)
+	$(CXX) $(LDFLAGS) -o $@ $^
+
+check: tests/main
+ifeq ($(processor),$(filter $(processor),aarch64 arm64 arm armv7l))
+	$(CC) $(ARCH_CFLAGS) -c sse2neon.h
+endif
+	$(EXEC_WRAPPER) $^
+
+indent:
+	@echo "Formatting files with clang-format.."
+	@if ! hash clang-format-18; then echo "clang-format-18 is required to indent"; fi
+	clang-format-18 -i sse2neon.h tests/*.cpp tests/*.h
+
+.PHONY: clean check format
+clean:
+	$(RM) $(OBJS) $(EXEC) $(deps) sse2neon.h.gch
+
+-include $(deps)
--- a/deps/sse2neon/README.md
+++ b/deps/sse2neon/README.md
@ -0,0 +1,300 @@
+# sse2neon
+![GitHub Actions](https://github.com/DLTcollab/sse2neon/workflows/GitHub%20Actions/badge.svg)
+
+A C/C++ header file that converts Intel SSE intrinsics to Arm/Aarch64 NEON intrinsics.
+
+## Introduction
+
+`sse2neon` is a translator of Intel SSE (Streaming SIMD Extensions) intrinsics
+to [Arm NEON](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon),
+shortening the time needed to get an Arm working program that then can be used to
+extract profiles and to identify hot paths in the code.
+The header file `sse2neon.h` contains several of the functions provided by Intel
+intrinsic headers such as `<xmmintrin.h>`, only implemented with NEON-based counterparts
+to produce the exact semantics of the intrinsics.
+
+## Mapping and Coverage
+
+Header file | Extension |
+---|---|
+`<mmintrin.h>` | MMX |
+`<xmmintrin.h>` | SSE |
+`<emmintrin.h>` | SSE2 |
+`<pmmintrin.h>` | SSE3 |
+`<tmmintrin.h>` | SSSE3 |
+`<smmintrin.h>` | SSE4.1 |
+`<nmmintrin.h>` | SSE4.2 |
+`<wmmintrin.h>` | AES  |
+
+`sse2neon` aims to support SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AES extension.
+
+In order to deliver NEON-equivalent intrinsics for all SSE intrinsics used widely,
+please be aware that some SSE intrinsics exist a direct mapping with a concrete
+NEON-equivalent intrinsic. Others, unfortunately, lack a 1:1 mapping, meaning that
+their equivalents are built utilizing a number of NEON intrinsics.
+
+For example, SSE intrinsic `_mm_loadu_si128` has a direct NEON mapping (`vld1q_s32`),
+but SSE intrinsic `_mm_maddubs_epi16` has to be implemented with 13+ NEON instructions.
+
+### Floating-point compatibility
+
+Some conversions require several NEON intrinsics, which may produce inconsistent results
+compared to their SSE counterparts due to differences in the arithmetic rules of IEEE-754.
+
+Taking a possible conversion of `_mm_rsqrt_ps` as example:
+
+```c
+__m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+
+    return vreinterpretq_m128_f32(out);
+}
+```
+
+The `_mm_rsqrt_ps` conversion will produce NaN if a source value is `0.0` (first INF for the
+reciprocal square root of `0.0`, then INF * `0.0` using `vmulq_f32`). In contrast,
+the SSE counterpart produces INF if a source value is `0.0`.
+As a result, additional treatments should be applied to ensure consistency between the conversion and its SSE counterpart.
+
+## Requirement
+
+Developers are advised to utilize sse2neon.h with GCC version 10 or higher, or Clang version 11 or higher. While sse2neon.h might be compatible with earlier versions, certain vector operation errors have been identified in those versions. For further details, refer to the discussion in issue [#622](https://github.com/DLTcollab/sse2neon/issues/622).
+
+## Usage
+
+- Put the file `sse2neon.h` in to your source code directory.
+
+- Locate the following SSE header files included in the code:
+```C
+#include <xmmintrin.h>
+#include <emmintrin.h>
+```
+  {p,t,s,n,w}mmintrin.h could be replaceable as well.
+
+- Replace them with:
+```C
+#include "sse2neon.h"
+```
+- If you target Windows Arm64EC, pass `/D_DISABLE_SOFTINTRIN_=1` to MSVC or add `#define _DISABLE_SOFTINTRIN_ 1` in before `#include` any Windows header files to disable implicit inclusion of SSE header files.
+- Explicitly specify platform-specific options to gcc/clang compilers.
+  * On ARMv8-A 64-bit targets, you should specify the following compiler option: (Remove `crypto` and/or `crc` if your architecture does not support cryptographic and/or CRC32 extensions)
+  ```shell
+  -march=armv8-a+fp+simd+crypto+crc
+  ```
+  * On ARMv8-A 32-bit targets, you should specify the following compiler option:
+  ```shell
+  -mfpu=neon-fp-armv8
+  ```
+  * On ARMv7-A targets, you need to append the following compiler option:
+  ```shell
+  -mfpu=neon
+  ```
+
+## Compile-time Configurations
+
+Though floating-point operations in NEON use the IEEE single-precision format, NEON does not fully comply to the IEEE standard when inputs or results are denormal or NaN values for minimizing power consumption as well as maximizing performance.
+Considering the balance between correctness and performance, `sse2neon` recognizes the following compile-time configurations:
+* `SSE2NEON_PRECISE_MINMAX`: Enable precise implementation of `_mm_min_{ps,pd}` and `_mm_max_{ps,pd}`. If you need consistent results such as handling with NaN values, enable it.
+* `SSE2NEON_PRECISE_DIV`: Enable precise implementation of `_mm_rcp_ps` and `_mm_div_ps` by additional Netwon-Raphson iteration for accuracy.
+* `SSE2NEON_PRECISE_SQRT`: Enable precise implementation of `_mm_sqrt_ps` and `_mm_rsqrt_ps` by additional Netwon-Raphson iteration for accuracy.
+* `SSE2NEON_PRECISE_DP`: Enable precise implementation of `_mm_dp_pd`. When the conditional bit is not set, the corresponding multiplication would not be executed.
+* `SSE2NEON_SUPPRESS_WARNINGS`: Set this macro to disable the warning which is emitted by default when optimizations are enabled.
+
+The above are turned off by default, and you should define the corresponding macro(s) as `1` before including `sse2neon.h` if you need the precise implementations.
+
+## Run Built-in Test Suite
+
+`sse2neon` provides a unified interface for developing test cases. These test
+cases are located in `tests` directory, and the input data is specified at
+runtime. Use the following commands to perform test cases:
+```shell
+$ make check
+```
+
+For running check with enabling features, you can use assign the features with `FEATURE` command.
+If `none` is assigned, then the command will be the same as simply calling `make check`.
+The following command enable `crypto` and `crc` features in the tests.
+```
+$ make FEATURE=crypto+crc check
+```
+
+For running check on certain CPU, setting the mode of FPU, etc.,
+you can also assign the desired options with `ARCH_CFLAGS` command.
+If `none` is assigned, the command acts as same as calling `make check`.
+For instance, to run tests on Cortex-A53 with enabling ARM VFPv4 extension and NEON:
+```
+$ make ARCH_CFLAGS="-mcpu=cortex-a53 -mfpu=neon-vfpv4" check
+```
+
+### Running tests on hosts other than ARM platform
+
+For running tests on hosts other than ARM platform,
+you can specify GNU toolchain for cross compilation with `CROSS_COMPILE` command.
+[QEMU](https://www.qemu.org/) should be installed in advance.
+
+For ARMv8-A running in 64-bit mode type:
+```shell
+$ make CROSS_COMPILE=aarch64-linux-gnu- check # ARMv8-A
+```
+
+For ARMv7-A type:
+```shell
+$ make CROSS_COMPILE=arm-linux-gnueabihf- check # ARMv7-A
+```
+
+For ARMv8-A running in 32-bit mode (A32 instruction set) type:
+```shell
+$ make \
+  CROSS_COMPILE=arm-linux-gnueabihf- \
+  ARCH_CFLAGS="-mcpu=cortex-a32 -mfpu=neon-fp-armv8" \
+  check 
+```
+
+Check the details via [Test Suite for SSE2NEON](tests/README.md).
+
+### Optimization
+
+The SSE2NEON project is designed with performance-sensitive scenarios in mind, and as such, optimization options (e.g. `O1`, `O2`) can lead to misbehavior under specific circumstances. For example, frequent changes to the rounding mode or repeated calls to `_MM_SET_DENORMALS_ZERO_MODE()` may introduce unintended behavior.
+
+Enforcing no optimizations for specific intrinsics could solve these boundary cases but may negatively impact general performance. Therefore, we have decided to prioritize performance and shift the responsibility for handling such edge cases to developers.
+
+It is important to be aware of these potential pitfalls when enabling optimizations and ensure that your code accounts for these scenarios if necessary.
+
+
+## Adoptions
+Here is a partial list of open source projects that have adopted `sse2neon` for Arm/Aarch64 support.
+* [Aaru Data Preservation Suite](https://www.aaru.app/) is a fully-featured software package to preserve all storage media from the very old to the cutting edge, as well as to give detailed information about any supported image file (whether from Aaru or not) and to extract the files from those images.
+* [aether-game-utils](https://github.com/johnhues/aether-game-utils) is a collection of cross platform utilities for quickly creating small game prototypes in C++.
+* [ALE](https://github.com/sc932/ALE), aka Assembly Likelihood Evaluation, is a tool for evaluating accuracy of assemblies without the need of a reference genome.
+* [AnchorWave](https://github.com/baoxingsong/AnchorWave), Anchored Wavefront Alignment, identifies collinear regions via conserved anchors (full-length CDS and full-length exon have been implemented currently) and breaks collinear regions into shorter fragments, i.e., anchor and inter-anchor intervals.
+* [ATAK-CIV](https://github.com/deptofdefense/AndroidTacticalAssaultKit-CIV), Android Tactical Assault Kit for Civilian Use, is the official geospatial-temporal and situational awareness tool used by the US Government.
+* [Apache Doris](https://doris.apache.org/) is a Massively Parallel Processing (MPP) based interactive SQL data warehousing for reporting and analysis.
+* [Apache Impala](https://impala.apache.org/) is a lightning-fast, distributed SQL queries for petabytes of data stored in Apache Hadoop clusters.
+* [Apache Kudu](https://kudu.apache.org/) completes Hadoop's storage layer to enable fast analytics on fast data.
+* [apollo](https://github.com/ApolloAuto/apollo) is a high performance, flexible architecture which accelerates the development of Autonomous Vehicles.
+* [ares](https://github.com/ares-emulator/ares) is a cross-platform, open source, multi-system emulator, focusing on accuracy and preservation.
+* [ART](https://github.com/dinosaure/art) is an implementation in OCaml of [Adaptive Radix Tree](https://db.in.tum.de/~leis/papers/ART.pdf) (ART).
+* [Async](https://github.com/romange/async) is a set of c++ primitives that allows efficient and rapid development in C++17 on GNU/Linux systems.
+* [avec](https://github.com/unevens/avec) is a little library for using SIMD instructions on both x86 and Arm.
+* [BEAGLE](https://github.com/beagle-dev/beagle-lib) is a high-performance library that can perform the core calculations at the heart of most Bayesian and Maximum Likelihood phylogenetics packages.
+* [BitMagic](https://github.com/tlk00/BitMagic) implements compressed bit-vectors and containers (vectors) based on ideas of bit-slicing transform and Rank-Select compression, offering sets of method to architect your applications to use HPC techniques to save memory (thus be able to fit more data in one compute unit) and improve storage and traffic patterns when storing data vectors and models in files or object stores.
+* [bipartite\_motif\_finder](https://github.com/soedinglab/bipartite_motif_finder) as known as BMF (Bipartite Motif Finder) is an open source tool for finding co-occurences of sequence motifs in genomic sequences.
+* [Blender](https://www.blender.org/) is the free and open source 3D creation suite, supporting the entirety of the 3D pipeline.
+* [Boo](https://github.com/AxioDL/boo) is a cross-platform windowing and event manager similar to SDL or SFML, with additional 3D rendering functionality.
+* [Brickworks](https://github.com/sdangelo/brickworks) is a music DSP toolkit that supplies with the fundamental building blocks for creating and enhancing audio engines on any platform.
+* [CARTA](https://github.com/CARTAvis/carta-backend) is a new visualization tool designed for viewing radio astronomy images in CASA, FITS, MIRIAD, and HDF5 formats (using the IDIA custom schema for HDF5).
+* [Catcoon](https://github.com/i-evi/catcoon) is a [feedforward neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network) implementation in C.
+* [compute-runtime](https://github.com/intel/compute-runtime), the Intel Graphics Compute Runtime for oneAPI Level Zero and OpenCL Driver, provides compute API support (Level Zero, OpenCL) for Intel graphics hardware architectures (HD Graphics, Xe).
+* [contour](https://github.com/contour-terminal/contour) is a modern and actually fast virtual terminal emulator.
+* [Cog](https://github.com/losnoco/Cog) is a free and open source audio player for macOS.
+* [dab-cmdline](https://github.com/JvanKatwijk/dab-cmdline) provides entries for the functionality to handle Digital audio broadcasting (DAB)/DAB+ through some simple calls.
+* [DISTRHO](https://distrho.sourceforge.io/) is an open-source project for Cross-Platform Audio Plugins.
+* [Dragonfly](https://github.com/dragonflydb/dragonfly) is a modern in-memory datastore, fully compatible with Redis and Memcached APIs.
+* [EDGE](https://github.com/3dfxdev/EDGE) is an advanced OpenGL source port spawned from the DOOM engine, with focus on easy development and expansion for modders and end-users.
+* [Embree](https://github.com/embree/embree) is a collection of high-performance ray tracing kernels. Its target users are graphics application engineers who want to improve the performance of their photo-realistic rendering application by leveraging Embree's performance-optimized ray tracing kernels.
+* [emp-tool](https://github.com/emp-toolkit/emp-tool) aims to provide a benchmark for secure computation and allowing other researchers to experiment and extend.
+* [Exudyn](https://github.com/jgerstmayr/EXUDYN) is a C++ based Python library for efficient simulation of flexible multibody dynamics systems.
+* [FoundationDB](https://www.foundationdb.org) is a distributed database designed to handle large volumes of structured data across clusters of commodity servers.
+* [fsrc](https://github.com/elsamuko/fsrc) is capable of searching large codebases for text snippets.
+* [GDAL](https://gdal.org) is a translator library for raster and vector geospatial data formats that comes with a variety of useful command line utilities for data translation and processing.
+* [gmmlib](https://github.com/intel/gmmlib) is the Intel Graphics Memory Management Library that provides device specific and buffer management for the Intel Graphics Compute Runtime for OpenCL and the Intel Media Driver for VAAPI.
+* [HISE](https://github.com/christophhart/HISE) is a cross-platform open source audio application for building virtual instruments, emphasizing on sampling, but includes some basic synthesis features for making hybrid instruments as well as audio effects.
+* [iqtree2](https://github.com/iqtree/iqtree2) is an efficient and versatile stochastic implementation to infer phylogenetic trees by maximum likelihood.
+* [indelPost](https://github.com/stjude/indelPost) is a Python library for indel processing via realignment and read-based phasing to resolve alignment ambiguities.
+* [IResearch](https://github.com/iresearch-toolkit/iresearch) is a cross-platform, high-performance document oriented search engine library written entirely in C++ with the focus on a pluggability of different ranking/similarity models.
+* [Kraken](https://github.com/Wabi-Studios/Kraken) is a 3D animation platform redefining animation composition, collaborative workflows, simulation engines, skeletal rigging systems, and look development from storyboard to final render.
+* [kram](https://github.com/alecazam/kram) is a wrapper to several popular encoders to and from PNG/[KTX](https://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/) files with [LDR/HDR and BC/ASTC/ETC2](https://developer.arm.com/solutions/graphics-and-gaming/developer-guides/learn-the-basics/adaptive-scalable-texture-compression/single-page).
+* [Krita](https://invent.kde.org/graphics/krita) is a cross-platform application that offers an end-to-end solution for creating digital art files from scratch built on the KDE and Qt frameworks.
+* [libCML](https://github.com/belosthomas/libCML) is a SLAM library and scientific tool, which include a novel fast thread-safe graph map implementation.
+* [libhdfs3](https://github.com/ClickHouse/libhdfs3) is implemented based on native Hadoop RPC protocol and Hadoop Distributed File System (HDFS), a highly fault-tolerant distributed fs, data transfer protocol.
+* [libpostal](https://github.com/openvenues/libpostal) is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data.
+* [libscapi](https://github.com/cryptobiu/libscapi) stands for the "Secure Computation API", providing  reliable, efficient, and highly flexible cryptographic infrastructure.
+* [libstreamvbyte](https://github.com/wst24365888/libstreamvbyte) is a C++ implementation of [StreamVByte](https://arxiv.org/abs/1709.08990).
+* [libmatoya](https://github.com/matoya/libmatoya) is a cross-platform application development library, providing various features such as common cryptography tasks.
+* [Loosejaw](https://github.com/TheHolyDiver/Loosejaw) provides deep hybrid CPU/GPU digital signal processing.
+* [Madronalib](https://github.com/madronalabs/madronalib) enables efficient audio DSP on SIMD processors with readable and brief C++ code.
+* [minimap2](https://github.com/lh3/minimap2) is a versatile sequence alignment program that aligns DNA or mRNA sequences against a large reference database.
+* [mixed-fem](https://github.com/tytrusty/mixed-fem) is an open source reference implementation of Mixed Variational Finite Elements for Implicit Simulation of Deformables.
+* [MMseqs2](https://github.com/soedinglab/MMseqs2) (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets.
+* [MRIcroGL](https://github.com/rordenlab/MRIcroGL) is a cross-platform tool for viewing NIfTI, DICOM, MGH, MHD, NRRD, AFNI format medical images.
+* [N2](https://github.com/oddconcepts/n2o) is an approximate nearest neighborhoods algorithm library written in C++, providing a much faster search speed than other implementations when modeling large dataset.
+* [nanors](https://github.com/sleepybishop/nanors) is a tiny, performant implementation of [Reed-Solomon codes](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction), capable of reaching multi-gigabit speeds on a single core.
+* [niimath](https://github.com/rordenlab/niimath) is a general image calculator with superior performance.
+* [NVIDIA GameWorks](https://developer.nvidia.com/gameworks-source-github) has been already used in a lot of games. These repositories are public on GitHub.
+* [Nx Meta Platform Open Source Components](https://github.com/networkoptix/nx_open) are used to build all Powered-by-Nx products including Nx Witness Video Management System (VMS).
+* [ofxNDI](https://github.com/leadedge/ofxNDI) is an [openFrameworks](https://openframeworks.cc/) addon to allow sending and receiving images over a network using the [NewTek](https://en.wikipedia.org/wiki/NewTek) Network Device Protocol.
+* [OGRE](https://github.com/OGRECave/ogre) is a scene-oriented, flexible 3D engine written in C++ designed to make it easier and more intuitive for developers to produce games and demos utilising 3D hardware.
+* [Olive](https://github.com/olive-editor/olive) is a free non-linear video editor for Windows, macOS, and Linux.
+* [OpenColorIO](https://github.com/AcademySoftwareFoundation/OpenColorIO) a complete color management solution geared towards motion picture production with an emphasis on visual effects and computer animation.
+* [OpenXRay](https://github.com/OpenXRay/xray-16) is an improved version of the X-Ray engine, used in world famous S.T.A.L.K.E.R. game series by GSC Game World.
+* [parallel-n64](https://github.com/libretro/parallel-n64) is an optimized/rewritten Nintendo 64 emulator made specifically for [Libretro](https://www.libretro.com/).
+* [Pathfinder C++](https://github.com/floppyhammer/pathfinder-cpp) is a fast, practical, GPU-based rasterizer for fonts and vector graphics using Vulkan and C++.
+* [PFFFT](https://github.com/marton78/pffft) does 1D Fast Fourier Transforms, of single precision real and complex vectors.
+* [pixaccess](https://github.com/oliverue/pixaccess) provides the abstractions for integer and float bitmaps, pixels, and aliased (nearest neighbor) and anti-aliased (bi-linearly interpolated) pixel access.
+* [PlutoSDR Firmware](https://github.com/seanstone/plutosdr-fw) is the customized firmware for the [PlutoSDR](https://wiki.analog.com/university/tools/pluto) that can be used to introduce fundamentals of Software Defined Radio (SDR) or Radio Frequency (RF) or Communications as advanced topics in electrical engineering in a self or instructor lead setting.
+* [PowerToys](https://github.com/microsoft/PowerToys) is a set of utilities for power users to tune and streamline their Windows experience for greater productivity.
+* [Pygame](https://www.pygame.org) is cross-platform and designed to make it easy to write multimedia software, such as games, in Python.
+* [R:RandomFieldsUtils](https://cran.r-project.org/web/packages/RandomFieldsUtils) provides various utilities might be used in spatial statistics and elsewhere. (CRAN)
+* [RAxML](https://github.com/stamatak/standard-RAxML) is tool for Phylogenetic Analysis and Post-Analysis of Large Phylogenies.
+* [ReHLDS](https://github.com/gennadykataev/rehlds) is fully compatible with latest Half-Life Dedicated Server (HLDS) with a lot of defects and (potential) bugs fixed.
+* [rkcommon](https://github.com/ospray/rkcommon) represents a common set of C++ infrastructure and CMake utilities used by various components of [Intel oneAPI Rendering Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/rendering-toolkit.html).
+* [RPCS3](https://github.com/RPCS3/rpcs3) is the world's first free and open-source PlayStation 3 emulator/debugger, written in C++.
+* [simd\_utils](https://github.com/JishinMaster/simd_utils) is a header-only library implementing common mathematical functions using SIMD intrinsics.
+* [Sire](https://github.com/OpenBioSim/sire) is a molecular modelling framework that provides extensive functionality to manipulate representations of biomolecular systems.
+* [SMhasher](https://github.com/rurban/smhasher) provides comprehensive Hash function quality and speed tests.
+* [SNN++](https://github.com/ianmkim/snnpp) implements a single layer non linear Spiking Neural Network for images classification and generation.
+* [Spack](https://github.com/spack/spack) is a multi-platform package manager that builds and installs multiple versions and configurations of software.
+* [SRA](https://github.com/ncbi/sra-tools) is a collection of tools and libraries for using data in the [INSDC Sequence Read Archives](https://www.ncbi.nlm.nih.gov/sra/docs/).
+* [srsLTE](https://github.com/srsLTE/srsLTE) is an open source SDR LTE software suite.
+* [SSW](https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library) is a fast implementation of the [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm), which uses the SIMD instructions to parallelize the algorithm at the instruction level.
+* [Surge](https://github.com/surge-synthesizer/surge) is an open source digital synthesizer.
+* [The Forge](https://github.com/ConfettiFX/The-Forge) is a cross-platform rendering framework, providing building blocks to write your own game engine.
+* [Typesense](https://github.com/typesense/typesense) is a fast, typo-tolerant search engine for building delightful search experiences.
+* [Vcpkg](https://github.com/microsoft/vcpkg) is a C++ Library Manager for Windows, Linux, and macOS.
+* [VelocyPack](https://github.com/arangodb/velocypack) is a fast and compact format for serialization and storage.
+* [VOLK](https://github.com/gnuradio/volk), Vector-Optimized Library of Kernel, is a sub-project of [GNU Radio](https://www.gnuradio.org/).
+* [Vowpal Wabbit](https://github.com/VowpalWabbit/vowpal_wabbit) is a machine learning system which pushes the frontier of machine learning with techniques such as online, hashing, allreduce, reductions, learning2search, active, and interactive learning.
+* [Winter](https://github.com/rosenthj/Winter) is the top rated chess engine from Switzerland and has competed at top invite only computer chess events.
+* [XEVE](https://github.com/mpeg5/xeve) (eXtra-fast Essential Video Encoder) is an open sourced and fast MPEG-5 EVC encoder.
+* [XMRig](https://github.com/xmrig/xmrig) is an open source CPU miner for [Monero](https://web.getmonero.org/) cryptocurrency.
+* [xsimd](https://github.com/xtensor-stack/xsimd) provides a unified means for using SIMD intrinsics and parallelized, optimized mathematical functions.
+* [YACL](https://github.com/secretflow/yasl) is a C++ library contains modules and utilities which [SecretFlow](https://github.com/secretflow) code depends on.
+
+## Related Projects
+* [SIMDe](https://github.com/simd-everywhere/simde): fast and portable implementations of SIMD
+  intrinsics on hardware which doesn't natively support them, such as calling SSE functions on ARM.
+* [CatBoost's sse2neon](https://github.com/catboost/catboost/blob/master/library/cpp/sse/sse2neon.h)
+* [ARM\_NEON\_2\_x86\_SSE](https://github.com/intel/ARM_NEON_2_x86_SSE)
+* [AvxToNeon](https://github.com/kunpengcompute/AvxToNeon)
+* [sse2rvv](https://github.com/FeddrickAquino/sse2rvv): C header file that converts Intel SSE intrinsics to RISC-V Vector intrinsic.
+* [sse2msa](https://github.com/i-evi/sse2msa): A C/C++ header file that converts Intel SSE intrinsics to MIPS/MIPS64 MSA intrinsics.
+* [sse2zig](https://github.com/aqrit/sse2zig): Intel SSE intrinsics mapped to [Zig](https://ziglang.org/) vector extensions.
+* [POWER/PowerPC support for GCC](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000) contains a series of headers simplifying porting x86\_64 code that makes explicit use of Intel intrinsics to powerpc64le (pure little-endian mode that has been introduced with the [POWER8](https://en.wikipedia.org/wiki/POWER8)).
+    - implementation: [xmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/xmmintrin.h), [emmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/emmintrin.h), [pmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/pmmintrin.h), [tmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/tmmintrin.h), [smmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/smmintrin.h)
+
+## Reference
+* [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html)
+* [Microsoft: x86 intrinsics list](https://learn.microsoft.com/en-us/cpp/intrinsics/x86-intrinsics-list)
+* [Arm Neon Intrinsics Reference](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
+* [Neon Programmer's Guide for Armv8-A](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/neon-programmers-guide-for-armv8-a)
+* [NEON Programmer's Guide](https://static.docs.arm.com/den0018/a/DEN0018A_neon_programmers_guide_en.pdf)
+* [qemu/target/i386/ops\_sse.h](https://github.com/qemu/qemu/blob/master/target/i386/ops_sse.h): Comprehensive SSE instruction emulation in C. Ideal for semantic checks.
+* [Porting Takua Renderer to 64-bit ARM- Part 1](https://blog.yiningkarlli.com/2021/05/porting-takua-to-arm-pt1.html)
+* [Porting Takua Renderer to 64-bit ARM- Part 2](https://blog.yiningkarlli.com/2021/07/porting-takua-to-arm-pt2.html)
+* [Comparing SIMD on x86-64 and arm64](https://blog.yiningkarlli.com/2021/09/neon-vs-sse.html)
+* [Port with SSE2Neon and SIMDe](https://developer.arm.com/documentation/102581/0200/Port-with-SSE2Neon-and-SIMDe)
+* [Genomics: Optimizing the BWA aligner for Arm Servers](https://community.arm.com/arm-community-blogs/b/high-performance-computing-blog/posts/optimizing-genomics-and-the-bwa-aligner-for-arm-servers)
+* [Bit twiddling with Arm Neon: beating SSE movemasks, counting bits and more](https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon)
+* [C/C++ on Graviton](https://github.com/aws/aws-graviton-getting-started/blob/main/c-c%2B%2B.md)
+* [C/C++ on NVIDIA Grace](https://nvidia.github.io/grace-cpu-benchmarking-guide/developer/languages/c-c++.html)
+* [Tune graphics-intensive games for Apple silicon](https://developer.apple.com/games/pathway/)
+* [Benchmarking and Testing of Qualcomm Snapdragon System-on-Chip for JPL Space Applications and Missions](https://ieeexplore.ieee.org/abstract/document/9843518)
+* [Spotlight: Petrobras Speeds Up Linear Solvers for Reservoir Simulation Using NVIDIA Grace CPU](https://developer.nvidia.com/blog/spotlight-petrobras-accelerates-linear-solvers-for-reservoir-simulation-using-nvidia-grace-cpu/)
+
+## Licensing
+
+`sse2neon` is freely redistributable under the MIT License.
--- a/deps/sse2neon/sse2neon.h
+++ b/deps/sse2neon/sse2neon.h
--- a/deps/sse2neon/sse2neon.sln
+++ b/deps/sse2neon/sse2neon.sln
@ -0,0 +1,37 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.3.32901.215
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sse2neon", "sse2neon.vcxproj", "{341BF194-865B-4DA6-8120-93173498E774}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|ARM = Debug|ARM
+		Debug|ARM64 = Debug|ARM64
+		Debug|ARM64EC = Debug|ARM64EC
+		Release|ARM = Release|ARM
+		Release|ARM64 = Release|ARM64
+		Release|ARM64EC = Release|ARM64EC
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM.ActiveCfg = Debug|ARM
+		{341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM.Build.0 = Debug|ARM
+		{341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM64.ActiveCfg = Debug|ARM64
+		{341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM64.Build.0 = Debug|ARM64
+		{341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC
+		{341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM64EC.Build.0 = Debug|ARM64EC
+		{341BF194-865B-4DA6-8120-93173498E774}.Release|ARM.ActiveCfg = Release|ARM
+		{341BF194-865B-4DA6-8120-93173498E774}.Release|ARM.Build.0 = Release|ARM
+		{341BF194-865B-4DA6-8120-93173498E774}.Release|ARM64.ActiveCfg = Release|ARM64
+		{341BF194-865B-4DA6-8120-93173498E774}.Release|ARM64.Build.0 = Release|ARM64
+		{341BF194-865B-4DA6-8120-93173498E774}.Release|ARM64EC.ActiveCfg = Release|ARM64EC
+		{341BF194-865B-4DA6-8120-93173498E774}.Release|ARM64EC.Build.0 = Release|ARM64EC
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {D503B299-AA05-4E05-A8D9-37C8D229ACB1}
+	EndGlobalSection
+EndGlobal
--- a/deps/sse2neon/sse2neon.vcxproj
+++ b/deps/sse2neon/sse2neon.vcxproj
@ -0,0 +1,217 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64EC">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM">
+      <Configuration>Release</Configuration>
+      <Platform>ARM</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64EC">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{341bf194-865b-4da6-8120-93173498e774}</ProjectGuid>
+    <RootNamespace>sse2neon</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>__i386__;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>.;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>__i386__;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>.;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>.;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalOptions>/Zc:preprocessor</AdditionalOptions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>.;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalOptions>/Zc:preprocessor</AdditionalOptions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+      <PreprocessToFile>false</PreprocessToFile>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>.;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalOptions>/Zc:preprocessor %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>.;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalOptions>/Zc:preprocessor %(AdditionalOptions)</AdditionalOptions>
+      <PreprocessToFile>false</PreprocessToFile>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="tests\binding.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+      </ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="tests\common.cpp" />
+    <ClCompile Include="tests\impl.cpp" />
+    <ClCompile Include="tests\main.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="sse2neon.h" />
+    <ClInclude Include="tests\binding.h" />
+    <ClInclude Include="tests\common.h" />
+    <ClInclude Include="tests\impl.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/deps/sse2neon/sse2neon.vcxproj.filters
+++ b/deps/sse2neon/sse2neon.vcxproj.filters
@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="tests\binding.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="tests\common.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="tests\impl.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="tests\main.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="tests\binding.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="tests\common.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="tests\impl.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="sse2neon.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
--- a/deps/sse2neon/tests/README.md
+++ b/deps/sse2neon/tests/README.md
@ -0,0 +1,29 @@
+# Test Suite for SSE2NEON
+
+:warning: **Warning: The test suite is based on the little-endian architecture.**
+
+## Add More Test Items
+Once the conversion is implemented, the test can be added with the following steps:
+
+* File `tests/impl.h`
+
+  Add the intrinsic under `INTRIN_LIST` macro. The naming convention
+  should be `mm_xxx`.
+  Place it in the correct classification with the alphabetical order.
+  The classification can be referenced from [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html).
+
+* File `tests/impl.cpp`
+    ```c
+    result_t test_mm_xxx()
+    {
+        // The C implementation
+        ...
+
+        // The Neon implementation
+        ret = _mm_xxx();
+
+        // Compare the result of two implementations and return either
+        // TEST_SUCCESS, TEST_FAIL, or TEST_UNIMPL
+        ...
+    }
+    ```
--- a/deps/sse2neon/tests/binding.cpp
+++ b/deps/sse2neon/tests/binding.cpp
@ -0,0 +1,35 @@
+#include "binding.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace SSE2NEON
+{
+void *platformAlignedAlloc(size_t size)
+{
+    void *address;
+#if defined(_WIN32)
+    address = _aligned_malloc(size, 16);
+    if (!address) {
+#else
+    int ret = posix_memalign(&address, 16, size);
+    if (ret != 0) {
+#endif
+        fprintf(stderr, "Error at File %s line number %d\n", __FILE__,
+                __LINE__);
+        exit(EXIT_FAILURE);
+    }
+    return address;
+}
+
+void platformAlignedFree(void *ptr)
+{
+#if defined(_WIN32)
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}
+
+
+}  // namespace SSE2NEON
--- a/deps/sse2neon/tests/binding.h
+++ b/deps/sse2neon/tests/binding.h
@ -0,0 +1,19 @@
+#ifndef SSE2NEONBINDING_H
+#define SSE2NEONBINDING_H
+
+#include <stdlib.h>
+
+// The SSE2NEON unit tests run both within our own internal project
+// as well as within the open source framework.
+// This header file is used to abstract any distinctions between
+// those two build environments.
+//
+// Initially, this is for how 16 byte aligned memory is allocated
+namespace SSE2NEON
+{
+void *platformAlignedAlloc(size_t size);
+void platformAlignedFree(void *ptr);
+
+}  // namespace SSE2NEON
+
+#endif
--- a/deps/sse2neon/tests/common.cpp
+++ b/deps/sse2neon/tests/common.cpp
@ -0,0 +1,414 @@
+#include "common.h"
+#include <cmath>
+#include <cstdint>
+
+namespace SSE2NEON
+{
+int32_t NaN = ~0;
+int64_t NaN64 = ~0;
+
+result_t validateInt64(__m128i a, int64_t i0, int64_t i1)
+{
+    const int64_t *t = (const int64_t *) &a;
+    ASSERT_RETURN(t[0] == i0);
+    ASSERT_RETURN(t[1] == i1);
+    return TEST_SUCCESS;
+}
+
+result_t validateInt64(__m64 a, int64_t i0)
+{
+    const int64_t *t = (const int64_t *) &a;
+    ASSERT_RETURN(t[0] == i0);
+    return TEST_SUCCESS;
+}
+
+result_t validateUInt64(__m128i a, uint64_t u0, uint64_t u1)
+{
+    const uint64_t *t = (const uint64_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    ASSERT_RETURN(t[1] == u1);
+    return TEST_SUCCESS;
+}
+
+result_t validateUInt64(__m64 a, uint64_t u0)
+{
+    const uint64_t *t = (const uint64_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    return TEST_SUCCESS;
+}
+
+result_t validateInt32(__m128i a,
+                       int32_t i0,
+                       int32_t i1,
+                       int32_t i2,
+                       int32_t i3)
+{
+    const int32_t *t = (const int32_t *) &a;
+    ASSERT_RETURN(t[0] == i0);
+    ASSERT_RETURN(t[1] == i1);
+    ASSERT_RETURN(t[2] == i2);
+    ASSERT_RETURN(t[3] == i3);
+    return TEST_SUCCESS;
+}
+
+result_t validateUInt32(__m128i a,
+                        uint32_t u0,
+                        uint32_t u1,
+                        uint32_t u2,
+                        uint32_t u3)
+{
+    const uint32_t *t = (const uint32_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    ASSERT_RETURN(t[1] == u1);
+    ASSERT_RETURN(t[2] == u2);
+    ASSERT_RETURN(t[3] == u3);
+    return TEST_SUCCESS;
+}
+
+result_t validateUInt32(__m64 a, uint32_t u0, uint32_t u1)
+{
+    const uint32_t *t = (const uint32_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    ASSERT_RETURN(t[1] == u1);
+    return TEST_SUCCESS;
+}
+
+result_t validateInt16(__m128i a,
+                       int16_t i0,
+                       int16_t i1,
+                       int16_t i2,
+                       int16_t i3,
+                       int16_t i4,
+                       int16_t i5,
+                       int16_t i6,
+                       int16_t i7)
+{
+    const int16_t *t = (const int16_t *) &a;
+    ASSERT_RETURN(t[0] == i0);
+    ASSERT_RETURN(t[1] == i1);
+    ASSERT_RETURN(t[2] == i2);
+    ASSERT_RETURN(t[3] == i3);
+    ASSERT_RETURN(t[4] == i4);
+    ASSERT_RETURN(t[5] == i5);
+    ASSERT_RETURN(t[6] == i6);
+    ASSERT_RETURN(t[7] == i7);
+    return TEST_SUCCESS;
+}
+
+result_t validateInt16(__m64 a, int16_t i0, int16_t i1, int16_t i2, int16_t i3)
+{
+    const int16_t *t = (const int16_t *) &a;
+    ASSERT_RETURN(t[0] == i0);
+    ASSERT_RETURN(t[1] == i1);
+    ASSERT_RETURN(t[2] == i2);
+    ASSERT_RETURN(t[3] == i3);
+    return TEST_SUCCESS;
+}
+
+result_t validateUInt16(__m128i a,
+                        uint16_t u0,
+                        uint16_t u1,
+                        uint16_t u2,
+                        uint16_t u3,
+                        uint16_t u4,
+                        uint16_t u5,
+                        uint16_t u6,
+                        uint16_t u7)
+{
+    const uint16_t *t = (const uint16_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    ASSERT_RETURN(t[1] == u1);
+    ASSERT_RETURN(t[2] == u2);
+    ASSERT_RETURN(t[3] == u3);
+    ASSERT_RETURN(t[4] == u4);
+    ASSERT_RETURN(t[5] == u5);
+    ASSERT_RETURN(t[6] == u6);
+    ASSERT_RETURN(t[7] == u7);
+    return TEST_SUCCESS;
+}
+
+result_t validateInt32(__m64 a, int32_t u0, int32_t u1)
+{
+    const int32_t *t = (const int32_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    ASSERT_RETURN(t[1] == u1);
+    return TEST_SUCCESS;
+}
+
+result_t validateUInt16(__m64 a,
+                        uint16_t u0,
+                        uint16_t u1,
+                        uint16_t u2,
+                        uint16_t u3)
+{
+    const uint16_t *t = (const uint16_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    ASSERT_RETURN(t[1] == u1);
+    ASSERT_RETURN(t[2] == u2);
+    ASSERT_RETURN(t[3] == u3);
+    return TEST_SUCCESS;
+}
+
+result_t validateInt8(__m128i a,
+                      int8_t i0,
+                      int8_t i1,
+                      int8_t i2,
+                      int8_t i3,
+                      int8_t i4,
+                      int8_t i5,
+                      int8_t i6,
+                      int8_t i7,
+                      int8_t i8,
+                      int8_t i9,
+                      int8_t i10,
+                      int8_t i11,
+                      int8_t i12,
+                      int8_t i13,
+                      int8_t i14,
+                      int8_t i15)
+{
+    const int8_t *t = (const int8_t *) &a;
+    ASSERT_RETURN(t[0] == i0);
+    ASSERT_RETURN(t[1] == i1);
+    ASSERT_RETURN(t[2] == i2);
+    ASSERT_RETURN(t[3] == i3);
+    ASSERT_RETURN(t[4] == i4);
+    ASSERT_RETURN(t[5] == i5);
+    ASSERT_RETURN(t[6] == i6);
+    ASSERT_RETURN(t[7] == i7);
+    ASSERT_RETURN(t[8] == i8);
+    ASSERT_RETURN(t[9] == i9);
+    ASSERT_RETURN(t[10] == i10);
+    ASSERT_RETURN(t[11] == i11);
+    ASSERT_RETURN(t[12] == i12);
+    ASSERT_RETURN(t[13] == i13);
+    ASSERT_RETURN(t[14] == i14);
+    ASSERT_RETURN(t[15] == i15);
+    return TEST_SUCCESS;
+}
+
+result_t validateInt8(__m64 a,
+                      int8_t i0,
+                      int8_t i1,
+                      int8_t i2,
+                      int8_t i3,
+                      int8_t i4,
+                      int8_t i5,
+                      int8_t i6,
+                      int8_t i7)
+{
+    const int8_t *t = (const int8_t *) &a;
+    ASSERT_RETURN(t[0] == i0);
+    ASSERT_RETURN(t[1] == i1);
+    ASSERT_RETURN(t[2] == i2);
+    ASSERT_RETURN(t[3] == i3);
+    ASSERT_RETURN(t[4] == i4);
+    ASSERT_RETURN(t[5] == i5);
+    ASSERT_RETURN(t[6] == i6);
+    ASSERT_RETURN(t[7] == i7);
+    return TEST_SUCCESS;
+}
+
+result_t validateUInt8(__m128i a,
+                       uint8_t u0,
+                       uint8_t u1,
+                       uint8_t u2,
+                       uint8_t u3,
+                       uint8_t u4,
+                       uint8_t u5,
+                       uint8_t u6,
+                       uint8_t u7,
+                       uint8_t u8,
+                       uint8_t u9,
+                       uint8_t u10,
+                       uint8_t u11,
+                       uint8_t u12,
+                       uint8_t u13,
+                       uint8_t u14,
+                       uint8_t u15)
+{
+    const uint8_t *t = (const uint8_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    ASSERT_RETURN(t[1] == u1);
+    ASSERT_RETURN(t[2] == u2);
+    ASSERT_RETURN(t[3] == u3);
+    ASSERT_RETURN(t[4] == u4);
+    ASSERT_RETURN(t[5] == u5);
+    ASSERT_RETURN(t[6] == u6);
+    ASSERT_RETURN(t[7] == u7);
+    ASSERT_RETURN(t[8] == u8);
+    ASSERT_RETURN(t[9] == u9);
+    ASSERT_RETURN(t[10] == u10);
+    ASSERT_RETURN(t[11] == u11);
+    ASSERT_RETURN(t[12] == u12);
+    ASSERT_RETURN(t[13] == u13);
+    ASSERT_RETURN(t[14] == u14);
+    ASSERT_RETURN(t[15] == u15);
+    return TEST_SUCCESS;
+}
+
+result_t validateUInt8(__m64 a,
+                       uint8_t u0,
+                       uint8_t u1,
+                       uint8_t u2,
+                       uint8_t u3,
+                       uint8_t u4,
+                       uint8_t u5,
+                       uint8_t u6,
+                       uint8_t u7)
+{
+    const uint8_t *t = (const uint8_t *) &a;
+    ASSERT_RETURN(t[0] == u0);
+    ASSERT_RETURN(t[1] == u1);
+    ASSERT_RETURN(t[2] == u2);
+    ASSERT_RETURN(t[3] == u3);
+    ASSERT_RETURN(t[4] == u4);
+    ASSERT_RETURN(t[5] == u5);
+    ASSERT_RETURN(t[6] == u6);
+    ASSERT_RETURN(t[7] == u7);
+    return TEST_SUCCESS;
+}
+
+result_t validateSingleFloatPair(float a, float b)
+{
+    const uint32_t *ua = (const uint32_t *) &a;
+    const uint32_t *ub = (const uint32_t *) &b;
+    // We do an integer (binary) compare rather than a
+    // floating point compare to take NaNs and infinities
+    // into account as well.
+    return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL;
+}
+
+result_t validateSingleDoublePair(double a, double b)
+{
+    const uint64_t *ua = (const uint64_t *) &a;
+    const uint64_t *ub = (const uint64_t *) &b;
+    // We do an integer (binary) compare rather than a
+    // floating point compare to take NaNs and infinities
+    // into account as well.
+
+    if (std::isnan(a) && std::isnan(b)) {
+        return TEST_SUCCESS;
+    }
+
+    return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL;
+}
+
+result_t validateFloat(__m128 a, float f0, float f1, float f2, float f3)
+{
+    const float *t = (const float *) &a;
+    ASSERT_RETURN(validateSingleFloatPair(t[0], f0));
+    ASSERT_RETURN(validateSingleFloatPair(t[1], f1));
+    ASSERT_RETURN(validateSingleFloatPair(t[2], f2));
+    ASSERT_RETURN(validateSingleFloatPair(t[3], f3));
+    return TEST_SUCCESS;
+}
+
+result_t validateFloatEpsilon(__m128 a,
+                              float f0,
+                              float f1,
+                              float f2,
+                              float f3,
+                              float epsilon)
+{
+    const float *t = (const float *) &a;
+    float df0 = fabsf(t[0] - f0);
+    float df1 = fabsf(t[1] - f1);
+    float df2 = fabsf(t[2] - f2);
+    float df3 = fabsf(t[3] - f3);
+
+    // Due to floating-point error, subtracting floating-point number with NaN
+    // and zero value usually produces erroneous result. Therefore, we directly
+    // define the difference of two floating-point numbers to zero if both
+    // numbers are NaN or zero.
+    if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0)) {
+        df0 = 0;
+    }
+
+    if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0)) {
+        df1 = 0;
+    }
+
+    if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0)) {
+        df2 = 0;
+    }
+
+    if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0)) {
+        df3 = 0;
+    }
+
+    ASSERT_RETURN(df0 < epsilon);
+    ASSERT_RETURN(df1 < epsilon);
+    ASSERT_RETURN(df2 < epsilon);
+    ASSERT_RETURN(df3 < epsilon);
+    return TEST_SUCCESS;
+}
+
+result_t validateFloatError(__m128 a,
+                            float f0,
+                            float f1,
+                            float f2,
+                            float f3,
+                            float err)
+{
+    const float *t = (const float *) &a;
+    float df0 = fabsf((t[0] - f0) / f0);
+    float df1 = fabsf((t[1] - f1) / f1);
+    float df2 = fabsf((t[2] - f2) / f2);
+    float df3 = fabsf((t[3] - f3) / f3);
+
+    if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) ||
+        (std::isinf(t[0]) && std::isinf(f0))) {
+        df0 = 0;
+    }
+
+    if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) ||
+        (std::isinf(t[1]) && std::isinf(f1))) {
+        df1 = 0;
+    }
+
+    if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0) ||
+        (std::isinf(t[2]) && std::isinf(f2))) {
+        df2 = 0;
+    }
+
+    if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0) ||
+        (std::isinf(t[3]) && std::isinf(f3))) {
+        df3 = 0;
+    }
+
+    ASSERT_RETURN(df0 < err);
+    ASSERT_RETURN(df1 < err);
+    ASSERT_RETURN(df2 < err);
+    ASSERT_RETURN(df3 < err);
+    return TEST_SUCCESS;
+}
+
+result_t validateDouble(__m128d a, double d0, double d1)
+{
+    const double *t = (const double *) &a;
+    ASSERT_RETURN(validateSingleDoublePair(t[0], d0));
+    ASSERT_RETURN(validateSingleDoublePair(t[1], d1));
+    return TEST_SUCCESS;
+}
+
+result_t validateFloatError(__m128d a, double d0, double d1, double err)
+{
+    const double *t = (const double *) &a;
+    double td0 = fabs((t[0] - d0) / d0);
+    double td1 = fabs((t[1] - d1) / d1);
+
+    if (std::isnan(t[0]) && std::isnan(d0)) {
+        td0 = 0;
+    }
+
+    if (std::isnan(t[1]) && std::isnan(d1)) {
+        td1 = 0;
+    }
+
+    ASSERT_RETURN(td0 < err);
+    ASSERT_RETURN(td1 < err);
+    return TEST_SUCCESS;
+}
+
+}  // namespace SSE2NEON
--- a/deps/sse2neon/tests/common.h
+++ b/deps/sse2neon/tests/common.h
@ -0,0 +1,527 @@
+#ifndef SSE2NEONCOMMON_H
+#define SSE2NEONCOMMON_H
+#include <cstdint>
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \
+    defined(__arm__)
+#include "sse2neon.h"
+#elif defined(__x86_64__) || defined(__i386__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <wmmintrin.h>
+#include <x86intrin.h>
+#include <xmmintrin.h>
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("ALIGN_STRUCT")
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#else
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#endif
+
+/* Tunable testing configuration for precise testing */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+#endif
+
+#define ASSERT_RETURN(x) \
+    if (!(x))            \
+        return TEST_FAIL;
+
+namespace SSE2NEON
+{
+enum result_t {
+    TEST_SUCCESS = 1,
+    TEST_FAIL = 0,
+    TEST_UNIMPL = -1,
+};
+extern int32_t NaN;
+extern int64_t NaN64;
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma push_macro("OPTNONE")
+#define OPTNONE __attribute__((optimize("O0")))
+#elif defined(__clang__)
+#pragma push_macro("OPTNONE")
+#define OPTNONE __attribute__((optnone))
+#else
+#define OPTNONE
+#endif
+
+#include <string.h>
+static inline double sse2neon_tool_recast_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+static inline int64_t sse2neon_tool_recast_i64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(int64_t));
+    return i64;
+}
+static inline float sse2neon_tool_recast_f32(uint32_t u32)
+{
+    float f32;
+    memcpy(&f32, &u32, sizeof(uint32_t));
+    return f32;
+}
+static inline float sse2neon_tool_recast_f32(int32_t i32)
+{
+    float f32;
+    memcpy(&f32, &i32, sizeof(int32_t));
+    return f32;
+}
+#define ALL_BIT_1_32 sse2neon_tool_recast_f32(UINT32_MAX)
+#define ALL_BIT_1_64 sse2neon_tool_recast_f64(UINT64_MAX)
+
+template <typename T>
+result_t validate128(T a, T b)
+{
+    const int32_t *t1 = (const int32_t *) &a;
+    const int32_t *t2 = (const int32_t *) &b;
+
+    ASSERT_RETURN(t1[0] == t2[0]);
+    ASSERT_RETURN(t1[1] == t2[1]);
+    ASSERT_RETURN(t1[2] == t2[2]);
+    ASSERT_RETURN(t1[3] == t2[3]);
+    return TEST_SUCCESS;
+}
+result_t validateInt64(__m128i a, int64_t i0, int64_t i1);
+result_t validateInt64(__m64 a, int64_t i0);
+result_t validateUInt64(__m128i a, uint64_t u0, uint64_t u1);
+result_t validateUInt64(__m64 a, uint64_t u0);
+result_t validateInt32(__m128i a,
+                       int32_t i0,
+                       int32_t i1,
+                       int32_t i2,
+                       int32_t i3);
+result_t validateUInt32(__m128i a,
+                        uint32_t u0,
+                        uint32_t u1,
+                        uint32_t u2,
+                        uint32_t u3);
+result_t validateUInt32(__m64 a, uint32_t u0, uint32_t u1);
+result_t validateInt32(__m64 a, int32_t u0, int32_t u1);
+result_t validateInt16(__m128i a,
+                       int16_t i0,
+                       int16_t i1,
+                       int16_t i2,
+                       int16_t i3,
+                       int16_t i4,
+                       int16_t i5,
+                       int16_t i6,
+                       int16_t i7);
+result_t validateInt16(__m64 a, int16_t i0, int16_t i1, int16_t i2, int16_t i3);
+result_t validateUInt16(__m128i a,
+                        uint16_t u0,
+                        uint16_t u1,
+                        uint16_t u2,
+                        uint16_t u3,
+                        uint16_t u4,
+                        uint16_t u5,
+                        uint16_t u6,
+                        uint16_t u7);
+result_t validateUInt16(__m64 a,
+                        uint16_t u0,
+                        uint16_t u1,
+                        uint16_t u2,
+                        uint16_t u3);
+result_t validateInt8(__m128i a,
+                      int8_t i0,
+                      int8_t i1,
+                      int8_t i2,
+                      int8_t i3,
+                      int8_t i4,
+                      int8_t i5,
+                      int8_t i6,
+                      int8_t i7,
+                      int8_t i8,
+                      int8_t i9,
+                      int8_t i10,
+                      int8_t i11,
+                      int8_t i12,
+                      int8_t i13,
+                      int8_t i14,
+                      int8_t i15);
+result_t validateInt8(__m64 a,
+                      int8_t i0,
+                      int8_t i1,
+                      int8_t i2,
+                      int8_t i3,
+                      int8_t i4,
+                      int8_t i5,
+                      int8_t i6,
+                      int8_t i7);
+result_t validateUInt8(__m128i a,
+                       uint8_t u0,
+                       uint8_t u1,
+                       uint8_t u2,
+                       uint8_t u3,
+                       uint8_t u4,
+                       uint8_t u5,
+                       uint8_t u6,
+                       uint8_t u7,
+                       uint8_t u8,
+                       uint8_t u9,
+                       uint8_t u10,
+                       uint8_t u11,
+                       uint8_t u12,
+                       uint8_t u13,
+                       uint8_t u14,
+                       uint8_t u15);
+result_t validateUInt8(__m64 a,
+                       uint8_t u0,
+                       uint8_t u1,
+                       uint8_t u2,
+                       uint8_t u3,
+                       uint8_t u4,
+                       uint8_t u5,
+                       uint8_t u6,
+                       uint8_t u7);
+result_t validateSingleFloatPair(float a, float b);
+result_t validateSingleDoublePair(double a, double b);
+result_t validateFloat(__m128 a, float f0, float f1, float f2, float f3);
+result_t validateFloatEpsilon(__m128 a,
+                              float f0,
+                              float f1,
+                              float f2,
+                              float f3,
+                              float epsilon);
+result_t validateFloatError(__m128 a,
+                            float f0,
+                            float f1,
+                            float f2,
+                            float f3,
+                            float err);
+result_t validateDouble(__m128d a, double d0, double d1);
+result_t validateFloatError(__m128d a, double d0, double d1, double err);
+
+#define VALIDATE_INT8_M128(A, B)                                          \
+    validateInt8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7], B[8], \
+                 B[9], B[10], B[11], B[12], B[13], B[14], B[15])
+#define VALIDATE_UINT8_M128(A, B)                                          \
+    validateUInt8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7], B[8], \
+                  B[9], B[10], B[11], B[12], B[13], B[14], B[15])
+#define VALIDATE_INT16_M128(A, B) \
+    validateInt16(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7])
+#define VALIDATE_UINT16_M128(A, B) \
+    validateUInt16(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7])
+#define VALIDATE_INT32_M128(A, B) validateInt32(A, B[0], B[1], B[2], B[3])
+#define VALIDATE_UINT32_M128(A, B) validateUInt32(A, B[0], B[1], B[2], B[3])
+
+#define VALIDATE_INT8_M64(A, B) \
+    validateInt8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7])
+#define VALIDATE_UINT8_M64(A, B) \
+    validateUInt8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7])
+#define VALIDATE_INT16_M64(A, B) validateInt16(A, B[0], B[1], B[2], B[3])
+#define VALIDATE_UINT16_M64(A, B) validateUInt16(A, B[0], B[1], B[2], B[3])
+#define VALIDATE_INT32_M64(A, B) validateInt32(A, B[0], B[1])
+#define VALIDATE_UINT32_M64(A, B) validateUInt32(A, B[0], B[1])
+#define CHECK_RESULT(EXP)      \
+    if (EXP != TEST_SUCCESS) { \
+        return TEST_FAIL;      \
+    }
+#define IMM_2_ITER \
+    TEST_IMPL(0)   \
+    TEST_IMPL(1)
+#define IMM_4_ITER \
+    IMM_2_ITER     \
+    TEST_IMPL(2)   \
+    TEST_IMPL(3)
+#define IMM_8_ITER \
+    IMM_4_ITER     \
+    TEST_IMPL(4)   \
+    TEST_IMPL(5)   \
+    TEST_IMPL(6)   \
+    TEST_IMPL(7)
+#define IMM_16_ITER \
+    IMM_8_ITER      \
+    TEST_IMPL(8)    \
+    TEST_IMPL(9)    \
+    TEST_IMPL(10)   \
+    TEST_IMPL(11)   \
+    TEST_IMPL(12)   \
+    TEST_IMPL(13)   \
+    TEST_IMPL(14)   \
+    TEST_IMPL(15)
+#define IMM_32_ITER \
+    IMM_16_ITER     \
+    TEST_IMPL(16)   \
+    TEST_IMPL(17)   \
+    TEST_IMPL(18)   \
+    TEST_IMPL(19)   \
+    TEST_IMPL(20)   \
+    TEST_IMPL(21)   \
+    TEST_IMPL(22)   \
+    TEST_IMPL(23)   \
+    TEST_IMPL(24)   \
+    TEST_IMPL(25)   \
+    TEST_IMPL(26)   \
+    TEST_IMPL(27)   \
+    TEST_IMPL(28)   \
+    TEST_IMPL(29)   \
+    TEST_IMPL(30)   \
+    TEST_IMPL(31)
+#define IMM_64_ITER \
+    IMM_32_ITER     \
+    TEST_IMPL(32)   \
+    TEST_IMPL(33)   \
+    TEST_IMPL(34)   \
+    TEST_IMPL(35)   \
+    TEST_IMPL(36)   \
+    TEST_IMPL(37)   \
+    TEST_IMPL(38)   \
+    TEST_IMPL(39)   \
+    TEST_IMPL(40)   \
+    TEST_IMPL(41)   \
+    TEST_IMPL(42)   \
+    TEST_IMPL(43)   \
+    TEST_IMPL(44)   \
+    TEST_IMPL(45)   \
+    TEST_IMPL(46)   \
+    TEST_IMPL(47)   \
+    TEST_IMPL(48)   \
+    TEST_IMPL(49)   \
+    TEST_IMPL(50)   \
+    TEST_IMPL(51)   \
+    TEST_IMPL(52)   \
+    TEST_IMPL(53)   \
+    TEST_IMPL(54)   \
+    TEST_IMPL(55)   \
+    TEST_IMPL(56)   \
+    TEST_IMPL(57)   \
+    TEST_IMPL(58)   \
+    TEST_IMPL(59)   \
+    TEST_IMPL(60)   \
+    TEST_IMPL(61)   \
+    TEST_IMPL(62)   \
+    TEST_IMPL(63)
+#define IMM_128_ITER \
+    IMM_64_ITER      \
+    TEST_IMPL(64)    \
+    TEST_IMPL(65)    \
+    TEST_IMPL(66)    \
+    TEST_IMPL(67)    \
+    TEST_IMPL(68)    \
+    TEST_IMPL(69)    \
+    TEST_IMPL(70)    \
+    TEST_IMPL(71)    \
+    TEST_IMPL(72)    \
+    TEST_IMPL(73)    \
+    TEST_IMPL(74)    \
+    TEST_IMPL(75)    \
+    TEST_IMPL(76)    \
+    TEST_IMPL(77)    \
+    TEST_IMPL(78)    \
+    TEST_IMPL(79)    \
+    TEST_IMPL(80)    \
+    TEST_IMPL(81)    \
+    TEST_IMPL(82)    \
+    TEST_IMPL(83)    \
+    TEST_IMPL(84)    \
+    TEST_IMPL(85)    \
+    TEST_IMPL(86)    \
+    TEST_IMPL(87)    \
+    TEST_IMPL(88)    \
+    TEST_IMPL(89)    \
+    TEST_IMPL(90)    \
+    TEST_IMPL(91)    \
+    TEST_IMPL(92)    \
+    TEST_IMPL(93)    \
+    TEST_IMPL(94)    \
+    TEST_IMPL(95)    \
+    TEST_IMPL(96)    \
+    TEST_IMPL(97)    \
+    TEST_IMPL(98)    \
+    TEST_IMPL(99)    \
+    TEST_IMPL(100)   \
+    TEST_IMPL(101)   \
+    TEST_IMPL(102)   \
+    TEST_IMPL(103)   \
+    TEST_IMPL(104)   \
+    TEST_IMPL(105)   \
+    TEST_IMPL(106)   \
+    TEST_IMPL(107)   \
+    TEST_IMPL(108)   \
+    TEST_IMPL(109)   \
+    TEST_IMPL(110)   \
+    TEST_IMPL(111)   \
+    TEST_IMPL(112)   \
+    TEST_IMPL(113)   \
+    TEST_IMPL(114)   \
+    TEST_IMPL(115)   \
+    TEST_IMPL(116)   \
+    TEST_IMPL(117)   \
+    TEST_IMPL(118)   \
+    TEST_IMPL(119)   \
+    TEST_IMPL(120)   \
+    TEST_IMPL(121)   \
+    TEST_IMPL(122)   \
+    TEST_IMPL(123)   \
+    TEST_IMPL(124)   \
+    TEST_IMPL(125)   \
+    TEST_IMPL(126)   \
+    TEST_IMPL(127)
+#define IMM_256_ITER \
+    IMM_128_ITER     \
+    TEST_IMPL(128)   \
+    TEST_IMPL(129)   \
+    TEST_IMPL(130)   \
+    TEST_IMPL(131)   \
+    TEST_IMPL(132)   \
+    TEST_IMPL(133)   \
+    TEST_IMPL(134)   \
+    TEST_IMPL(135)   \
+    TEST_IMPL(136)   \
+    TEST_IMPL(137)   \
+    TEST_IMPL(138)   \
+    TEST_IMPL(139)   \
+    TEST_IMPL(140)   \
+    TEST_IMPL(141)   \
+    TEST_IMPL(142)   \
+    TEST_IMPL(143)   \
+    TEST_IMPL(144)   \
+    TEST_IMPL(145)   \
+    TEST_IMPL(146)   \
+    TEST_IMPL(147)   \
+    TEST_IMPL(148)   \
+    TEST_IMPL(149)   \
+    TEST_IMPL(150)   \
+    TEST_IMPL(151)   \
+    TEST_IMPL(152)   \
+    TEST_IMPL(153)   \
+    TEST_IMPL(154)   \
+    TEST_IMPL(155)   \
+    TEST_IMPL(156)   \
+    TEST_IMPL(157)   \
+    TEST_IMPL(158)   \
+    TEST_IMPL(159)   \
+    TEST_IMPL(160)   \
+    TEST_IMPL(161)   \
+    TEST_IMPL(162)   \
+    TEST_IMPL(163)   \
+    TEST_IMPL(164)   \
+    TEST_IMPL(165)   \
+    TEST_IMPL(166)   \
+    TEST_IMPL(167)   \
+    TEST_IMPL(168)   \
+    TEST_IMPL(169)   \
+    TEST_IMPL(170)   \
+    TEST_IMPL(171)   \
+    TEST_IMPL(172)   \
+    TEST_IMPL(173)   \
+    TEST_IMPL(174)   \
+    TEST_IMPL(175)   \
+    TEST_IMPL(176)   \
+    TEST_IMPL(177)   \
+    TEST_IMPL(178)   \
+    TEST_IMPL(179)   \
+    TEST_IMPL(180)   \
+    TEST_IMPL(181)   \
+    TEST_IMPL(182)   \
+    TEST_IMPL(183)   \
+    TEST_IMPL(184)   \
+    TEST_IMPL(185)   \
+    TEST_IMPL(186)   \
+    TEST_IMPL(187)   \
+    TEST_IMPL(188)   \
+    TEST_IMPL(189)   \
+    TEST_IMPL(190)   \
+    TEST_IMPL(191)   \
+    TEST_IMPL(192)   \
+    TEST_IMPL(193)   \
+    TEST_IMPL(194)   \
+    TEST_IMPL(195)   \
+    TEST_IMPL(196)   \
+    TEST_IMPL(197)   \
+    TEST_IMPL(198)   \
+    TEST_IMPL(199)   \
+    TEST_IMPL(200)   \
+    TEST_IMPL(201)   \
+    TEST_IMPL(202)   \
+    TEST_IMPL(203)   \
+    TEST_IMPL(204)   \
+    TEST_IMPL(205)   \
+    TEST_IMPL(206)   \
+    TEST_IMPL(207)   \
+    TEST_IMPL(208)   \
+    TEST_IMPL(209)   \
+    TEST_IMPL(210)   \
+    TEST_IMPL(211)   \
+    TEST_IMPL(212)   \
+    TEST_IMPL(213)   \
+    TEST_IMPL(214)   \
+    TEST_IMPL(215)   \
+    TEST_IMPL(216)   \
+    TEST_IMPL(217)   \
+    TEST_IMPL(218)   \
+    TEST_IMPL(219)   \
+    TEST_IMPL(220)   \
+    TEST_IMPL(221)   \
+    TEST_IMPL(222)   \
+    TEST_IMPL(223)   \
+    TEST_IMPL(224)   \
+    TEST_IMPL(225)   \
+    TEST_IMPL(226)   \
+    TEST_IMPL(227)   \
+    TEST_IMPL(228)   \
+    TEST_IMPL(229)   \
+    TEST_IMPL(230)   \
+    TEST_IMPL(231)   \
+    TEST_IMPL(232)   \
+    TEST_IMPL(233)   \
+    TEST_IMPL(234)   \
+    TEST_IMPL(235)   \
+    TEST_IMPL(236)   \
+    TEST_IMPL(237)   \
+    TEST_IMPL(238)   \
+    TEST_IMPL(239)   \
+    TEST_IMPL(240)   \
+    TEST_IMPL(241)   \
+    TEST_IMPL(242)   \
+    TEST_IMPL(243)   \
+    TEST_IMPL(244)   \
+    TEST_IMPL(245)   \
+    TEST_IMPL(246)   \
+    TEST_IMPL(247)   \
+    TEST_IMPL(248)   \
+    TEST_IMPL(249)   \
+    TEST_IMPL(250)   \
+    TEST_IMPL(251)   \
+    TEST_IMPL(252)   \
+    TEST_IMPL(253)   \
+    TEST_IMPL(254)   \
+    TEST_IMPL(255)
+}  // namespace SSE2NEON
+
+#endif
--- a/deps/sse2neon/tests/impl.cpp
+++ b/deps/sse2neon/tests/impl.cpp
--- a/deps/sse2neon/tests/impl.h
+++ b/deps/sse2neon/tests/impl.h
@ -0,0 +1,572 @@
+#ifndef SSE2NEONTEST_H
+#define SSE2NEONTEST_H
+
+#include "common.h"
+
+#define INTRIN_LIST               \
+    /* MMX */                     \
+    _(mm_empty)                   \
+    /* SSE */                     \
+    _(mm_add_ps)                  \
+    _(mm_add_ss)                  \
+    _(mm_and_ps)                  \
+    _(mm_andnot_ps)               \
+    _(mm_avg_pu16)                \
+    _(mm_avg_pu8)                 \
+    _(mm_cmpeq_ps)                \
+    _(mm_cmpeq_ss)                \
+    _(mm_cmpge_ps)                \
+    _(mm_cmpge_ss)                \
+    _(mm_cmpgt_ps)                \
+    _(mm_cmpgt_ss)                \
+    _(mm_cmple_ps)                \
+    _(mm_cmple_ss)                \
+    _(mm_cmplt_ps)                \
+    _(mm_cmplt_ss)                \
+    _(mm_cmpneq_ps)               \
+    _(mm_cmpneq_ss)               \
+    _(mm_cmpnge_ps)               \
+    _(mm_cmpnge_ss)               \
+    _(mm_cmpngt_ps)               \
+    _(mm_cmpngt_ss)               \
+    _(mm_cmpnle_ps)               \
+    _(mm_cmpnle_ss)               \
+    _(mm_cmpnlt_ps)               \
+    _(mm_cmpnlt_ss)               \
+    _(mm_cmpord_ps)               \
+    _(mm_cmpord_ss)               \
+    _(mm_cmpunord_ps)             \
+    _(mm_cmpunord_ss)             \
+    _(mm_comieq_ss)               \
+    _(mm_comige_ss)               \
+    _(mm_comigt_ss)               \
+    _(mm_comile_ss)               \
+    _(mm_comilt_ss)               \
+    _(mm_comineq_ss)              \
+    _(mm_cvt_pi2ps)               \
+    _(mm_cvt_ps2pi)               \
+    _(mm_cvt_si2ss)               \
+    _(mm_cvt_ss2si)               \
+    _(mm_cvtpi16_ps)              \
+    _(mm_cvtpi32_ps)              \
+    _(mm_cvtpi32x2_ps)            \
+    _(mm_cvtpi8_ps)               \
+    _(mm_cvtps_pi16)              \
+    _(mm_cvtps_pi32)              \
+    _(mm_cvtps_pi8)               \
+    _(mm_cvtpu16_ps)              \
+    _(mm_cvtpu8_ps)               \
+    _(mm_cvtsi32_ss)              \
+    _(mm_cvtsi64_ss)              \
+    _(mm_cvtss_f32)               \
+    _(mm_cvtss_si32)              \
+    _(mm_cvtss_si64)              \
+    _(mm_cvtt_ps2pi)              \
+    _(mm_cvtt_ss2si)              \
+    _(mm_cvttps_pi32)             \
+    _(mm_cvttss_si32)             \
+    _(mm_cvttss_si64)             \
+    _(mm_div_ps)                  \
+    _(mm_div_ss)                  \
+    _(mm_extract_pi16)            \
+    _(mm_free)                    \
+    _(mm_get_flush_zero_mode)     \
+    _(mm_get_rounding_mode)       \
+    _(mm_getcsr)                  \
+    _(mm_insert_pi16)             \
+    _(mm_load_ps)                 \
+    _(mm_load_ps1)                \
+    _(mm_load_ss)                 \
+    _(mm_load1_ps)                \
+    _(mm_loadh_pi)                \
+    _(mm_loadl_pi)                \
+    _(mm_loadr_ps)                \
+    _(mm_loadu_ps)                \
+    _(mm_loadu_si16)              \
+    _(mm_loadu_si64)              \
+    _(mm_malloc)                  \
+    _(mm_maskmove_si64)           \
+    _(m_maskmovq)                 \
+    _(mm_max_pi16)                \
+    _(mm_max_ps)                  \
+    _(mm_max_pu8)                 \
+    _(mm_max_ss)                  \
+    _(mm_min_pi16)                \
+    _(mm_min_ps)                  \
+    _(mm_min_pu8)                 \
+    _(mm_min_ss)                  \
+    _(mm_move_ss)                 \
+    _(mm_movehl_ps)               \
+    _(mm_movelh_ps)               \
+    _(mm_movemask_pi8)            \
+    _(mm_movemask_ps)             \
+    _(mm_mul_ps)                  \
+    _(mm_mul_ss)                  \
+    _(mm_mulhi_pu16)              \
+    _(mm_or_ps)                   \
+    _(m_pavgb)                    \
+    _(m_pavgw)                    \
+    _(m_pextrw)                   \
+    _(m_pinsrw)                   \
+    _(m_pmaxsw)                   \
+    _(m_pmaxub)                   \
+    _(m_pminsw)                   \
+    _(m_pminub)                   \
+    _(m_pmovmskb)                 \
+    _(m_pmulhuw)                  \
+    _(mm_prefetch)                \
+    _(m_psadbw)                   \
+    _(m_pshufw)                   \
+    _(mm_rcp_ps)                  \
+    _(mm_rcp_ss)                  \
+    _(mm_rsqrt_ps)                \
+    _(mm_rsqrt_ss)                \
+    _(mm_sad_pu8)                 \
+    _(mm_set_flush_zero_mode)     \
+    _(mm_set_ps)                  \
+    _(mm_set_ps1)                 \
+    _(mm_set_rounding_mode)       \
+    _(mm_set_ss)                  \
+    _(mm_set1_ps)                 \
+    _(mm_setcsr)                  \
+    _(mm_setr_ps)                 \
+    _(mm_setzero_ps)              \
+    _(mm_sfence)                  \
+    _(mm_shuffle_pi16)            \
+    _(mm_shuffle_ps)              \
+    _(mm_sqrt_ps)                 \
+    _(mm_sqrt_ss)                 \
+    _(mm_store_ps)                \
+    _(mm_store_ps1)               \
+    _(mm_store_ss)                \
+    _(mm_store1_ps)               \
+    _(mm_storeh_pi)               \
+    _(mm_storel_pi)               \
+    _(mm_storer_ps)               \
+    _(mm_storeu_ps)               \
+    _(mm_storeu_si16)             \
+    _(mm_storeu_si64)             \
+    _(mm_stream_pi)               \
+    _(mm_stream_ps)               \
+    _(mm_sub_ps)                  \
+    _(mm_sub_ss)                  \
+    _(mm_ucomieq_ss)              \
+    _(mm_ucomige_ss)              \
+    _(mm_ucomigt_ss)              \
+    _(mm_ucomile_ss)              \
+    _(mm_ucomilt_ss)              \
+    _(mm_ucomineq_ss)             \
+    _(mm_undefined_ps)            \
+    _(mm_unpackhi_ps)             \
+    _(mm_unpacklo_ps)             \
+    _(mm_xor_ps)                  \
+    /* SSE2 */                    \
+    _(mm_add_epi16)               \
+    _(mm_add_epi32)               \
+    _(mm_add_epi64)               \
+    _(mm_add_epi8)                \
+    _(mm_add_pd)                  \
+    _(mm_add_sd)                  \
+    _(mm_add_si64)                \
+    _(mm_adds_epi16)              \
+    _(mm_adds_epi8)               \
+    _(mm_adds_epu16)              \
+    _(mm_adds_epu8)               \
+    _(mm_and_pd)                  \
+    _(mm_and_si128)               \
+    _(mm_andnot_pd)               \
+    _(mm_andnot_si128)            \
+    _(mm_avg_epu16)               \
+    _(mm_avg_epu8)                \
+    _(mm_bslli_si128)             \
+    _(mm_bsrli_si128)             \
+    _(mm_castpd_ps)               \
+    _(mm_castpd_si128)            \
+    _(mm_castps_pd)               \
+    _(mm_castps_si128)            \
+    _(mm_castsi128_pd)            \
+    _(mm_castsi128_ps)            \
+    _(mm_clflush)                 \
+    _(mm_cmpeq_epi16)             \
+    _(mm_cmpeq_epi32)             \
+    _(mm_cmpeq_epi8)              \
+    _(mm_cmpeq_pd)                \
+    _(mm_cmpeq_sd)                \
+    _(mm_cmpge_pd)                \
+    _(mm_cmpge_sd)                \
+    _(mm_cmpgt_epi16)             \
+    _(mm_cmpgt_epi32)             \
+    _(mm_cmpgt_epi8)              \
+    _(mm_cmpgt_pd)                \
+    _(mm_cmpgt_sd)                \
+    _(mm_cmple_pd)                \
+    _(mm_cmple_sd)                \
+    _(mm_cmplt_epi16)             \
+    _(mm_cmplt_epi32)             \
+    _(mm_cmplt_epi8)              \
+    _(mm_cmplt_pd)                \
+    _(mm_cmplt_sd)                \
+    _(mm_cmpneq_pd)               \
+    _(mm_cmpneq_sd)               \
+    _(mm_cmpnge_pd)               \
+    _(mm_cmpnge_sd)               \
+    _(mm_cmpngt_pd)               \
+    _(mm_cmpngt_sd)               \
+    _(mm_cmpnle_pd)               \
+    _(mm_cmpnle_sd)               \
+    _(mm_cmpnlt_pd)               \
+    _(mm_cmpnlt_sd)               \
+    _(mm_cmpord_pd)               \
+    _(mm_cmpord_sd)               \
+    _(mm_cmpunord_pd)             \
+    _(mm_cmpunord_sd)             \
+    _(mm_comieq_sd)               \
+    _(mm_comige_sd)               \
+    _(mm_comigt_sd)               \
+    _(mm_comile_sd)               \
+    _(mm_comilt_sd)               \
+    _(mm_comineq_sd)              \
+    _(mm_cvtepi32_pd)             \
+    _(mm_cvtepi32_ps)             \
+    _(mm_cvtpd_epi32)             \
+    _(mm_cvtpd_pi32)              \
+    _(mm_cvtpd_ps)                \
+    _(mm_cvtpi32_pd)              \
+    _(mm_cvtps_epi32)             \
+    _(mm_cvtps_pd)                \
+    _(mm_cvtsd_f64)               \
+    _(mm_cvtsd_si32)              \
+    _(mm_cvtsd_si64)              \
+    _(mm_cvtsd_si64x)             \
+    _(mm_cvtsd_ss)                \
+    _(mm_cvtsi128_si32)           \
+    _(mm_cvtsi128_si64)           \
+    _(mm_cvtsi128_si64x)          \
+    _(mm_cvtsi32_sd)              \
+    _(mm_cvtsi32_si128)           \
+    _(mm_cvtsi64_sd)              \
+    _(mm_cvtsi64_si128)           \
+    _(mm_cvtsi64x_sd)             \
+    _(mm_cvtsi64x_si128)          \
+    _(mm_cvtss_sd)                \
+    _(mm_cvttpd_epi32)            \
+    _(mm_cvttpd_pi32)             \
+    _(mm_cvttps_epi32)            \
+    _(mm_cvttsd_si32)             \
+    _(mm_cvttsd_si64)             \
+    _(mm_cvttsd_si64x)            \
+    _(mm_div_pd)                  \
+    _(mm_div_sd)                  \
+    _(mm_extract_epi16)           \
+    _(mm_insert_epi16)            \
+    _(mm_lfence)                  \
+    _(mm_load_pd)                 \
+    _(mm_load_pd1)                \
+    _(mm_load_sd)                 \
+    _(mm_load_si128)              \
+    _(mm_load1_pd)                \
+    _(mm_loadh_pd)                \
+    _(mm_loadl_epi64)             \
+    _(mm_loadl_pd)                \
+    _(mm_loadr_pd)                \
+    _(mm_loadu_pd)                \
+    _(mm_loadu_si128)             \
+    _(mm_loadu_si32)              \
+    _(mm_madd_epi16)              \
+    _(mm_maskmoveu_si128)         \
+    _(mm_max_epi16)               \
+    _(mm_max_epu8)                \
+    _(mm_max_pd)                  \
+    _(mm_max_sd)                  \
+    _(mm_mfence)                  \
+    _(mm_min_epi16)               \
+    _(mm_min_epu8)                \
+    _(mm_min_pd)                  \
+    _(mm_min_sd)                  \
+    _(mm_move_epi64)              \
+    _(mm_move_sd)                 \
+    _(mm_movemask_epi8)           \
+    _(mm_movemask_pd)             \
+    _(mm_movepi64_pi64)           \
+    _(mm_movpi64_epi64)           \
+    _(mm_mul_epu32)               \
+    _(mm_mul_pd)                  \
+    _(mm_mul_sd)                  \
+    _(mm_mul_su32)                \
+    _(mm_mulhi_epi16)             \
+    _(mm_mulhi_epu16)             \
+    _(mm_mullo_epi16)             \
+    _(mm_or_pd)                   \
+    _(mm_or_si128)                \
+    _(mm_packs_epi16)             \
+    _(mm_packs_epi32)             \
+    _(mm_packus_epi16)            \
+    _(mm_pause)                   \
+    _(mm_sad_epu8)                \
+    _(mm_set_epi16)               \
+    _(mm_set_epi32)               \
+    _(mm_set_epi64)               \
+    _(mm_set_epi64x)              \
+    _(mm_set_epi8)                \
+    _(mm_set_pd)                  \
+    _(mm_set_pd1)                 \
+    _(mm_set_sd)                  \
+    _(mm_set1_epi16)              \
+    _(mm_set1_epi32)              \
+    _(mm_set1_epi64)              \
+    _(mm_set1_epi64x)             \
+    _(mm_set1_epi8)               \
+    _(mm_set1_pd)                 \
+    _(mm_setr_epi16)              \
+    _(mm_setr_epi32)              \
+    _(mm_setr_epi64)              \
+    _(mm_setr_epi8)               \
+    _(mm_setr_pd)                 \
+    _(mm_setzero_pd)              \
+    _(mm_setzero_si128)           \
+    _(mm_shuffle_epi32)           \
+    _(mm_shuffle_pd)              \
+    _(mm_shufflehi_epi16)         \
+    _(mm_shufflelo_epi16)         \
+    _(mm_sll_epi16)               \
+    _(mm_sll_epi32)               \
+    _(mm_sll_epi64)               \
+    _(mm_slli_epi16)              \
+    _(mm_slli_epi32)              \
+    _(mm_slli_epi64)              \
+    _(mm_slli_si128)              \
+    _(mm_sqrt_pd)                 \
+    _(mm_sqrt_sd)                 \
+    _(mm_sra_epi16)               \
+    _(mm_sra_epi32)               \
+    _(mm_srai_epi16)              \
+    _(mm_srai_epi32)              \
+    _(mm_srl_epi16)               \
+    _(mm_srl_epi32)               \
+    _(mm_srl_epi64)               \
+    _(mm_srli_epi16)              \
+    _(mm_srli_epi32)              \
+    _(mm_srli_epi64)              \
+    _(mm_srli_si128)              \
+    _(mm_store_pd)                \
+    _(mm_store_pd1)               \
+    _(mm_store_sd)                \
+    _(mm_store_si128)             \
+    _(mm_store1_pd)               \
+    _(mm_storeh_pd)               \
+    _(mm_storel_epi64)            \
+    _(mm_storel_pd)               \
+    _(mm_storer_pd)               \
+    _(mm_storeu_pd)               \
+    _(mm_storeu_si128)            \
+    _(mm_storeu_si32)             \
+    _(mm_stream_pd)               \
+    _(mm_stream_si128)            \
+    _(mm_stream_si32)             \
+    _(mm_stream_si64)             \
+    _(mm_sub_epi16)               \
+    _(mm_sub_epi32)               \
+    _(mm_sub_epi64)               \
+    _(mm_sub_epi8)                \
+    _(mm_sub_pd)                  \
+    _(mm_sub_sd)                  \
+    _(mm_sub_si64)                \
+    _(mm_subs_epi16)              \
+    _(mm_subs_epi8)               \
+    _(mm_subs_epu16)              \
+    _(mm_subs_epu8)               \
+    _(mm_ucomieq_sd)              \
+    _(mm_ucomige_sd)              \
+    _(mm_ucomigt_sd)              \
+    _(mm_ucomile_sd)              \
+    _(mm_ucomilt_sd)              \
+    _(mm_ucomineq_sd)             \
+    _(mm_undefined_pd)            \
+    _(mm_undefined_si128)         \
+    _(mm_unpackhi_epi16)          \
+    _(mm_unpackhi_epi32)          \
+    _(mm_unpackhi_epi64)          \
+    _(mm_unpackhi_epi8)           \
+    _(mm_unpackhi_pd)             \
+    _(mm_unpacklo_epi16)          \
+    _(mm_unpacklo_epi32)          \
+    _(mm_unpacklo_epi64)          \
+    _(mm_unpacklo_epi8)           \
+    _(mm_unpacklo_pd)             \
+    _(mm_xor_pd)                  \
+    _(mm_xor_si128)               \
+    /* SSE3 */                    \
+    _(mm_addsub_pd)               \
+    _(mm_addsub_ps)               \
+    _(mm_hadd_pd)                 \
+    _(mm_hadd_ps)                 \
+    _(mm_hsub_pd)                 \
+    _(mm_hsub_ps)                 \
+    _(mm_lddqu_si128)             \
+    _(mm_loaddup_pd)              \
+    _(mm_movedup_pd)              \
+    _(mm_movehdup_ps)             \
+    _(mm_moveldup_ps)             \
+    /* SSSE3 */                   \
+    _(mm_abs_epi16)               \
+    _(mm_abs_epi32)               \
+    _(mm_abs_epi8)                \
+    _(mm_abs_pi16)                \
+    _(mm_abs_pi32)                \
+    _(mm_abs_pi8)                 \
+    _(mm_alignr_epi8)             \
+    _(mm_alignr_pi8)              \
+    _(mm_hadd_epi16)              \
+    _(mm_hadd_epi32)              \
+    _(mm_hadd_pi16)               \
+    _(mm_hadd_pi32)               \
+    _(mm_hadds_epi16)             \
+    _(mm_hadds_pi16)              \
+    _(mm_hsub_epi16)              \
+    _(mm_hsub_epi32)              \
+    _(mm_hsub_pi16)               \
+    _(mm_hsub_pi32)               \
+    _(mm_hsubs_epi16)             \
+    _(mm_hsubs_pi16)              \
+    _(mm_maddubs_epi16)           \
+    _(mm_maddubs_pi16)            \
+    _(mm_mulhrs_epi16)            \
+    _(mm_mulhrs_pi16)             \
+    _(mm_shuffle_epi8)            \
+    _(mm_shuffle_pi8)             \
+    _(mm_sign_epi16)              \
+    _(mm_sign_epi32)              \
+    _(mm_sign_epi8)               \
+    _(mm_sign_pi16)               \
+    _(mm_sign_pi32)               \
+    _(mm_sign_pi8)                \
+    /* SSE4.1 */                  \
+    _(mm_blend_epi16)             \
+    _(mm_blend_pd)                \
+    _(mm_blend_ps)                \
+    _(mm_blendv_epi8)             \
+    _(mm_blendv_pd)               \
+    _(mm_blendv_ps)               \
+    _(mm_ceil_pd)                 \
+    _(mm_ceil_ps)                 \
+    _(mm_ceil_sd)                 \
+    _(mm_ceil_ss)                 \
+    _(mm_cmpeq_epi64)             \
+    _(mm_cvtepi16_epi32)          \
+    _(mm_cvtepi16_epi64)          \
+    _(mm_cvtepi32_epi64)          \
+    _(mm_cvtepi8_epi16)           \
+    _(mm_cvtepi8_epi32)           \
+    _(mm_cvtepi8_epi64)           \
+    _(mm_cvtepu16_epi32)          \
+    _(mm_cvtepu16_epi64)          \
+    _(mm_cvtepu32_epi64)          \
+    _(mm_cvtepu8_epi16)           \
+    _(mm_cvtepu8_epi32)           \
+    _(mm_cvtepu8_epi64)           \
+    _(mm_dp_pd)                   \
+    _(mm_dp_ps)                   \
+    _(mm_extract_epi32)           \
+    _(mm_extract_epi64)           \
+    _(mm_extract_epi8)            \
+    _(mm_extract_ps)              \
+    _(mm_floor_pd)                \
+    _(mm_floor_ps)                \
+    _(mm_floor_sd)                \
+    _(mm_floor_ss)                \
+    _(mm_insert_epi32)            \
+    _(mm_insert_epi64)            \
+    _(mm_insert_epi8)             \
+    _(mm_insert_ps)               \
+    _(mm_max_epi32)               \
+    _(mm_max_epi8)                \
+    _(mm_max_epu16)               \
+    _(mm_max_epu32)               \
+    _(mm_min_epi32)               \
+    _(mm_min_epi8)                \
+    _(mm_min_epu16)               \
+    _(mm_min_epu32)               \
+    _(mm_minpos_epu16)            \
+    _(mm_mpsadbw_epu8)            \
+    _(mm_mul_epi32)               \
+    _(mm_mullo_epi32)             \
+    _(mm_packus_epi32)            \
+    _(mm_round_pd)                \
+    _(mm_round_ps)                \
+    _(mm_round_sd)                \
+    _(mm_round_ss)                \
+    _(mm_stream_load_si128)       \
+    _(mm_test_all_ones)           \
+    _(mm_test_all_zeros)          \
+    _(mm_test_mix_ones_zeros)     \
+    _(mm_testc_si128)             \
+    _(mm_testnzc_si128)           \
+    _(mm_testz_si128)             \
+    /* SSE4.2 */                  \
+    _(mm_cmpestra)                \
+    _(mm_cmpestrc)                \
+    _(mm_cmpestri)                \
+    _(mm_cmpestrm)                \
+    _(mm_cmpestro)                \
+    _(mm_cmpestrs)                \
+    _(mm_cmpestrz)                \
+    _(mm_cmpgt_epi64)             \
+    _(mm_cmpistra)                \
+    _(mm_cmpistrc)                \
+    _(mm_cmpistri)                \
+    _(mm_cmpistrm)                \
+    _(mm_cmpistro)                \
+    _(mm_cmpistrs)                \
+    _(mm_cmpistrz)                \
+    _(mm_crc32_u16)               \
+    _(mm_crc32_u32)               \
+    _(mm_crc32_u64)               \
+    _(mm_crc32_u8)                \
+    /* AES */                     \
+    _(mm_aesenc_si128)            \
+    _(mm_aesdec_si128)            \
+    _(mm_aesenclast_si128)        \
+    _(mm_aesdeclast_si128)        \
+    _(mm_aesimc_si128)            \
+    _(mm_aeskeygenassist_si128)   \
+    /* Others */                  \
+    _(mm_clmulepi64_si128)        \
+    _(mm_get_denormals_zero_mode) \
+    _(mm_popcnt_u32)              \
+    _(mm_popcnt_u64)              \
+    _(mm_set_denormals_zero_mode) \
+    _(rdtsc)                      \
+    _(last) /* This indicates the end of macros */
+
+namespace SSE2NEON
+{
+// The way unit tests are implemented is that 10,000 random floating point and
+// integer vec4 numbers are generated as sample data.
+//
+// A short C implementation of every intrinsic is implemented and compared to
+// the actual expected results from the corresponding SSE intrinsic against all
+// of the 10,000 randomized input vectors. When running on ARM, then the results
+// are compared to the NEON approximate version.
+extern const char *instructionString[];
+enum InstructionTest {
+#define _(x) it_##x,
+    INTRIN_LIST
+#undef _
+};
+
+class SSE2NEONTest
+{
+public:
+    static SSE2NEONTest *create(void);  // create the test.
+
+    // Run test of this instruction;
+    // Passed: TEST_SUCCESS (1)
+    // Failed: TEST_FAIL (0)
+    // Unimplemented: TEST_UNIMPL (-1)
+    virtual result_t runTest(InstructionTest test) = 0;
+    virtual void release(void) = 0;
+};
+
+}  // namespace SSE2NEON
+
+#endif
--- a/deps/sse2neon/tests/main.cpp
+++ b/deps/sse2neon/tests/main.cpp
@ -0,0 +1,39 @@
+#include <stdint.h>
+#include <stdio.h>
+#include "impl.h"
+
+int main(int /*argc*/, const char ** /*argv*/)
+{
+    SSE2NEON::SSE2NEONTest *test = SSE2NEON::SSE2NEONTest::create();
+    uint32_t passCount = 0;
+    uint32_t failedCount = 0;
+    uint32_t ignoreCount = 0;
+    for (uint32_t i = 0; i < SSE2NEON::it_last; i++) {
+        SSE2NEON::InstructionTest it = SSE2NEON::InstructionTest(i);
+        SSE2NEON::result_t ret = test->runTest(it);
+        // If the test fails, we will run it again so we can step into the
+        // debugger and figure out why!
+        if (ret == SSE2NEON::TEST_FAIL) {
+            printf("Test %-30s failed\n", SSE2NEON::instructionString[it]);
+            failedCount++;
+        } else if (ret == SSE2NEON::TEST_UNIMPL) {
+            printf("Test %-30s skipped\n", SSE2NEON::instructionString[it]);
+            ignoreCount++;
+        } else {
+            printf("Test %-30s passed\n", SSE2NEON::instructionString[it]);
+            passCount++;
+        }
+    }
+    test->release();
+    printf(
+        "SSE2NEONTest Complete!\n"
+        "Passed:  %d\n"
+        "Failed:  %d\n"
+        "Ignored: %d\n"
+        "Coverage rate: %.2f%%\n",
+        passCount, failedCount, ignoreCount,
+        (float) passCount / (float) (passCount + failedCount + ignoreCount) *
+            100);
+
+    return failedCount ? -1 : 0;
+}
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -119,6 +119,7 @@
 - Apple Driver: Updated requirements to use Apple OpenCL API to macOS 13.0 - use
 - Backend Checks: Describe workaround in error message when detecting more than 64 backend devices
 - Brain: Added sanity check and corresponding error message for invalid --brain-port values
+- Dependencies: Added sse2neon v1.8.0 (commit 658eeac)
 - Dependencies: Updated LZMA SDK to 24.09
 - Dependencies: Updated unrar source to 6.2.7
 - Dependencies: Updated xxHash to 0.8.3 (commit 50f4226)
--- a/docs/license_libs/SSE2NEON_LICENSE.txt
+++ b/docs/license_libs/SSE2NEON_LICENSE.txt
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2015-2025 SSE2NEON Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/src/Makefile
+++ b/src/Makefile
@ -149,6 +149,8 @@ DEPS_UNRAR_PATH         := $(LIBRARY_DEV_ROOT_FOLDER)
 endif
 endif

+DEPS_SSE2NEON           := deps/sse2neon
+
 ##
 ## Filenames for library and frontend
 ##
@ -361,6 +363,8 @@ LFLAGS_NATIVE           += -lpthread
 LFLAGS_NATIVE           += -liconv

 ifeq ($(IS_APPLE_SILICON),1)
+CFLAGS_NATIVE           += -DSSE2NEON_SUPPRESS_WARNINGS
+CFLAGS_NATIVE           += -I$(DEPS_SSE2NEON)
 CFLAGS_NATIVE           += -arch arm64
 CFLAGS_NATIVE           += -arch x86_64
 ifeq ($(SHARED),1)
@ -820,6 +824,11 @@ CFLAGS_LZMA_WIN         += -Wno-misleading-indentation

 CFLAGS_UNRAR_WIN        += -Wno-misleading-indentation
 CFLAGS_UNRAR_WIN        += -Wno-class-memaccess
+
+ifeq ($(IS_APPLE_SILICON),1)
+CFLAGS_CROSS_LINUX      += -DSSE2NEON_SUPPRESS_WARNINGS
+CFLAGS_CROSS_LINUX      += -I$(DEPS_SSE2NEON)
+endif
 endif

 ##