#ifndef SSE2NEON_H
#define SSE2NEON_H
// This header file provides a simple API translation layer
// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
//
// This header file does not yet translate all of the SSE intrinsics.
//
// Contributors to this work are:
// John W. Ratcliff <
[email protected]>
// Brandon Rowlett <
[email protected]>
// Ken Fast <
[email protected]>
// Eric van Beurden <
[email protected]>
// Alexander Potylitsin <
[email protected]>
// Hasindu Gamaarachchi <
[email protected]>
// Jim Huang <
[email protected]>
// Mark Cheng <
[email protected]>
// Malcolm James MacLeod <
[email protected]>
// Devin Hussey (easyaspi314) <
[email protected]>
// Sebastian Pop <
[email protected]>
// Developer Ecosystem Engineering <
[email protected]>
// Danila Kutenin <
[email protected]>
// François Turban (JishinMaster) <
[email protected]>
// Pei-Hsuan Hung <
[email protected]>
// Yang-Hao Yuan <
[email protected]>
/*
* sse2neon is freely redistributable under the MIT License.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* Tunable configurations */
/* Enable precise implementation of _mm_min_ps and _mm_max_ps
* This would slow down the computation a bit, but gives consistent result with
* x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
*/
#ifndef SSE2NEON_PRECISE_MINMAX
#define SSE2NEON_PRECISE_MINMAX (0)
#endif
#if defined(__GNUC__) || defined(__clang__)
#pragma push_macro("FORCE_INLINE")
#pragma push_macro("ALIGN_STRUCT")
#define FORCE_INLINE static inline __attribute__((always_inline))
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
#else
#error "Macro name collisions may happen with unsupported compiler."
#ifdef FORCE_INLINE
#undef FORCE_INLINE
#endif
#define FORCE_INLINE static inline
#ifndef ALIGN_STRUCT
#define ALIGN_STRUCT(x) __declspec(align(x))
#endif
#endif
#include <stdint.h>
#include <stdlib.h>
// These cause the build to fail on raspberry pi with 'unsupported target'
// and don't seem to do anything particularly useful
///* Architecture-specific build options */
///* FIXME: #pragma GCC push_options is only available on GCC */
//#if defined(__GNUC__)
//#if defined(__arm__) && __ARM_ARCH == 7
///* According to ARM C Language Extensions Architecture specification,
// * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
// * architecture supported.
// */
//#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
//#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
//#endif
//#pragma GCC push_options
//#pragma GCC target("fpu=neon")
//#elif defined(__aarch64__)
//#pragma GCC push_options
//#pragma GCC target("+simd")
//#else
//#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
//#endif
//#endif
#include <arm_neon.h>
/* Rounding functions require either Aarch64 instructions or libm failback */
#if !defined(__aarch64__)
#include <math.h>
#endif
/* "__has_builtin" can be used to query support for built-in functions
* provided by gcc/clang and other compilers that support it.
*/
#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
/* Compatibility with gcc <= 9 */
#if __GNUC__ <= 9
#define __has_builtin(x) HAS##x
#define HAS__builtin_popcount 1
#define HAS__builtin_popcountll 1
#else
#define __has_builtin(x) 0
#endif
#endif
/**
* MACRO for shuffle parameter for _mm_shuffle_ps().
* Argument fp3 is a digit[0123] that represents the fp from argument "b"
* of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
* for fp2 in result. fp1 is a digit[0123] that represents the fp from
* argument "a" of mm_shuffle_ps that will be places in fp1 of result.
* fp0 is the same for fp0 of result.
*/
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
/* Rounding mode macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
#define _MM_FROUND_TO_NEG_INF 0x01
#define _MM_FROUND_TO_POS_INF 0x02
#define _MM_FROUND_TO_ZERO 0x03
#define _MM_FROUND_CUR_DIRECTION 0x04
#define _MM_FROUND_NO_EXC 0x08
/* indicate immediate constant argument in a given range */
#define __constrange(a, b) const
/* A few intrinsics accept traditional data types like ints or floats, but
* most operate on data types that are specific to SSE.
* If a vector type ends in d, it contains doubles, and if it does not have
* a suffix, it contains floats. An integer vector type can contain any type
* of integer, from chars to shorts to unsigned long longs.
*/
typedef int64x1_t __m64;
typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
// On ARM 32-bit architecture, the float64x2_t is not supported.
// The data type __m128d should be represented in a different way for related
// intrinsic conversion.
#if defined(__aarch64__)
typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
#else
typedef float32x4_t __m128d;
#endif
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
/* type-safe casting between types */
#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
#define vreinterpretq_m128_f32(x) (x)
#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
#define vreinterpretq_f32_m128(x) (x)
#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
#define vreinterpretq_m128i_s64(x) (x)
#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
#define vreinterpretq_s32_m128i(x) vreinterpretq_s