src/xnnpack/intrinsics-polyfill.h - platform/external/XNNPACK - Git at Google

 // Copyright 2019 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.

 #pragma once

 #include <xnnpack/common.h>
 #include <xnnpack/unaligned.h>


 #if defined(__SSE2__)
 #include <emmintrin.h>

 // GCC pre-11, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16
 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && __GNUC__ < 11) || \
     (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \
     (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \
     (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
     (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1600))

 static XNN_INTRINSIC
 void _mm_storeu_si32(void* address, __m128i v) {
   unaligned_store_u32(address, (uint32_t) _mm_cvtsi128_si32(v));
 }

 static XNN_INTRINSIC
 void _mm_storeu_si16(void* address, __m128i v) {
   unaligned_store_u16(address, (uint16_t) _mm_extract_epi16(v, 0));
 }
 #endif  // GCC pre-11, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16
 #endif  // SSE2

 #ifdef __AVX512F__
 #include <immintrin.h>

 // GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, ICC pre-18, and MSVC pre-2019
 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
     (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \
     (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \
     (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
     (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) || \
     (defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__) && (_MSC_VER <= 1916))

 static XNN_INTRINSIC
 __mmask16 _cvtu32_mask16(unsigned int mask) {
   return (__mmask16) mask;
 }

 static XNN_INTRINSIC
 __mmask64 _cvtu64_mask64(unsigned long long mask) {
   return (__mmask64) mask;
 }

 static XNN_INTRINSIC
 __mmask64 _kshiftli_mask64(__mmask64 a, unsigned int count) {
   return (__mmask64) ((unsigned long long) a << count);
 }

 static XNN_INTRINSIC
 __mmask64 _kshiftri_mask64(__mmask64 a, unsigned int count) {
   return (__mmask64) ((unsigned long long) a >> count);
 }

 #endif  // GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-18

 // GCC pre-7, Clang pre-4, and ICC pre-18
 #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
     (defined(__clang__) && (__clang_major__ < 4)) || \
     (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))

 static XNN_INTRINSIC
 float _mm512_reduce_add_ps(__m512 v) {
 #if __AVX512DQ__
   const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
 #else
   const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
 #endif
   const __m128 sum4 = _mm_add_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
   const __m128 sum8 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4));
   const __m128 sum16 = _mm_add_ss(sum8, _mm_movehdup_ps(sum8));
   return _mm_cvtss_f32(sum16);
 }

 static XNN_INTRINSIC
 float _mm512_reduce_max_ps(__m512 v) {
 #if __AVX512DQ__
   const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
 #else
   const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
 #endif
   const __m128 sum4 = _mm_max_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
   const __m128 sum8 = _mm_max_ps(sum4, _mm_movehl_ps(sum4, sum4));
   const __m128 sum16 = _mm_max_ss(sum8, _mm_movehdup_ps(sum8));
   return _mm_cvtss_f32(sum16);
 }

 #endif  // GCC pre-7, Clang pre-4, and ICC pre-18

 #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 9)
 static XNN_INTRINSIC
 __m512i _mm512_set_epi8(
   char e63, char e62, char e61, char e60,
   char e59, char e58, char e57, char e56,
   char e55, char e54, char e53, char e52,
   char e51, char e50, char e49, char e48,
   char e47, char e46, char e45, char e44,
   char e43, char e42, char e41, char e40,
   char e39, char e38, char e37, char e36,
   char e35, char e34, char e33, char e32,
   char e31, char e30, char e29, char e28,
   char e27, char e26, char e25, char e24,
   char e23, char e22, char e21, char e20,
   char e19, char e18, char e17, char e16,
   char e15, char e14, char e13, char e12,
   char e11, char e10, char e09, char e08,
   char e07, char e06, char e05, char e04,
   char e03, char e02, char e01, char e00)
 {
   return (__m512i) (__v64qi) {
     e00, e01, e02, e03, e04, e05, e06, e07,
     e08, e09, e10, e11, e12, e13, e14, e15,
     e16, e17, e18, e19, e20, e21, e22, e23,
     e24, e25, e26, e27, e28, e29, e30, e31,
     e32, e33, e34, e35, e36, e37, e38, e39,
     e40, e41, e42, e43, e44, e45, e46, e47,
     e48, e49, e50, e51, e52, e53, e54, e55,
     e56, e57, e58, e59, e60, e61, e62, e63
   };
 }
 #endif  // GCC pre-9

 #endif  // __AVX512F__

 #if XNN_ARCH_ARM

 // AArch32 GCC 10+ implements arm_acle.h header, but lacks __ror intrinsic
 #if defined(__GNUC__) && !defined(__clang__)
 static XNN_INTRINSIC uint32_t __ror(uint32_t x, uint32_t y) {
    return (x >> y) | (x << (32 - y));
 }
 #endif  // AArch32 GCC

 #endif  // ARM

 #if XNN_ARCH_ARM && (defined(__ARM_NEON) || defined(__ARM_NEON__))
 #include <arm_neon.h>

 // AArch32 GCC targeting ARMv8 NEON, see
 // - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71233
 // - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95399
 #if defined(__GNUC__) && !defined(__clang__) && (__ARM_ARCH >= 8)
 static XNN_INTRINSIC
 int32x4_t vcvtnq_s32_f32(float32x4_t v) {
   return vcvtq_s32_f32(vrndnq_f32(v));
 }
 #endif  // AArch32 GCC targeting ARMv8 NEON

 #endif  // ARM NEON

 #if XNN_ARCH_ARM64
 #include <arm_neon.h>

 // AArch64 GCC pre-8, 8.1-8.4, 9.1-9.3
 #if defined(__GNUC__) && !defined(__clang__) && \
   (__GNUC__ < 8 || __GNUC__ == 8 && __GNUC_MINOR__ < 5 || __GNUC__ == 9 && __GNUC_MINOR__ < 4)
 static XNN_INTRINSIC
 uint8x16x4_t vld1q_u8_x4(const uint8_t* address) {
   uint8x16x4_t result;
   result.val[0] = vld1q_u8(address);
   result.val[1] = vld1q_u8(address + 16);
   result.val[2] = vld1q_u8(address + 32);
   result.val[3] = vld1q_u8(address + 48);
   return result;
 }
 #endif  // AArch64 GCC pre-8, 8.1-8.4, 9.1-9.3

 #endif  // ARM64 NEON
	// Copyright 2019 Google LLC
	//
	// This source code is licensed under the BSD-style license found in the
	// LICENSE file in the root directory of this source tree.

	#pragma once

	#include <xnnpack/common.h>
	#include <xnnpack/unaligned.h>


	#if defined(__SSE2__)
	#include <emmintrin.h>

	// GCC pre-11, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16
	#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && __GNUC__ < 11) \|\| \
	(defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) \|\| \
	(defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) \|\| \
	(defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) \|\| \
	(defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1600))

	static XNN_INTRINSIC
	void _mm_storeu_si32(void* address, __m128i v) {
	unaligned_store_u32(address, (uint32_t) _mm_cvtsi128_si32(v));
	}

	static XNN_INTRINSIC
	void _mm_storeu_si16(void* address, __m128i v) {
	unaligned_store_u16(address, (uint16_t) _mm_extract_epi16(v, 0));
	}
	#endif // GCC pre-11, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16
	#endif // SSE2

	#ifdef __AVX512F__
	#include <immintrin.h>

	// GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, ICC pre-18, and MSVC pre-2019
	#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) \|\| \
	(defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) \|\| \
	(defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) \|\| \
	(defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) \|\| \
	(defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) \|\| \
	(defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__) && (_MSC_VER <= 1916))

	static XNN_INTRINSIC
	__mmask16 _cvtu32_mask16(unsigned int mask) {
	return (__mmask16) mask;
	}

	static XNN_INTRINSIC
	__mmask64 _cvtu64_mask64(unsigned long long mask) {
	return (__mmask64) mask;
	}

	static XNN_INTRINSIC
	__mmask64 _kshiftli_mask64(__mmask64 a, unsigned int count) {
	return (__mmask64) ((unsigned long long) a << count);
	}

	static XNN_INTRINSIC
	__mmask64 _kshiftri_mask64(__mmask64 a, unsigned int count) {
	return (__mmask64) ((unsigned long long) a >> count);
	}

	#endif // GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-18

	// GCC pre-7, Clang pre-4, and ICC pre-18
	#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) \|\| \
	(defined(__clang__) && (__clang_major__ < 4)) \|\| \
	(defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))

	static XNN_INTRINSIC
	float _mm512_reduce_add_ps(__m512 v) {
	#if __AVX512DQ__
	const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
	#else
	const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
	#endif
	const __m128 sum4 = _mm_add_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
	const __m128 sum8 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4));
	const __m128 sum16 = _mm_add_ss(sum8, _mm_movehdup_ps(sum8));
	return _mm_cvtss_f32(sum16);
	}

	static XNN_INTRINSIC
	float _mm512_reduce_max_ps(__m512 v) {
	#if __AVX512DQ__
	const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
	#else
	const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
	#endif
	const __m128 sum4 = _mm_max_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
	const __m128 sum8 = _mm_max_ps(sum4, _mm_movehl_ps(sum4, sum4));
	const __m128 sum16 = _mm_max_ss(sum8, _mm_movehdup_ps(sum8));
	return _mm_cvtss_f32(sum16);
	}

	#endif // GCC pre-7, Clang pre-4, and ICC pre-18

	#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 9)
	static XNN_INTRINSIC
	__m512i _mm512_set_epi8(
	char e63, char e62, char e61, char e60,
	char e59, char e58, char e57, char e56,
	char e55, char e54, char e53, char e52,
	char e51, char e50, char e49, char e48,
	char e47, char e46, char e45, char e44,
	char e43, char e42, char e41, char e40,
	char e39, char e38, char e37, char e36,
	char e35, char e34, char e33, char e32,
	char e31, char e30, char e29, char e28,
	char e27, char e26, char e25, char e24,
	char e23, char e22, char e21, char e20,
	char e19, char e18, char e17, char e16,
	char e15, char e14, char e13, char e12,
	char e11, char e10, char e09, char e08,
	char e07, char e06, char e05, char e04,
	char e03, char e02, char e01, char e00)
	{
	return (__m512i) (__v64qi) {
	e00, e01, e02, e03, e04, e05, e06, e07,
	e08, e09, e10, e11, e12, e13, e14, e15,
	e16, e17, e18, e19, e20, e21, e22, e23,
	e24, e25, e26, e27, e28, e29, e30, e31,
	e32, e33, e34, e35, e36, e37, e38, e39,
	e40, e41, e42, e43, e44, e45, e46, e47,
	e48, e49, e50, e51, e52, e53, e54, e55,
	e56, e57, e58, e59, e60, e61, e62, e63
	};
	}
	#endif // GCC pre-9

	#endif // __AVX512F__

	#if XNN_ARCH_ARM

	// AArch32 GCC 10+ implements arm_acle.h header, but lacks __ror intrinsic
	#if defined(__GNUC__) && !defined(__clang__)
	static XNN_INTRINSIC uint32_t __ror(uint32_t x, uint32_t y) {
	return (x >> y) \| (x << (32 - y));
	}
	#endif // AArch32 GCC

	#endif // ARM

	#if XNN_ARCH_ARM && (defined(__ARM_NEON) \|\| defined(__ARM_NEON__))
	#include <arm_neon.h>

	// AArch32 GCC targeting ARMv8 NEON, see
	// - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71233
	// - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95399
	#if defined(__GNUC__) && !defined(__clang__) && (__ARM_ARCH >= 8)
	static XNN_INTRINSIC
	int32x4_t vcvtnq_s32_f32(float32x4_t v) {
	return vcvtq_s32_f32(vrndnq_f32(v));
	}
	#endif // AArch32 GCC targeting ARMv8 NEON

	#endif // ARM NEON

	#if XNN_ARCH_ARM64
	#include <arm_neon.h>

	// AArch64 GCC pre-8, 8.1-8.4, 9.1-9.3
	#if defined(__GNUC__) && !defined(__clang__) && \
	(__GNUC__ < 8 \|\| __GNUC__ == 8 && __GNUC_MINOR__ < 5 \|\| __GNUC__ == 9 && __GNUC_MINOR__ < 4)
	static XNN_INTRINSIC
	uint8x16x4_t vld1q_u8_x4(const uint8_t* address) {
	uint8x16x4_t result;
	result.val[0] = vld1q_u8(address);
	result.val[1] = vld1q_u8(address + 16);
	result.val[2] = vld1q_u8(address + 32);
	result.val[3] = vld1q_u8(address + 48);
	return result;
	}
	#endif // AArch64 GCC pre-8, 8.1-8.4, 9.1-9.3

	#endif // ARM64 NEON