Initialize F16 microkernel pointers on x86
PiperOrigin-RevId: 422911260
diff --git a/BUILD.bazel b/BUILD.bazel
index 2812c32..1347c6f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5979,6 +5979,13 @@
PROD_F16C_MICROKERNEL_SRCS = [
"src/f16-f32-vcvt/gen/vcvt-f16c-x16.c",
+ "src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c",
+ "src/f16-gavgpool/gen/7x-minmax-f16c-c8.c",
+ "src/f16-vbinary/gen/vadd-minmax-f16c-x16.c",
+ "src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c",
+ "src/f16-vbinary/gen/vmul-minmax-f16c-x16.c",
+ "src/f16-vbinary/gen/vmulc-minmax-f16c-x16.c",
+ "src/f16-vhswish/gen/vhswish-f16c-x16.c",
"src/f32-f16-vcvt/gen/vcvt-f16c-x16.c",
]
@@ -6189,6 +6196,10 @@
]
PROD_FMA3_MICROKERNEL_SRCS = [
+ "src/f16-dwconv/gen/up8x25-minmax-fma3-acc2.c",
+ "src/f16-dwconv/gen/up16x4-minmax-fma3.c",
+ "src/f16-dwconv/gen/up16x9-minmax-fma3.c",
+ "src/f16-vmulcaddc/gen/c8-minmax-fma3-2x.c",
"src/f32-dwconv/gen/up8x25-minmax-fma3.c",
"src/f32-dwconv/gen/up16x3-minmax-fma3.c",
"src/f32-dwconv/gen/up16x4-minmax-fma3.c",
@@ -6299,6 +6310,10 @@
]
PROD_AVX2_MICROKERNEL_SRCS = [
+ "src/f16-gemm/gen/1x16-minmax-avx2-broadcast.c",
+ "src/f16-gemm/gen/4x16-minmax-avx2-broadcast.c",
+ "src/f16-igemm/gen/1x16-minmax-avx2-broadcast.c",
+ "src/f16-igemm/gen/4x16-minmax-avx2-broadcast.c",
"src/f32-qs8-vcvt/gen/vcvt-avx2-x64.c",
"src/f32-qu8-vcvt/gen/vcvt-avx2-x64.c",
"src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6696e86..93605b7 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4727,6 +4727,13 @@
SET(PROD_F16C_MICROKERNEL_SRCS
src/f16-f32-vcvt/gen/vcvt-f16c-x16.c
+ src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c
+ src/f16-gavgpool/gen/7x-minmax-f16c-c8.c
+ src/f16-vbinary/gen/vadd-minmax-f16c-x16.c
+ src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c
+ src/f16-vbinary/gen/vmul-minmax-f16c-x16.c
+ src/f16-vbinary/gen/vmulc-minmax-f16c-x16.c
+ src/f16-vhswish/gen/vhswish-f16c-x16.c
src/f32-f16-vcvt/gen/vcvt-f16c-x16.c)
SET(ALL_F16C_MICROKERNEL_SRCS
@@ -4933,6 +4940,10 @@
src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x16.c)
SET(PROD_FMA3_MICROKERNEL_SRCS
+ src/f16-dwconv/gen/up8x25-minmax-fma3-acc2.c
+ src/f16-dwconv/gen/up16x4-minmax-fma3.c
+ src/f16-dwconv/gen/up16x9-minmax-fma3.c
+ src/f16-vmulcaddc/gen/c8-minmax-fma3-2x.c
src/f32-dwconv/gen/up8x25-minmax-fma3.c
src/f32-dwconv/gen/up16x3-minmax-fma3.c
src/f32-dwconv/gen/up16x4-minmax-fma3.c
@@ -5041,6 +5052,10 @@
src/math/sqrt-fma3-nr2fma.c)
SET(PROD_AVX2_MICROKERNEL_SRCS
+ src/f16-gemm/gen/1x16-minmax-avx2-broadcast.c
+ src/f16-gemm/gen/4x16-minmax-avx2-broadcast.c
+ src/f16-igemm/gen/1x16-minmax-avx2-broadcast.c
+ src/f16-igemm/gen/4x16-minmax-avx2-broadcast.c
src/f32-qs8-vcvt/gen/vcvt-avx2-x64.c
src/f32-qu8-vcvt/gen/vcvt-avx2-x64.c
src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c
diff --git a/src/amalgam/avx2.c b/src/amalgam/avx2.c
index b534f88..ab30cc7 100644
--- a/src/amalgam/avx2.c
+++ b/src/amalgam/avx2.c
@@ -19,6 +19,622 @@
#include <xnnpack/vunary.h>
+void xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const void*restrict a,
+ size_t a_stride,
+ const void*restrict w,
+ void*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint16_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const uint16_t* a0 = a;
+ uint16_t* c0 = c;
+
+ do {
+ __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+ w = (const uint16_t*) w + 16;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
+ a0 += 1;
+
+ const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+ w = (const uint16_t*) w + 16;
+
+ vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
+ vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
+
+ k -= sizeof(uint16_t);
+ } while (k != 0);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
+
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
+
+ if XNN_LIKELY(nc >= 16) {
+ _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
+ c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
+
+ a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
+
+ nc -= 16;
+ } else {
+ __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
+ if (nc & 8) {
+ _mm_storeu_si128((__m128i*) c0, vh0x01234567);
+
+ vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
+
+ c0 += 8;
+ }
+ if (nc & 4) {
+ _mm_storel_epi64((__m128i*) c0, vh0x01234567);
+
+ vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
+
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storeu_si32(c0, vh0x01234567);
+
+ vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
+
+ c0 += 2;
+ }
+ if (nc & 1) {
+ *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
+
+void xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const void*restrict a,
+ size_t a_stride,
+ const void*restrict w,
+ void*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint16_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const uint16_t* a0 = a;
+ uint16_t* c0 = c;
+ const uint16_t* a1 = (const uint16_t*) ((uintptr_t) a0 + a_stride);
+ uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const uint16_t* a2 = (const uint16_t*) ((uintptr_t) a1 + a_stride);
+ uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const uint16_t* a3 = (const uint16_t*) ((uintptr_t) a2 + a_stride);
+ uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
+ w = (const uint16_t*) w + 16;
+
+ size_t k = kc;
+ do {
+ const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
+ a0 += 1;
+ const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
+ a1 += 1;
+ const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
+ a2 += 1;
+ const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
+ a3 += 1;
+
+ const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+ w = (const uint16_t*) w + 16;
+
+ vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
+ vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
+ vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
+ vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
+ vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
+ vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
+ vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
+ vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
+
+ k -= sizeof(uint16_t);
+ } while (k != 0);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+ vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+ vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+ vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
+ vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
+ vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
+ vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
+
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
+ vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
+ vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
+ vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
+
+ if XNN_LIKELY(nc >= 16) {
+ _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
+ c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
+ _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
+ c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
+ _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
+ c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
+ _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
+ c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
+
+ a3 = (const uint16_t*) ((uintptr_t) a3 - kc);
+ a2 = (const uint16_t*) ((uintptr_t) a2 - kc);
+ a1 = (const uint16_t*) ((uintptr_t) a1 - kc);
+ a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
+
+ nc -= 16;
+ } else {
+ __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
+ __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
+ __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
+ __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
+ if (nc & 8) {
+ _mm_storeu_si128((__m128i*) c3, vh3x01234567);
+ _mm_storeu_si128((__m128i*) c2, vh2x01234567);
+ _mm_storeu_si128((__m128i*) c1, vh1x01234567);
+ _mm_storeu_si128((__m128i*) c0, vh0x01234567);
+
+ vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
+ vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
+ vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
+ vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
+
+ c3 += 8;
+ c2 += 8;
+ c1 += 8;
+ c0 += 8;
+ }
+ if (nc & 4) {
+ _mm_storel_epi64((__m128i*) c3, vh3x01234567);
+ _mm_storel_epi64((__m128i*) c2, vh2x01234567);
+ _mm_storel_epi64((__m128i*) c1, vh1x01234567);
+ _mm_storel_epi64((__m128i*) c0, vh0x01234567);
+
+ vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
+ vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
+ vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
+ vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
+
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storeu_si32(c3, vh3x01234567);
+ _mm_storeu_si32(c2, vh2x01234567);
+ _mm_storeu_si32(c1, vh1x01234567);
+ _mm_storeu_si32(c0, vh0x01234567);
+
+ vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
+ vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
+ vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
+ vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
+
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ *c3 = (uint16_t) _mm_extract_epi16(vh3x01234567, 0);
+ *c2 = (uint16_t) _mm_extract_epi16(vh2x01234567, 0);
+ *c1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
+ *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
+
+void xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const void**restrict a,
+ const void*restrict w,
+ void*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const void* zero,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint16_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint16_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ uint16_t* c0 = c;
+
+ do {
+ __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+ w = (const uint16_t*) w + 16;
+
+ size_t p = ks;
+ do {
+ const uint16_t* restrict a0 = (const uint16_t*) a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+ w = (const uint16_t*) w + 16;
+
+ const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
+ a0 += 1;
+
+ vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
+ vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
+
+ k -= sizeof(uint16_t);
+ } while (k != 0);
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
+
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
+
+ if XNN_LIKELY(nc >= 16) {
+ _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
+ c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const void**restrict) ((uintptr_t) a - ks);
+ nc -= 16;
+ } else {
+ __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
+ if (nc & 8) {
+ _mm_storeu_si128((__m128i*) c0, vh0x01234567);
+
+ vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
+
+ c0 += 8;
+ }
+ if (nc & 4) {
+ _mm_storel_epi64((__m128i*) c0, vh0x01234567);
+
+ vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
+
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storeu_si32(c0, vh0x01234567);
+
+ vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
+
+ c0 += 2;
+ }
+ if (nc & 1) {
+ *c0 = _mm_extract_epi16(vh0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
+
+void xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const void**restrict a,
+ const void*restrict w,
+ void*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const void* zero,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint16_t) == 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint16_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ uint16_t* c0 = c;
+ uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+ __m256 vacc1x01234567 = vacc0x01234567;
+ __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
+ __m256 vacc2x01234567 = vacc0x01234567;
+ __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
+ __m256 vacc3x01234567 = vacc0x01234567;
+ __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
+ w = (const uint16_t*) w + 16;
+
+ size_t p = ks;
+ do {
+ const uint16_t* restrict a0 = (const uint16_t*) a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint16_t* restrict a1 = (const uint16_t*) a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint16_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const uint16_t* restrict a2 = (const uint16_t*) a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const uint16_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const uint16_t* restrict a3 = (const uint16_t*) a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const uint16_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ do {
+ const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+ w = (const uint16_t*) w + 16;
+
+ const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
+ a0 += 1;
+ const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
+ a1 += 1;
+ const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
+ a2 += 1;
+ const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
+ a3 += 1;
+
+ vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
+ vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
+ vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
+ vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
+ vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
+ vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
+ vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
+ vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
+
+ k -= sizeof(uint16_t);
+ } while (k != 0);
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x01234567, vscale), _MM_FROUND_NO_EXC));
+ vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+ vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+ vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+ vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+ vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+ vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+ vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+ vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+ vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
+ vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
+ vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
+ vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
+
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+ vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+ vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+ vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+ vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
+ vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
+ vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
+ vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
+
+ if XNN_LIKELY(nc >= 16) {
+ _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
+ c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
+ _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
+ c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
+ _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
+ c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
+ _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
+ c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const void**restrict) ((uintptr_t) a - ks);
+ nc -= 16;
+ } else {
+ __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
+ __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
+ __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
+ __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
+ if (nc & 8) {
+ _mm_storeu_si128((__m128i*) c3, vh3x01234567);
+ _mm_storeu_si128((__m128i*) c2, vh2x01234567);
+ _mm_storeu_si128((__m128i*) c1, vh1x01234567);
+ _mm_storeu_si128((__m128i*) c0, vh0x01234567);
+
+ vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
+ vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
+ vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
+ vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
+
+ c3 += 8;
+ c2 += 8;
+ c1 += 8;
+ c0 += 8;
+ }
+ if (nc & 4) {
+ _mm_storel_epi64((__m128i*) c3, vh3x01234567);
+ _mm_storel_epi64((__m128i*) c2, vh2x01234567);
+ _mm_storel_epi64((__m128i*) c1, vh1x01234567);
+ _mm_storel_epi64((__m128i*) c0, vh0x01234567);
+
+ vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
+ vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
+ vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
+ vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
+
+ c3 += 4;
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+ }
+ if (nc & 2) {
+ _mm_storeu_si32(c3, vh3x01234567);
+ _mm_storeu_si32(c2, vh2x01234567);
+ _mm_storeu_si32(c1, vh1x01234567);
+ _mm_storeu_si32(c0, vh0x01234567);
+
+ vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
+ vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
+ vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
+ vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
+
+ c3 += 2;
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+ }
+ if (nc & 1) {
+ *c3 = _mm_extract_epi16(vh3x01234567, 0);
+ *c2 = _mm_extract_epi16(vh2x01234567, 0);
+ *c1 = _mm_extract_epi16(vh1x01234567, 0);
+ *c0 = _mm_extract_epi16(vh0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
+
void xnn_f32_qs8_vcvt_ukernel__avx2_x64(
size_t n,
const float* x,
diff --git a/src/amalgam/f16c.c b/src/amalgam/f16c.c
index 34df7ed..7cbaef9 100644
--- a/src/amalgam/f16c.c
+++ b/src/amalgam/f16c.c
@@ -8,8 +8,12 @@
#include <immintrin.h>
#include <xnnpack/common.h>
+#include <xnnpack/gavgpool.h>
#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
#include <xnnpack/vcvt.h>
+#include <xnnpack/vunary.h>
void xnn_f16_f32_vcvt_ukernel__f16c_x16(
@@ -62,6 +66,690 @@
}
}
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* buffer,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const uint16_t* i0 = input;
+ const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
+ const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
+ const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
+ const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
+ const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
+ const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t);
+
+ uint16_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
+ __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+ _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
+ i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
+ i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
+ i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
+ i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
+ i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
+
+ uint16_t* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+ _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
+ }
+ }
+
+ i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
+ i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const uint16_t*) zero;
+ }
+ i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const uint16_t*) zero;
+ }
+ i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const uint16_t*) zero;
+ }
+ i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const uint16_t*) zero;
+ }
+ i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const uint16_t*) zero;
+ }
+ i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const uint16_t*) zero;
+ }
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ for (; channels >= 8; channels -= 8) {
+ __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+ vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
+
+ __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
+
+ vout01234567 = _mm256_min_ps(vout01234567, vmax);
+
+ _mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
+ output = (uint16_t*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+ vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
+ __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
+ vout01234567 = _mm256_min_ps(vout01234567, vmax);
+
+ __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
+ if (channels & 4) {
+ _mm_storel_epi64((__m128i*) output, vh01234567);
+ output = (uint16_t*) output + 4;
+ vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+ }
+ if (channels & 2) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+ output = (uint16_t*) output + 2;
+ vh01234567 = _mm_srli_epi64(vh01234567, 32);
+ }
+ if (channels & 1) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+ }
+ }
+ }
+}
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const uint16_t* i0 = input;
+ const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const uint16_t*) zero;
+ }
+ const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const uint16_t*) zero;
+ }
+ const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const uint16_t*) zero;
+ }
+ const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const uint16_t*) zero;
+ }
+ const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const uint16_t*) zero;
+ }
+ const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const uint16_t*) zero;
+ }
+
+ const __m256 vscale = _mm256_load_ps(params->avx.scale);
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ for (; channels >= 8; channels -= 8) {
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 += 8;
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 += 8;
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
+ i2 += 8;
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ i3 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ i4 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ i5 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ i6 += 8;
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+ vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
+
+ __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
+
+ vout01234567 = _mm256_min_ps(vout01234567, vmax);
+
+ _mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
+ output = (uint16_t*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+ vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+ vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
+ __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
+ vout01234567 = _mm256_min_ps(vout01234567, vmax);
+
+ __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
+ if (channels & 4) {
+ _mm_storel_epi64((__m128i*) output, vh01234567);
+ output = (uint16_t*) output + 4;
+ vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+ }
+ if (channels & 2) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+ output = (uint16_t*) output + 2;
+ vh01234567 = _mm_srli_epi64(vh01234567, 32);
+ }
+ if (channels & 1) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+ }
+ }
+ }
+}
+
+void xnn_f16_vadd_minmax_ukernel__f16c_x16(
+ size_t n,
+ const void* restrict a_ptr,
+ const void* restrict b_ptr,
+ void* restrict y_ptr,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint16_t) == 0);
+ assert(a_ptr != NULL);
+ assert(b_ptr != NULL);
+ assert(y_ptr != NULL);
+
+ const uint16_t* a = (const uint16_t*) a_ptr;
+ const uint16_t* b = (const uint16_t*) b_ptr;
+ uint16_t* y = (uint16_t*) y_ptr;
+
+ const __m256 vy_min = _mm256_load_ps(params->avx.min);
+ const __m256 vy_max = _mm256_load_ps(params->avx.max);
+
+ for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+ const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+ const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
+ const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
+ a += 16;
+ b += 16;
+
+ __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
+ __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
+
+
+ vy01234567 = _mm256_max_ps(vy01234567, vy_min);
+ vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
+
+ vy01234567 = _mm256_min_ps(vy01234567, vy_max);
+ vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
+
+ _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+ const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+ a += 8;
+ b += 8;
+
+ __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
+
+ vy = _mm256_max_ps(vy, vy_min);
+ vy = _mm256_min_ps(vy, vy_max);
+
+ _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+
+ __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
+
+ vy = _mm256_max_ps(vy, vy_min);
+ vy = _mm256_min_ps(vy, vy_max);
+
+ __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
+ if (n & (4 * sizeof(uint16_t))) {
+ _mm_storel_epi64((__m128i*) y, vh);
+ vh = _mm_unpackhi_epi64(vh, vh);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint16_t))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
+ vh = _mm_srli_epi64(vh, 32);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint16_t))) {
+ *y = (uint16_t) _mm_extract_epi16(vh, 0);
+ }
+ }
+}
+
+void xnn_f16_vaddc_minmax_ukernel__f16c_x16(
+ size_t n,
+ const void* restrict a_ptr,
+ const void* restrict b_ptr,
+ void* restrict y_ptr,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint16_t) == 0);
+ assert(a_ptr != NULL);
+ assert(b_ptr != NULL);
+ assert(y_ptr != NULL);
+
+ const uint16_t* a = (const uint16_t*) a_ptr;
+ const uint16_t* b = (const uint16_t*) b_ptr;
+ uint16_t* y = (uint16_t*) y_ptr;
+
+ const __m256 vy_min = _mm256_load_ps(params->avx.min);
+ const __m256 vy_max = _mm256_load_ps(params->avx.max);
+
+ const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
+ for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+ const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
+ a += 16;
+
+ __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb), _MM_FROUND_NO_EXC));
+ __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
+
+
+ vy01234567 = _mm256_max_ps(vy01234567, vy_min);
+ vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
+
+ vy01234567 = _mm256_min_ps(vy01234567, vy_max);
+ vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
+
+ _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+ const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ a += 8;
+
+ __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
+
+ vy = _mm256_max_ps(vy, vy_min);
+ vy = _mm256_min_ps(vy, vy_max);
+
+ _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+
+ __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
+
+ vy = _mm256_max_ps(vy, vy_min);
+ vy = _mm256_min_ps(vy, vy_max);
+
+ __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
+ if (n & (4 * sizeof(uint16_t))) {
+ _mm_storel_epi64((__m128i*) y, vh);
+ vh = _mm_unpackhi_epi64(vh, vh);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint16_t))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
+ vh = _mm_srli_epi64(vh, 32);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint16_t))) {
+ *y = (uint16_t) _mm_extract_epi16(vh, 0);
+ }
+ }
+}
+
+void xnn_f16_vmul_minmax_ukernel__f16c_x16(
+ size_t n,
+ const void* restrict a_ptr,
+ const void* restrict b_ptr,
+ void* restrict y_ptr,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint16_t) == 0);
+ assert(a_ptr != NULL);
+ assert(b_ptr != NULL);
+ assert(y_ptr != NULL);
+
+ const uint16_t* a = (const uint16_t*) a_ptr;
+ const uint16_t* b = (const uint16_t*) b_ptr;
+ uint16_t* y = (uint16_t*) y_ptr;
+
+ const __m256 vy_min = _mm256_load_ps(params->avx.min);
+ const __m256 vy_max = _mm256_load_ps(params->avx.max);
+
+ for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+ const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+ const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
+ const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
+ a += 16;
+ b += 16;
+
+ __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
+ __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
+
+
+ vy01234567 = _mm256_max_ps(vy01234567, vy_min);
+ vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
+
+ vy01234567 = _mm256_min_ps(vy01234567, vy_max);
+ vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
+
+ _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+ const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+ a += 8;
+ b += 8;
+
+ __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
+
+ vy = _mm256_max_ps(vy, vy_min);
+ vy = _mm256_min_ps(vy, vy_max);
+
+ _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+
+ __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
+
+ vy = _mm256_max_ps(vy, vy_min);
+ vy = _mm256_min_ps(vy, vy_max);
+
+ __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
+ if (n & (4 * sizeof(uint16_t))) {
+ _mm_storel_epi64((__m128i*) y, vh);
+ vh = _mm_unpackhi_epi64(vh, vh);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint16_t))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
+ vh = _mm_srli_epi64(vh, 32);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint16_t))) {
+ *y = (uint16_t) _mm_extract_epi16(vh, 0);
+ }
+ }
+}
+
+void xnn_f16_vmulc_minmax_ukernel__f16c_x16(
+ size_t n,
+ const void* restrict a_ptr,
+ const void* restrict b_ptr,
+ void* restrict y_ptr,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint16_t) == 0);
+ assert(a_ptr != NULL);
+ assert(b_ptr != NULL);
+ assert(y_ptr != NULL);
+
+ const uint16_t* a = (const uint16_t*) a_ptr;
+ const uint16_t* b = (const uint16_t*) b_ptr;
+ uint16_t* y = (uint16_t*) y_ptr;
+
+ const __m256 vy_min = _mm256_load_ps(params->avx.min);
+ const __m256 vy_max = _mm256_load_ps(params->avx.max);
+
+ const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
+ for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+ const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
+ a += 16;
+
+ __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb), _MM_FROUND_NO_EXC));
+ __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
+
+
+ vy01234567 = _mm256_max_ps(vy01234567, vy_min);
+ vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
+
+ vy01234567 = _mm256_min_ps(vy01234567, vy_max);
+ vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
+
+ _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+ const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+ a += 8;
+
+ __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
+
+ vy = _mm256_max_ps(vy, vy_min);
+ vy = _mm256_min_ps(vy, vy_max);
+
+ _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+
+ __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
+
+ vy = _mm256_max_ps(vy, vy_min);
+ vy = _mm256_min_ps(vy, vy_max);
+
+ __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
+ if (n & (4 * sizeof(uint16_t))) {
+ _mm_storel_epi64((__m128i*) y, vh);
+ vh = _mm_unpackhi_epi64(vh, vh);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint16_t))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
+ vh = _mm_srli_epi64(vh, 32);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint16_t))) {
+ *y = (uint16_t) _mm_extract_epi16(vh, 0);
+ }
+ }
+}
+
+void xnn_f16_vhswish_ukernel__f16c_x16(
+ size_t n,
+ const void* restrict x_ptr,
+ void* restrict y_ptr,
+ const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint16_t) == 0);
+
+ const uint16_t* x = (const uint16_t*) x_ptr;
+ uint16_t* y = (uint16_t*) y_ptr;
+
+ const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
+ const __m256 vthree = _mm256_load_ps(params->avx.three);
+ const __m128i vsix = _mm_load_si128((const __m128i*) params->avx.six);
+ const __m128i vzero = _mm_setzero_si128();
+
+ for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+ __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
+ __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
+ x += 16;
+
+ __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vx01234567, vthree), _MM_FROUND_NO_EXC);
+ vx01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx01234567, vsixth), _MM_FROUND_NO_EXC));
+ __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vx89ABCDEF, vthree), _MM_FROUND_NO_EXC);
+ vx89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx89ABCDEF, vsixth), _MM_FROUND_NO_EXC));
+
+ vacc01234567 = _mm_max_epi16(vacc01234567, vzero);
+ vacc89ABCDEF = _mm_max_epi16(vacc89ABCDEF, vzero);
+
+ vacc01234567 = _mm_min_epi16(vacc01234567, vsix);
+ vacc89ABCDEF = _mm_min_epi16(vacc89ABCDEF, vsix);
+
+ vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vx01234567), _MM_FROUND_NO_EXC);
+ vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vx89ABCDEF), _MM_FROUND_NO_EXC);
+
+ _mm_storeu_si128((__m128i*) y, vacc01234567);
+ _mm_storeu_si128((__m128i*) (y + 8), vacc89ABCDEF);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+ __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
+ x += 8;
+ __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
+ vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
+ vacc = _mm_max_epi16(vacc, vzero);
+ vacc = _mm_min_epi16(vacc, vsix);
+ vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
+ _mm_storeu_si128((__m128i*) y, vacc);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
+ __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
+ vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
+ vacc = _mm_max_epi16(vacc, vzero);
+ vacc = _mm_min_epi16(vacc, vsix);
+ vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
+
+ if (n & (4 * sizeof(uint16_t))) {
+ _mm_storel_epi64((__m128i*) y, vacc);
+ vacc = _mm_unpackhi_epi64(vacc, vacc);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint16_t))) {
+ *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vacc);
+ vacc = _mm_srli_epi64(vacc, 32);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint16_t))) {
+ *y = (uint16_t) _mm_extract_epi16(vacc, 0);
+ }
+ }
+}
+
void xnn_f32_f16_vcvt_ukernel__f16c_x16(
size_t n,
const float* input,
diff --git a/src/amalgam/fma3.c b/src/amalgam/fma3.c
index 634210e..13053d6 100644
--- a/src/amalgam/fma3.c
+++ b/src/amalgam/fma3.c
@@ -11,9 +11,1091 @@
#include <xnnpack/dwconv.h>
#include <xnnpack/gemm.h>
#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
#include <xnnpack/vunary.h>
+void xnn_f16_dwconv_minmax_ukernel_up16x4__fma3(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+
+ uint16_t* o = (uint16_t*) output;
+ do {
+ const uint16_t* i0 = input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != zero) {
+ i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+ }
+ const uint16_t* i1 = input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != zero) {
+ i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+ }
+ const uint16_t* i2 = input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != zero) {
+ i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+ }
+ const uint16_t* i3 = input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != zero) {
+ i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+ }
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const uint16_t* w = weights;
+ for (; c >= 16; c -= 16) {
+ __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ __m256 vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
+
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
+ i0 += 16;
+
+ const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
+ const __m256 vk0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
+ i1 += 16;
+
+ const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
+ const __m256 vk1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8)));
+ i2 += 16;
+
+ const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
+ const __m256 vk2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8)));
+ i3 += 16;
+
+ const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
+ const __m256 vk3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x89ABCDEF, vk3x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ w += 80;
+
+
+ __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+ __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
+ vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+ vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
+
+ _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
+ o += 16;
+ }
+ for (; c >= 8; c -= 8) {
+ __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 += 8;
+
+ const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 += 8;
+
+ const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ i2 += 8;
+
+ const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ i3 += 8;
+
+ const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ w += 8;
+
+
+ __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+ vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+ _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+ o += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ assert(c >= 1);
+ assert(c <= 7);
+
+ __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+
+ const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+
+ const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+
+ const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+
+ const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+
+ __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+ vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+ __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
+ if (c & 4) {
+ _mm_storel_epi64((__m128i*) o, vh01234567);
+ vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+ o += 4;
+ }
+ if (c & 2) {
+ *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+ vh01234567 = _mm_srli_epi64(vh01234567, 32);
+ o += 2;
+ }
+ if (c & 1) {
+ *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+ o += 1;
+ }
+ }
+
+ o = (uint16_t*) ((uintptr_t) o + output_increment);
+ } while (--output_width != 0);
+}
+
+void xnn_f16_dwconv_minmax_ukernel_up16x9__fma3(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+
+ uint16_t* o = (uint16_t*) output;
+ do {
+ const uint16_t* i0 = input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != zero) {
+ i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+ }
+ const uint16_t* i1 = input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != zero) {
+ i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+ }
+ const uint16_t* i2 = input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != zero) {
+ i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+ }
+ const uint16_t* i3 = input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != zero) {
+ i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+ }
+ const uint16_t* i4 = input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != zero) {
+ i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
+ }
+ const uint16_t* i5 = input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != zero) {
+ i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
+ }
+ const uint16_t* i6 = input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != zero) {
+ i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
+ }
+ const uint16_t* i7 = input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != zero) {
+ i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
+ }
+ const uint16_t* i8 = input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != zero) {
+ i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
+ }
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const uint16_t* w = weights;
+ for (; c >= 16; c -= 16) {
+ __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+ __m256 vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
+
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
+ i0 += 16;
+
+ const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
+ const __m256 vk0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
+ i1 += 16;
+
+ const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
+ const __m256 vk1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8)));
+ i2 += 16;
+
+ const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
+ const __m256 vk2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8)));
+ i3 += 16;
+
+ const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
+ const __m256 vk3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x89ABCDEF, vk3x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 8)));
+ i4 += 16;
+
+ const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 80)));
+ const __m256 vk4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 88)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x89ABCDEF, vk4x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 8)));
+ i5 += 16;
+
+ const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 96)));
+ const __m256 vk5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 104)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x89ABCDEF, vk5x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 8)));
+ i6 += 16;
+
+ const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 112)));
+ const __m256 vk6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 120)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x89ABCDEF, vk6x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+ const __m256 vi7x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i7 + 8)));
+ i7 += 16;
+
+ const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 128)));
+ const __m256 vk7x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 136)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x89ABCDEF, vk7x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+ const __m256 vi8x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i8 + 8)));
+ i8 += 16;
+
+ const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 144)));
+ const __m256 vk8x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 152)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+ vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x89ABCDEF, vk8x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+ w += 160;
+
+
+ __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+ __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
+ vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+ vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
+
+ _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+ _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
+ o += 16;
+ }
+ for (; c >= 8; c -= 8) {
+ __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 += 8;
+
+ const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 += 8;
+
+ const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ i2 += 8;
+
+ const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ i3 += 8;
+
+ const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ i4 += 8;
+
+ const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ i5 += 8;
+
+ const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ i6 += 8;
+
+ const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+ i7 += 8;
+
+ const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+ i8 += 8;
+
+ const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ w += 8;
+
+
+ __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+ vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+ _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+ o += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ assert(c >= 1);
+ assert(c <= 7);
+
+ __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+
+ const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+
+ const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+
+ const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+
+ const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+
+ const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+
+ const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+
+ const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+
+ const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+
+ const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+
+ __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+ vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+ __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
+ if (c & 4) {
+ _mm_storel_epi64((__m128i*) o, vh01234567);
+ vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+ o += 4;
+ }
+ if (c & 2) {
+ *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+ vh01234567 = _mm_srli_epi64(vh01234567, 32);
+ o += 2;
+ }
+ if (c & 1) {
+ *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+ o += 1;
+ }
+ }
+
+ o = (uint16_t*) ((uintptr_t) o + output_increment);
+ } while (--output_width != 0);
+}
+
+void xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+
+ uint16_t* o = (uint16_t*) output;
+ do {
+ const uint16_t* i0 = input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != zero) {
+ i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+ }
+ const uint16_t* i1 = input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != zero) {
+ i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+ }
+ const uint16_t* i2 = input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != zero) {
+ i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+ }
+ const uint16_t* i3 = input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != zero) {
+ i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+ }
+ const uint16_t* i4 = input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != zero) {
+ i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
+ }
+ const uint16_t* i5 = input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != zero) {
+ i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
+ }
+ const uint16_t* i6 = input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != zero) {
+ i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
+ }
+ const uint16_t* i7 = input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != zero) {
+ i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
+ }
+ const uint16_t* i8 = input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != zero) {
+ i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
+ }
+ const uint16_t* i9 = input[9];
+ assert(i9 != NULL);
+ if XNN_UNPREDICTABLE(i9 != zero) {
+ i9 = (const uint16_t*) ((uintptr_t) i9 + input_offset);
+ }
+ const uint16_t* i10 = input[10];
+ assert(i10 != NULL);
+ if XNN_UNPREDICTABLE(i10 != zero) {
+ i10 = (const uint16_t*) ((uintptr_t) i10 + input_offset);
+ }
+ const uint16_t* i11 = input[11];
+ assert(i11 != NULL);
+ if XNN_UNPREDICTABLE(i11 != zero) {
+ i11 = (const uint16_t*) ((uintptr_t) i11 + input_offset);
+ }
+ const uint16_t* i12 = input[12];
+ assert(i12 != NULL);
+ if XNN_UNPREDICTABLE(i12 != zero) {
+ i12 = (const uint16_t*) ((uintptr_t) i12 + input_offset);
+ }
+ const uint16_t* i13 = input[13];
+ assert(i13 != NULL);
+ if XNN_UNPREDICTABLE(i13 != zero) {
+ i13 = (const uint16_t*) ((uintptr_t) i13 + input_offset);
+ }
+ const uint16_t* i14 = input[14];
+ assert(i14 != NULL);
+ if XNN_UNPREDICTABLE(i14 != zero) {
+ i14 = (const uint16_t*) ((uintptr_t) i14 + input_offset);
+ }
+ const uint16_t* i15 = input[15];
+ assert(i15 != NULL);
+ if XNN_UNPREDICTABLE(i15 != zero) {
+ i15 = (const uint16_t*) ((uintptr_t) i15 + input_offset);
+ }
+ const uint16_t* i16 = input[16];
+ assert(i16 != NULL);
+ if XNN_UNPREDICTABLE(i16 != zero) {
+ i16 = (const uint16_t*) ((uintptr_t) i16 + input_offset);
+ }
+ const uint16_t* i17 = input[17];
+ assert(i17 != NULL);
+ if XNN_UNPREDICTABLE(i17 != zero) {
+ i17 = (const uint16_t*) ((uintptr_t) i17 + input_offset);
+ }
+ const uint16_t* i18 = input[18];
+ assert(i18 != NULL);
+ if XNN_UNPREDICTABLE(i18 != zero) {
+ i18 = (const uint16_t*) ((uintptr_t) i18 + input_offset);
+ }
+ const uint16_t* i19 = input[19];
+ assert(i19 != NULL);
+ if XNN_UNPREDICTABLE(i19 != zero) {
+ i19 = (const uint16_t*) ((uintptr_t) i19 + input_offset);
+ }
+ const uint16_t* i20 = input[20];
+ assert(i20 != NULL);
+ if XNN_UNPREDICTABLE(i20 != zero) {
+ i20 = (const uint16_t*) ((uintptr_t) i20 + input_offset);
+ }
+ const uint16_t* i21 = input[21];
+ assert(i21 != NULL);
+ if XNN_UNPREDICTABLE(i21 != zero) {
+ i21 = (const uint16_t*) ((uintptr_t) i21 + input_offset);
+ }
+ const uint16_t* i22 = input[22];
+ assert(i22 != NULL);
+ if XNN_UNPREDICTABLE(i22 != zero) {
+ i22 = (const uint16_t*) ((uintptr_t) i22 + input_offset);
+ }
+ const uint16_t* i23 = input[23];
+ assert(i23 != NULL);
+ if XNN_UNPREDICTABLE(i23 != zero) {
+ i23 = (const uint16_t*) ((uintptr_t) i23 + input_offset);
+ }
+ const uint16_t* i24 = input[24];
+ assert(i24 != NULL);
+ if XNN_UNPREDICTABLE(i24 != zero) {
+ i24 = (const uint16_t*) ((uintptr_t) i24 + input_offset);
+ }
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const uint16_t* w = weights;
+ for (; c >= 8; c -= 8) {
+ __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 += 8;
+
+ const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 += 8;
+
+ const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
+ __m256 vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vi1x01234567, vk1x01234567), _MM_FROUND_NO_EXC));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+ i2 += 8;
+
+ const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+ i3 += 8;
+
+ const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+ i4 += 8;
+
+ const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+ i5 += 8;
+
+ const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+ i6 += 8;
+
+ const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+ i7 += 8;
+
+ const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+ i8 += 8;
+
+ const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i9));
+ i9 += 8;
+
+ const __m256 vk9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 80)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi9x01234567, vk9x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i10));
+ i10 += 8;
+
+ const __m256 vk10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 88)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i11));
+ i11 += 8;
+
+ const __m256 vk11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 96)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi11x01234567, vk11x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i12));
+ i12 += 8;
+
+ const __m256 vk12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 104)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i13));
+ i13 += 8;
+
+ const __m256 vk13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 112)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi13x01234567, vk13x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14));
+ i14 += 8;
+
+ const __m256 vk14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 120)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i15));
+ i15 += 8;
+
+ const __m256 vk15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 128)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi15x01234567, vk15x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i16));
+ i16 += 8;
+
+ const __m256 vk16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 136)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i17));
+ i17 += 8;
+
+ const __m256 vk17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 144)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi17x01234567, vk17x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i18));
+ i18 += 8;
+
+ const __m256 vk18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 152)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi18x01234567, vk18x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i19));
+ i19 += 8;
+
+ const __m256 vk19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 160)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi19x01234567, vk19x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i20));
+ i20 += 8;
+
+ const __m256 vk20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 168)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi20x01234567, vk20x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i21));
+ i21 += 8;
+
+ const __m256 vk21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 176)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi21x01234567, vk21x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i22));
+ i22 += 8;
+
+ const __m256 vk22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 184)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi22x01234567, vk22x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i23));
+ i23 += 8;
+
+ const __m256 vk23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 192)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi23x01234567, vk23x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i24));
+ i24 += 8;
+
+ const __m256 vk24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 200)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi24x01234567, vk24x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ w += 208;
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vacc01234567p0, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+ vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+ _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+ o += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ assert(c >= 1);
+ assert(c <= 7);
+
+ __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+ const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+
+ const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+
+ const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+ __m256 vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vi1x01234567, vk1x01234567), _MM_FROUND_NO_EXC));
+
+ const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+
+ const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 24)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+
+ const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+
+ const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 40)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+
+ const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+
+ const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 56)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+
+ const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+
+ const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 72)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i9));
+
+ const __m256 vk9x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi9x01234567, vk9x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i10));
+
+ const __m256 vk10x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 88)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i11));
+
+ const __m256 vk11x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi11x01234567, vk11x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i12));
+
+ const __m256 vk12x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 104)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i13));
+
+ const __m256 vk13x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi13x01234567, vk13x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14));
+
+ const __m256 vk14x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 120)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i15));
+
+ const __m256 vk15x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi15x01234567, vk15x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i16));
+
+ const __m256 vk16x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 136)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i17));
+
+ const __m256 vk17x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi17x01234567, vk17x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i18));
+
+ const __m256 vk18x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 152)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi18x01234567, vk18x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i19));
+
+ const __m256 vk19x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 160)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi19x01234567, vk19x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i20));
+
+ const __m256 vk20x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 168)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi20x01234567, vk20x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i21));
+
+ const __m256 vk21x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 176)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi21x01234567, vk21x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i22));
+
+ const __m256 vk22x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 184)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi22x01234567, vk22x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ const __m256 vi23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i23));
+
+ const __m256 vk23x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 192)));
+ vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi23x01234567, vk23x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ const __m256 vi24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i24));
+
+ const __m256 vk24x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 200)));
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi24x01234567, vk24x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vacc01234567p0, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+ __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+ vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+ __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
+ if (c & 4) {
+ _mm_storel_epi64((__m128i*) o, vh01234567);
+ vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+ o += 4;
+ }
+ if (c & 2) {
+ *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+ vh01234567 = _mm_srli_epi64(vh01234567, 32);
+ o += 2;
+ }
+ if (c & 1) {
+ *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+ o += 1;
+ }
+ }
+
+ o = (uint16_t*) ((uintptr_t) o + output_increment);
+ } while (--output_width != 0);
+}
+
+void xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x(
+ size_t rows,
+ size_t channels,
+ const void*restrict input,
+ size_t input_stride,
+ const void*restrict weights,
+ void*restrict output,
+ size_t output_stride,
+ const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(channels != 0);
+ assert(channels % sizeof(uint16_t) == 0);
+
+ const uint16_t* i0 = (const uint16_t*) input;
+ uint16_t* o0 = (uint16_t*) output;
+ const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
+ uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
+
+ const size_t input_increment = input_stride * 2 - channels;
+ const size_t output_increment = output_stride * 2 - channels;
+
+ const __m256 vmin = _mm256_load_ps(params->avx.min);
+ const __m256 vmax = _mm256_load_ps(params->avx.max);
+ do {
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = i0;
+ o1 = o0;
+ }
+
+ const uint16_t* w = (const uint16_t*) weights;
+ size_t c = channels;
+ for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
+ const __m256 vscale = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
+
+ __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 += 8;
+ __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 += 8;
+
+ const __m256 vbias = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
+ w += 16;
+
+ vacc0 = _mm256_fmadd_ps(vacc0, vscale, vbias);
+ vacc1 = _mm256_fmadd_ps(vacc1, vscale, vbias);
+
+ vacc0 = _mm256_max_ps(vacc0, vmin);
+ vacc1 = _mm256_max_ps(vacc1, vmin);
+
+ vacc0 = _mm256_min_ps(vacc0, vmax);
+ vacc1 = _mm256_min_ps(vacc1, vmax);
+
+ _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
+ o0 += 8;
+ _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
+ o1 += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ const __m256 vscale = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
+
+ __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+ i0 = (const uint16_t*) ((uintptr_t) i0 + c);
+ __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+ i1 = (const uint16_t*) ((uintptr_t) i1 + c);
+
+ const __m256 vbias = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
+
+ vacc0 = _mm256_fmadd_ps(vacc0, vscale, vbias);
+ vacc1 = _mm256_fmadd_ps(vacc1, vscale, vbias);
+
+ vacc0 = _mm256_max_ps(vacc0, vmin);
+ vacc1 = _mm256_max_ps(vacc1, vmin);
+
+ vacc0 = _mm256_min_ps(vacc0, vmax);
+ vacc1 = _mm256_min_ps(vacc1, vmax);
+
+ __m128i vh0 = _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC);
+ __m128i vh1 = _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC);
+
+ if (c & (4 * sizeof(uint16_t))) {
+ _mm_storel_epi64((__m128i*) o0, vh0);
+ _mm_storel_epi64((__m128i*) o1, vh1);
+
+ vh0 = _mm_unpackhi_epi64(vh0, vh0);
+ vh1 = _mm_unpackhi_epi64(vh1, vh1);
+
+ o0 += 4;
+ o1 += 4;
+ }
+ if (c & (2 * sizeof(uint16_t))) {
+ *((uint32_t*) o0) = (uint32_t) _mm_cvtsi128_si32(vh0);
+ *((uint32_t*) o1) = (uint32_t) _mm_cvtsi128_si32(vh1);
+
+ vh0 = _mm_srli_epi64(vh0, 32);
+ vh1 = _mm_srli_epi64(vh1, 32);
+
+ o0 += 2;
+ o1 += 2;
+ }
+ if (c & (1 * sizeof(uint16_t))) {
+ *o0 = (uint16_t) _mm_extract_epi16(vh0, 0);
+ *o1 = (uint16_t) _mm_extract_epi16(vh1, 0);
+
+ o0 += 1;
+ o1 += 1;
+ }
+ }
+ i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
+ o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
+ i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
+ o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
+ rows = doz(rows, 2);
+ } while (rows != 0);
+}
+
void xnn_f32_dwconv_minmax_ukernel_up16x3__fma3(
size_t channels,
size_t output_width,
diff --git a/src/init.c b/src/init.c
index 8ae5145..4c73574 100644
--- a/src/init.c
+++ b/src/init.c
@@ -3315,6 +3315,70 @@
};
#endif // XNN_NO_X8_OPERATORS
+ /**************************** F16 x86 micro-kernels ****************************/
+ #ifndef XNN_NO_F16_OPERATORS
+ if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
+ init_flags |= XNN_INIT_FLAG_F16;
+
+ xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
+ xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
+ xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
+ xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
+ xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_avx_params;
+ xnn_params.f16.gemm.mr = 4;
+ xnn_params.f16.gemm.nr = 16;
+
+ xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
+ xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
+ xnn_params.f16.dwconv[0].channel_tile = 16;
+ xnn_params.f16.dwconv[0].primary_tile = 4;
+
+ xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
+ xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
+ xnn_params.f16.dwconv[1].channel_tile = 16;
+ xnn_params.f16.dwconv[1].primary_tile = 9;
+
+ xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
+ xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
+ xnn_params.f16.dwconv[2].channel_tile = 8;
+ xnn_params.f16.dwconv[2].primary_tile = 25;
+
+ xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
+ .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
+ .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
+ .init.f16 = xnn_init_f16_scaleminmax_avx_params,
+ .update.f16 = xnn_update_f16_scaleminmax_avx_params,
+ .row_tile = 7,
+ .channel_tile = 8,
+ };
+ xnn_params.f16.vadd = (struct vbinary_parameters) {
+ .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
+ .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
+ .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
+ .init.f16_minmax = xnn_init_f16_minmax_avx_params,
+ .element_tile = 16,
+ };
+ xnn_params.f16.vmul = (struct vbinary_parameters) {
+ .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
+ .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
+ .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
+ .init.f16_minmax = xnn_init_f16_minmax_avx_params,
+ .element_tile = 16,
+ };
+ xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
+ .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
+ .init.f16 = xnn_init_f16_minmax_avx_params,
+ .channel_tile = 8,
+ .row_tile = 2,
+ };
+ xnn_params.f16.hswish = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
+ .init.f16_hswish = xnn_init_f16_hswish_avx_params,
+ .element_tile = 16,
+ };
+ }
+ #endif // XNN_NO_F16_OPERATORS
+
/**************************** F32 x86 micro-kernels ****************************/
#ifndef XNN_NO_F32_OPERATORS
init_flags |= XNN_INIT_FLAG_F32;