Initialize F16 microkernel pointers on x86

PiperOrigin-RevId: 422911260
diff --git a/BUILD.bazel b/BUILD.bazel
index 2812c32..1347c6f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5979,6 +5979,13 @@
 
 PROD_F16C_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/vcvt-f16c-x16.c",
+    "src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c",
+    "src/f16-gavgpool/gen/7x-minmax-f16c-c8.c",
+    "src/f16-vbinary/gen/vadd-minmax-f16c-x16.c",
+    "src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c",
+    "src/f16-vbinary/gen/vmul-minmax-f16c-x16.c",
+    "src/f16-vbinary/gen/vmulc-minmax-f16c-x16.c",
+    "src/f16-vhswish/gen/vhswish-f16c-x16.c",
     "src/f32-f16-vcvt/gen/vcvt-f16c-x16.c",
 ]
 
@@ -6189,6 +6196,10 @@
 ]
 
 PROD_FMA3_MICROKERNEL_SRCS = [
+    "src/f16-dwconv/gen/up8x25-minmax-fma3-acc2.c",
+    "src/f16-dwconv/gen/up16x4-minmax-fma3.c",
+    "src/f16-dwconv/gen/up16x9-minmax-fma3.c",
+    "src/f16-vmulcaddc/gen/c8-minmax-fma3-2x.c",
     "src/f32-dwconv/gen/up8x25-minmax-fma3.c",
     "src/f32-dwconv/gen/up16x3-minmax-fma3.c",
     "src/f32-dwconv/gen/up16x4-minmax-fma3.c",
@@ -6299,6 +6310,10 @@
 ]
 
 PROD_AVX2_MICROKERNEL_SRCS = [
+    "src/f16-gemm/gen/1x16-minmax-avx2-broadcast.c",
+    "src/f16-gemm/gen/4x16-minmax-avx2-broadcast.c",
+    "src/f16-igemm/gen/1x16-minmax-avx2-broadcast.c",
+    "src/f16-igemm/gen/4x16-minmax-avx2-broadcast.c",
     "src/f32-qs8-vcvt/gen/vcvt-avx2-x64.c",
     "src/f32-qu8-vcvt/gen/vcvt-avx2-x64.c",
     "src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6696e86..93605b7 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4727,6 +4727,13 @@
 
 SET(PROD_F16C_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-f16c-x16.c
+  src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c
+  src/f16-gavgpool/gen/7x-minmax-f16c-c8.c
+  src/f16-vbinary/gen/vadd-minmax-f16c-x16.c
+  src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c
+  src/f16-vbinary/gen/vmul-minmax-f16c-x16.c
+  src/f16-vbinary/gen/vmulc-minmax-f16c-x16.c
+  src/f16-vhswish/gen/vhswish-f16c-x16.c
   src/f32-f16-vcvt/gen/vcvt-f16c-x16.c)
 
 SET(ALL_F16C_MICROKERNEL_SRCS
@@ -4933,6 +4940,10 @@
   src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x16.c)
 
 SET(PROD_FMA3_MICROKERNEL_SRCS
+  src/f16-dwconv/gen/up8x25-minmax-fma3-acc2.c
+  src/f16-dwconv/gen/up16x4-minmax-fma3.c
+  src/f16-dwconv/gen/up16x9-minmax-fma3.c
+  src/f16-vmulcaddc/gen/c8-minmax-fma3-2x.c
   src/f32-dwconv/gen/up8x25-minmax-fma3.c
   src/f32-dwconv/gen/up16x3-minmax-fma3.c
   src/f32-dwconv/gen/up16x4-minmax-fma3.c
@@ -5041,6 +5052,10 @@
   src/math/sqrt-fma3-nr2fma.c)
 
 SET(PROD_AVX2_MICROKERNEL_SRCS
+  src/f16-gemm/gen/1x16-minmax-avx2-broadcast.c
+  src/f16-gemm/gen/4x16-minmax-avx2-broadcast.c
+  src/f16-igemm/gen/1x16-minmax-avx2-broadcast.c
+  src/f16-igemm/gen/4x16-minmax-avx2-broadcast.c
   src/f32-qs8-vcvt/gen/vcvt-avx2-x64.c
   src/f32-qu8-vcvt/gen/vcvt-avx2-x64.c
   src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c
diff --git a/src/amalgam/avx2.c b/src/amalgam/avx2.c
index b534f88..ab30cc7 100644
--- a/src/amalgam/avx2.c
+++ b/src/amalgam/avx2.c
@@ -19,6 +19,622 @@
 #include <xnnpack/vunary.h>
 
 
+void xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const void*restrict a,
+    size_t a_stride,
+    const void*restrict w,
+    void*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(uint16_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const uint16_t* a0 = a;
+  uint16_t* c0 = c;
+
+  do {
+    __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+    __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+    w = (const uint16_t*) w + 16;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
+      a0 += 1;
+
+      const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+      const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+      w = (const uint16_t*) w + 16;
+
+      vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
+      vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
+
+      k -= sizeof(uint16_t);
+    } while (k != 0);
+
+    const __m256 vscale = _mm256_load_ps(params->avx.scale);
+    vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+
+    const __m256 vmin = _mm256_load_ps(params->avx.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
+
+    const __m256 vmax = _mm256_load_ps(params->avx.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
+      c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
+      if (nc & 8) {
+        _mm_storeu_si128((__m128i*) c0, vh0x01234567);
+
+        vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
+
+        c0 += 8;
+      }
+      if (nc & 4) {
+        _mm_storel_epi64((__m128i*) c0, vh0x01234567);
+
+        vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
+
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storeu_si32(c0, vh0x01234567);
+
+        vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
+
+        c0 += 2;
+      }
+      if (nc & 1) {
+        *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
+
+void xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const void*restrict a,
+    size_t a_stride,
+    const void*restrict w,
+    void*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(uint16_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const uint16_t* a0 = a;
+  uint16_t* c0 = c;
+  const uint16_t* a1 = (const uint16_t*) ((uintptr_t) a0 + a_stride);
+  uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const uint16_t* a2 = (const uint16_t*) ((uintptr_t) a1 + a_stride);
+  uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const uint16_t* a3 = (const uint16_t*) ((uintptr_t) a2 + a_stride);
+  uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+    __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
+    w = (const uint16_t*) w + 16;
+
+    size_t k = kc;
+    do {
+      const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
+      a0 += 1;
+      const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
+      a1 += 1;
+      const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
+      a2 += 1;
+      const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
+      a3 += 1;
+
+      const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+      const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+      w = (const uint16_t*) w + 16;
+
+      vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
+      vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
+      vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
+      vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
+      vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
+      vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
+      vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
+      vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
+
+      k -= sizeof(uint16_t);
+    } while (k != 0);
+
+    const __m256 vscale = _mm256_load_ps(params->avx.scale);
+    vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+    vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+    vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+    vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+
+    const __m256 vmin = _mm256_load_ps(params->avx.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
+    vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
+    vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
+    vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
+
+    const __m256 vmax = _mm256_load_ps(params->avx.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
+    vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
+    vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
+    vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
+      c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
+      _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
+      c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
+      _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
+      c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
+      _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
+      c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const uint16_t*) ((uintptr_t) a3 - kc);
+      a2 = (const uint16_t*) ((uintptr_t) a2 - kc);
+      a1 = (const uint16_t*) ((uintptr_t) a1 - kc);
+      a0 = (const uint16_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 16;
+    } else {
+      __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
+      __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
+      __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
+      __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
+      if (nc & 8) {
+        _mm_storeu_si128((__m128i*) c3, vh3x01234567);
+        _mm_storeu_si128((__m128i*) c2, vh2x01234567);
+        _mm_storeu_si128((__m128i*) c1, vh1x01234567);
+        _mm_storeu_si128((__m128i*) c0, vh0x01234567);
+
+        vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
+        vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
+        vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
+        vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
+
+        c3 += 8;
+        c2 += 8;
+        c1 += 8;
+        c0 += 8;
+      }
+      if (nc & 4) {
+        _mm_storel_epi64((__m128i*) c3, vh3x01234567);
+        _mm_storel_epi64((__m128i*) c2, vh2x01234567);
+        _mm_storel_epi64((__m128i*) c1, vh1x01234567);
+        _mm_storel_epi64((__m128i*) c0, vh0x01234567);
+
+        vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
+        vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
+        vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
+        vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
+
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storeu_si32(c3, vh3x01234567);
+        _mm_storeu_si32(c2, vh2x01234567);
+        _mm_storeu_si32(c1, vh1x01234567);
+        _mm_storeu_si32(c0, vh0x01234567);
+
+        vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
+        vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
+        vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
+        vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
+
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        *c3 = (uint16_t) _mm_extract_epi16(vh3x01234567, 0);
+        *c2 = (uint16_t) _mm_extract_epi16(vh2x01234567, 0);
+        *c1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
+        *c0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
+
+void xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const void**restrict a,
+    const void*restrict w,
+    void*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const void* zero,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(uint16_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(uint16_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  uint16_t* c0 = c;
+
+  do {
+    __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+    __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+    w = (const uint16_t*) w + 16;
+
+    size_t p = ks;
+    do {
+      const uint16_t* restrict a0 = (const uint16_t*) a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+        const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+        w = (const uint16_t*) w + 16;
+
+        const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
+        a0 += 1;
+
+        vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
+        vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
+
+        k -= sizeof(uint16_t);
+      } while (k != 0);
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vscale = _mm256_load_ps(params->avx.scale);
+    vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+
+    const __m256 vmin = _mm256_load_ps(params->avx.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
+
+    const __m256 vmax = _mm256_load_ps(params->avx.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
+      c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const void**restrict) ((uintptr_t) a - ks);
+      nc -= 16;
+    } else {
+      __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
+      if (nc & 8) {
+        _mm_storeu_si128((__m128i*) c0, vh0x01234567);
+
+        vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
+
+        c0 += 8;
+      }
+      if (nc & 4) {
+        _mm_storel_epi64((__m128i*) c0, vh0x01234567);
+
+        vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
+
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storeu_si32(c0, vh0x01234567);
+
+        vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
+
+        c0 += 2;
+      }
+      if (nc & 1) {
+        *c0 = _mm_extract_epi16(vh0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
+
+void xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const void**restrict a,
+    const void*restrict w,
+    void*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const void* zero,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(uint16_t) == 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(uint16_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  uint16_t* c0 = c;
+  uint16_t* c1 = (uint16_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  uint16_t* c2 = (uint16_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  uint16_t* c3 = (uint16_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m256 vacc0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+    __m256 vacc0x89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+    __m256 vacc1x01234567 = vacc0x01234567;
+    __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
+    __m256 vacc2x01234567 = vacc0x01234567;
+    __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
+    __m256 vacc3x01234567 = vacc0x01234567;
+    __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
+    w = (const uint16_t*) w + 16;
+
+    size_t p = ks;
+    do {
+      const uint16_t* restrict a0 = (const uint16_t*) a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const uint16_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const uint16_t* restrict a1 = (const uint16_t*) a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const uint16_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const uint16_t* restrict a2 = (const uint16_t*) a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const uint16_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const uint16_t* restrict a3 = (const uint16_t*) a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const uint16_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      do {
+        const __m256 vb01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+        const __m256 vb89ABCDEF = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) ((const uint16_t*) w + 8)));
+        w = (const uint16_t*) w + 16;
+
+        const __m256 va0 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a0));
+        a0 += 1;
+        const __m256 va1 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a1));
+        a1 += 1;
+        const __m256 va2 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a2));
+        a2 += 1;
+        const __m256 va3 = _mm256_cvtph_ps(_mm_set1_epi16((short) *a3));
+        a3 += 1;
+
+        vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb01234567, vacc0x01234567), _MM_FROUND_NO_EXC));
+        vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va0, vb89ABCDEF, vacc0x89ABCDEF), _MM_FROUND_NO_EXC));
+        vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb01234567, vacc1x01234567), _MM_FROUND_NO_EXC));
+        vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va1, vb89ABCDEF, vacc1x89ABCDEF), _MM_FROUND_NO_EXC));
+        vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb01234567, vacc2x01234567), _MM_FROUND_NO_EXC));
+        vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF), _MM_FROUND_NO_EXC));
+        vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb01234567, vacc3x01234567), _MM_FROUND_NO_EXC));
+        vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF), _MM_FROUND_NO_EXC));
+
+        k -= sizeof(uint16_t);
+      } while (k != 0);
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256 vscale = _mm256_load_ps(params->avx.scale);
+    vacc0x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc1x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc2x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc3x01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x01234567, vscale), _MM_FROUND_NO_EXC));
+    vacc0x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc0x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+    vacc1x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc1x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+    vacc2x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc2x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+    vacc3x89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vacc3x89ABCDEF, vscale), _MM_FROUND_NO_EXC));
+
+    const __m256 vmin = _mm256_load_ps(params->avx.min);
+    vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
+    vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
+    vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
+    vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
+    vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
+    vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
+    vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
+    vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
+
+    const __m256 vmax = _mm256_load_ps(params->avx.max);
+    vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
+    vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
+    vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
+    vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
+    vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
+    vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
+    vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
+    vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
+
+    if XNN_LIKELY(nc >= 16) {
+      _mm_storeu_si128((__m128i*) c3, _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c3 + 8), _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC));
+      c3 = (uint16_t*) ((uintptr_t) c3 + cn_stride);
+      _mm_storeu_si128((__m128i*) c2, _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c2 + 8), _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC));
+      c2 = (uint16_t*) ((uintptr_t) c2 + cn_stride);
+      _mm_storeu_si128((__m128i*) c1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c1 + 8), _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC));
+      c1 = (uint16_t*) ((uintptr_t) c1 + cn_stride);
+      _mm_storeu_si128((__m128i*) c0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (c0 + 8), _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC));
+      c0 = (uint16_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const void**restrict) ((uintptr_t) a - ks);
+      nc -= 16;
+    } else {
+      __m128i vh3x01234567 = _mm256_cvtps_ph(vacc3x01234567, _MM_FROUND_NO_EXC);
+      __m128i vh2x01234567 = _mm256_cvtps_ph(vacc2x01234567, _MM_FROUND_NO_EXC);
+      __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
+      __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
+      if (nc & 8) {
+        _mm_storeu_si128((__m128i*) c3, vh3x01234567);
+        _mm_storeu_si128((__m128i*) c2, vh2x01234567);
+        _mm_storeu_si128((__m128i*) c1, vh1x01234567);
+        _mm_storeu_si128((__m128i*) c0, vh0x01234567);
+
+        vh3x01234567 = _mm256_cvtps_ph(vacc3x89ABCDEF, _MM_FROUND_NO_EXC);
+        vh2x01234567 = _mm256_cvtps_ph(vacc2x89ABCDEF, _MM_FROUND_NO_EXC);
+        vh1x01234567 = _mm256_cvtps_ph(vacc1x89ABCDEF, _MM_FROUND_NO_EXC);
+        vh0x01234567 = _mm256_cvtps_ph(vacc0x89ABCDEF, _MM_FROUND_NO_EXC);
+
+        c3 += 8;
+        c2 += 8;
+        c1 += 8;
+        c0 += 8;
+      }
+      if (nc & 4) {
+        _mm_storel_epi64((__m128i*) c3, vh3x01234567);
+        _mm_storel_epi64((__m128i*) c2, vh2x01234567);
+        _mm_storel_epi64((__m128i*) c1, vh1x01234567);
+        _mm_storel_epi64((__m128i*) c0, vh0x01234567);
+
+        vh3x01234567 = _mm_unpackhi_epi64(vh3x01234567, vh3x01234567);
+        vh2x01234567 = _mm_unpackhi_epi64(vh2x01234567, vh2x01234567);
+        vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
+        vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
+
+        c3 += 4;
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+      }
+      if (nc & 2) {
+        _mm_storeu_si32(c3, vh3x01234567);
+        _mm_storeu_si32(c2, vh2x01234567);
+        _mm_storeu_si32(c1, vh1x01234567);
+        _mm_storeu_si32(c0, vh0x01234567);
+
+        vh3x01234567 = _mm_srli_epi64(vh3x01234567, 32);
+        vh2x01234567 = _mm_srli_epi64(vh2x01234567, 32);
+        vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
+        vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
+
+        c3 += 2;
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        *c3 = _mm_extract_epi16(vh3x01234567, 0);
+        *c2 = _mm_extract_epi16(vh2x01234567, 0);
+        *c1 = _mm_extract_epi16(vh1x01234567, 0);
+        *c0 = _mm_extract_epi16(vh0x01234567, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
+
 void xnn_f32_qs8_vcvt_ukernel__avx2_x64(
     size_t n,
     const float* x,
diff --git a/src/amalgam/f16c.c b/src/amalgam/f16c.c
index 34df7ed..7cbaef9 100644
--- a/src/amalgam/f16c.c
+++ b/src/amalgam/f16c.c
@@ -8,8 +8,12 @@
 #include <immintrin.h>
 
 #include <xnnpack/common.h>
+#include <xnnpack/gavgpool.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
 #include <xnnpack/vcvt.h>
+#include <xnnpack/vunary.h>
 
 
 void xnn_f16_f32_vcvt_ukernel__f16c_x16(
@@ -62,6 +66,690 @@
   }
 }
 
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* buffer,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows > 7);
+  assert(channels != 0);
+
+  const uint16_t* i0 = input;
+  const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
+  const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
+  const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
+  const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
+  const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
+  const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t);
+
+  uint16_t* b = buffer;
+  size_t c = channels;
+  for (; c != 0; c = doz(c, 8)) {
+    const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
+    const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
+
+    const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
+    __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
+
+    const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+    _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
+  }
+
+  for (rows -= 7; rows > 7; rows -= 7) {
+    i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
+    i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
+    i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
+    i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
+    i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
+    i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
+    i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
+
+    uint16_t* b = buffer;
+    size_t c = channels;
+    for (; c != 0; c = doz(c, 8)) {
+      __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+      _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
+    }
+  }
+
+  i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
+  i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const uint16_t*) zero;
+  }
+  i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const uint16_t*) zero;
+  }
+  i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const uint16_t*) zero;
+  }
+  i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const uint16_t*) zero;
+  }
+  i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const uint16_t*) zero;
+  }
+  i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const uint16_t*) zero;
+  }
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 vmin = _mm256_load_ps(params->avx.min);
+  const __m256 vmax = _mm256_load_ps(params->avx.max);
+  for (; channels >= 8; channels -= 8) {
+    __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
+
+    const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
+
+    const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+    vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
+
+    __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
+
+    vout01234567 = _mm256_min_ps(vout01234567, vmax);
+
+    _mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
+    output = (uint16_t*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    {
+      __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+      vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
+      __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
+      vout01234567 = _mm256_min_ps(vout01234567, vmax);
+
+      __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
+      if (channels & 4) {
+        _mm_storel_epi64((__m128i*) output, vh01234567);
+        output = (uint16_t*) output + 4;
+        vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+      }
+      if (channels & 2) {
+        *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+        output = (uint16_t*) output + 2;
+        vh01234567 = _mm_srli_epi64(vh01234567, 32);
+      }
+      if (channels & 1) {
+        *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+      }
+    }
+  }
+}
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows != 0);
+  assert(rows <= 7);
+  assert(channels != 0);
+
+  const uint16_t* i0 = input;
+  const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const uint16_t*) zero;
+  }
+  const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const uint16_t*) zero;
+  }
+  const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const uint16_t*) zero;
+  }
+  const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const uint16_t*) zero;
+  }
+  const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const uint16_t*) zero;
+  }
+  const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const uint16_t*) zero;
+  }
+
+  const __m256 vscale = _mm256_load_ps(params->avx.scale);
+  const __m256 vmin = _mm256_load_ps(params->avx.min);
+  const __m256 vmax = _mm256_load_ps(params->avx.max);
+  for (; channels >= 8; channels -= 8) {
+    const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+    i0 += 8;
+    const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+    i1 += 8;
+
+    const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+    __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
+    i2 += 8;
+
+    const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+    i3 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+    i4 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+    i5 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+    const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+    i6 += 8;
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+    vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+    vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
+
+    __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
+
+    vout01234567 = _mm256_min_ps(vout01234567, vmax);
+
+    _mm_storeu_si128((__m128i*) output, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
+    output = (uint16_t*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    {
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+      __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
+      const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
+      vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
+
+      vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
+      __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
+      vout01234567 = _mm256_min_ps(vout01234567, vmax);
+
+      __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
+      if (channels & 4) {
+        _mm_storel_epi64((__m128i*) output, vh01234567);
+        output = (uint16_t*) output + 4;
+        vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+      }
+      if (channels & 2) {
+        *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+        output = (uint16_t*) output + 2;
+        vh01234567 = _mm_srli_epi64(vh01234567, 32);
+      }
+      if (channels & 1) {
+        *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+      }
+    }
+  }
+}
+
+void xnn_f16_vadd_minmax_ukernel__f16c_x16(
+    size_t n,
+    const void* restrict a_ptr,
+    const void* restrict b_ptr,
+    void* restrict y_ptr,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint16_t) == 0);
+  assert(a_ptr != NULL);
+  assert(b_ptr != NULL);
+  assert(y_ptr != NULL);
+
+  const uint16_t* a = (const uint16_t*) a_ptr;
+  const uint16_t* b = (const uint16_t*) b_ptr;
+  uint16_t* y = (uint16_t*) y_ptr;
+
+  const __m256 vy_min = _mm256_load_ps(params->avx.min);
+  const __m256 vy_max = _mm256_load_ps(params->avx.max);
+
+  for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+    const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+    const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
+    const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
+    a += 16;
+    b += 16;
+
+    __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
+    __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
+
+
+    vy01234567 = _mm256_max_ps(vy01234567, vy_min);
+    vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
+
+    vy01234567 = _mm256_min_ps(vy01234567, vy_max);
+    vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
+
+    _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
+    _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+    const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+    a += 8;
+    b += 8;
+
+    __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
+
+    vy = _mm256_max_ps(vy, vy_min);
+    vy = _mm256_min_ps(vy, vy_max);
+
+    _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+
+    __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
+
+    vy = _mm256_max_ps(vy, vy_min);
+    vy = _mm256_min_ps(vy, vy_max);
+
+    __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
+    if (n & (4 * sizeof(uint16_t))) {
+      _mm_storel_epi64((__m128i*) y, vh);
+      vh = _mm_unpackhi_epi64(vh, vh);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint16_t))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
+      vh = _mm_srli_epi64(vh, 32);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint16_t))) {
+      *y = (uint16_t) _mm_extract_epi16(vh, 0);
+    }
+  }
+}
+
+void xnn_f16_vaddc_minmax_ukernel__f16c_x16(
+    size_t n,
+    const void* restrict a_ptr,
+    const void* restrict b_ptr,
+    void* restrict y_ptr,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint16_t) == 0);
+  assert(a_ptr != NULL);
+  assert(b_ptr != NULL);
+  assert(y_ptr != NULL);
+
+  const uint16_t* a = (const uint16_t*) a_ptr;
+  const uint16_t* b = (const uint16_t*) b_ptr;
+  uint16_t* y = (uint16_t*) y_ptr;
+
+  const __m256 vy_min = _mm256_load_ps(params->avx.min);
+  const __m256 vy_max = _mm256_load_ps(params->avx.max);
+
+  const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
+  for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+    const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
+    a += 16;
+
+    __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb), _MM_FROUND_NO_EXC));
+    __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
+
+
+    vy01234567 = _mm256_max_ps(vy01234567, vy_min);
+    vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
+
+    vy01234567 = _mm256_min_ps(vy01234567, vy_max);
+    vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
+
+    _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
+    _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+    const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    a += 8;
+
+    __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
+
+    vy = _mm256_max_ps(vy, vy_min);
+    vy = _mm256_min_ps(vy, vy_max);
+
+    _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+
+    __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
+
+    vy = _mm256_max_ps(vy, vy_min);
+    vy = _mm256_min_ps(vy, vy_max);
+
+    __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
+    if (n & (4 * sizeof(uint16_t))) {
+      _mm_storel_epi64((__m128i*) y, vh);
+      vh = _mm_unpackhi_epi64(vh, vh);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint16_t))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
+      vh = _mm_srli_epi64(vh, 32);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint16_t))) {
+      *y = (uint16_t) _mm_extract_epi16(vh, 0);
+    }
+  }
+}
+
+void xnn_f16_vmul_minmax_ukernel__f16c_x16(
+    size_t n,
+    const void* restrict a_ptr,
+    const void* restrict b_ptr,
+    void* restrict y_ptr,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint16_t) == 0);
+  assert(a_ptr != NULL);
+  assert(b_ptr != NULL);
+  assert(y_ptr != NULL);
+
+  const uint16_t* a = (const uint16_t*) a_ptr;
+  const uint16_t* b = (const uint16_t*) b_ptr;
+  uint16_t* y = (uint16_t*) y_ptr;
+
+  const __m256 vy_min = _mm256_load_ps(params->avx.min);
+  const __m256 vy_max = _mm256_load_ps(params->avx.max);
+
+  for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+    const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+    const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
+    const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
+    a += 16;
+    b += 16;
+
+    __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
+    __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
+
+
+    vy01234567 = _mm256_max_ps(vy01234567, vy_min);
+    vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
+
+    vy01234567 = _mm256_min_ps(vy01234567, vy_max);
+    vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
+
+    _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
+    _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+    const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+    a += 8;
+    b += 8;
+
+    __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
+
+    vy = _mm256_max_ps(vy, vy_min);
+    vy = _mm256_min_ps(vy, vy_max);
+
+    _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
+
+    __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
+
+    vy = _mm256_max_ps(vy, vy_min);
+    vy = _mm256_min_ps(vy, vy_max);
+
+    __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
+    if (n & (4 * sizeof(uint16_t))) {
+      _mm_storel_epi64((__m128i*) y, vh);
+      vh = _mm_unpackhi_epi64(vh, vh);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint16_t))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
+      vh = _mm_srli_epi64(vh, 32);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint16_t))) {
+      *y = (uint16_t) _mm_extract_epi16(vh, 0);
+    }
+  }
+}
+
+void xnn_f16_vmulc_minmax_ukernel__f16c_x16(
+    size_t n,
+    const void* restrict a_ptr,
+    const void* restrict b_ptr,
+    void* restrict y_ptr,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint16_t) == 0);
+  assert(a_ptr != NULL);
+  assert(b_ptr != NULL);
+  assert(y_ptr != NULL);
+
+  const uint16_t* a = (const uint16_t*) a_ptr;
+  const uint16_t* b = (const uint16_t*) b_ptr;
+  uint16_t* y = (uint16_t*) y_ptr;
+
+  const __m256 vy_min = _mm256_load_ps(params->avx.min);
+  const __m256 vy_max = _mm256_load_ps(params->avx.max);
+
+  const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
+  for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+    const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
+    a += 16;
+
+    __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb), _MM_FROUND_NO_EXC));
+    __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
+
+
+    vy01234567 = _mm256_max_ps(vy01234567, vy_min);
+    vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
+
+    vy01234567 = _mm256_min_ps(vy01234567, vy_max);
+    vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
+
+    _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
+    _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+    const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+    a += 8;
+
+    __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
+
+    vy = _mm256_max_ps(vy, vy_min);
+    vy = _mm256_min_ps(vy, vy_max);
+
+    _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
+
+    __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
+
+    vy = _mm256_max_ps(vy, vy_min);
+    vy = _mm256_min_ps(vy, vy_max);
+
+    __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
+    if (n & (4 * sizeof(uint16_t))) {
+      _mm_storel_epi64((__m128i*) y, vh);
+      vh = _mm_unpackhi_epi64(vh, vh);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint16_t))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vh);
+      vh = _mm_srli_epi64(vh, 32);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint16_t))) {
+      *y = (uint16_t) _mm_extract_epi16(vh, 0);
+    }
+  }
+}
+
+void xnn_f16_vhswish_ukernel__f16c_x16(
+    size_t n,
+    const void* restrict x_ptr,
+    void* restrict y_ptr,
+    const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(n != 0);
+  assert(n % sizeof(uint16_t) == 0);
+
+  const uint16_t* x = (const uint16_t*) x_ptr;
+  uint16_t* y = (uint16_t*) y_ptr;
+
+  const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
+  const __m256 vthree = _mm256_load_ps(params->avx.three);
+  const __m128i vsix = _mm_load_si128((const __m128i*) params->avx.six);
+  const __m128i vzero = _mm_setzero_si128();
+
+  for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
+    __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
+    __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
+    x += 16;
+
+    __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vx01234567, vthree), _MM_FROUND_NO_EXC);
+    vx01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx01234567, vsixth), _MM_FROUND_NO_EXC));
+    __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vx89ABCDEF, vthree), _MM_FROUND_NO_EXC);
+    vx89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx89ABCDEF, vsixth), _MM_FROUND_NO_EXC));
+
+    vacc01234567 = _mm_max_epi16(vacc01234567, vzero);
+    vacc89ABCDEF = _mm_max_epi16(vacc89ABCDEF, vzero);
+
+    vacc01234567 = _mm_min_epi16(vacc01234567, vsix);
+    vacc89ABCDEF = _mm_min_epi16(vacc89ABCDEF, vsix);
+
+    vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vx01234567), _MM_FROUND_NO_EXC);
+    vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vx89ABCDEF), _MM_FROUND_NO_EXC);
+
+    _mm_storeu_si128((__m128i*) y, vacc01234567);
+    _mm_storeu_si128((__m128i*) (y + 8), vacc89ABCDEF);
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
+    __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
+    x += 8;
+    __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
+    vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
+    vacc = _mm_max_epi16(vacc, vzero);
+    vacc = _mm_min_epi16(vacc, vsix);
+    vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
+    _mm_storeu_si128((__m128i*) y, vacc);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
+    __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
+    vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
+    vacc = _mm_max_epi16(vacc, vzero);
+    vacc = _mm_min_epi16(vacc, vsix);
+    vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
+
+    if (n & (4 * sizeof(uint16_t))) {
+      _mm_storel_epi64((__m128i*) y, vacc);
+      vacc = _mm_unpackhi_epi64(vacc, vacc);
+      y += 4;
+    }
+    if (n & (2 * sizeof(uint16_t))) {
+      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vacc);
+      vacc = _mm_srli_epi64(vacc, 32);
+      y += 2;
+    }
+    if (n & (1 * sizeof(uint16_t))) {
+      *y = (uint16_t) _mm_extract_epi16(vacc, 0);
+    }
+  }
+}
+
 void xnn_f32_f16_vcvt_ukernel__f16c_x16(
     size_t n,
     const float* input,
diff --git a/src/amalgam/fma3.c b/src/amalgam/fma3.c
index 634210e..13053d6 100644
--- a/src/amalgam/fma3.c
+++ b/src/amalgam/fma3.c
@@ -11,9 +11,1091 @@
 #include <xnnpack/dwconv.h>
 #include <xnnpack/gemm.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
 #include <xnnpack/vunary.h>
 
 
+void xnn_f16_dwconv_minmax_ukernel_up16x4__fma3(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m256 vmax = _mm256_load_ps(params->avx.max);
+  const __m256 vmin = _mm256_load_ps(params->avx.min);
+
+  uint16_t* o = (uint16_t*) output;
+  do {
+    const uint16_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint16_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint16_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint16_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+    }
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const uint16_t* w = weights;
+    for (; c >= 16; c -= 16) {
+      __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+      __m256 vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
+
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+      const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
+      i0 += 16;
+
+      const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
+      const __m256 vk0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+      const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
+      i1 += 16;
+
+      const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
+      const __m256 vk1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+      const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8)));
+      i2 += 16;
+
+      const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
+      const __m256 vk2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+      const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8)));
+      i3 += 16;
+
+      const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
+      const __m256 vk3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x89ABCDEF, vk3x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      w += 80;
+
+
+      __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+      __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
+      vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+      vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
+
+      _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
+      o += 16;
+    }
+    for (; c >= 8; c -= 8) {
+      __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+      i0 += 8;
+
+      const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+      i1 += 8;
+
+      const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+      i2 += 8;
+
+      const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+      i3 += 8;
+
+      const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      w += 8;
+
+
+      __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+      vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+      _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+      o += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      assert(c >= 1);
+      assert(c <= 7);
+
+      __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+
+      const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+
+      const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+
+      const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+
+      const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+
+      __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+      vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+      __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
+      if (c & 4) {
+        _mm_storel_epi64((__m128i*) o, vh01234567);
+        vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+        o += 4;
+      }
+      if (c & 2) {
+        *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+        vh01234567 = _mm_srli_epi64(vh01234567, 32);
+        o += 2;
+      }
+      if (c & 1) {
+        *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+        o += 1;
+      }
+    }
+
+    o = (uint16_t*) ((uintptr_t) o + output_increment);
+  } while (--output_width != 0);
+}
+
+void xnn_f16_dwconv_minmax_ukernel_up16x9__fma3(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m256 vmax = _mm256_load_ps(params->avx.max);
+  const __m256 vmin = _mm256_load_ps(params->avx.min);
+
+  uint16_t* o = (uint16_t*) output;
+  do {
+    const uint16_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint16_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint16_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint16_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint16_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint16_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint16_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint16_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint16_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
+    }
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const uint16_t* w = weights;
+    for (; c >= 16; c -= 16) {
+      __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+      __m256 vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
+
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+      const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
+      i0 += 16;
+
+      const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
+      const __m256 vk0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x89ABCDEF, vk0x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+      const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
+      i1 += 16;
+
+      const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
+      const __m256 vk1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x89ABCDEF, vk1x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+      const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8)));
+      i2 += 16;
+
+      const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
+      const __m256 vk2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x89ABCDEF, vk2x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+      const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8)));
+      i3 += 16;
+
+      const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
+      const __m256 vk3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x89ABCDEF, vk3x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+      const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 8)));
+      i4 += 16;
+
+      const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 80)));
+      const __m256 vk4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 88)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x89ABCDEF, vk4x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+      const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 8)));
+      i5 += 16;
+
+      const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 96)));
+      const __m256 vk5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 104)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x89ABCDEF, vk5x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+      const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 8)));
+      i6 += 16;
+
+      const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 112)));
+      const __m256 vk6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 120)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x89ABCDEF, vk6x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+      const __m256 vi7x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i7 + 8)));
+      i7 += 16;
+
+      const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 128)));
+      const __m256 vk7x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 136)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x89ABCDEF, vk7x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+      const __m256 vi8x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i8 + 8)));
+      i8 += 16;
+
+      const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 144)));
+      const __m256 vk8x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 152)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+      vacc89ABCDEFp0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x89ABCDEF, vk8x89ABCDEF, vacc89ABCDEFp0), _MM_FROUND_NO_EXC));
+
+      w += 160;
+
+
+      __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+      __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
+      vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+      vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
+
+      _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+      _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
+      o += 16;
+    }
+    for (; c >= 8; c -= 8) {
+      __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+      i0 += 8;
+
+      const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+      i1 += 8;
+
+      const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+      i2 += 8;
+
+      const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+      i3 += 8;
+
+      const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+      i4 += 8;
+
+      const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+      i5 += 8;
+
+      const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+      i6 += 8;
+
+      const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+      i7 += 8;
+
+      const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+      i8 += 8;
+
+      const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      w += 8;
+
+
+      __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+      vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+      _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+      o += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      assert(c >= 1);
+      assert(c <= 7);
+
+      __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+
+      const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+
+      const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+
+      const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+
+      const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+
+      const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+
+      const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+
+      const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+
+      const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+
+      const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+
+      __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+      vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+      __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
+      if (c & 4) {
+        _mm_storel_epi64((__m128i*) o, vh01234567);
+        vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+        o += 4;
+      }
+      if (c & 2) {
+        *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+        vh01234567 = _mm_srli_epi64(vh01234567, 32);
+        o += 2;
+      }
+      if (c & 1) {
+        *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+        o += 1;
+      }
+    }
+
+    o = (uint16_t*) ((uintptr_t) o + output_increment);
+  } while (--output_width != 0);
+}
+
+void xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m256 vmax = _mm256_load_ps(params->avx.max);
+  const __m256 vmin = _mm256_load_ps(params->avx.min);
+
+  uint16_t* o = (uint16_t*) output;
+  do {
+    const uint16_t* i0 = input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != zero) {
+      i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
+    }
+    const uint16_t* i1 = input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != zero) {
+      i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
+    }
+    const uint16_t* i2 = input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != zero) {
+      i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
+    }
+    const uint16_t* i3 = input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != zero) {
+      i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
+    }
+    const uint16_t* i4 = input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != zero) {
+      i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
+    }
+    const uint16_t* i5 = input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != zero) {
+      i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
+    }
+    const uint16_t* i6 = input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != zero) {
+      i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
+    }
+    const uint16_t* i7 = input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != zero) {
+      i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
+    }
+    const uint16_t* i8 = input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != zero) {
+      i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
+    }
+    const uint16_t* i9 = input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != zero) {
+      i9 = (const uint16_t*) ((uintptr_t) i9 + input_offset);
+    }
+    const uint16_t* i10 = input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != zero) {
+      i10 = (const uint16_t*) ((uintptr_t) i10 + input_offset);
+    }
+    const uint16_t* i11 = input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != zero) {
+      i11 = (const uint16_t*) ((uintptr_t) i11 + input_offset);
+    }
+    const uint16_t* i12 = input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != zero) {
+      i12 = (const uint16_t*) ((uintptr_t) i12 + input_offset);
+    }
+    const uint16_t* i13 = input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != zero) {
+      i13 = (const uint16_t*) ((uintptr_t) i13 + input_offset);
+    }
+    const uint16_t* i14 = input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != zero) {
+      i14 = (const uint16_t*) ((uintptr_t) i14 + input_offset);
+    }
+    const uint16_t* i15 = input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != zero) {
+      i15 = (const uint16_t*) ((uintptr_t) i15 + input_offset);
+    }
+    const uint16_t* i16 = input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != zero) {
+      i16 = (const uint16_t*) ((uintptr_t) i16 + input_offset);
+    }
+    const uint16_t* i17 = input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != zero) {
+      i17 = (const uint16_t*) ((uintptr_t) i17 + input_offset);
+    }
+    const uint16_t* i18 = input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != zero) {
+      i18 = (const uint16_t*) ((uintptr_t) i18 + input_offset);
+    }
+    const uint16_t* i19 = input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != zero) {
+      i19 = (const uint16_t*) ((uintptr_t) i19 + input_offset);
+    }
+    const uint16_t* i20 = input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != zero) {
+      i20 = (const uint16_t*) ((uintptr_t) i20 + input_offset);
+    }
+    const uint16_t* i21 = input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != zero) {
+      i21 = (const uint16_t*) ((uintptr_t) i21 + input_offset);
+    }
+    const uint16_t* i22 = input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != zero) {
+      i22 = (const uint16_t*) ((uintptr_t) i22 + input_offset);
+    }
+    const uint16_t* i23 = input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != zero) {
+      i23 = (const uint16_t*) ((uintptr_t) i23 + input_offset);
+    }
+    const uint16_t* i24 = input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != zero) {
+      i24 = (const uint16_t*) ((uintptr_t) i24 + input_offset);
+    }
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const uint16_t* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+      i0 += 8;
+
+      const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+      i1 += 8;
+
+      const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 16)));
+      __m256 vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vi1x01234567, vk1x01234567), _MM_FROUND_NO_EXC));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+      i2 += 8;
+
+      const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 24)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+      i3 += 8;
+
+      const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 32)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+      i4 += 8;
+
+      const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+      i5 += 8;
+
+      const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 48)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+      i6 += 8;
+
+      const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 56)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+      i7 += 8;
+
+      const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 64)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+      i8 += 8;
+
+      const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i9));
+      i9 += 8;
+
+      const __m256 vk9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 80)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi9x01234567, vk9x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i10));
+      i10 += 8;
+
+      const __m256 vk10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 88)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i11));
+      i11 += 8;
+
+      const __m256 vk11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 96)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi11x01234567, vk11x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i12));
+      i12 += 8;
+
+      const __m256 vk12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 104)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i13));
+      i13 += 8;
+
+      const __m256 vk13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 112)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi13x01234567, vk13x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14));
+      i14 += 8;
+
+      const __m256 vk14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 120)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i15));
+      i15 += 8;
+
+      const __m256 vk15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 128)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi15x01234567, vk15x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i16));
+      i16 += 8;
+
+      const __m256 vk16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 136)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i17));
+      i17 += 8;
+
+      const __m256 vk17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 144)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi17x01234567, vk17x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i18));
+      i18 += 8;
+
+      const __m256 vk18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 152)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi18x01234567, vk18x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i19));
+      i19 += 8;
+
+      const __m256 vk19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 160)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi19x01234567, vk19x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i20));
+      i20 += 8;
+
+      const __m256 vk20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 168)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi20x01234567, vk20x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i21));
+      i21 += 8;
+
+      const __m256 vk21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 176)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi21x01234567, vk21x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i22));
+      i22 += 8;
+
+      const __m256 vk22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 184)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi22x01234567, vk22x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i23));
+      i23 += 8;
+
+      const __m256 vk23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 192)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi23x01234567, vk23x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i24));
+      i24 += 8;
+
+      const __m256 vk24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 200)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi24x01234567, vk24x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      w += 208;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vacc01234567p0, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+      vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+      _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
+      o += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      assert(c >= 1);
+      assert(c <= 7);
+
+      __m256 vacc01234567p0 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) w));
+
+      const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+
+      const __m256 vk0x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 8)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+
+      const __m256 vk1x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 16)));
+      __m256 vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vi1x01234567, vk1x01234567), _MM_FROUND_NO_EXC));
+
+      const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
+
+      const __m256 vk2x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 24)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
+
+      const __m256 vk3x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 32)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
+
+      const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 40)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
+
+      const __m256 vk5x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 48)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
+
+      const __m256 vk6x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 56)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi7x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
+
+      const __m256 vk7x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 64)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
+
+      const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 72)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi9x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i9));
+
+      const __m256 vk9x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi9x01234567, vk9x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi10x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i10));
+
+      const __m256 vk10x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 88)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi11x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i11));
+
+      const __m256 vk11x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 96)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi11x01234567, vk11x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi12x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i12));
+
+      const __m256 vk12x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 104)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi13x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i13));
+
+      const __m256 vk13x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 112)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi13x01234567, vk13x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14));
+
+      const __m256 vk14x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 120)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi15x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i15));
+
+      const __m256 vk15x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 128)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi15x01234567, vk15x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi16x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i16));
+
+      const __m256 vk16x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 136)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi17x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i17));
+
+      const __m256 vk17x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi17x01234567, vk17x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi18x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i18));
+
+      const __m256 vk18x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 152)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi18x01234567, vk18x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi19x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i19));
+
+      const __m256 vk19x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 160)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi19x01234567, vk19x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi20x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i20));
+
+      const __m256 vk20x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 168)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi20x01234567, vk20x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi21x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i21));
+
+      const __m256 vk21x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 176)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi21x01234567, vk21x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi22x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i22));
+
+      const __m256 vk22x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 184)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi22x01234567, vk22x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      const __m256 vi23x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i23));
+
+      const __m256 vk23x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 192)));
+      vacc01234567p1 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi23x01234567, vk23x01234567, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      const __m256 vi24x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i24));
+
+      const __m256 vk24x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 200)));
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi24x01234567, vk24x01234567, vacc01234567p0), _MM_FROUND_NO_EXC));
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vacc01234567p0, vacc01234567p1), _MM_FROUND_NO_EXC));
+
+      __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
+      vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
+
+      __m128i vh01234567 = _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC);
+      if (c & 4) {
+        _mm_storel_epi64((__m128i*) o, vh01234567);
+        vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
+        o += 4;
+      }
+      if (c & 2) {
+        *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vh01234567);
+        vh01234567 = _mm_srli_epi64(vh01234567, 32);
+        o += 2;
+      }
+      if (c & 1) {
+        *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vh01234567, 0);
+        o += 1;
+      }
+    }
+
+    o = (uint16_t*) ((uintptr_t) o + output_increment);
+  } while (--output_width != 0);
+}
+
+void xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x(
+    size_t rows,
+    size_t channels,
+    const void*restrict input,
+    size_t input_stride,
+    const void*restrict weights,
+    void*restrict output,
+    size_t output_stride,
+    const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(uint16_t) == 0);
+
+  const uint16_t* i0 = (const uint16_t*) input;
+  uint16_t* o0 = (uint16_t*) output;
+  const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
+  uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const __m256 vmin = _mm256_load_ps(params->avx.min);
+  const __m256 vmax = _mm256_load_ps(params->avx.max);
+  do {
+    if XNN_UNPREDICTABLE(rows < 2) {
+      i1 = i0;
+      o1 = o0;
+    }
+
+    const uint16_t* w = (const uint16_t*) weights;
+    size_t c = channels;
+    for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
+      const __m256 vscale = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
+
+      __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+      i0 += 8;
+      __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+      i1 += 8;
+
+      const __m256 vbias = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
+      w += 16;
+
+      vacc0 = _mm256_fmadd_ps(vacc0, vscale, vbias);
+      vacc1 = _mm256_fmadd_ps(vacc1, vscale, vbias);
+
+      vacc0 = _mm256_max_ps(vacc0, vmin);
+      vacc1 = _mm256_max_ps(vacc1, vmin);
+
+      vacc0 = _mm256_min_ps(vacc0, vmax);
+      vacc1 = _mm256_min_ps(vacc1, vmax);
+
+      _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
+      o0 += 8;
+      _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
+      o1 += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const __m256 vscale = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
+
+      __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
+      i0 = (const uint16_t*) ((uintptr_t) i0 + c);
+      __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
+      i1 = (const uint16_t*) ((uintptr_t) i1 + c);
+
+      const __m256 vbias = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
+
+      vacc0 = _mm256_fmadd_ps(vacc0, vscale, vbias);
+      vacc1 = _mm256_fmadd_ps(vacc1, vscale, vbias);
+
+      vacc0 = _mm256_max_ps(vacc0, vmin);
+      vacc1 = _mm256_max_ps(vacc1, vmin);
+
+      vacc0 = _mm256_min_ps(vacc0, vmax);
+      vacc1 = _mm256_min_ps(vacc1, vmax);
+
+      __m128i vh0 = _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC);
+      __m128i vh1 = _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC);
+
+      if (c & (4 * sizeof(uint16_t))) {
+        _mm_storel_epi64((__m128i*) o0, vh0);
+        _mm_storel_epi64((__m128i*) o1, vh1);
+
+        vh0 = _mm_unpackhi_epi64(vh0, vh0);
+        vh1 = _mm_unpackhi_epi64(vh1, vh1);
+
+        o0 += 4;
+        o1 += 4;
+      }
+      if (c & (2 * sizeof(uint16_t))) {
+        *((uint32_t*) o0) = (uint32_t) _mm_cvtsi128_si32(vh0);
+        *((uint32_t*) o1) = (uint32_t) _mm_cvtsi128_si32(vh1);
+
+        vh0 = _mm_srli_epi64(vh0, 32);
+        vh1 = _mm_srli_epi64(vh1, 32);
+
+        o0 += 2;
+        o1 += 2;
+      }
+      if (c & (1 * sizeof(uint16_t))) {
+        *o0 = (uint16_t) _mm_extract_epi16(vh0, 0);
+        *o1 = (uint16_t) _mm_extract_epi16(vh1, 0);
+
+        o0 += 1;
+        o1 += 1;
+      }
+    }
+    i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
+    o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
+    i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
+    o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}
+
 void xnn_f32_dwconv_minmax_ukernel_up16x3__fma3(
     size_t channels,
     size_t output_width,
diff --git a/src/init.c b/src/init.c
index 8ae5145..4c73574 100644
--- a/src/init.c
+++ b/src/init.c
@@ -3315,6 +3315,70 @@
     };
   #endif  // XNN_NO_X8_OPERATORS
 
+  /**************************** F16 x86 micro-kernels ****************************/
+  #ifndef XNN_NO_F16_OPERATORS
+    if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
+      init_flags |= XNN_INIT_FLAG_F16;
+
+      xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
+      xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
+      xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
+      xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
+      xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_avx_params;
+      xnn_params.f16.gemm.mr = 4;
+      xnn_params.f16.gemm.nr = 16;
+
+      xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
+      xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
+      xnn_params.f16.dwconv[0].channel_tile = 16;
+      xnn_params.f16.dwconv[0].primary_tile = 4;
+
+      xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
+      xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
+      xnn_params.f16.dwconv[1].channel_tile = 16;
+      xnn_params.f16.dwconv[1].primary_tile = 9;
+
+      xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
+      xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
+      xnn_params.f16.dwconv[2].channel_tile = 8;
+      xnn_params.f16.dwconv[2].primary_tile = 25;
+
+      xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
+        .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
+        .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
+        .init.f16 = xnn_init_f16_scaleminmax_avx_params,
+        .update.f16 = xnn_update_f16_scaleminmax_avx_params,
+        .row_tile = 7,
+        .channel_tile = 8,
+      };
+      xnn_params.f16.vadd = (struct vbinary_parameters) {
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
+        .init.f16_minmax = xnn_init_f16_minmax_avx_params,
+        .element_tile = 16,
+      };
+      xnn_params.f16.vmul = (struct vbinary_parameters) {
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
+        .init.f16_minmax = xnn_init_f16_minmax_avx_params,
+        .element_tile = 16,
+      };
+      xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
+        .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
+        .init.f16 = xnn_init_f16_minmax_avx_params,
+        .channel_tile = 8,
+        .row_tile = 2,
+      };
+      xnn_params.f16.hswish = (struct vunary_parameters) {
+        .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
+        .init.f16_hswish = xnn_init_f16_hswish_avx_params,
+        .element_tile = 16,
+      };
+    }
+  #endif  // XNN_NO_F16_OPERATORS
+
   /**************************** F32 x86 micro-kernels ****************************/
   #ifndef XNN_NO_F32_OPERATORS
     init_flags |= XNN_INIT_FLAG_F32;