src/xnnpack/microparams.h - platform/external/XNNPACK - Git at Google

 // Copyright 2022 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.

 #pragma once

 #include <stddef.h>
 #include <stdint.h>

 #include <xnnpack/common.h>


 // Default: serves to differentiate pointer types for micro-kernels without fused activation.

 union xnn_f16_default_params {
   char _; // Dummy member variable to comply with the C standard
 };

 union xnn_f32_default_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     int32_t mask_table[14];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };


 // ReLU: serves to differentiate pointer types for micro-kernels with fused ReLU activation.

 union xnn_f32_relu_params {
   char _; // Dummy member variable to comply with the C standard
 };


 // Scale+Min+Max: used by AVGPOOL/GAVGPOOL microkernels.

 union xnn_f16_scaleminmax_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint16_t scale;
     uint16_t min;
     uint16_t max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(32) float scale[8];
     XNN_ALIGN(32) float min[8];
     XNN_ALIGN(32) float max[8];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_scaleminmax_params {
   struct {
     float scale;
     float min;
     float max;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float min[4];
     XNN_ALIGN(16) float max[4];
   } sse;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };


 // Min+Max: used by VCLAMP and GEMM/IGEMM/DWCONV/MAXPOOL/etc with MINMAX activation.

 union xnn_bf16_minmax_params {
   struct {
     float min;
     float max;
   } scalar;
 };

 union xnn_f16_minmax_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint16_t min;
     uint16_t max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(32) float min[8];
     XNN_ALIGN(32) float max[8];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_minmax_params {
   struct {
     float min;
     float max;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float min[4];
     XNN_ALIGN(16) float max[4];
   } sse;
   struct {
     XNN_ALIGN(32) float min[8];
     XNN_ALIGN(32) float max[8];
     int32_t mask_table[14];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float min[2];
     XNN_ALIGN(8) float max[2];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_s8_minmax_params {
   struct {
     int32_t min;
     int32_t max;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint8_t bias[16];
     XNN_ALIGN(16) uint8_t min_with_bias[16];
     XNN_ALIGN(16) uint8_t max_with_bias[16];
   } sse2;
   struct {
     XNN_ALIGN(16) int8_t min[16];
     XNN_ALIGN(16) int8_t max[16];
   } sse4;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int8_t min;
     int8_t max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int8_t min[8];
     XNN_ALIGN(8) int8_t max[8];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_u8_minmax_params {
   struct {
     uint32_t min;
     uint32_t max;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint8_t min[16];
     XNN_ALIGN(16) uint8_t max[16];
   } sse2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint8_t min;
     uint8_t max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) uint8_t min[8];
     XNN_ALIGN(8) uint8_t max[8];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // Conv w. Min+Max: used by quantized GEMM/IGEMM/DWCONV microkernels with MINMAX activation.

 union xnn_qc8_conv_minmax_params {
   struct {
     float magic_bias;
     int32_t magic_min;
     int32_t magic_max;
     int32_t magic_bias_less_zero_point;
   } fp32_scalar_imagic;
   struct {
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
   } fp32_scalar_fmagic;
   struct {
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     int32_t output_zero_point;
   } fp32_scalar_lrintf;
 #if XNN_ARCH_ARM
   struct {
     float magic_bias;
     int32_t magic_bias_less_zero_point;
     uint32_t output_min;
     uint32_t output_max;
   } fp32_armsimd32;
 #endif  // XNN_ARCH_ARM
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } fp32_neon;
   struct {
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } fp32_neonv8;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int16_t output_min[8];
   } fp32_sse2;
   struct {
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
   } fp32_sse4;
   struct {
     XNN_ALIGN(32) float output_max_less_zero_point[8];
     XNN_ALIGN(32) int16_t output_zero_point[16];
     XNN_ALIGN(32) int8_t output_min[32];
   } fp32_avx2;
   struct {
     XNN_ALIGN(64) float output_max_less_zero_point[16];
     XNN_ALIGN(64) int16_t output_zero_point[32];
     XNN_ALIGN(64) int8_t output_min[64];
   } fp32_avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
     XNN_ALIGN(8) int8_t output_max[8];
   } fp32_wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qs8_conv_minmax_params {
   struct {
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
   } fp32_scalar_fmagic;
   struct {
     float scale;
     float magic_bias;
     int32_t magic_min;
     int32_t magic_max;
     int32_t magic_bias_less_zero_point;
   } fp32_scalar_imagic;
   struct {
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     int32_t output_zero_point;
   } fp32_scalar_lrintf;
 #if XNN_ARCH_ARM
   struct {
     float scale;
     float magic_bias;
     int32_t magic_bias_less_zero_point;
     uint32_t output_min;
     uint32_t output_max;
   } fp32_armsimd32;
 #endif  // XNN_ARCH_ARM
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float scale;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } fp32_neon;
   struct {
     float scale;
     int16_t output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } fp32_neonv8;
   struct {
     int32_t right_pre_shift;
     int32_t multiplier;
     int32_t right_post_shift;
     int16_t output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } rndnu_neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int16_t output_min[8];
   } fp32_sse2;
   struct {
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
   } fp32_sse4;
   struct {
     XNN_ALIGN(32) float scale[8];
     XNN_ALIGN(32) float output_max_less_zero_point[8];
     XNN_ALIGN(32) int16_t output_zero_point[16];
     XNN_ALIGN(32) int8_t output_min[32];
   } fp32_avx2;
   struct {
     XNN_ALIGN(64) float scale[16];
     XNN_ALIGN(64) float output_max_less_zero_point[16];
     XNN_ALIGN(64) int16_t output_zero_point[32];
     XNN_ALIGN(64) int8_t output_min[64];
   } fp32_avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
     XNN_ALIGN(8) int8_t output_max[8];
   } fp32_wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qu8_conv_minmax_params {
   struct {
     int32_t kernel_zero_point;
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
   } fp32_scalar_fmagic;
   struct {
     int32_t kernel_zero_point;
     float scale;
     float magic_bias;
     int32_t magic_min;
     int32_t magic_max;
     int32_t magic_bias_less_zero_point;
   } fp32_scalar_imagic;
   struct {
     int32_t kernel_zero_point;
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     int32_t output_zero_point;
   } fp32_scalar_lrintf;
 #if XNN_ARCH_ARM
   struct {
     float scale;
     float magic_bias;
     uint32_t minus_kernel_zero_point;
     int32_t magic_bias_less_zero_point;
     uint32_t output_min;
     uint32_t output_max;
   } fp32_armsimd32;
 #endif  // XNN_ARCH_ARM
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint8_t kernel_zero_point[4];
     float scale;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } fp32_neon;
   struct {
     uint8_t kernel_zero_point[4];
     float scale;
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } fp32_neonv8;
   struct {
     uint8_t kernel_zero_point[4];
     int32_t right_pre_shift;
     int32_t multiplier;
     int32_t right_post_shift;
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } rndnu_neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t kernel_zero_point[8];
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
   } fp32_sse2;
   struct {
     XNN_ALIGN(32) int16_t kernel_zero_point[16];
     XNN_ALIGN(32) float scale[8];
     XNN_ALIGN(32) float output_max_less_zero_point[8];
     XNN_ALIGN(32) int16_t output_zero_point[16];
     XNN_ALIGN(32) uint8_t output_min[32];
   } fp32_avx2;
   struct {
     XNN_ALIGN(64) int16_t kernel_zero_point[32];
     XNN_ALIGN(64) float scale[16];
     XNN_ALIGN(64) float output_max_less_zero_point[16];
     XNN_ALIGN(64) int16_t output_zero_point[32];
     XNN_ALIGN(64) uint8_t output_min[64];
   } fp32_avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int16_t kernel_zero_point[4];
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
     XNN_ALIGN(8) int8_t output_max[8];
   } fp32_wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // Add w. Min+Max: used by quantized VADD[C] microkernels with MINMAX activation.

 union xnn_qs8_add_minmax_params {
   struct {
     int32_t bias;
     int32_t a_multiplier;
     int32_t b_multiplier;
     uint32_t shift;
     int32_t output_min_less_zero_point;
     int32_t output_max_less_zero_point;
     int32_t output_zero_point;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int8_t a_zero_point;
     int8_t b_zero_point;
     int16_t output_zero_point;
     int32_t a_multiplier;
     int32_t b_multiplier;
     int32_t right_shift;
     int8_t output_min;
     int8_t output_max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t bias[4];
     XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
     XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
     XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
     XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
     uint32_t shift;
     uint32_t b_multiplier;
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int16_t output_min[8];
     XNN_ALIGN(16) int16_t output_max[8];
   } sse2;
   struct {
     XNN_ALIGN(16) int32_t bias[4];
     XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
     XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
     XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
     XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
     uint32_t shift;
     uint32_t b_multiplier;
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
     XNN_ALIGN(16) int8_t output_max[16];
   } sse4_mul16;
   struct {
     XNN_ALIGN(16) int32_t bias[4];
     XNN_ALIGN(16) int32_t a_multiplier[4];
     XNN_ALIGN(16) int32_t b_multiplier[4];
     XNN_ALIGN(16) uint64_t shift[2];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
     XNN_ALIGN(16) int8_t output_max[16];
   } sse4_mul32;
   struct {
     XNN_ALIGN(32) int32_t bias[8];
     XNN_ALIGN(32) int32_t a_multiplier[8];
     XNN_ALIGN(32) int32_t b_multiplier[8];
     XNN_ALIGN(32) uint64_t shift[4];
     XNN_ALIGN(32) int16_t output_zero_point[16];
     XNN_ALIGN(16) int8_t output_min[16];
     XNN_ALIGN(16) int8_t output_max[16];
   } avx2;
   struct {
     XNN_ALIGN(64) int32_t bias[16];
     XNN_ALIGN(64) int32_t a_multiplier[16];
     XNN_ALIGN(64) int32_t b_multiplier[16];
     XNN_ALIGN(64) uint64_t shift[8];
     XNN_ALIGN(64) int16_t output_zero_point[32];
     XNN_ALIGN(32) int8_t output_min[32];
     XNN_ALIGN(32) int8_t output_max[32];
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int32_t bias[2];
     XNN_ALIGN(8) int32_t a_multiplier[2];
     XNN_ALIGN(8) int32_t b_multiplier[2];
     uint32_t shift;
     XNN_ALIGN(8) int16_t output_zero_point[4];
     XNN_ALIGN(8) int8_t output_min[8];
     XNN_ALIGN(8) int8_t output_max[8];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qu8_add_minmax_params {
   struct {
     int32_t bias;
     int32_t a_multiplier;
     int32_t b_multiplier;
     int32_t rounding;
     uint32_t shift;
     int32_t output_min_less_zero_point;
     int32_t output_max_less_zero_point;
     int32_t output_zero_point;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint8_t a_zero_point;
     uint8_t b_zero_point;
     int16_t output_zero_point;
     int32_t a_multiplier;
     int32_t b_multiplier;
     int32_t right_shift;
     uint8_t output_min;
     uint8_t output_max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t bias[4];
     XNN_ALIGN(16) uint16_t a_multiplier_lo[8];
     XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
     XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
     XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
     uint32_t shift;
     uint32_t b_multiplier;
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
     XNN_ALIGN(16) uint8_t output_max[16];
   } sse2;
   struct {
     XNN_ALIGN(16) int32_t bias[4];
     XNN_ALIGN(16) int32_t a_multiplier[4];
     XNN_ALIGN(16) int32_t b_multiplier[4];
     XNN_ALIGN(16) uint64_t shift[2];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
     XNN_ALIGN(16) uint8_t output_max[16];
   } sse4;
   struct {
     XNN_ALIGN(32) int32_t bias[8];
     XNN_ALIGN(32) int32_t a_multiplier[8];
     XNN_ALIGN(32) int32_t b_multiplier[8];
     XNN_ALIGN(32) uint64_t shift[4];
     XNN_ALIGN(32) int16_t output_zero_point[16];
     XNN_ALIGN(16) uint8_t output_min[16];
     XNN_ALIGN(16) uint8_t output_max[16];
   } avx2;
   struct {
     XNN_ALIGN(64) int32_t bias[16];
     XNN_ALIGN(64) int32_t a_multiplier[16];
     XNN_ALIGN(64) int32_t b_multiplier[16];
     XNN_ALIGN(64) uint64_t shift[8];
     XNN_ALIGN(64) int16_t output_zero_point[32];
     XNN_ALIGN(32) uint8_t output_min[32];
     XNN_ALIGN(32) uint8_t output_max[32];
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int32_t bias[2];
     XNN_ALIGN(8) int32_t a_multiplier[2];
     XNN_ALIGN(8) int32_t b_multiplier[2];
     uint32_t shift;
     XNN_ALIGN(8) int16_t output_zero_point[4];
     XNN_ALIGN(8) uint8_t output_min[8];
     XNN_ALIGN(8) uint8_t output_max[8];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // Mul w. Min+Max: used by quantized VMUL[C] microkernels with MINMAX activation.

 union xnn_qs8_mul_minmax_params {
   struct {
     int32_t a_zero_point;
     int32_t b_zero_point;
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
   } fp32_scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int8_t a_zero_point[2];
     int8_t b_zero_point[2];
     float scale;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } fp32_neon;
   struct {
     int8_t a_zero_point[2];
     int8_t b_zero_point[2];
     float scale;
     int16_t output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } fp32_neonv8;
   struct {
     int8_t a_zero_point[2];
     int8_t b_zero_point[2];
     int32_t left_pre_shift;
     int32_t multiplier;
     int32_t left_post_shift;
     int16_t output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } rndnu_neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t a_zero_point[8];
     XNN_ALIGN(16) int16_t b_zero_point[8];
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int16_t output_min[8];
     XNN_ALIGN(16) int16_t output_max[8];
   } fp32_sse2;
   struct {
     XNN_ALIGN(16) int16_t a_zero_point[8];
     XNN_ALIGN(16) int16_t b_zero_point[8];
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
     XNN_ALIGN(16) int8_t output_max[16];
   } fp32_sse4;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int16_t a_zero_point[4];
     XNN_ALIGN(8) int16_t b_zero_point[4];
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
     XNN_ALIGN(8) int8_t output_max[8];
   } fp32_wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qu8_mul_minmax_params {
   struct {
     int32_t a_zero_point;
     int32_t b_zero_point;
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
   } fp32_scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint8_t a_zero_point[2];
     uint8_t b_zero_point[2];
     float scale;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } fp32_neon;
   struct {
     uint8_t a_zero_point[2];
     uint8_t b_zero_point[2];
     float scale;
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } fp32_neonv8;
   struct {
     uint8_t a_zero_point[2];
     uint8_t b_zero_point[2];
     int32_t left_pre_shift;
     int32_t multiplier;
     int32_t left_post_shift;
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } rndnu_neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t a_zero_point[8];
     XNN_ALIGN(16) int16_t b_zero_point[8];
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
     XNN_ALIGN(16) uint8_t output_max[16];
   } fp32_sse2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int16_t a_zero_point[4];
     XNN_ALIGN(8) int16_t b_zero_point[4];
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
     XNN_ALIGN(8) uint8_t output_max[8];
   } fp32_wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // AvgPool w. Min+Max: used by quantized GAVGPOOL microkernels with MINMAX activation.

 union xnn_qs8_avgpool_minmax_params {
   struct {
     int32_t init_bias;
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
   } fp32_scalar_fmagic;
   struct {
     int32_t init_bias;
     float scale;
     float magic_bias;
     int32_t magic_min;
     int32_t magic_max;
     int32_t magic_bias_less_zero_point;
   } fp32_scalar_imagic;
   struct {
     int32_t init_bias;
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     int32_t output_zero_point;
   } fp32_scalar_lrintf;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int32_t init_bias;
     float scale;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } fp32_neon;
   struct {
     int32_t init_bias;
     float scale;
     int16_t output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } fp32_neonv8;
   struct {
     int32_t init_bias;
     int32_t left_pre_shift;
     int32_t multiplier;
     int32_t left_post_shift;
     int16_t output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } rndnu_neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t init_bias[4];
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int16_t output_min[8];
   } fp32_sse2;
   struct {
     XNN_ALIGN(16) int32_t init_bias[4];
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
   } fp32_sse4;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int32_t init_bias[2];
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
     XNN_ALIGN(8) int8_t output_max[8];
   } fp32_wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qu8_avgpool_minmax_params {
   struct {
     int32_t init_bias;
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
   } fp32_scalar_fmagic;
   struct {
     int32_t init_bias;
     float scale;
     float magic_bias;
     int32_t magic_min;
     int32_t magic_max;
     int32_t magic_bias_less_zero_point;
   } fp32_scalar_imagic;
   struct {
     int32_t init_bias;
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     int32_t output_zero_point;
   } fp32_scalar_lrintf;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int32_t init_bias;
     float scale;
     float magic_bias;
     int32_t magic_bias_less_output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } fp32_neon;
   struct {
     int32_t init_bias;
     float scale;
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } fp32_neonv8;
   struct {
     int32_t init_bias;
     int32_t left_pre_shift;
     int32_t multiplier;
     int32_t left_post_shift;
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } rndnu_neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t init_bias[4];
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
   } fp32_sse2;
   struct {
     XNN_ALIGN(16) int32_t init_bias[4];
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
   } fp32_sse4;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int32_t init_bias[2];
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2];
     XNN_ALIGN(8) uint8_t output_max[8];
   } fp32_wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD

   // Legacy parameters used by QU8 AVGPOOL microkernels
   struct {
     int32_t bias;
     int32_t multiplier;
     int64_t rounding;
     uint32_t right_shift;
     int32_t output_min_less_zero_point;
     int32_t output_max_less_zero_point;
     int32_t output_zero_point;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int32_t bias;
     int32_t multiplier;
     int64_t left_shift;
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int32_t bias[4];
     XNN_ALIGN(16) uint32_t multiplier[4];
     XNN_ALIGN(16) uint64_t rounding[2];
     XNN_ALIGN(16) uint64_t right_shift[2];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
     XNN_ALIGN(16) uint8_t output_max[16];
   } sse2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };


 // Abs: used by VABS microkernels.

 union xnn_f16_abs_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint16_t nonsign_mask[8];
   } sse;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_abs_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float nonsign_mask[4];
   } sse;
   struct {
     XNN_ALIGN(32) float nonsign_mask[8];
     int32_t mask_table[14];
   } avx;
   struct {
     uint32_t nonsign_mask;
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float nonsign_mask[2];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // Cvt (Convert): used by VCVT microkernels.

 union xnn_f16_f32_cvt_params {
   struct {
     uint32_t sign_mask;
     uint32_t exp_offset;
     float exp_scale;
     uint32_t magic_mask;
     float magic_bias;
     uint32_t denorm_cutoff;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float exp_scale;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint16_t sign_mask[8];
     XNN_ALIGN(16) uint16_t exp_offset[8];
     XNN_ALIGN(16) float exp_scale[4];
     XNN_ALIGN(16) uint16_t magic_mask[8];
     XNN_ALIGN(16) float magic_bias[4];
     XNN_ALIGN(16) int16_t denorm_cutoff[8];
   } sse_int16;
   struct {
     XNN_ALIGN(16) uint32_t sign_mask[4];
     XNN_ALIGN(16) uint32_t exp_offset[4];
     XNN_ALIGN(16) float exp_scale[4];
     XNN_ALIGN(16) uint32_t magic_bias[4];
     XNN_ALIGN(16) int32_t denorm_cutoff[4];
   } sse_int32;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) uint16_t sign_mask[4];
     XNN_ALIGN(8) uint16_t exp_offset[4];
     XNN_ALIGN(8) float exp_scale[2];
     XNN_ALIGN(8) uint16_t magic_mask[4];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int16_t denorm_cutoff[4];
   } wasmsimd_int16;
   struct {
     XNN_ALIGN(8) uint32_t sign_mask[2];
     XNN_ALIGN(8) uint32_t exp_offset[2];
     XNN_ALIGN(8) float exp_scale[2];
     XNN_ALIGN(8) uint32_t magic_bias[2];
     XNN_ALIGN(8) int32_t denorm_cutoff[2];
   } wasmsimd_int32;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_f32_f16_cvt_params {
   struct {
     uint32_t nonsign_mask;
     uint32_t exp_bias;
     float scale_to_inf;
     uint32_t expw_max;
     float scale_to_zero;
     uint32_t bias_min;
     uint16_t exph_mask;
     uint16_t manth_mask;
     uint16_t nanh;
   } scalar_bitcast;
   struct {
     float scale_to_inf;
     uint32_t exp_bias;
     float scale_to_zero;
     uint32_t expw_max;
     uint32_t bias_min;
     uint16_t exph_mask;
     uint16_t manth_mask;
     uint16_t nanh;
   } scalar_fabsf;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint32_t exp_bias;
     float scale_to_inf;
     uint32_t expw_max;
     float scale_to_zero;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint32_t nonsign_mask[4];
     XNN_ALIGN(16) uint32_t exp_bias[4];
     XNN_ALIGN(16) float scale_to_inf[4];
     XNN_ALIGN(16) uint32_t expw_max[4];
     XNN_ALIGN(16) float scale_to_zero[4];
     XNN_ALIGN(16) int16_t bias_min[8];
     XNN_ALIGN(16) uint32_t manth_mask[4];
     XNN_ALIGN(16) uint32_t exph_mask[4];
     XNN_ALIGN(16) uint16_t nanh[8];
   } sse2;
   struct {
     int32_t mask_table[14];
   } f16c;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) uint32_t exp_bias[2];
     XNN_ALIGN(8) float scale_to_inf[2];
     XNN_ALIGN(8) uint32_t expw_max[2];
     XNN_ALIGN(8) float scale_to_zero[2];
     XNN_ALIGN(8) int16_t bias_min[4];
     XNN_ALIGN(8) uint32_t manth_mask[2];
     XNN_ALIGN(8) uint32_t exph_mask[2];
     XNN_ALIGN(8) uint16_t nanh[4];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_f32_qs8_cvt_params {
   struct {
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_zero_point;
   } scalar_fmagic;
   struct {
     float scale;
     float magic_bias;
     int32_t magic_min;
     int32_t magic_max;
     int32_t magic_bias_less_zero_point;
   } scalar_imagic;
   struct {
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     int32_t output_zero_point;
   } scalar_lrintf;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float scale;
     float magic_bias;
     int32_t magic_bias_less_zero_point;
     int8_t output_min;
     int8_t output_max;
   } neon;
   struct {
     float scale;
     int16_t output_zero_point;
     int8_t output_min;
     int8_t output_max;
   } neonv8;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int16_t output_min[8];
   } sse2;
   struct {
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
   } sse4;
   struct {
     XNN_ALIGN(32) float scale[8];
     XNN_ALIGN(32) float output_max_less_zero_point[8];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) int8_t output_min[16];
     int32_t mask_table[14];
   } avx;
   struct {
     XNN_ALIGN(32) float scale[8];
     XNN_ALIGN(32) float output_max_less_zero_point[8];
     XNN_ALIGN(32) int16_t output_zero_point[16];
     XNN_ALIGN(32) uint32_t shuffle_mask[8];
     XNN_ALIGN(32) int8_t output_min[32];
     int32_t mask_table[14];
   } avx2;
   struct {
     XNN_ALIGN(64) float scale[16];
     XNN_ALIGN(64) float output_max_less_zero_point[16];
     XNN_ALIGN(64) int16_t output_zero_point[32];
     XNN_ALIGN(64) int8_t output_min[64];
     XNN_ALIGN(64) uint32_t shuffle512_mask[16];
     XNN_ALIGN(32) uint32_t shuffle256_mask[8];
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) int16_t output_zero_point[4];
     XNN_ALIGN(8) int8_t output_min[8];
     XNN_ALIGN(8) int8_t output_max[8];
   } wasmsimd_cvt;
   struct {
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2];
     XNN_ALIGN(8) int8_t output_max[8];
   } wasmsimd_magic;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_f32_qu8_cvt_params {
   struct {
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     float magic_bias;
     int32_t magic_bias_less_zero_point;
   } scalar_fmagic;
   struct {
     float scale;
     float magic_bias;
     int32_t magic_min;
     int32_t magic_max;
     int32_t magic_bias_less_zero_point;
   } scalar_imagic;
   struct {
     float scale;
     float output_min_less_zero_point;
     float output_max_less_zero_point;
     int32_t output_zero_point;
   } scalar_lrintf;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float scale;
     float magic_bias;
     int32_t magic_bias_less_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } neon;
   struct {
     float scale;
     int16_t output_zero_point;
     uint8_t output_min;
     uint8_t output_max;
   } neonv8;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float scale[4];
     XNN_ALIGN(16) float output_max_less_zero_point[4];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
   } sse2;
   struct {
     XNN_ALIGN(32) float scale[8];
     XNN_ALIGN(32) float output_max_less_zero_point[8];
     XNN_ALIGN(16) int16_t output_zero_point[8];
     XNN_ALIGN(16) uint8_t output_min[16];
     int32_t mask_table[14];
   } avx;
   struct {
     XNN_ALIGN(32) float scale[8];
     XNN_ALIGN(32) float output_max_less_zero_point[8];
     XNN_ALIGN(32) int16_t output_zero_point[16];
     XNN_ALIGN(32) uint32_t shuffle_mask[8];
     XNN_ALIGN(32) uint8_t output_min[32];
     int32_t mask_table[14];
   } avx2;
   struct {
     XNN_ALIGN(64) float scale[16];
     XNN_ALIGN(64) float output_max_less_zero_point[16];
     XNN_ALIGN(64) int16_t output_zero_point[32];
     XNN_ALIGN(64) uint8_t output_min[64];
     XNN_ALIGN(64) uint32_t shuffle512_mask[16];
     XNN_ALIGN(32) uint32_t shuffle256_mask[8];
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) int16_t output_zero_point[4];
     XNN_ALIGN(8) uint8_t output_min[8];
     XNN_ALIGN(8) uint8_t output_max[8];
   } wasmsimd_cvt;
   struct {
     XNN_ALIGN(8) float scale[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) int32_t magic_min[2];
     XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2];
     XNN_ALIGN(8) uint8_t output_max[8];
   } wasmsimd_magic;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qs8_cvt_params {
   struct {
     int32_t bias;
     int32_t multiplier;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint32_t minus_input_zero_point;
     int32_t multiplier;
     int32_t bias;
   } armsimd32;
   struct {
     int16_t input_zero_point;
     int16_t multiplier;
     int16_t output_zero_point;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t multiplier[8];
     XNN_ALIGN(16) int32_t bias[4];
   } sse2;
   struct {
     XNN_ALIGN(16) int16_t input_zero_point[8];
     XNN_ALIGN(16) int16_t multiplier[8];
     XNN_ALIGN(16) int16_t output_zero_point[8];
   } ssse3;
   struct {
     XNN_ALIGN(32) int16_t input_zero_point[16];
     XNN_ALIGN(32) int16_t multiplier[16];
     XNN_ALIGN(32) int16_t output_zero_point[16];
   } avx2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int16_t input_zero_point[4];
     XNN_ALIGN(8) int16_t multiplier[4];
     XNN_ALIGN(8) int16_t output_zero_point[4];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qs8_f32_cvt_params {
   struct {
     int32_t zero_point;
     float scale;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int16_t minus_zero_point[2];
     float scale;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint8_t sign_mask[16];
     XNN_ALIGN(16) uint16_t magic_exp[8];
     XNN_ALIGN(16) float magic_bias[4];
     XNN_ALIGN(16) float scale[4];
   } sse2;
   struct {
     XNN_ALIGN(16) int32_t minus_zero_point[4];
     XNN_ALIGN(16) float scale[4];
   } sse4;
   struct {
     XNN_ALIGN(32) int32_t minus_zero_point[8];
     XNN_ALIGN(32) float scale[8];
   } avx;
   struct {
     XNN_ALIGN(64) int32_t minus_zero_point[16];
     XNN_ALIGN(64) float scale[16];
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int16_t minus_zero_point[4];
     XNN_ALIGN(8) float scale[2];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qu8_cvt_params {
   struct {
     int32_t bias;
     int32_t multiplier;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint32_t minus_input_zero_point;
     int32_t multiplier;
     int32_t bias;
   } armsimd32;
   struct {
     uint16_t input_zero_point;
     int16_t multiplier;
     int16_t output_zero_point;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint16_t multiplier[8];
     XNN_ALIGN(16) int32_t bias[4];
   } sse2;
   struct {
     XNN_ALIGN(16) uint16_t input_zero_point[8];
     XNN_ALIGN(16) int16_t multiplier[8];
     XNN_ALIGN(16) int16_t output_zero_point[8];
   } ssse3;
   struct {
     XNN_ALIGN(32) uint16_t input_zero_point[16];
     XNN_ALIGN(32) int16_t multiplier[16];
     XNN_ALIGN(32) int16_t output_zero_point[16];
   } avx2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) uint16_t input_zero_point[4];
     XNN_ALIGN(8) int16_t multiplier[4];
     XNN_ALIGN(8) int16_t output_zero_point[4];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qu8_f32_cvt_params {
   struct {
     int32_t zero_point;
     float scale;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     int16_t minus_zero_point[2];
     float scale;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint16_t magic_exp[8];
     XNN_ALIGN(16) float magic_bias[4];
     XNN_ALIGN(16) float scale[4];
   } sse2;
   struct {
     XNN_ALIGN(16) int32_t minus_zero_point[4];
     XNN_ALIGN(16) float scale[4];
   } sse4;
   struct {
     XNN_ALIGN(32) int32_t minus_zero_point[8];
     XNN_ALIGN(32) float scale[8];
   } avx;
   struct {
     XNN_ALIGN(64) int32_t minus_zero_point[16];
     XNN_ALIGN(64) float scale[16];
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int16_t minus_zero_point[4];
     XNN_ALIGN(8) float scale[2];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // ELU: used by VELU microkernels.

 union xnn_f16_elu_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint16_t prescale;
     uint16_t sat_cutoff;
     uint16_t magic_bias;
     uint16_t log2e;
     uint16_t minus_ln2;
     uint16_t c3;
     uint16_t c2;
     uint16_t minus_alpha;
     uint16_t beta;
   } neonfp16arith_rr1_p3;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(32) float prescale[8];
     XNN_ALIGN(32) float sat_cutoff[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float c1[8];
     XNN_ALIGN(32) float alpha[8];
     XNN_ALIGN(32) float beta[8];
   } avx2_rr1_p3;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_elu_params {
   struct {
     float prescale;
     float alpha;
     float beta;
     float sat_cutoff;
     float magic_bias;
     float log2e;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c3;
     float c2;
     float one;
   } scalar_rr2_lut16_p3;
   struct {
     float prescale;
     float alpha;
     float beta;
     float sat_cutoff;
     float magic_bias;
     float log2e;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c6;
     float c5;
     float c4;
     float c3;
     float c2;
     float one;
   } scalar_rr2_p6;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float prescale;
     float alpha;
     float beta;
     float sat_cutoff;
     float magic_bias;
     float log2e;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c6;
     float c5;
     float c4;
     float c3;
     float c2;
   } neon_rr2_p6;
   struct {
     float prescale;
     float alpha;
     float beta;
     float sat_cutoff;
     float magic_bias;
     float log2e;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c3;
     float c2;
   } neon_rr2_lut16_p3;
   struct {
     float prescale;
     float alpha;
     float beta;
     float sat_cutoff;
     float magic_bias;
     float log2e;
     float minus_ln2;
     float c6;
     float c5;
     float c4;
     float c3;
     float c2;
   } neonfma_rr1_p6;
   struct {
     float prescale;
     float alpha;
     float beta;
     float sat_cutoff;
     float magic_bias;
     float log2e;
     float minus_ln2;
     float c3;
     float c2;
   } neonfma_rr1_lut16_p3;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float prescale[4];
     XNN_ALIGN(16) float alpha[4];
     XNN_ALIGN(16) float beta[4];
     XNN_ALIGN(16) float sat_cutoff[4];
     XNN_ALIGN(16) float magic_bias[4];
     XNN_ALIGN(16) float log2e[4];
     XNN_ALIGN(16) uint32_t index_mask[4];
     XNN_ALIGN(16) float minus_ln2_hi[4];
     XNN_ALIGN(16) float minus_ln2_lo[4];
     XNN_ALIGN(16) float c3[4];
     XNN_ALIGN(16) float c2[4];
     XNN_ALIGN(16) float one[4];
   } sse2_rr2_lut16_p3;
   struct {
     XNN_ALIGN(16) float prescale[4];
     XNN_ALIGN(16) float alpha[4];
     XNN_ALIGN(16) float beta[4];
     XNN_ALIGN(16) float sat_cutoff[4];
     XNN_ALIGN(16) float magic_bias[4];
     XNN_ALIGN(16) float log2e[4];
     XNN_ALIGN(16) float minus_ln2_hi[4];
     XNN_ALIGN(16) float minus_ln2_lo[4];
     XNN_ALIGN(16) float c6[4];
     XNN_ALIGN(16) float c5[4];
     XNN_ALIGN(16) float c4[4];
     XNN_ALIGN(16) float c3[4];
     XNN_ALIGN(16) float c2[4];
     XNN_ALIGN(16) float one[4];
   } sse2_rr2_p6;
   struct {
     XNN_ALIGN(32) float prescale[8];
     XNN_ALIGN(32) float alpha[8];
     XNN_ALIGN(32) float beta[8];
     XNN_ALIGN(32) float sat_cutoff[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) uint32_t index_mask[8];
     XNN_ALIGN(32) float minus_ln2_hi[8];
     XNN_ALIGN(32) float minus_ln2_lo[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float one[8];
     int32_t mask_table[14];
   } avx_rr2_lut16_p3;
   struct {
     XNN_ALIGN(32) float prescale[8];
     XNN_ALIGN(32) float alpha[8];
     XNN_ALIGN(32) float beta[8];
     XNN_ALIGN(32) float sat_cutoff[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) uint32_t index_mask[8];
     XNN_ALIGN(32) float table[8];
     XNN_ALIGN(32) float minus_ln2_hi[8];
     XNN_ALIGN(32) float minus_ln2_lo[8];
     XNN_ALIGN(32) float c4[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float one[8];
     int32_t mask_table[14];
   } avx_rr2_lut4_p4;
   struct {
     XNN_ALIGN(32) float prescale[8];
     XNN_ALIGN(32) float alpha[8];
     XNN_ALIGN(32) float beta[8];
     XNN_ALIGN(32) float sat_cutoff[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float minus_ln2_hi[8];
     XNN_ALIGN(32) float minus_ln2_lo[8];
     XNN_ALIGN(32) float c6[8];
     XNN_ALIGN(32) float c5[8];
     XNN_ALIGN(32) float c4[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float one[8];
     int32_t mask_table[14];
   } avx_rr2_p6;
   struct {
     XNN_ALIGN(32) float prescale[8];
     XNN_ALIGN(32) float alpha[8];
     XNN_ALIGN(32) float beta[8];
     XNN_ALIGN(32) float sat_cutoff[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) uint32_t index_mask[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     int32_t mask_table[14];
   } avx2_rr1_lut16_p3;
   struct {
     XNN_ALIGN(32) float prescale[8];
     XNN_ALIGN(32) float alpha[8];
     XNN_ALIGN(32) float beta[8];
     XNN_ALIGN(32) float sat_cutoff[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) uint32_t table[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c4[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     int32_t mask_table[14];
   } avx2_rr1_lut8_p4;
   struct {
     XNN_ALIGN(32) float prescale[8];
     XNN_ALIGN(32) float alpha[8];
     XNN_ALIGN(32) float beta[8];
     XNN_ALIGN(32) float sat_cutoff[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float table[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c4[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     int32_t mask_table[14];
   } avx2_rr1_lut4_p4;
   struct {
     XNN_ALIGN(32) float prescale[8];
     XNN_ALIGN(32) float alpha[8];
     XNN_ALIGN(32) float beta[8];
     XNN_ALIGN(32) float sat_cutoff[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c6[8];
     XNN_ALIGN(32) float c5[8];
     XNN_ALIGN(32) float c4[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     int32_t mask_table[14];
   } avx2_rr1_p6;
   struct {
     float prescale;
     float alpha;
     float beta;
     float sat_cutoff;
     float magic_bias;
     float log2e;
     float minus_ln2;
     float c3;
     float c2;
     XNN_ALIGN(64) uint32_t table[16];
   } avx512_rr1_lut16_p3;
   struct {
     float prescale;
     float alpha;
     float beta;
     float sat_cutoff;
     float magic_bias;
     float log2e;
     float minus_ln2;
     float c6;
     float c5;
     float c4;
     float c3;
     float c2;
   } avx512_rr1_p6;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float prescale[2];
     XNN_ALIGN(8) float alpha[2];
     XNN_ALIGN(8) float beta[2];
     XNN_ALIGN(8) float sat_cutoff[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) float log2e[2];
     XNN_ALIGN(8) uint32_t index_mask[2];
     XNN_ALIGN(8) float minus_ln2_hi[2];
     XNN_ALIGN(8) float minus_ln2_lo[2];
     XNN_ALIGN(8) float c3[2];
     XNN_ALIGN(8) float c2[2];
     XNN_ALIGN(8) float one[2];
   } wasmsimd_rr2_lut16_p3;
   struct {
     XNN_ALIGN(8) float prescale[2];
     XNN_ALIGN(8) float alpha[2];
     XNN_ALIGN(8) float beta[2];
     XNN_ALIGN(8) float sat_cutoff[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) float log2e[2];
     XNN_ALIGN(8) float minus_ln2_hi[2];
     XNN_ALIGN(8) float minus_ln2_lo[2];
     XNN_ALIGN(8) float c6[2];
     XNN_ALIGN(8) float c5[2];
     XNN_ALIGN(8) float c4[2];
     XNN_ALIGN(8) float c3[2];
     XNN_ALIGN(8) float c2[2];
     XNN_ALIGN(8) float one[2];
   } wasmsimd_rr2_p6;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // ExpMinus: used by RADDEXPMINUSMAX microkernels.

 union xnn_f16_expminus_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint16_t magic_bias;
     uint16_t log2e;
     uint16_t minus_ln2_hi;
     uint16_t minus_ln2_lo;
     uint16_t c2;
     uint16_t c1;
     uint16_t denorm_cutoff;
   } neonfp16arith_rr2_p2;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float c1[8];
     XNN_ALIGN(32) float denorm_cutoff[8];
   } avx2_rr1_p2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_expminus_params {
   struct {
     float log2e;
     float magic_bias;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c5;
     float c4;
     float c3;
     float c2;
     float c1;
     float denorm_cutoff;
   } scalar_rr2_p5;
   struct {
     float log2e;
     float magic_bias;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c2;
     float denorm_cutoff;
   } scalar_rr2_lut64_p2;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float log2e;
     float magic_bias;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c5;
     float c4;
     float c3;
     float c2;
     float c1;
     float denorm_cutoff;
   } neon_rr2_p5;
   struct {
     float log2e;
     float magic_bias;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c2;
     float denorm_cutoff;
   } neon_rr2_lut64_p2;
   struct {
     float log2e;
     float magic_bias;
     float minus_ln2;
     float c5;
     float c4;
     float c3;
     float c2;
     float c1;
     float denorm_cutoff;
   } neonfma_rr1_p5;
   struct {
     float log2e;
     float magic_bias;
     float minus_ln2;
     float c2;
     float denorm_cutoff;
   } neonfma_rr1_lut64_p2;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float log2e[4];
     XNN_ALIGN(16) float magic_bias[4];
     XNN_ALIGN(16) float minus_ln2_hi[4];
     XNN_ALIGN(16) float minus_ln2_lo[4];
     XNN_ALIGN(16) float c5[4];
     XNN_ALIGN(16) float c4[4];
     XNN_ALIGN(16) float c3[4];
     XNN_ALIGN(16) float c2[4];
     XNN_ALIGN(16) float c1[4];
     XNN_ALIGN(16) float denorm_cutoff[4];
   } sse2_rr2_p5;
   struct {
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c5[8];
     XNN_ALIGN(32) float c4[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float c1[8];
     XNN_ALIGN(32) float denorm_cutoff[8];
     int32_t mask_table[14];
   } avx2_rr1_p5;
   struct {
     float log2e;
     float minus_ln2;
     float c5;
     float c4;
     float c3;
     float c2;
     float c1;
     float c0;
   } avx512_rr1_p5;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float log2e[2];
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) float minus_ln2_hi[2];
     XNN_ALIGN(8) float minus_ln2_lo[2];
     XNN_ALIGN(8) float c5[2];
     XNN_ALIGN(8) float c4[2];
     XNN_ALIGN(8) float c3[2];
     XNN_ALIGN(8) float c2[2];
     XNN_ALIGN(8) float c1[2];
     XNN_ALIGN(8) float denorm_cutoff[2];
   } wasmsimd_rr2_p5;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // HSwish: used by VHSWISH microkernels.

 union xnn_f16_hswish_params {
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint16_t sixth;
     uint16_t three;
     uint16_t six;
     uint16_t pad;  // pad to 8 bytes for neonfp16arith assembly.
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(32) float sixth[8];
     XNN_ALIGN(32) float three[8];
     XNN_ALIGN(16) uint16_t six[8];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_hswish_params {
   struct {
     float sixth;
     float three;
     float six;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float sixth[4];
     XNN_ALIGN(16) float half[4];
     XNN_ALIGN(16) float one[4];
   } sse;
   struct {
     XNN_ALIGN(32) float sixth[8];
     XNN_ALIGN(32) float half[8];
     XNN_ALIGN(32) float one[8];
     int32_t mask_table[14];
   } avx;
   struct {
     float sixth;
     float half;
     float one;
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float sixth[2];
     XNN_ALIGN(8) float three[2];
     XNN_ALIGN(8) float six[2];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // LReLU (Leaky ReLU): used by VLRELU microkernels.

 union xnn_f16_lrelu_params {
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint16_t slope;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(32) float slope[8];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_lrelu_params {
   struct {
     float slope;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float slope[4];
   } sse;
   struct {
     XNN_ALIGN(32) float slope[8];
     int32_t mask_table[14];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float slope[2];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qs8_lrelu_params {
   struct {
     int32_t input_zero_point;
     int32_t positive_multiplier;
     int32_t negative_multiplier;
     int32_t bias;
   } scalar_select;
   struct {
     int32_t input_zero_point;
     int32_t multiplier_diff;
     int32_t multiplier_base;
     int32_t bias;
   } scalar_andxor;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint32_t input_zero_point;
     uint32_t positive_multiplier;
     uint32_t negative_multiplier;
     int32_t bias;
   } armsimd32;
   struct {
     int16_t input_zero_point;
     int16_t positive_multiplier;
     int16_t negative_multiplier;
     int16_t output_zero_point;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t input_zero_point[8];
     XNN_ALIGN(16) int16_t multiplier_diff[8];
     XNN_ALIGN(16) int16_t multiplier_base[8];
     XNN_ALIGN(16) int16_t output_zero_point[8];
   } sse2;
   struct {
     XNN_ALIGN(16) int16_t input_zero_point[8];
     XNN_ALIGN(16) int16_t positive_multiplier[8];
     XNN_ALIGN(16) int16_t negative_multiplier[8];
     XNN_ALIGN(16) int16_t output_zero_point[8];
   } avx;
   struct {
     XNN_ALIGN(32) int16_t input_zero_point[16];
     XNN_ALIGN(32) int16_t positive_multiplier[16];
     XNN_ALIGN(32) int16_t negative_multiplier[16];
     XNN_ALIGN(32) int16_t output_zero_point[16];
   } avx2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int16_t input_zero_point[4];
     XNN_ALIGN(8) int16_t positive_multiplier[4];
     XNN_ALIGN(8) int16_t negative_multiplier[4];
     XNN_ALIGN(8) int16_t output_zero_point[4];
   } wasmsimd_arm;
   struct {
     XNN_ALIGN(8) int16_t input_zero_point[4];
     XNN_ALIGN(8) int16_t multiplier_diff[4];
     XNN_ALIGN(8) int16_t multiplier_base[4];
     XNN_ALIGN(8) int16_t output_zero_point[4];
   } wasmsimd_x86;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };

 union xnn_qu8_lrelu_params {
   struct {
     int32_t input_zero_point;
     int32_t positive_multiplier;
     int32_t negative_multiplier;
     int32_t bias;
   } scalar_select;
   struct {
     int32_t input_zero_point;
     int32_t multiplier_base;
     int32_t multiplier_diff;
     int32_t bias;
   } scalar_andxor;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint32_t input_zero_point;
     uint32_t positive_multiplier;
     uint32_t negative_multiplier;
     int32_t bias;
   } armsimd32;
   struct {
     uint16_t input_zero_point;
     int16_t positive_multiplier;
     int16_t negative_multiplier;
     int16_t output_zero_point;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) int16_t input_zero_point[8];
     XNN_ALIGN(16) int16_t multiplier_diff[8];
     XNN_ALIGN(16) int16_t multiplier_base[8];
     XNN_ALIGN(16) int16_t output_zero_point[8];
   } sse2;
   struct {
     XNN_ALIGN(16) int16_t input_zero_point[8];
     XNN_ALIGN(16) int16_t positive_multiplier[8];
     XNN_ALIGN(16) int16_t negative_multiplier[8];
     XNN_ALIGN(16) int16_t output_zero_point[8];
   } avx;
   struct {
     XNN_ALIGN(32) int16_t input_zero_point[16];
     XNN_ALIGN(32) int16_t positive_multiplier[16];
     XNN_ALIGN(32) int16_t negative_multiplier[16];
     XNN_ALIGN(32) int16_t output_zero_point[16];
   } avx2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) int16_t input_zero_point[4];
     XNN_ALIGN(8) int16_t positive_multiplier[4];
     XNN_ALIGN(8) int16_t negative_multiplier[4];
     XNN_ALIGN(8) int16_t output_zero_point[4];
   } wasmsimd_arm;
   struct {
     XNN_ALIGN(8) int16_t input_zero_point[4];
     XNN_ALIGN(8) int16_t multiplier_diff[4];
     XNN_ALIGN(8) int16_t multiplier_base[4];
     XNN_ALIGN(8) int16_t output_zero_point[4];
   } wasmsimd_x86;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // Neg: used by VNEG microkernels.

 union xnn_f16_neg_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) uint16_t sign_mask[8];
   } sse;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_neg_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float sign_mask[4];
   } sse;
   struct {
     XNN_ALIGN(32) float sign_mask[8];
     int32_t mask_table[14];
   } avx;
   struct {
     uint32_t sign_mask;
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float sign_mask[2];
   } wasmsimd;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // Rnd (Round): used by VRNDNE/VRNDU/VRNDD/VRNDZ microkernels.

 union xnn_f16_rnd_params {
   char _; // Dummy member variable to comply with the C standard
 };

 union xnn_f32_rnd_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float sign_mask[4];
     XNN_ALIGN(16) float one[4];
   } sse2;
   struct {
     int32_t mask_table[14];
   } avx;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };


 // Sigmoid: used by VSIGMOID microkernels.

 union xnn_f16_sigmoid_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint16_t magic_bias;
     uint16_t minus_log2e;
     uint16_t ln2_hi;
     uint16_t ln2_lo;
     uint16_t c2;
     uint16_t c1;
     uint16_t denorm_cutoff;
   } neonfp16arith_rr2_p2;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(32) float sign_mask[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float c1[8];
     XNN_ALIGN(32) float one[8];
     XNN_ALIGN(32) float denorm_cutoff[8];
   } avx2_rr1_p2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };

 union xnn_f32_sigmoid_params {
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2_hi;
     float ln2_lo;
     float c1;
     float one;
     float denorm_cutoff;
   } scalar_rr2_lut2048_p1;
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2_hi;
     float ln2_lo;
     float c2;
     float one;
     float denorm_cutoff;
   } scalar_rr2_lut64_p2;
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2_hi;
     float ln2_lo;
     float c5;
     float c4;
     float c3;
     float c2;
     float c1;
     float one;
     float denorm_cutoff;
   } scalar_rr2_p5;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2_hi;
     float ln2_lo;
     float c1;
     float denorm_cutoff;
   } neon_rr2_lut2048_p1;
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2_hi;
     float ln2_lo;
     float c2;
     float denorm_cutoff;
   } neon_rr2_lut64_p2;
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2_hi;
     float ln2_lo;
     float c5;
     float c4;
     float c3;
     float c2;
     float c1;
     float denorm_cutoff;
   } neon_rr2_p5;
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2;
     float c1;
     float denorm_cutoff;
   } neonfma_rr1_lut2048_p1;
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2;
     float c2;
     float denorm_cutoff;
   } neonfma_rr1_lut64_p2;
   struct {
     float magic_bias;
     float minus_log2e;
     float ln2;
     float c5;
     float c4;
     float c3;
     float c2;
     float c1;
     float denorm_cutoff;
   } neonfma_rr1_p5;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float sign_mask[4];
     XNN_ALIGN(16) float magic_bias[4];
     XNN_ALIGN(16) float log2e[4];
     XNN_ALIGN(16) uint32_t index_mask[4];
     XNN_ALIGN(16) float minus_ln2_hi[4];
     XNN_ALIGN(16) float minus_ln2_lo[4];
     XNN_ALIGN(16) float c2[4];
     XNN_ALIGN(16) float one[4];
     XNN_ALIGN(16) float denorm_cutoff[4];
   } sse2_rr2_lut64_p2;
   struct {
     XNN_ALIGN(16) float sign_mask[4];
     XNN_ALIGN(16) float magic_bias[4];
     XNN_ALIGN(16) float log2e[4];
     XNN_ALIGN(16) float minus_ln2_hi[4];
     XNN_ALIGN(16) float minus_ln2_lo[4];
     XNN_ALIGN(16) float c5[4];
     XNN_ALIGN(16) float c4[4];
     XNN_ALIGN(16) float c3[4];
     XNN_ALIGN(16) float c2[4];
     XNN_ALIGN(16) float c1[4];
     XNN_ALIGN(16) float one[4];
     XNN_ALIGN(16) float denorm_cutoff[4];
   } sse2_rr2_p5;
   struct {
     XNN_ALIGN(32) float sign_mask[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float minus_ln2_hi[8];
     XNN_ALIGN(32) float minus_ln2_lo[8];
     XNN_ALIGN(32) float c5[8];
     XNN_ALIGN(32) float c4[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float c1[8];
     XNN_ALIGN(32) float one[8];
     XNN_ALIGN(32) float two[8];
     XNN_ALIGN(32) float denorm_cutoff[8];
     int32_t mask_table[14];
   } avx_rr2_p5;
   struct {
     XNN_ALIGN(32) float sign_mask[8];
     XNN_ALIGN(32) float magic_bias[8];
     XNN_ALIGN(32) float log2e[8];
     XNN_ALIGN(32) float minus_ln2[8];
     XNN_ALIGN(32) float c5[8];
     XNN_ALIGN(32) float c4[8];
     XNN_ALIGN(32) float c3[8];
     XNN_ALIGN(32) float c2[8];
     XNN_ALIGN(32) float c1[8];
     XNN_ALIGN(32) float one[8];
     XNN_ALIGN(32) float denorm_cutoff[8];
     int32_t mask_table[14];
   } avx2_rr1_p5;
   struct {
     uint32_t sign_mask;
     float magic_bias;
     float log2e;
     float minus_ln2;
     float c3;
     float c2;
     float one;
     XNN_ALIGN(64) float table[16];
   } avx512_rr1_lut16_p3;
   struct {
     uint32_t sign_mask;
     float magic_bias;
     float log2e;
     float minus_ln2_hi;
     float minus_ln2_lo;
     float c2;
     float c1;
     float one;
     XNN_ALIGN(64) float table_lo[16];
     XNN_ALIGN(64) float table_hi[16];
   } avx512_rr2_lut32_p2;
   struct {
     uint32_t sign_mask;
     float log2e;
     float minus_ln2;
     float c5;
     float c4;
     float c3;
     float c2;
     float c1;
     float one;
   } avx512_rr1_p5;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
   struct {
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) float minus_log2e[2];
     XNN_ALIGN(8) uint32_t index_mask[2];
     XNN_ALIGN(8) float ln2_hi[2];
     XNN_ALIGN(8) float ln2_lo[2];
     XNN_ALIGN(8) float c2[2];
     XNN_ALIGN(8) float one[2];
     XNN_ALIGN(8) float denorm_cutoff[2];
   } wasmsimd_rr2_lut64_p2;
   struct {
     XNN_ALIGN(8) float magic_bias[2];
     XNN_ALIGN(8) float minus_log2e[2];
     XNN_ALIGN(8) float ln2_hi[2];
     XNN_ALIGN(8) float ln2_lo[2];
     XNN_ALIGN(8) float c5[2];
     XNN_ALIGN(8) float c4[2];
     XNN_ALIGN(8) float c3[2];
     XNN_ALIGN(8) float c2[2];
     XNN_ALIGN(8) float c1[2];
     XNN_ALIGN(8) float one[2];
     XNN_ALIGN(8) float denorm_cutoff[2];
   } wasmsimd_rr2_p5;
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
 };


 // Sqrt (Square Root): used by VSQRT microkernels.

 union xnn_f16_sqrt_params {
   char _; // Dummy member variable to comply with the C standard
 };

 union xnn_f32_sqrt_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     int32_t mask_table[14];
   } avx;
   struct {
     XNN_ALIGN(32) float half[8];
     int32_t mask_table[14];
   } fma;
   struct {
     float half;
   } avx512;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };


 // SqrtShift (Square Root + Shift): used by VSQRTSHIFT microkernels.

 union xnn_u64_u32_sqrtshift_params {
   struct {
     uint32_t shift;
   } scalar;
 };

 // CHW: used by CONV/DWCONV microkernels in CHW layout with Min+Max parameters.

 union xnn_f16_chw_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     uint16_t min;
     uint16_t max;
     XNN_ALIGN(8) uint16_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(8) uint16_t mask_odd[4];  // used by stride 2 kernels
     XNN_ALIGN(8) uint16_t mask[4]; // used by stride 1 kernels
     XNN_ALIGN(16) uint16_t maskx8[8]; // used by stride 1 x8 kernels
   } neonfp16arith;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 };

 union xnn_f32_chw_params {
   struct {
     XNN_ALIGN(16) int32_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(16) int32_t mask_odd[4];  // used by stride 2 kernels
     XNN_ALIGN(16) int32_t mask[4]; // used by stride 1 kernels
     float min;
     float max;
   } scalar;
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     float min;
     float max;
     XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float min[4];
     XNN_ALIGN(16) float max[4];
     XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask_odd[4];  // used by stride 2 kernels
     XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
   } sse;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };


 // GAvgPool (Global Average Pool): used by GAVGPOOL microkernels in CHW layout with Scale+Min+Max parameters.

 union xnn_f16_gavgpool_params {
   char _; // Dummy member variable to comply with the C standard
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     XNN_ALIGN(16) uint16_t mask[8];
     uint16_t multiplier;
     uint16_t output_min;
     uint16_t output_max;
   } neonfp16arith;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
 };

 union xnn_f32_gavgpool_params {
   struct {
     XNN_ALIGN(16) int32_t mask[4];
     float multiplier;
     float output_min;
     float output_max;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
     XNN_ALIGN(16) float multiplier[4];
     XNN_ALIGN(16) float output_min[4];
     XNN_ALIGN(16) float output_max[4];
     XNN_ALIGN(16) uint32_t mask[4];
   } sse;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   struct {
     XNN_ALIGN(16) uint32_t mask[4];
     float multiplier;
     float output_min;
     float output_max;
   } neon;
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64 */
 };

 // Forward declare for use in microkernel headers for JIT generator functions.
 struct xnn_code_buffer;

 typedef int xnn_status_t;