src/qs8-igemm/c4-armsimd32.c.in - platform/external/XNNPACK - Git at Google

 // Copyright 2022 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.

 $assert REQUANTIZATION == "FP32"
 $assert DATATYPE in ["QC8", "QS8", "QU8"]
 $assert 1 <= MR <= 2
 $assert 1 <= NR <= 2
 #include <assert.h>

 #include <arm_acle.h>

 #include <xnnpack/intrinsics-polyfill.h>
 #include <xnnpack/math.h>
 #include <xnnpack/gemm.h>
 #include <xnnpack/unaligned.h>


 $PARAMS_STRUCT = REQUANTIZATION.lower() + "_armsimd32"
 $PARAMS_UNION = "xnn_%s_conv_minmax_params" % DATATYPE.lower()
 $__XXTB16 = "__uxtb16" if DATATYPE == "QU8" else "__sxtb16"
 $__XSAT = "__usat" if DATATYPE == "QU8" else "__ssat"
 $__XSUB8 = "__usub8" if DATATYPE == "QU8" else "__ssub8"
 $XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
 void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x${NR}c4__armsimd32(
     size_t mr,
     size_t nc,
     size_t kc,
     size_t ks,
     const ${XINT8_T}**restrict a,
     const void*restrict w,
     ${XINT8_T}*restrict c,
     size_t cm_stride,
     size_t cn_stride,
     size_t a_offset,
     const ${XINT8_T}* zero,
     const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(mr != 0);
   assert(mr <= ${MR});
   assert(nc != 0);
   assert(kc != 0);
   assert(ks != 0);
   assert(ks % (${MR} * sizeof(void*)) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);

   kc = round_up_po2(kc, 4 * sizeof(int8_t));
   ${XINT8_T}* c0 = c;
   $for M in range(1, MR):
     ${XINT8_T}* c${M} = (${XINT8_T}*) ((uintptr_t) c${M-1} + cm_stride);
     $if M % 2 == 0:
       if XNN_UNPREDICTABLE(mr <= ${M}) {
         c${M} = c${M-1};
       }
     $elif M + 1 == MR:
       if XNN_UNPREDICTABLE(mr != ${M+1}) {
         c${M} = c${M-1};
       }
     $else:
       if XNN_UNPREDICTABLE(mr < ${M+1}) {
         c${M} = c${M-1};
       }

   $if DATATYPE == "QU8":
     const int16x2_t vb_minus_zero_point = (int16x2_t) params->${PARAMS_STRUCT}.minus_kernel_zero_point;
   $if REQUANTIZATION == "FP32":
     $if DATATYPE != "QC8":
       const float vscale = params->${PARAMS_STRUCT}.scale;
     const float vmagic_bias = params->${PARAMS_STRUCT}.magic_bias;
   do {
     $for N in range(NR):
       int32_t vacc0x${N} = ((const int32_t*) w)[${N}];
     $for M in range(1, MR):
       $for N in range(NR):
         int32_t vacc${M}x${N} = vacc0x${N};
     w = (const void*) ((const int32_t*) w + ${NR});

     size_t p = ks;
     do {
       $for M in range(MR):
         const ${XINT8_T}* restrict a${M} = a[${M}];
         assert(a${M} != NULL);
         if XNN_UNPREDICTABLE(a${M} != zero) {
           a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} + a_offset);
         }
       a += ${MR};

       size_t k = kc;
       do {
         $for M in range(MR):
           const int8x4_t va${M} = (int8x4_t) unaligned_load_s32(a${M}); a${M} += 4;

         $for M in range(MR):
           const int16x2_t va${M}c02 = ${__XXTB16}(va${M});
           const int16x2_t va${M}c13 = ${__XXTB16}(__ror(va${M}, 8));

         $for N in range(NR):
           const int8x4_t vb${N} = *((const int8x4_t*) w); w = (const int8_t*) w + 4;
           $if DATATYPE == "QU8":
             const int16x2_t vb${N}c02 = __uxtab16(vb_minus_zero_point, vb${N});
           $else:
             const int16x2_t vb${N}c02 = __sxtb16(vb${N});

           $for M in range(MR):
             vacc${M}x${N} = __smlad(va${M}c02, vb${N}c02, vacc${M}x${N});

           $if DATATYPE == "QU8":
             const int16x2_t vb${N}c13 = __uxtab16(vb_minus_zero_point, __ror(vb${N}, 8));
           $else:
             const int16x2_t vb${N}c13 = __sxtb16(__ror(vb${N}, 8));
           $for M in range(MR):
             vacc${M}x${N} = __smlad(va${M}c13, vb${N}c13, vacc${M}x${N});

         k -= 4 * sizeof(${XINT8_T});
       } while (k != 0);
       p -= ${MR} * sizeof(void*);
     } while (p != 0);

     $for M in range(MR):
       $for N in range(NR):
         float vfpacc${M}x${N} = (float) vacc${M}x${N};

     $if DATATYPE == "QC8":
       $for N in range(NR):
         const float vscale${N} = ((const float*) w)[${N}];
         $for M in range(MR):
           vfpacc${M}x${N} *= vscale${N};
       w = (const void*) ((const float*) w + ${NR});
     $else:
       $for M in range(MR):
         $for N in range(NR):
           vfpacc${M}x${N} *= vscale;

     $for M in range(MR):
       $for N in range(NR):
         vfpacc${M}x${N} += vmagic_bias;

     $for M in range(MR):
       $for N in range(NR):
         int32_t vout${M}x${N} = (int32_t) float_as_uint32(vfpacc${M}x${N});

     const int32_t vmagic_bias_less_zero_point = params->${PARAMS_STRUCT}.magic_bias_less_zero_point;
     $for M in range(MR):
       $for N in range(NR):
         vout${M}x${N} = __qsub(vout${M}x${N}, vmagic_bias_less_zero_point);

     $for M in range(MR):
       $for N in range(NR):
         vout${M}x${N} = ${__XSAT}(vout${M}x${N}, 8);

     $for M in range(MR):
       $if NR == 1:
         const uint32_t vout${M} = (uint32_t) vout${M}x0;
       $else:
         const uint32_t vout${M} = (uint32_t) (uint8_t) vout${M}x0 | ((uint32_t) vout${M}x1 << 8);

     $if MR == 1:
       uint32_t vout = vout0;
     $else:
       uint32_t vout = (uint32_t) (uint16_t) vout1 | (vout0 << 16);

     const int8x4_t voutput_min = (int8x4_t) params->${PARAMS_STRUCT}.output_min;
     ${__XSUB8}((int8x4_t) vout, voutput_min);
     vout = (uint32_t) __sel((uint8x4_t) vout, (uint8x4_t) voutput_min);

     const int8x4_t voutput_max = (int8x4_t) params->${PARAMS_STRUCT}.output_max;
     ${__XSUB8}((int8x4_t) vout, voutput_max);
     vout = (uint32_t) __sel((uint8x4_t) voutput_max, (uint8x4_t) vout);

     $if NR == 2:
       if XNN_LIKELY(nc >= ${NR}) {
         $for M in reversed(range(MR)):
           unaligned_store_u16(c${M}, (uint16_t) vout);
           $if M != 0:
             vout >>= 16;

         $for M in reversed(range(MR)):
           c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);

         a = (const ${XINT8_T}**restrict) ((uintptr_t) a - ks);
         nc -= ${NR};
       } else {
         $for M in reversed(range(MR)):
           *c${M} = (${XINT8_T}) vout;
           $if M != 0:
             vout >>= 16;

         nc = 0;
       }
     $else:
       $for M in reversed(range(MR)):
         *c${M} = (${XINT8_T}) vout;
         $if M != 0:
           vout >>= 16;

       $for M in reversed(range(MR)):
         c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);

       a = (const ${XINT8_T}**restrict) ((uintptr_t) a - ks);
       nc -= 1;
   } while (nc != 0);
 }
	// Copyright 2022 Google LLC
	//
	// This source code is licensed under the BSD-style license found in the
	// LICENSE file in the root directory of this source tree.

	$assert REQUANTIZATION == "FP32"
	$assert DATATYPE in ["QC8", "QS8", "QU8"]
	$assert 1 <= MR <= 2
	$assert 1 <= NR <= 2
	#include <assert.h>

	#include <arm_acle.h>

	#include <xnnpack/intrinsics-polyfill.h>
	#include <xnnpack/math.h>
	#include <xnnpack/gemm.h>
	#include <xnnpack/unaligned.h>


	$PARAMS_STRUCT = REQUANTIZATION.lower() + "_armsimd32"
	$PARAMS_UNION = "xnn_%s_conv_minmax_params" % DATATYPE.lower()
	$__XXTB16 = "__uxtb16" if DATATYPE == "QU8" else "__sxtb16"
	$__XSAT = "__usat" if DATATYPE == "QU8" else "__ssat"
	$__XSUB8 = "__usub8" if DATATYPE == "QU8" else "__ssub8"
	$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
	void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x${NR}c4__armsimd32(
	size_t mr,
	size_t nc,
	size_t kc,
	size_t ks,
	const ${XINT8_T}**restrict a,
	const void*restrict w,
	${XINT8_T}*restrict c,
	size_t cm_stride,
	size_t cn_stride,
	size_t a_offset,
	const ${XINT8_T}* zero,
	const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)])
	{
	assert(mr != 0);
	assert(mr <= ${MR});
	assert(nc != 0);
	assert(kc != 0);
	assert(ks != 0);
	assert(ks % (${MR} * sizeof(void*)) == 0);
	assert(a != NULL);
	assert(w != NULL);
	assert(c != NULL);

	kc = round_up_po2(kc, 4 * sizeof(int8_t));
	${XINT8_T}* c0 = c;
	$for M in range(1, MR):
	${XINT8_T}* c${M} = (${XINT8_T}*) ((uintptr_t) c${M-1} + cm_stride);
	$if M % 2 == 0:
	if XNN_UNPREDICTABLE(mr <= ${M}) {
	c${M} = c${M-1};
	}
	$elif M + 1 == MR:
	if XNN_UNPREDICTABLE(mr != ${M+1}) {
	c${M} = c${M-1};
	}
	$else:
	if XNN_UNPREDICTABLE(mr < ${M+1}) {
	c${M} = c${M-1};
	}

	$if DATATYPE == "QU8":
	const int16x2_t vb_minus_zero_point = (int16x2_t) params->${PARAMS_STRUCT}.minus_kernel_zero_point;
	$if REQUANTIZATION == "FP32":
	$if DATATYPE != "QC8":
	const float vscale = params->${PARAMS_STRUCT}.scale;
	const float vmagic_bias = params->${PARAMS_STRUCT}.magic_bias;
	do {
	$for N in range(NR):
	int32_t vacc0x${N} = ((const int32_t*) w)[${N}];
	$for M in range(1, MR):
	$for N in range(NR):
	int32_t vacc${M}x${N} = vacc0x${N};
	w = (const void) ((const int32_t) w + ${NR});

	size_t p = ks;
	do {
	$for M in range(MR):
	const ${XINT8_T}* restrict a${M} = a[${M}];
	assert(a${M} != NULL);
	if XNN_UNPREDICTABLE(a${M} != zero) {
	a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} + a_offset);
	}
	a += ${MR};

	size_t k = kc;
	do {
	$for M in range(MR):
	const int8x4_t va${M} = (int8x4_t) unaligned_load_s32(a${M}); a${M} += 4;

	$for M in range(MR):
	const int16x2_t va${M}c02 = ${__XXTB16}(va${M});
	const int16x2_t va${M}c13 = ${__XXTB16}(__ror(va${M}, 8));

	$for N in range(NR):
	const int8x4_t vb${N} = ((const int8x4_t) w); w = (const int8_t*) w + 4;
	$if DATATYPE == "QU8":
	const int16x2_t vb${N}c02 = __uxtab16(vb_minus_zero_point, vb${N});
	$else:
	const int16x2_t vb${N}c02 = __sxtb16(vb${N});

	$for M in range(MR):
	vacc${M}x${N} = __smlad(va${M}c02, vb${N}c02, vacc${M}x${N});

	$if DATATYPE == "QU8":
	const int16x2_t vb${N}c13 = __uxtab16(vb_minus_zero_point, __ror(vb${N}, 8));
	$else:
	const int16x2_t vb${N}c13 = __sxtb16(__ror(vb${N}, 8));
	$for M in range(MR):
	vacc${M}x${N} = __smlad(va${M}c13, vb${N}c13, vacc${M}x${N});

	k -= 4 * sizeof(${XINT8_T});
	} while (k != 0);
	p -= ${MR} * sizeof(void*);
	} while (p != 0);

	$for M in range(MR):
	$for N in range(NR):
	float vfpacc${M}x${N} = (float) vacc${M}x${N};

	$if DATATYPE == "QC8":
	$for N in range(NR):
	const float vscale${N} = ((const float*) w)[${N}];
	$for M in range(MR):
	vfpacc${M}x${N} *= vscale${N};
	w = (const void) ((const float) w + ${NR});
	$else:
	$for M in range(MR):
	$for N in range(NR):
	vfpacc${M}x${N} *= vscale;

	$for M in range(MR):
	$for N in range(NR):
	vfpacc${M}x${N} += vmagic_bias;

	$for M in range(MR):
	$for N in range(NR):
	int32_t vout${M}x${N} = (int32_t) float_as_uint32(vfpacc${M}x${N});

	const int32_t vmagic_bias_less_zero_point = params->${PARAMS_STRUCT}.magic_bias_less_zero_point;
	$for M in range(MR):
	$for N in range(NR):
	vout${M}x${N} = __qsub(vout${M}x${N}, vmagic_bias_less_zero_point);

	$for M in range(MR):
	$for N in range(NR):
	vout${M}x${N} = ${__XSAT}(vout${M}x${N}, 8);

	$for M in range(MR):
	$if NR == 1:
	const uint32_t vout${M} = (uint32_t) vout${M}x0;
	$else:
	const uint32_t vout${M} = (uint32_t) (uint8_t) vout${M}x0 \| ((uint32_t) vout${M}x1 << 8);

	$if MR == 1:
	uint32_t vout = vout0;
	$else:
	uint32_t vout = (uint32_t) (uint16_t) vout1 \| (vout0 << 16);

	const int8x4_t voutput_min = (int8x4_t) params->${PARAMS_STRUCT}.output_min;
	${__XSUB8}((int8x4_t) vout, voutput_min);
	vout = (uint32_t) __sel((uint8x4_t) vout, (uint8x4_t) voutput_min);

	const int8x4_t voutput_max = (int8x4_t) params->${PARAMS_STRUCT}.output_max;
	${__XSUB8}((int8x4_t) vout, voutput_max);
	vout = (uint32_t) __sel((uint8x4_t) voutput_max, (uint8x4_t) vout);

	$if NR == 2:
	if XNN_LIKELY(nc >= ${NR}) {
	$for M in reversed(range(MR)):
	unaligned_store_u16(c${M}, (uint16_t) vout);
	$if M != 0:
	vout >>= 16;

	$for M in reversed(range(MR)):
	c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);

	a = (const ${XINT8_T}**restrict) ((uintptr_t) a - ks);
	nc -= ${NR};
	} else {
	$for M in reversed(range(MR)):
	*c${M} = (${XINT8_T}) vout;
	$if M != 0:
	vout >>= 16;

	nc = 0;
	}
	$else:
	$for M in reversed(range(MR)):
	*c${M} = (${XINT8_T}) vout;
	$if M != 0:
	vout >>= 16;

	$for M in reversed(range(MR)):
	c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);

	a = (const ${XINT8_T}**restrict) ((uintptr_t) a - ks);
	nc -= 1;
	} while (nc != 0);
	}