blob: 5ef2d2212d7ef67518f90555cee31a66276e871c [file] [log] [blame] [edit]
// Copyright 2021 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
#include <immintrin.h>
#include <xnnpack/avgpool.h>
#include <xnnpack/common.h>
#include <xnnpack/conv.h>
#include <xnnpack/dwconv.h>
#include <xnnpack/gavgpool.h>
#include <xnnpack/gemm.h>
#include <xnnpack/ibilinear.h>
#include <xnnpack/igemm.h>
#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/math.h>
#include <xnnpack/maxpool.h>
#include <xnnpack/packx.h>
#include <xnnpack/pavgpool.h>
#include <xnnpack/rmax.h>
#include <xnnpack/spmm.h>
#include <xnnpack/transpose.h>
#include <xnnpack/vbinary.h>
#include <xnnpack/vmulcaddc.h>
#include <xnnpack/vunary.h>
void xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
const float* zero,
float* buffer,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(kernel_elements > 9);
assert(channels != 0);
const __m128 vscale = _mm_load_ps(params->sse.scale);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vmax = _mm_load_ps(params->sse.max);
do {
{
const float* i0 = *input++;
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = *input++;
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = *input++;
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = *input++;
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = *input++;
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
const float* i5 = *input++;
assert(i5 != NULL);
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
const float* i6 = *input++;
assert(i6 != NULL);
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
const float* i7 = *input++;
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
const float* i8 = *input++;
assert(i8 != NULL);
if XNN_UNPREDICTABLE(i8 != zero) {
i8 = (const float*) ((uintptr_t) i8 + input_offset);
}
float* b = buffer;
for (size_t c = 0; c < channels; c += 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vi8 = _mm_loadu_ps(i8);
i8 += 4;
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
_mm_store_ps(b, vsum); b += 4;
}
}
size_t k = kernel_elements;
for (k -= 9; k > 8; k -= 8) {
const float* i0 = *input++;
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = *input++;
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = *input++;
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = *input++;
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = *input++;
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
const float* i5 = *input++;
assert(i5 != NULL);
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
const float* i6 = *input++;
assert(i6 != NULL);
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
const float* i7 = *input++;
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
float* b = buffer;
for (size_t c = 0; c < channels; c += 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vacc = _mm_load_ps(b);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
_mm_store_ps(b, vsum); b += 4;
}
}
{
const float* i0 = input[0];
assert(i0 != NULL);
const float* i1 = input[1];
const float* i2 = input[2];
const float* i3 = input[3];
const float* i4 = input[4];
const float* i5 = input[5];
const float* i6 = input[6];
const float* i7 = input[7];
input = (const float**) ((uintptr_t) input + input_increment);
if (k < 2) {
i1 = zero;
}
assert(i1 != NULL);
if (k <= 2) {
i2 = zero;
}
assert(i2 != NULL);
if (k < 4) {
i3 = zero;
}
assert(i3 != NULL);
if (k <= 4) {
i4 = zero;
}
assert(i4 != NULL);
if (k < 6) {
i5 = zero;
}
assert(i5 != NULL);
if (k <= 6) {
i6 = zero;
}
assert(i6 != NULL);
if (k < 8) {
i7 = zero;
}
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
size_t c = channels;
float* b = buffer;
while (c >= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vacc = _mm_load_ps(b);
b += 4;
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
__m128 vout = _mm_mul_ps(vsum, vscale);
vout = _mm_max_ps(vout, vmin);
vout = _mm_min_ps(vout, vmax);
_mm_storeu_ps(output, vout);
output += 4;
c -= 4;
}
if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
const __m128 vi3 = _mm_loadu_ps(i3);
const __m128 vi4 = _mm_loadu_ps(i4);
const __m128 vi5 = _mm_loadu_ps(i5);
const __m128 vi6 = _mm_loadu_ps(i6);
const __m128 vi7 = _mm_loadu_ps(i7);
const __m128 vacc = _mm_load_ps(b);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
__m128 vout = _mm_mul_ps(vsum, vscale);
vout = _mm_max_ps(vout, vmin);
vout = _mm_min_ps(vout, vmax);
if (c & 2) {
_mm_storel_pi((__m64*) output, vout);
vout = _mm_movehl_ps(vout, vout);
output += 2;
}
if (c & 1) {
_mm_store_ss(output, vout);
output += 1;
}
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_pixels != 0);
}
void xnn_f32_avgpool_minmax_ukernel_9x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
const float* zero,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(kernel_elements != 0);
assert(kernel_elements <= 9);
assert(channels != 0);
const __m128 vscale = _mm_load_ps(params->sse.scale);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vmax = _mm_load_ps(params->sse.max);
do {
const float* i0 = input[0];
assert(i0 != NULL);
const float* i1 = input[1];
const float* i2 = input[2];
const float* i3 = input[3];
const float* i4 = input[4];
const float* i5 = input[5];
const float* i6 = input[6];
const float* i7 = input[7];
const float* i8 = input[8];
input = (const float**) ((uintptr_t) input + input_increment);
if (kernel_elements < 2) {
i1 = zero;
}
assert(i1 != NULL);
if (kernel_elements <= 2) {
i2 = zero;
}
assert(i2 != NULL);
if (kernel_elements < 4) {
i3 = zero;
}
assert(i3 != NULL);
if (kernel_elements <= 4) {
i4 = zero;
}
assert(i4 != NULL);
if (kernel_elements < 6) {
i5 = zero;
}
assert(i5 != NULL);
if (kernel_elements <= 6) {
i6 = zero;
}
assert(i6 != NULL);
if (kernel_elements < 8) {
i7 = zero;
}
assert(i7 != NULL);
if (kernel_elements <= 8) {
i8 = zero;
}
assert(i8 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
if XNN_UNPREDICTABLE(i8 != zero) {
i8 = (const float*) ((uintptr_t) i8 + input_offset);
}
size_t c = channels;
while (c >= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vi8 = _mm_loadu_ps(i8);
i8 += 4;
const __m128 vsum018 = _mm_add_ps(_mm_add_ps(vi0, vi1), vi8);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
__m128 vout = _mm_mul_ps(vsum, vscale);
vout = _mm_max_ps(vout, vmin);
vout = _mm_min_ps(vout, vmax);
_mm_storeu_ps(output, vout); output += 4;
c -= 4;
}
if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
const __m128 vi3 = _mm_loadu_ps(i3);
const __m128 vi4 = _mm_loadu_ps(i4);
const __m128 vi5 = _mm_loadu_ps(i5);
const __m128 vi6 = _mm_loadu_ps(i6);
const __m128 vi7 = _mm_loadu_ps(i7);
const __m128 vi8 = _mm_loadu_ps(i8);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
__m128 vout = _mm_mul_ps(vsum, vscale);
vout = _mm_max_ps(vout, vmin);
vout = _mm_min_ps(vout, vmax);
if (c & 2) {
_mm_storel_pi((__m64*) output, vout);
vout = _mm_movehl_ps(vout, vout);
output += 2;
}
if (c & 1) {
_mm_store_ss(output, vout);
output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_pixels != 0);
}
void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(
size_t input_height,
size_t input_width,
size_t output_y_start,
size_t output_y_end,
const float* input,
const float* zero,
const float* weights,
float* output,
size_t input_padding_top,
size_t output_channels,
size_t output_height_stride,
size_t output_channel_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(input_width != 0);
assert(output_y_end > output_y_start);
assert(input_padding_top <= 1);
assert(output_channels != 0);
const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float);
const size_t output_width = (input_width + 1) / 2;
const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
// Adjustment for padding processed below
const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
float* output1 = (float*) ((uintptr_t) output0 + output_height_stride);
if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
i0 = zero;
}
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vmax = _mm_load_ps(params->sse.max);
for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
const size_t input_y4 = input_y2 + 2;
if XNN_UNPREDICTABLE(input_y2 >= input_height) {
i2 = zero;
}
if XNN_UNPREDICTABLE(input_y4 > input_height) {
i3 = zero;
}
if XNN_UNPREDICTABLE(input_y4 >= input_height) {
i4 = zero;
}
if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
output1 = output0;
}
const float* w = weights;
size_t c = output_channels;
float* o0c0 = output0;
float* o1c0 = output1;
float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride);
float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride);
float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride);
do {
if XNN_UNPREDICTABLE(c < 2) {
o0c1 = o0c0;
o1c1 = o1c0;
}
if XNN_UNPREDICTABLE(c <= 2) {
o0c2 = o0c1;
o1c2 = o1c1;
}
if XNN_UNPREDICTABLE(c < 4) {
o0c3 = o0c2;
o1c3 = o1c2;
}
// viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
__m128 vi0x0 = _mm_setzero_ps();
__m128 vi1x0 = _mm_setzero_ps();
__m128 vi2x0 = _mm_setzero_ps();
__m128 vi3x0 = _mm_setzero_ps();
__m128 vi4x0 = _mm_setzero_ps();
size_t iw = input_width;
for (; iw >= 4; iw -= 4) {
__m128 vo0x0 = _mm_load_ps(w);
__m128 vo1x0 = vo0x0;
__m128 vo0x1 = vo0x0;
__m128 vo1x1 = vo0x0;
const __m128 vk00c0 = _mm_load_ps(w + 4);
// viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
const __m128 vi0x1 = _mm_loadu_ps(i0); i0 += 4;
const __m128 vi1x1 = _mm_loadu_ps(i1); i1 += 4;
const __m128 vi2x1 = _mm_loadu_ps(i2); i2 += 4;
const __m128 vi3x1 = _mm_loadu_ps(i3); i3 += 4;
const __m128 vi4x1 = _mm_loadu_ps(i4); i4 += 4;
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk10c0 = _mm_load_ps(w + 8);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk20c0 = _mm_load_ps(w + 12);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk00c1 = _mm_load_ps(w + 16);
// viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
const __m128 vi0x2 = _mm_loadu_ps(i0); i0 += 4;
const __m128 vi1x2 = _mm_loadu_ps(i1); i1 += 4;
const __m128 vi2x2 = _mm_loadu_ps(i2); i2 += 4;
const __m128 vi3x2 = _mm_loadu_ps(i3); i3 += 4;
const __m128 vi4x2 = _mm_loadu_ps(i4); i4 += 4;
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk10c1 = _mm_load_ps(w + 20);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk20c1 = _mm_load_ps(w + 24);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk00c2 = _mm_load_ps(w + 28);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk10c2 = _mm_load_ps(w + 32);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk20c2 = _mm_load_ps(w + 36);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk01c0 = _mm_load_ps(w + 40);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
const __m128 vk11c0 = _mm_load_ps(w + 44);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
const __m128 vk21c0 = _mm_load_ps(w + 48);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
const __m128 vk01c1 = _mm_load_ps(w + 52);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk11c1 = _mm_load_ps(w + 56);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk21c1 = _mm_load_ps(w + 60);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk01c2 = _mm_load_ps(w + 64);
// viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
const __m128 vi0x3 = _mm_loadu_ps(i0); i0 += 4;
const __m128 vi1x3 = _mm_loadu_ps(i1); i1 += 4;
const __m128 vi2x3 = _mm_loadu_ps(i2); i2 += 4;
const __m128 vi3x3 = _mm_loadu_ps(i3); i3 += 4;
const __m128 vi4x3 = _mm_loadu_ps(i4); i4 += 4;
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk11c2 = _mm_load_ps(w + 68);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk21c2 = _mm_load_ps(w + 72);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk02c0 = _mm_load_ps(w + 76);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk12c0 = _mm_load_ps(w + 80);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk22c0 = _mm_load_ps(w + 84);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk02c1 = _mm_load_ps(w + 88);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
const __m128 vk12c1 = _mm_load_ps(w + 92);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(2, 2, 2, 2))));
const __m128 vk22c1 = _mm_load_ps(w + 96);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(2, 2, 2, 2))));
const __m128 vk02c2 = _mm_load_ps(w + 100);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk12c2 = _mm_load_ps(w + 104);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk22c2 = _mm_load_ps(w + 108);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(3, 3, 3, 3))));
vi0x0 = vi0x3;
vi1x0 = vi1x3;
vi2x0 = vi2x3;
vi3x0 = vi3x3;
vi4x0 = vi4x3;
vo0x0 = _mm_max_ps(vo0x0, vmin);
vo1x0 = _mm_max_ps(vo1x0, vmin);
vo0x1 = _mm_max_ps(vo0x1, vmin);
vo1x1 = _mm_max_ps(vo1x1, vmin);
vo0x0 = _mm_min_ps(vo0x0, vmax);
vo1x0 = _mm_min_ps(vo1x0, vmax);
vo0x1 = _mm_min_ps(vo0x1, vmax);
vo1x1 = _mm_min_ps(vo1x1, vmax);
const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
// Always 2+ output width elements remaining
_mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
_mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
_mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
_mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
_mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
_mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
_mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
_mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
}
assert(iw < 4);
if XNN_UNLIKELY(iw != 0) {
__m128 vo0x0 = _mm_load_ps(w);
__m128 vo1x0 = vo0x0;
__m128 vo0x1 = vo0x0;
__m128 vo1x1 = vo0x0;
const __m128 vk00c0 = _mm_load_ps(w + 4);
// viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
__m128 vi0x1 = _mm_loadu_ps(i0);
__m128 vi1x1 = _mm_loadu_ps(i1);
__m128 vi2x1 = _mm_loadu_ps(i2);
__m128 vi3x1 = _mm_loadu_ps(i3);
__m128 vi4x1 = _mm_loadu_ps(i4);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
}
const __m128 vk10c0 = _mm_load_ps(w + 8);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
}
const __m128 vk20c0 = _mm_load_ps(w + 12);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
}
const __m128 vk00c1 = _mm_load_ps(w + 16);
__m128 vi0x2 = _mm_setzero_ps();
__m128 vi1x2 = _mm_setzero_ps();
__m128 vi2x2 = _mm_setzero_ps();
__m128 vi3x2 = _mm_setzero_ps();
__m128 vi4x2 = _mm_setzero_ps();
if (iw >= 2) {
// viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
vi0x2 = _mm_loadu_ps(i0 + 4);
vi1x2 = _mm_loadu_ps(i1 + 4);
vi2x2 = _mm_loadu_ps(i2 + 4);
vi3x2 = _mm_loadu_ps(i3 + 4);
vi4x2 = _mm_loadu_ps(i4 + 4);
}
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk10c1 = _mm_load_ps(w + 20);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk20c1 = _mm_load_ps(w + 24);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk00c2 = _mm_load_ps(w + 28);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk10c2 = _mm_load_ps(w + 32);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk20c2 = _mm_load_ps(w + 36);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk01c0 = _mm_load_ps(w + 40);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
}
const __m128 vk11c0 = _mm_load_ps(w + 44);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
}
const __m128 vk21c0 = _mm_load_ps(w + 48);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
}
const __m128 vk01c1 = _mm_load_ps(w + 52);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
}
const __m128 vk11c1 = _mm_load_ps(w + 56);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
}
const __m128 vk21c1 = _mm_load_ps(w + 60);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
if (iw > 2) {
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
}
const __m128 vk01c2 = _mm_load_ps(w + 64);
__m128 vi0x3 = _mm_setzero_ps();
__m128 vi1x3 = _mm_setzero_ps();
__m128 vi2x3 = _mm_setzero_ps();
__m128 vi3x3 = _mm_setzero_ps();
__m128 vi4x3 = _mm_setzero_ps();
if (iw > 2) {
// viMx3 = ( 0.0, 0.0, 0.0, iM3c2 )
vi0x3 = _mm_load_ss(i0 + 8);
vi1x3 = _mm_load_ss(i1 + 8);
vi2x3 = _mm_load_ss(i2 + 8);
vi3x3 = _mm_load_ss(i3 + 8);
vi4x3 = _mm_load_ss(i4 + 8);
}
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk11c2 = _mm_load_ps(w + 68);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk21c2 = _mm_load_ps(w + 72);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
if (iw >= 2) {
const __m128 vk02c0 = _mm_load_ps(w + 76);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk12c0 = _mm_load_ps(w + 80);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk22c0 = _mm_load_ps(w + 84);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
const __m128 vk02c1 = _mm_load_ps(w + 88);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk12c1 = _mm_load_ps(w + 92);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk22c1 = _mm_load_ps(w + 96);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
const __m128 vk02c2 = _mm_load_ps(w + 100);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk12c2 = _mm_load_ps(w + 104);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
const __m128 vk22c2 = _mm_load_ps(w + 108);
vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
}
vo0x0 = _mm_max_ps(vo0x0, vmin);
vo1x0 = _mm_max_ps(vo1x0, vmin);
vo0x1 = _mm_max_ps(vo0x1, vmin);
vo1x1 = _mm_max_ps(vo1x1, vmin);
vo0x0 = _mm_min_ps(vo0x0, vmax);
vo1x0 = _mm_min_ps(vo1x0, vmax);
vo0x1 = _mm_min_ps(vo0x1, vmax);
vo1x1 = _mm_min_ps(vo1x1, vmax);
if (iw == 3) {
// Exactly 2 output width elements remaining
const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
_mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
_mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
_mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
_mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
_mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
_mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
_mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
_mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
} else {
// Exactly 1 output width element remaining
_mm_store_ss(o1c0, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(0, 0, 0, 0))); o1c0 += 1;
_mm_store_ss(o1c1, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(1, 1, 1, 1))); o1c1 += 1;
_mm_store_ss(o1c2, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(2, 2, 2, 2))); o1c2 += 1;
_mm_store_ss(o1c3, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(3, 3, 3, 3))); o1c3 += 1;
_mm_store_ss(o0c0, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(0, 0, 0, 0))); o0c0 += 1;
_mm_store_ss(o0c1, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(1, 1, 1, 1))); o0c1 += 1;
_mm_store_ss(o0c2, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(2, 2, 2, 2))); o0c2 += 1;
_mm_store_ss(o0c3, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(3, 3, 3, 3))); o0c3 += 1;
}
}
// Move output pointers back to the position of the first pixel in a row,
// and forward to the next block of output channels.
o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment);
o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment);
o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment);
o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment);
// Revert input pointers to the position of the first pixel in a row
i0 = (const float*) ((uintptr_t) i0 - input_width_increment);
i1 = (const float*) ((uintptr_t) i1 - input_width_increment);
i2 = (const float*) ((uintptr_t) i2 - input_width_increment);
i3 = (const float*) ((uintptr_t) i3 - input_width_increment);
i4 = (const float*) ((uintptr_t) i4 - input_width_increment);
// Move to the block of weights for the next 4 output channels
w += 112;
c = doz(c, 4);
} while (c != 0);
// Move output pointers forward to the next two rows
output0 = (float*) ((uintptr_t) output1 + output_height_stride);
output1 = (float*) ((uintptr_t) output0 + output_height_stride);
// Move input pointers forward to the next four rows
i0 = i4;
i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
}
}
void xnn_f32_dwconv_minmax_ukernel_up8x25__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
size_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
const __m128 vmax = _mm_load_ps(params->sse.max);
const __m128 vmin = _mm_load_ps(params->sse.min);
do {
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = input[4];
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
const float* i5 = input[5];
assert(i5 != NULL);
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
const float* i6 = input[6];
assert(i6 != NULL);
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
const float* i7 = input[7];
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
const float* i8 = input[8];
assert(i8 != NULL);
if XNN_UNPREDICTABLE(i8 != zero) {
i8 = (const float*) ((uintptr_t) i8 + input_offset);
}
const float* i9 = input[9];
assert(i9 != NULL);
if XNN_UNPREDICTABLE(i9 != zero) {
i9 = (const float*) ((uintptr_t) i9 + input_offset);
}
const float* i10 = input[10];
assert(i10 != NULL);
if XNN_UNPREDICTABLE(i10 != zero) {
i10 = (const float*) ((uintptr_t) i10 + input_offset);
}
const float* i11 = input[11];
assert(i11 != NULL);
if XNN_UNPREDICTABLE(i11 != zero) {
i11 = (const float*) ((uintptr_t) i11 + input_offset);
}
const float* i12 = input[12];
assert(i12 != NULL);
if XNN_UNPREDICTABLE(i12 != zero) {
i12 = (const float*) ((uintptr_t) i12 + input_offset);
}
const float* i13 = input[13];
assert(i13 != NULL);
if XNN_UNPREDICTABLE(i13 != zero) {
i13 = (const float*) ((uintptr_t) i13 + input_offset);
}
const float* i14 = input[14];
assert(i14 != NULL);
if XNN_UNPREDICTABLE(i14 != zero) {
i14 = (const float*) ((uintptr_t) i14 + input_offset);
}
const float* i15 = input[15];
assert(i15 != NULL);
if XNN_UNPREDICTABLE(i15 != zero) {
i15 = (const float*) ((uintptr_t) i15 + input_offset);
}
const float* i16 = input[16];
assert(i16 != NULL);
if XNN_UNPREDICTABLE(i16 != zero) {
i16 = (const float*) ((uintptr_t) i16 + input_offset);
}
const float* i17 = input[17];
assert(i17 != NULL);
if XNN_UNPREDICTABLE(i17 != zero) {
i17 = (const float*) ((uintptr_t) i17 + input_offset);
}
const float* i18 = input[18];
assert(i18 != NULL);
if XNN_UNPREDICTABLE(i18 != zero) {
i18 = (const float*) ((uintptr_t) i18 + input_offset);
}
const float* i19 = input[19];
assert(i19 != NULL);
if XNN_UNPREDICTABLE(i19 != zero) {
i19 = (const float*) ((uintptr_t) i19 + input_offset);
}
const float* i20 = input[20];
assert(i20 != NULL);
if XNN_UNPREDICTABLE(i20 != zero) {
i20 = (const float*) ((uintptr_t) i20 + input_offset);
}
const float* i21 = input[21];
assert(i21 != NULL);
if XNN_UNPREDICTABLE(i21 != zero) {
i21 = (const float*) ((uintptr_t) i21 + input_offset);
}
const float* i22 = input[22];
assert(i22 != NULL);
if XNN_UNPREDICTABLE(i22 != zero) {
i22 = (const float*) ((uintptr_t) i22 + input_offset);
}
const float* i23 = input[23];
assert(i23 != NULL);
if XNN_UNPREDICTABLE(i23 != zero) {
i23 = (const float*) ((uintptr_t) i23 + input_offset);
}
const float* i24 = input[24];
assert(i24 != NULL);
if XNN_UNPREDICTABLE(i24 != zero) {
i24 = (const float*) ((uintptr_t) i24 + input_offset);
}
input = (const float**) ((uintptr_t) input + input_stride);
size_t c = channels;
const float* w = weights;
for (; c >= 8; c -= 8) {
__m128 vacc0123p0 = _mm_load_ps(w);
__m128 vacc4567p0 = _mm_load_ps(w + 4);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
i0 += 8;
const __m128 vk0x0123 = _mm_load_ps(w + 8);
const __m128 vk0x4567 = _mm_load_ps(w + 12);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
i1 += 8;
const __m128 vk1x0123 = _mm_load_ps(w + 16);
const __m128 vk1x4567 = _mm_load_ps(w + 20);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
i2 += 8;
const __m128 vk2x0123 = _mm_load_ps(w + 24);
const __m128 vk2x4567 = _mm_load_ps(w + 28);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
i3 += 8;
const __m128 vk3x0123 = _mm_load_ps(w + 32);
const __m128 vk3x4567 = _mm_load_ps(w + 36);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
const __m128 vi4x0123 = _mm_loadu_ps(i4);
const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
i4 += 8;
const __m128 vk4x0123 = _mm_load_ps(w + 40);
const __m128 vk4x4567 = _mm_load_ps(w + 44);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
const __m128 vi5x0123 = _mm_loadu_ps(i5);
const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
i5 += 8;
const __m128 vk5x0123 = _mm_load_ps(w + 48);
const __m128 vk5x4567 = _mm_load_ps(w + 52);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
const __m128 vi6x0123 = _mm_loadu_ps(i6);
const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
i6 += 8;
const __m128 vk6x0123 = _mm_load_ps(w + 56);
const __m128 vk6x4567 = _mm_load_ps(w + 60);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
const __m128 vi7x0123 = _mm_loadu_ps(i7);
const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
i7 += 8;
const __m128 vk7x0123 = _mm_load_ps(w + 64);
const __m128 vk7x4567 = _mm_load_ps(w + 68);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
const __m128 vi8x0123 = _mm_loadu_ps(i8);
const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
i8 += 8;
const __m128 vk8x0123 = _mm_load_ps(w + 72);
const __m128 vk8x4567 = _mm_load_ps(w + 76);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
const __m128 vi9x0123 = _mm_loadu_ps(i9);
const __m128 vi9x4567 = _mm_loadu_ps(i9 + 4);
i9 += 8;
const __m128 vk9x0123 = _mm_load_ps(w + 80);
const __m128 vk9x4567 = _mm_load_ps(w + 84);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi9x4567, vk9x4567));
const __m128 vi10x0123 = _mm_loadu_ps(i10);
const __m128 vi10x4567 = _mm_loadu_ps(i10 + 4);
i10 += 8;
const __m128 vk10x0123 = _mm_load_ps(w + 88);
const __m128 vk10x4567 = _mm_load_ps(w + 92);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi10x4567, vk10x4567));
const __m128 vi11x0123 = _mm_loadu_ps(i11);
const __m128 vi11x4567 = _mm_loadu_ps(i11 + 4);
i11 += 8;
const __m128 vk11x0123 = _mm_load_ps(w + 96);
const __m128 vk11x4567 = _mm_load_ps(w + 100);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi11x4567, vk11x4567));
const __m128 vi12x0123 = _mm_loadu_ps(i12);
const __m128 vi12x4567 = _mm_loadu_ps(i12 + 4);
i12 += 8;
const __m128 vk12x0123 = _mm_load_ps(w + 104);
const __m128 vk12x4567 = _mm_load_ps(w + 108);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi12x4567, vk12x4567));
const __m128 vi13x0123 = _mm_loadu_ps(i13);
const __m128 vi13x4567 = _mm_loadu_ps(i13 + 4);
i13 += 8;
const __m128 vk13x0123 = _mm_load_ps(w + 112);
const __m128 vk13x4567 = _mm_load_ps(w + 116);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi13x4567, vk13x4567));
const __m128 vi14x0123 = _mm_loadu_ps(i14);
const __m128 vi14x4567 = _mm_loadu_ps(i14 + 4);
i14 += 8;
const __m128 vk14x0123 = _mm_load_ps(w + 120);
const __m128 vk14x4567 = _mm_load_ps(w + 124);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi14x4567, vk14x4567));
const __m128 vi15x0123 = _mm_loadu_ps(i15);
const __m128 vi15x4567 = _mm_loadu_ps(i15 + 4);
i15 += 8;
const __m128 vk15x0123 = _mm_load_ps(w + 128);
const __m128 vk15x4567 = _mm_load_ps(w + 132);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi15x4567, vk15x4567));
const __m128 vi16x0123 = _mm_loadu_ps(i16);
const __m128 vi16x4567 = _mm_loadu_ps(i16 + 4);
i16 += 8;
const __m128 vk16x0123 = _mm_load_ps(w + 136);
const __m128 vk16x4567 = _mm_load_ps(w + 140);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi16x4567, vk16x4567));
const __m128 vi17x0123 = _mm_loadu_ps(i17);
const __m128 vi17x4567 = _mm_loadu_ps(i17 + 4);
i17 += 8;
const __m128 vk17x0123 = _mm_load_ps(w + 144);
const __m128 vk17x4567 = _mm_load_ps(w + 148);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi17x4567, vk17x4567));
const __m128 vi18x0123 = _mm_loadu_ps(i18);
const __m128 vi18x4567 = _mm_loadu_ps(i18 + 4);
i18 += 8;
const __m128 vk18x0123 = _mm_load_ps(w + 152);
const __m128 vk18x4567 = _mm_load_ps(w + 156);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi18x4567, vk18x4567));
const __m128 vi19x0123 = _mm_loadu_ps(i19);
const __m128 vi19x4567 = _mm_loadu_ps(i19 + 4);
i19 += 8;
const __m128 vk19x0123 = _mm_load_ps(w + 160);
const __m128 vk19x4567 = _mm_load_ps(w + 164);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi19x4567, vk19x4567));
const __m128 vi20x0123 = _mm_loadu_ps(i20);
const __m128 vi20x4567 = _mm_loadu_ps(i20 + 4);
i20 += 8;
const __m128 vk20x0123 = _mm_load_ps(w + 168);
const __m128 vk20x4567 = _mm_load_ps(w + 172);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi20x4567, vk20x4567));
const __m128 vi21x0123 = _mm_loadu_ps(i21);
const __m128 vi21x4567 = _mm_loadu_ps(i21 + 4);
i21 += 8;
const __m128 vk21x0123 = _mm_load_ps(w + 176);
const __m128 vk21x4567 = _mm_load_ps(w + 180);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi21x4567, vk21x4567));
const __m128 vi22x0123 = _mm_loadu_ps(i22);
const __m128 vi22x4567 = _mm_loadu_ps(i22 + 4);
i22 += 8;
const __m128 vk22x0123 = _mm_load_ps(w + 184);
const __m128 vk22x4567 = _mm_load_ps(w + 188);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi22x4567, vk22x4567));
const __m128 vi23x0123 = _mm_loadu_ps(i23);
const __m128 vi23x4567 = _mm_loadu_ps(i23 + 4);
i23 += 8;
const __m128 vk23x0123 = _mm_load_ps(w + 192);
const __m128 vk23x4567 = _mm_load_ps(w + 196);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi23x4567, vk23x4567));
const __m128 vi24x0123 = _mm_loadu_ps(i24);
const __m128 vi24x4567 = _mm_loadu_ps(i24 + 4);
i24 += 8;
const __m128 vk24x0123 = _mm_load_ps(w + 200);
const __m128 vk24x4567 = _mm_load_ps(w + 204);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi24x4567, vk24x4567));
w += 208;
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
__m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
vacc4567 = _mm_min_ps(vacc4567, vmax);
_mm_storeu_ps(output, vacc0123);
_mm_storeu_ps(output + 4, vacc4567);
output += 8;
}
for (; c >= 4; c -= 4) {
__m128 vacc0123p0 = _mm_load_ps(w);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vk0x0123 = _mm_load_ps(w + 8);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vk1x0123 = _mm_load_ps(w + 16);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vk2x0123 = _mm_load_ps(w + 24);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vk3x0123 = _mm_load_ps(w + 32);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
const __m128 vi4x0123 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vk4x0123 = _mm_load_ps(w + 40);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
const __m128 vi5x0123 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vk5x0123 = _mm_load_ps(w + 48);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
const __m128 vi6x0123 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vk6x0123 = _mm_load_ps(w + 56);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
const __m128 vi7x0123 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vk7x0123 = _mm_load_ps(w + 64);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
const __m128 vi8x0123 = _mm_loadu_ps(i8);
i8 += 4;
const __m128 vk8x0123 = _mm_load_ps(w + 72);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
const __m128 vi9x0123 = _mm_loadu_ps(i9);
i9 += 4;
const __m128 vk9x0123 = _mm_load_ps(w + 80);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
const __m128 vi10x0123 = _mm_loadu_ps(i10);
i10 += 4;
const __m128 vk10x0123 = _mm_load_ps(w + 88);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
const __m128 vi11x0123 = _mm_loadu_ps(i11);
i11 += 4;
const __m128 vk11x0123 = _mm_load_ps(w + 96);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
const __m128 vi12x0123 = _mm_loadu_ps(i12);
i12 += 4;
const __m128 vk12x0123 = _mm_load_ps(w + 104);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
const __m128 vi13x0123 = _mm_loadu_ps(i13);
i13 += 4;
const __m128 vk13x0123 = _mm_load_ps(w + 112);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
const __m128 vi14x0123 = _mm_loadu_ps(i14);
i14 += 4;
const __m128 vk14x0123 = _mm_load_ps(w + 120);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
const __m128 vi15x0123 = _mm_loadu_ps(i15);
i15 += 4;
const __m128 vk15x0123 = _mm_load_ps(w + 128);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
const __m128 vi16x0123 = _mm_loadu_ps(i16);
i16 += 4;
const __m128 vk16x0123 = _mm_load_ps(w + 136);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
const __m128 vi17x0123 = _mm_loadu_ps(i17);
i17 += 4;
const __m128 vk17x0123 = _mm_load_ps(w + 144);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
const __m128 vi18x0123 = _mm_loadu_ps(i18);
i18 += 4;
const __m128 vk18x0123 = _mm_load_ps(w + 152);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
const __m128 vi19x0123 = _mm_loadu_ps(i19);
i19 += 4;
const __m128 vk19x0123 = _mm_load_ps(w + 160);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
const __m128 vi20x0123 = _mm_loadu_ps(i20);
i20 += 4;
const __m128 vk20x0123 = _mm_load_ps(w + 168);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
const __m128 vi21x0123 = _mm_loadu_ps(i21);
i21 += 4;
const __m128 vk21x0123 = _mm_load_ps(w + 176);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
const __m128 vi22x0123 = _mm_loadu_ps(i22);
i22 += 4;
const __m128 vk22x0123 = _mm_load_ps(w + 184);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
const __m128 vi23x0123 = _mm_loadu_ps(i23);
i23 += 4;
const __m128 vk23x0123 = _mm_load_ps(w + 192);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
const __m128 vi24x0123 = _mm_loadu_ps(i24);
i24 += 4;
const __m128 vk24x0123 = _mm_load_ps(w + 200);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
w += 4;
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
_mm_storeu_ps(output, vacc0123);
output += 4;
}
if XNN_UNLIKELY(c != 0) {
__m128 vacc0123p0 = _mm_load_ps(w);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
const __m128 vk0x0123 = _mm_load_ps(w + 8);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
const __m128 vk1x0123 = _mm_load_ps(w + 16);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
const __m128 vk2x0123 = _mm_load_ps(w + 24);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
const __m128 vk3x0123 = _mm_load_ps(w + 32);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
const __m128 vi4x0123 = _mm_loadu_ps(i4);
const __m128 vk4x0123 = _mm_load_ps(w + 40);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
const __m128 vi5x0123 = _mm_loadu_ps(i5);
const __m128 vk5x0123 = _mm_load_ps(w + 48);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
const __m128 vi6x0123 = _mm_loadu_ps(i6);
const __m128 vk6x0123 = _mm_load_ps(w + 56);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
const __m128 vi7x0123 = _mm_loadu_ps(i7);
const __m128 vk7x0123 = _mm_load_ps(w + 64);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
const __m128 vi8x0123 = _mm_loadu_ps(i8);
const __m128 vk8x0123 = _mm_load_ps(w + 72);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
const __m128 vi9x0123 = _mm_loadu_ps(i9);
const __m128 vk9x0123 = _mm_load_ps(w + 80);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
const __m128 vi10x0123 = _mm_loadu_ps(i10);
const __m128 vk10x0123 = _mm_load_ps(w + 88);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
const __m128 vi11x0123 = _mm_loadu_ps(i11);
const __m128 vk11x0123 = _mm_load_ps(w + 96);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
const __m128 vi12x0123 = _mm_loadu_ps(i12);
const __m128 vk12x0123 = _mm_load_ps(w + 104);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
const __m128 vi13x0123 = _mm_loadu_ps(i13);
const __m128 vk13x0123 = _mm_load_ps(w + 112);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
const __m128 vi14x0123 = _mm_loadu_ps(i14);
const __m128 vk14x0123 = _mm_load_ps(w + 120);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
const __m128 vi15x0123 = _mm_loadu_ps(i15);
const __m128 vk15x0123 = _mm_load_ps(w + 128);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
const __m128 vi16x0123 = _mm_loadu_ps(i16);
const __m128 vk16x0123 = _mm_load_ps(w + 136);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
const __m128 vi17x0123 = _mm_loadu_ps(i17);
const __m128 vk17x0123 = _mm_load_ps(w + 144);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
const __m128 vi18x0123 = _mm_loadu_ps(i18);
const __m128 vk18x0123 = _mm_load_ps(w + 152);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
const __m128 vi19x0123 = _mm_loadu_ps(i19);
const __m128 vk19x0123 = _mm_load_ps(w + 160);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
const __m128 vi20x0123 = _mm_loadu_ps(i20);
const __m128 vk20x0123 = _mm_load_ps(w + 168);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
const __m128 vi21x0123 = _mm_loadu_ps(i21);
const __m128 vk21x0123 = _mm_load_ps(w + 176);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
const __m128 vi22x0123 = _mm_loadu_ps(i22);
const __m128 vk22x0123 = _mm_load_ps(w + 184);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
const __m128 vi23x0123 = _mm_loadu_ps(i23);
const __m128 vk23x0123 = _mm_load_ps(w + 192);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
const __m128 vi24x0123 = _mm_loadu_ps(i24);
const __m128 vk24x0123 = _mm_load_ps(w + 200);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
if (c & 2) {
_mm_storel_pi((__m64*) output, vacc0123);
vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
output += 2;
}
if (c & 1) {
_mm_store_ss(output, vacc0123);
output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_dwconv_minmax_ukernel_up8x3__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
size_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
const __m128 vmax = _mm_load_ps(params->sse.max);
const __m128 vmin = _mm_load_ps(params->sse.min);
do {
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
input = (const float**) ((uintptr_t) input + input_stride);
size_t c = channels;
const float* w = weights;
for (; c >= 8; c -= 8) {
__m128 vacc0123p0 = _mm_load_ps(w);
__m128 vacc4567p0 = _mm_load_ps(w + 4);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
i0 += 8;
const __m128 vk0x0123 = _mm_load_ps(w + 8);
const __m128 vk0x4567 = _mm_load_ps(w + 12);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
i1 += 8;
const __m128 vk1x0123 = _mm_load_ps(w + 16);
const __m128 vk1x4567 = _mm_load_ps(w + 20);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
i2 += 8;
const __m128 vk2x0123 = _mm_load_ps(w + 24);
const __m128 vk2x4567 = _mm_load_ps(w + 28);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
w += 32;
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
__m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
vacc4567 = _mm_min_ps(vacc4567, vmax);
_mm_storeu_ps(output, vacc0123);
_mm_storeu_ps(output + 4, vacc4567);
output += 8;
}
for (; c >= 4; c -= 4) {
__m128 vacc0123p0 = _mm_load_ps(w);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vk0x0123 = _mm_load_ps(w + 8);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vk1x0123 = _mm_load_ps(w + 16);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vk2x0123 = _mm_load_ps(w + 24);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
w += 4;
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
_mm_storeu_ps(output, vacc0123);
output += 4;
}
if XNN_UNLIKELY(c != 0) {
__m128 vacc0123p0 = _mm_load_ps(w);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
const __m128 vk0x0123 = _mm_load_ps(w + 8);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
const __m128 vk1x0123 = _mm_load_ps(w + 16);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
const __m128 vk2x0123 = _mm_load_ps(w + 24);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
if (c & 2) {
_mm_storel_pi((__m64*) output, vacc0123);
vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
output += 2;
}
if (c & 1) {
_mm_store_ss(output, vacc0123);
output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_dwconv_minmax_ukernel_up8x4__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
size_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
const __m128 vmax = _mm_load_ps(params->sse.max);
const __m128 vmin = _mm_load_ps(params->sse.min);
do {
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
input = (const float**) ((uintptr_t) input + input_stride);
size_t c = channels;
const float* w = weights;
for (; c >= 8; c -= 8) {
__m128 vacc0123p0 = _mm_load_ps(w);
__m128 vacc4567p0 = _mm_load_ps(w + 4);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
i0 += 8;
const __m128 vk0x0123 = _mm_load_ps(w + 8);
const __m128 vk0x4567 = _mm_load_ps(w + 12);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
i1 += 8;
const __m128 vk1x0123 = _mm_load_ps(w + 16);
const __m128 vk1x4567 = _mm_load_ps(w + 20);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
i2 += 8;
const __m128 vk2x0123 = _mm_load_ps(w + 24);
const __m128 vk2x4567 = _mm_load_ps(w + 28);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
i3 += 8;
const __m128 vk3x0123 = _mm_load_ps(w + 32);
const __m128 vk3x4567 = _mm_load_ps(w + 36);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
w += 40;
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
__m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
vacc4567 = _mm_min_ps(vacc4567, vmax);
_mm_storeu_ps(output, vacc0123);
_mm_storeu_ps(output + 4, vacc4567);
output += 8;
}
for (; c >= 4; c -= 4) {
__m128 vacc0123p0 = _mm_load_ps(w);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vk0x0123 = _mm_load_ps(w + 8);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vk1x0123 = _mm_load_ps(w + 16);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vk2x0123 = _mm_load_ps(w + 24);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vk3x0123 = _mm_load_ps(w + 32);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
w += 4;
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
_mm_storeu_ps(output, vacc0123);
output += 4;
}
if XNN_UNLIKELY(c != 0) {
__m128 vacc0123p0 = _mm_load_ps(w);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
const __m128 vk0x0123 = _mm_load_ps(w + 8);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
const __m128 vk1x0123 = _mm_load_ps(w + 16);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
const __m128 vk2x0123 = _mm_load_ps(w + 24);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
const __m128 vk3x0123 = _mm_load_ps(w + 32);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
if (c & 2) {
_mm_storel_pi((__m64*) output, vacc0123);
vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
output += 2;
}
if (c & 1) {
_mm_store_ss(output, vacc0123);
output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_dwconv_minmax_ukernel_up8x9__sse(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
size_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
const __m128 vmax = _mm_load_ps(params->sse.max);
const __m128 vmin = _mm_load_ps(params->sse.min);
do {
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = input[4];
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
const float* i5 = input[5];
assert(i5 != NULL);
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
const float* i6 = input[6];
assert(i6 != NULL);
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
const float* i7 = input[7];
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
const float* i8 = input[8];
assert(i8 != NULL);
if XNN_UNPREDICTABLE(i8 != zero) {
i8 = (const float*) ((uintptr_t) i8 + input_offset);
}
input = (const float**) ((uintptr_t) input + input_stride);
size_t c = channels;
const float* w = weights;
for (; c >= 8; c -= 8) {
__m128 vacc0123p0 = _mm_load_ps(w);
__m128 vacc4567p0 = _mm_load_ps(w + 4);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
i0 += 8;
const __m128 vk0x0123 = _mm_load_ps(w + 8);
const __m128 vk0x4567 = _mm_load_ps(w + 12);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
i1 += 8;
const __m128 vk1x0123 = _mm_load_ps(w + 16);
const __m128 vk1x4567 = _mm_load_ps(w + 20);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
i2 += 8;
const __m128 vk2x0123 = _mm_load_ps(w + 24);
const __m128 vk2x4567 = _mm_load_ps(w + 28);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
i3 += 8;
const __m128 vk3x0123 = _mm_load_ps(w + 32);
const __m128 vk3x4567 = _mm_load_ps(w + 36);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
const __m128 vi4x0123 = _mm_loadu_ps(i4);
const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
i4 += 8;
const __m128 vk4x0123 = _mm_load_ps(w + 40);
const __m128 vk4x4567 = _mm_load_ps(w + 44);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
const __m128 vi5x0123 = _mm_loadu_ps(i5);
const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
i5 += 8;
const __m128 vk5x0123 = _mm_load_ps(w + 48);
const __m128 vk5x4567 = _mm_load_ps(w + 52);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
const __m128 vi6x0123 = _mm_loadu_ps(i6);
const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
i6 += 8;
const __m128 vk6x0123 = _mm_load_ps(w + 56);
const __m128 vk6x4567 = _mm_load_ps(w + 60);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
const __m128 vi7x0123 = _mm_loadu_ps(i7);
const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
i7 += 8;
const __m128 vk7x0123 = _mm_load_ps(w + 64);
const __m128 vk7x4567 = _mm_load_ps(w + 68);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
const __m128 vi8x0123 = _mm_loadu_ps(i8);
const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
i8 += 8;
const __m128 vk8x0123 = _mm_load_ps(w + 72);
const __m128 vk8x4567 = _mm_load_ps(w + 76);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
w += 80;
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
__m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
vacc4567 = _mm_min_ps(vacc4567, vmax);
_mm_storeu_ps(output, vacc0123);
_mm_storeu_ps(output + 4, vacc4567);
output += 8;
}
for (; c >= 4; c -= 4) {
__m128 vacc0123p0 = _mm_load_ps(w);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vk0x0123 = _mm_load_ps(w + 8);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vk1x0123 = _mm_load_ps(w + 16);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vk2x0123 = _mm_load_ps(w + 24);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vk3x0123 = _mm_load_ps(w + 32);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
const __m128 vi4x0123 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vk4x0123 = _mm_load_ps(w + 40);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
const __m128 vi5x0123 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vk5x0123 = _mm_load_ps(w + 48);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
const __m128 vi6x0123 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vk6x0123 = _mm_load_ps(w + 56);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
const __m128 vi7x0123 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vk7x0123 = _mm_load_ps(w + 64);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
const __m128 vi8x0123 = _mm_loadu_ps(i8);
i8 += 4;
const __m128 vk8x0123 = _mm_load_ps(w + 72);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
w += 4;
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
_mm_storeu_ps(output, vacc0123);
output += 4;
}
if XNN_UNLIKELY(c != 0) {
__m128 vacc0123p0 = _mm_load_ps(w);
const __m128 vi0x0123 = _mm_loadu_ps(i0);
const __m128 vk0x0123 = _mm_load_ps(w + 8);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
const __m128 vi1x0123 = _mm_loadu_ps(i1);
const __m128 vk1x0123 = _mm_load_ps(w + 16);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
const __m128 vi2x0123 = _mm_loadu_ps(i2);
const __m128 vk2x0123 = _mm_load_ps(w + 24);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
const __m128 vi3x0123 = _mm_loadu_ps(i3);
const __m128 vk3x0123 = _mm_load_ps(w + 32);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
const __m128 vi4x0123 = _mm_loadu_ps(i4);
const __m128 vk4x0123 = _mm_load_ps(w + 40);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
const __m128 vi5x0123 = _mm_loadu_ps(i5);
const __m128 vk5x0123 = _mm_load_ps(w + 48);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
const __m128 vi6x0123 = _mm_loadu_ps(i6);
const __m128 vk6x0123 = _mm_load_ps(w + 56);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
const __m128 vi7x0123 = _mm_loadu_ps(i7);
const __m128 vk7x0123 = _mm_load_ps(w + 64);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
const __m128 vi8x0123 = _mm_loadu_ps(i8);
const __m128 vk8x0123 = _mm_load_ps(w + 72);
vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
__m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
vacc0123 = _mm_min_ps(vacc0123, vmax);
if (c & 2) {
_mm_storel_pi((__m64*) output, vacc0123);
vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
output += 2;
}
if (c & 1) {
_mm_store_ss(output, vacc0123);
output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2(
size_t input_height,
size_t input_width,
const float* input,
const float* weights,
const float* zero,
float* output,
uint32_t padding_top,
const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(input_height != 0);
assert(input_width != 0);
assert(input_width % sizeof(float) == 0);
assert(padding_top == 1);
const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
const __m128 vmax = _mm_load_ps(params->sse.max);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vbias = _mm_load1_ps(weights);
const __m128 vk00 = _mm_load1_ps(weights + 1);
const __m128 vk01 = _mm_load1_ps(weights + 2);
const __m128 vk02 = _mm_load1_ps(weights + 3);
const __m128 vk10 = _mm_load1_ps(weights + 4);
const __m128 vk11 = _mm_load1_ps(weights + 5);
const __m128 vk12 = _mm_load1_ps(weights + 6);
const __m128 vk20 = _mm_load1_ps(weights + 7);
const __m128 vk21 = _mm_load1_ps(weights + 8);
const __m128 vk22 = _mm_load1_ps(weights + 9);
const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
const float* i0 = zero;
const float* i1 = input;
const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
float* o0 = output;
float* o1 = (float*) ((uintptr_t) o0 + input_width);
size_t output_height = input_height;
do {
if XNN_UNPREDICTABLE(output_height < 2) {
i2 = zero;
o1 = o0;
}
if XNN_UNPREDICTABLE(output_height < 3) {
i3 = zero;
}
// vi0x3012 = ( vi02, vi01, vi{M}0, vi{M}3 )
__m128 vi0x3012 = _mm_setzero_ps();
// vi1x3012 = ( vi12, vi11, vi{M}0, vi{M}3 )
__m128 vi1x3012 = _mm_setzero_ps();
// vi2x3012 = ( vi22, vi21, vi{M}0, vi{M}3 )
__m128 vi2x3012 = _mm_setzero_ps();
// vi3x3012 = ( vi32, vi31, vi{M}0, vi{M}3 )
__m128 vi3x3012 = _mm_setzero_ps();
__m128 vi0x4567 = _mm_loadu_ps(i0);
i0 += 4;
__m128 vi1x4567 = _mm_loadu_ps(i1);
i1 += 4;
__m128 vi2x4567 = _mm_loadu_ps(i2);
i2 += 4;
__m128 vi3x4567 = _mm_loadu_ps(i3);
i3 += 4;
size_t w = input_width;
for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) {
// vi0x89AB = ( vi0B, vi0A, vi09, vi08 )
const __m128 vi0x89AB = _mm_loadu_ps(i0);
i0 += 4;
// vi1x89AB = ( vi1B, vi1A, vi19, vi18 )
const __m128 vi1x89AB = _mm_loadu_ps(i1);
i1 += 4;
// vi2x89AB = ( vi2B, vi2A, vi29, vi28 )
const __m128 vi2x89AB = _mm_loadu_ps(i2);
i2 += 4;
// vi3x89AB = ( vi3B, vi3A, vi39, vi38 )
const __m128 vi3x89AB = _mm_loadu_ps(i3);
i3 += 4;
// vi0x7456 = ( vi06, vi05, vi04, vi07 )
const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
// vi1x7456 = ( vi16, vi15, vi14, vi17 )
const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
// vi2x7456 = ( vi26, vi25, vi24, vi27 )
const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
// vi3x7456 = ( vi36, vi35, vi34, vi37 )
const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
__m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
__m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
__m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
// vi0x3456 = ( vi06, vi05, vi04, vi03 )
const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
// vi1x3456 = ( vi16, vi15, vi14, vi13 )
const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
// vi2x3456 = ( vi26, vi25, vi24, vi23 )
const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
// vi3x3456 = ( vi36, vi35, vi34, vi33 )
const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
vi0x3012 = vi0x7456;
vi1x3012 = vi1x7456;
vi2x3012 = vi2x7456;
vi3x3012 = vi3x7456;
// vi0x8567 = ( vi07, vi06, vi05, vi08 )
const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
// vi1x8567 = ( vi17, vi16, vi15, vi18 )
const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
// vi2x8567 = ( vi27, vi26, vi25, vi28 )
const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
// vi3x8567 = ( vi37, vi36, vi35, vi38 )
const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
// vi0x5678 = ( vi08, vi07, vi06, vi05 )
const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
// vi1x5678 = ( vi18, vi17, vi16, vi15 )
const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
// vi2x5678 = ( vi28, vi27, vi26, vi25 )
const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
// vi3x5678 = ( vi38, vi37, vi36, vi35 )
const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
vi0x4567 = vi0x89AB;
vi1x4567 = vi1x89AB;
vi2x4567 = vi2x89AB;
vi3x4567 = vi3x89AB;
vo0p0 = _mm_add_ps(vo0p0, vo0p1);
vo1p0 = _mm_add_ps(vo1p0, vo1p1);
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
__m128 vo1 = _mm_max_ps(vo1p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
vo1 = _mm_min_ps(vo1, vmax);
_mm_storeu_ps(o1, vo1);
o1 += 4;
_mm_storeu_ps(o0, vo0);
o0 += 4;
}
// Always process the last block of 1..4 pixels.
assert(w >= 1 * sizeof(float));
assert(w <= 4 * sizeof(float));
{
vi0x4567 = _mm_and_ps(vmask, vi0x4567);
vi1x4567 = _mm_and_ps(vmask, vi1x4567);
vi2x4567 = _mm_and_ps(vmask, vi2x4567);
vi3x4567 = _mm_and_ps(vmask, vi3x4567);
// vi0x7456 = ( vi06, vi05, vi04, vi07 )
const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
// vi1x7456 = ( vi16, vi15, vi14, vi17 )
const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
// vi2x7456 = ( vi26, vi25, vi24, vi27 )
const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
// vi3x7456 = ( vi36, vi35, vi34, vi37 )
const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk01));
__m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk01));
__m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11);
__m128 vo1p1 = _mm_mul_ps(vi2x4567, vk11);
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk21));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk21));
// vi0x3456 = ( vi06, vi05, vi04, vi03 )
const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
// vi1x3456 = ( vi16, vi15, vi14, vi13 )
const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
// vi2x3456 = ( vi26, vi25, vi24, vi23 )
const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
// vi3x3456 = ( vi36, vi35, vi34, vi33 )
const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk00));
vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi1x3456, vk00));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk10));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk10));
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk20));
vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi3x3456, vk20));
const __m128 vzero = _mm_setzero_ps();
// vi0x8567 = ( vi07, vi06, vi05, 0.0 )
const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
// vi1x8567 = ( vi17, vi16, vi15, 0.0 )
const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
// vi2x8567 = ( vi27, vi26, vi25, 0.0 )
const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
// vi3x8567 = ( vi37, vi36, vi35, 0.0 )
const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
// vi0x5678 = ( vi08, vi07, vi06, vi05 )
const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
// vi1x5678 = ( vi18, vi17, vi16, vi15 )
const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
// vi2x5678 = ( vi28, vi27, vi26, vi25 )
const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
// vi3x5678 = ( vi38, vi37, vi36, vi35 )
const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02));
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12));
vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi2x5678, vk12));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22));
vo0p0 = _mm_add_ps(vo0p0, vo0p1);
vo1p0 = _mm_add_ps(vo1p0, vo1p1);
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
__m128 vo1 = _mm_max_ps(vo1p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
vo1 = _mm_min_ps(vo1, vmax);
if XNN_LIKELY(w == 4 * sizeof(float)) {
_mm_storeu_ps(o1, vo1);
o1 += 4;
_mm_storeu_ps(o0, vo0);
o0 += 4;
} else {
if (w & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) o1, vo1);
o1 += 2;
_mm_storel_pi((__m64*) o0, vo0);
o0 += 2;
vo0 = _mm_movehl_ps(vo0, vo0);
vo1 = _mm_movehl_ps(vo1, vo1);
}
if (w & (1 * sizeof(float))) {
_mm_store_ss(o1, vo1);
o1 += 1;
_mm_store_ss(o0, vo0);
o0 += 1;
}
}
}
i0 = (const float*) ((uintptr_t) i2 - input_decrement);
i1 = (const float*) ((uintptr_t) i3 - input_decrement);
i2 = (const float*) ((uintptr_t) i1 + input_width);
i3 = (const float*) ((uintptr_t) i2 + input_width);
o0 = o1;
o1 = (float*) ((uintptr_t) o0 + input_width);
output_height = doz(output_height, 2);
} while (output_height != 0);
}
void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(
size_t input_height,
size_t input_width,
const float* input,
const float* weights,
const float* zero,
float* output,
uint32_t padding_top,
const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(input_height != 0);
assert(input_width != 0);
assert(input_width % sizeof(float) == 0);
assert(padding_top >= 0);
assert(padding_top <= 1);
const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd);
const __m128 vmax = _mm_load_ps(params->sse.max);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vbias = _mm_load1_ps(weights);
const __m128 vk00 = _mm_load1_ps(weights + 1);
const __m128 vk01 = _mm_load1_ps(weights + 2);
const __m128 vk02 = _mm_load1_ps(weights + 3);
const __m128 vk10 = _mm_load1_ps(weights + 4);
const __m128 vk11 = _mm_load1_ps(weights + 5);
const __m128 vk12 = _mm_load1_ps(weights + 6);
const __m128 vk20 = _mm_load1_ps(weights + 7);
const __m128 vk21 = _mm_load1_ps(weights + 8);
const __m128 vk22 = _mm_load1_ps(weights + 9);
const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));
const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
if XNN_UNPREDICTABLE(padding_top != 0) {
i0 = zero;
}
const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
float* o0 = output;
size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
do {
if XNN_UNPREDICTABLE(padded_input_height < 4) {
i2 = zero;
}
__m128 vi0x7531 = _mm_setzero_ps();
__m128 vi1x7531 = _mm_setzero_ps();
__m128 vi2x7531 = _mm_setzero_ps();
size_t w = input_width;
for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
const __m128 vi0x89AB = _mm_loadu_ps(i0);
const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
i0 += 8;
const __m128 vi1x89AB = _mm_loadu_ps(i1);
const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
i1 += 8;
const __m128 vi2x89AB = _mm_loadu_ps(i2);
const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
i2 += 8;
const __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
__m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
__m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
vi0x7531 = vi0xF9BD;
vi1x7531 = vi1xF9BD;
vi2x7531 = vi2xF9BD;
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
vo0p0 = _mm_add_ps(vo0p0, vo0p1);
vo0p0 = _mm_add_ps(vo0p0, vo0p2);
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
_mm_storeu_ps(o0, vo0);
o0 += 4;
}
// Potentially process the last block of 0..7 pixels.
assert(w < 8 * sizeof(float));
if XNN_LIKELY(w != 0) {
const __m128 vi0x89AB = _mm_loadu_ps(i0);
const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
const __m128 vi1x89AB = _mm_loadu_ps(i1);
const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
const __m128 vi2x89AB = _mm_loadu_ps(i2);
const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
const __m128 vi0x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
const __m128 vi0x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
const __m128 vi1x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
const __m128 vi1x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
const __m128 vi2x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
const __m128 vi2x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
__m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
__m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
vi0x7531 = vi0xF9BD;
vi1x7531 = vi1xF9BD;
vi2x7531 = vi2xF9BD;
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
vo0p0 = _mm_add_ps(vo0p0, vo0p1);
vo0p0 = _mm_add_ps(vo0p0, vo0p2);
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
if (w == 7 * sizeof(float)) {
_mm_storeu_ps(o0, vo0);
o0 += 4;
} else {
w += 1 * sizeof(float);
if (w & (4 * sizeof(float))) {
_mm_storel_pi((__m64*) o0, vo0);
o0 += 2;
vo0 = _mm_movehl_ps(vo0, vo0);
}
if (w & (2 * sizeof(float))) {
_mm_store_ss(o0, vo0);
o0 += 1;
}
}
}
i0 = (const float*) ((uintptr_t) i2 - input_decrement);
i1 = (const float*) ((uintptr_t) i0 + input_width);
i2 = (const float*) ((uintptr_t) i1 + input_width);
output_height -= 1;
padded_input_height -= 2;
} while (output_height != 0);
}
void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4(
size_t input_height,
size_t input_width,
const float* input,
const float* weights,
const float* zero,
float* output,
uint32_t padding_top,
const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(input_height != 0);
assert(input_width != 0);
assert(input_width % sizeof(float) == 0);
assert(padding_top == 2);
const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
const __m128 vmax = _mm_load_ps(params->sse.max);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vbias = _mm_load1_ps(weights);
const __m128 vk00 = _mm_load1_ps(weights + 1);
const __m128 vk01 = _mm_load1_ps(weights + 2);
const __m128 vk02 = _mm_load1_ps(weights + 3);
const __m128 vk03 = _mm_load1_ps(weights + 4);
const __m128 vk04 = _mm_load1_ps(weights + 5);
const __m128 vk10 = _mm_load1_ps(weights + 6);
const __m128 vk11 = _mm_load1_ps(weights + 7);
const __m128 vk12 = _mm_load1_ps(weights + 8);
const __m128 vk13 = _mm_load1_ps(weights + 9);
const __m128 vk14 = _mm_load1_ps(weights + 10);
const __m128 vk20 = _mm_load1_ps(weights + 11);
const __m128 vk21 = _mm_load1_ps(weights + 12);
const __m128 vk22 = _mm_load1_ps(weights + 13);
const __m128 vk23 = _mm_load1_ps(weights + 14);
const __m128 vk24 = _mm_load1_ps(weights + 15);
const __m128 vk30 = _mm_load1_ps(weights + 16);
const __m128 vk31 = _mm_load1_ps(weights + 17);
const __m128 vk32 = _mm_load1_ps(weights + 18);
const __m128 vk33 = _mm_load1_ps(weights + 19);
const __m128 vk34 = _mm_load1_ps(weights + 20);
const __m128 vk40 = _mm_load1_ps(weights + 21);
const __m128 vk41 = _mm_load1_ps(weights + 22);
const __m128 vk42 = _mm_load1_ps(weights + 23);
const __m128 vk43 = _mm_load1_ps(weights + 24);
const __m128 vk44 = _mm_load1_ps(weights + 25);
const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
const float* i0 = zero;
const float* i1 = zero;
const float* i2 = input;
const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
float* o0 = output;
float* o1 = (float*) ((uintptr_t) o0 + input_width);
float* o2 = (float*) ((uintptr_t) o1 + input_width);
float* o3 = (float*) ((uintptr_t) o2 + input_width);
size_t output_height = input_height;
do {
if XNN_UNPREDICTABLE(output_height < 2) {
i3 = zero;
o1 = o0;
}
if XNN_UNPREDICTABLE(output_height < 3) {
i4 = zero;
o2 = o1;
}
if XNN_UNPREDICTABLE(output_height < 4) {
i5 = zero;
o3 = o2;
}
if XNN_UNPREDICTABLE(output_height < 5) {
i6 = zero;
}
if XNN_UNPREDICTABLE(output_height < 6) {
i7 = zero;
}
__m128 vi0x3012 = _mm_setzero_ps();
__m128 vi1x3012 = _mm_setzero_ps();
__m128 vi2x3012 = _mm_setzero_ps();
__m128 vi3x3012 = _mm_setzero_ps();
__m128 vi4x3012 = _mm_setzero_ps();
__m128 vi5x3012 = _mm_setzero_ps();
__m128 vi6x3012 = _mm_setzero_ps();
__m128 vi7x3012 = _mm_setzero_ps();
__m128 vi0x4567 = _mm_loadu_ps(i0);
i0 += 4;
__m128 vi1x4567 = _mm_loadu_ps(i1);
i1 += 4;
__m128 vi2x4567 = _mm_loadu_ps(i2);
i2 += 4;
__m128 vi3x4567 = _mm_loadu_ps(i3);
i3 += 4;
__m128 vi4x4567 = _mm_loadu_ps(i4);
i4 += 4;
__m128 vi5x4567 = _mm_loadu_ps(i5);
i5 += 4;
__m128 vi6x4567 = _mm_loadu_ps(i6);
i6 += 4;
__m128 vi7x4567 = _mm_loadu_ps(i7);
i7 += 4;
size_t w = input_width;
for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
__m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
__m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
__m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi0x89AB = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1x89AB = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2x89AB = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3x89AB = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4x89AB = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5x89AB = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6x89AB = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7x89AB = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi0x3012 = vi0x7456;
const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi1x3012 = vi1x7456;
const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi2x3012 = vi2x7456;
const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi3x3012 = vi3x7456;
const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi4x3012 = vi4x7456;
const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi5x3012 = vi5x7456;
const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi6x3012 = vi6x7456;
const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi7x3012 = vi7x7456;
const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
vi0x4567 = vi0x89AB;
const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
vi1x4567 = vi1x89AB;
const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
vi2x4567 = vi2x89AB;
const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
vi3x4567 = vi3x89AB;
const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
vi4x4567 = vi4x89AB;
const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
vi5x4567 = vi5x89AB;
const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
vi6x4567 = vi6x89AB;
const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
vi7x4567 = vi7x89AB;
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
__m128 vo1 = _mm_max_ps(vo1p0, vmin);
__m128 vo2 = _mm_max_ps(vo2p0, vmin);
__m128 vo3 = _mm_max_ps(vo3p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
vo1 = _mm_min_ps(vo1, vmax);
vo2 = _mm_min_ps(vo2, vmax);
vo3 = _mm_min_ps(vo3, vmax);
_mm_storeu_ps(o3, vo3);
o3 += 4;
_mm_storeu_ps(o2, vo2);
o2 += 4;
_mm_storeu_ps(o1, vo1);
o1 += 4;
_mm_storeu_ps(o0, vo0);
o0 += 4;
}
// Always process the last block of 5..8 pixels.
if XNN_LIKELY(w > 4 * sizeof(float)) {
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
__m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
__m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
__m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
i0 += 4;
const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
i1 += 4;
const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
i2 += 4;
const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
i3 += 4;
const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
i4 += 4;
const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
i5 += 4;
const __m128 vi6x89AB = _mm_and_ps(_mm_loadu_ps(i6), vmask);
i6 += 4;
const __m128 vi7x89AB = _mm_and_ps(_mm_loadu_ps(i7), vmask);
i7 += 4;
const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi0x3012 = vi0x7456;
const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi1x3012 = vi1x7456;
const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi2x3012 = vi2x7456;
const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi3x3012 = vi3x7456;
const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi4x3012 = vi4x7456;
const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi5x3012 = vi5x7456;
const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi6x3012 = vi6x7456;
const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
vi7x3012 = vi7x7456;
const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
vi0x4567 = vi0x89AB;
const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
vi1x4567 = vi1x89AB;
const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
vi2x4567 = vi2x89AB;
const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
vi3x4567 = vi3x89AB;
const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
vi4x4567 = vi4x89AB;
const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
vi5x4567 = vi5x89AB;
const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
vi6x4567 = vi6x89AB;
const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
vi7x4567 = vi7x89AB;
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
__m128 vo1 = _mm_max_ps(vo1p0, vmin);
__m128 vo2 = _mm_max_ps(vo2p0, vmin);
__m128 vo3 = _mm_max_ps(vo3p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
vo1 = _mm_min_ps(vo1, vmax);
vo2 = _mm_min_ps(vo2, vmax);
vo3 = _mm_min_ps(vo3, vmax);
_mm_storeu_ps(o3, vo3);
o3 += 4;
_mm_storeu_ps(o2, vo2);
o2 += 4;
_mm_storeu_ps(o1, vo1);
o1 += 4;
_mm_storeu_ps(o0, vo0);
o0 += 4;
w -= 4 * sizeof(float);
}
assert(w >= 1 * sizeof(float));
assert(w <= 4 * sizeof(float));
{
vi0x4567 = _mm_and_ps(vi0x4567, vmask);
vi1x4567 = _mm_and_ps(vi1x4567, vmask);
vi2x4567 = _mm_and_ps(vi2x4567, vmask);
vi3x4567 = _mm_and_ps(vi3x4567, vmask);
vi4x4567 = _mm_and_ps(vi4x4567, vmask);
vi5x4567 = _mm_and_ps(vi5x4567, vmask);
vi6x4567 = _mm_and_ps(vi6x4567, vmask);
vi7x4567 = _mm_and_ps(vi7x4567, vmask);
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
__m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
__m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
__m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vzero = _mm_setzero_ps();
const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vzero);
const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vzero);
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
__m128 vo1 = _mm_max_ps(vo1p0, vmin);
__m128 vo2 = _mm_max_ps(vo2p0, vmin);
__m128 vo3 = _mm_max_ps(vo3p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
vo1 = _mm_min_ps(vo1, vmax);
vo2 = _mm_min_ps(vo2, vmax);
vo3 = _mm_min_ps(vo3, vmax);
if XNN_LIKELY(w & (4 * sizeof(float))) {
_mm_storeu_ps(o3, vo3);
o3 += 4;
_mm_storeu_ps(o2, vo2);
o2 += 4;
_mm_storeu_ps(o1, vo1);
o1 += 4;
_mm_storeu_ps(o0, vo0);
o0 += 4;
} else {
if (w & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) o3, vo3);
o3 += 2;
_mm_storel_pi((__m64*) o2, vo2);
o2 += 2;
_mm_storel_pi((__m64*) o1, vo1);
o1 += 2;
_mm_storel_pi((__m64*) o0, vo0);
o0 += 2;
vo0 = _mm_movehl_ps(vo0, vo0);
vo1 = _mm_movehl_ps(vo1, vo1);
vo2 = _mm_movehl_ps(vo2, vo2);
vo3 = _mm_movehl_ps(vo3, vo3);
}
if (w & (1 * sizeof(float))) {
_mm_store_ss(o3, vo3);
o3 += 1;
_mm_store_ss(o2, vo2);
o2 += 1;
_mm_store_ss(o1, vo1);
o1 += 1;
_mm_store_ss(o0, vo0);
o0 += 1;
}
}
}
i0 = (const float*) ((uintptr_t) i4 - input_decrement);
i1 = (const float*) ((uintptr_t) i5 - input_decrement);
i2 = (const float*) ((uintptr_t) i1 + input_width);
i3 = (const float*) ((uintptr_t) i2 + input_width);
i4 = (const float*) ((uintptr_t) i3 + input_width);
i5 = (const float*) ((uintptr_t) i4 + input_width);
i6 = (const float*) ((uintptr_t) i5 + input_width);
i7 = (const float*) ((uintptr_t) i6 + input_width);
o0 = o3;
o1 = (float*) ((uintptr_t) o0 + input_width);
o2 = (float*) ((uintptr_t) o1 + input_width);
o3 = (float*) ((uintptr_t) o2 + input_width);
output_height = doz(output_height, 4);
} while (output_height != 0);
}
void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4(
size_t input_height,
size_t input_width,
const float* input,
const float* weights,
const float* zero,
float* output,
uint32_t padding_top,
const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(input_height != 0);
assert(input_width != 0);
assert(input_width % sizeof(float) == 0);
assert(padding_top >= 1);
assert(padding_top <= 2);
const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd);
const __m128 vmax = _mm_load_ps(params->sse.max);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vbias = _mm_load1_ps(weights);
const __m128 vk00 = _mm_load1_ps(weights + 1);
const __m128 vk01 = _mm_load1_ps(weights + 2);
const __m128 vk02 = _mm_load1_ps(weights + 3);
const __m128 vk03 = _mm_load1_ps(weights + 4);
const __m128 vk04 = _mm_load1_ps(weights + 5);
const __m128 vk10 = _mm_load1_ps(weights + 6);
const __m128 vk11 = _mm_load1_ps(weights + 7);
const __m128 vk12 = _mm_load1_ps(weights + 8);
const __m128 vk13 = _mm_load1_ps(weights + 9);
const __m128 vk14 = _mm_load1_ps(weights + 10);
const __m128 vk20 = _mm_load1_ps(weights + 11);
const __m128 vk21 = _mm_load1_ps(weights + 12);
const __m128 vk22 = _mm_load1_ps(weights + 13);
const __m128 vk23 = _mm_load1_ps(weights + 14);
const __m128 vk24 = _mm_load1_ps(weights + 15);
const __m128 vk30 = _mm_load1_ps(weights + 16);
const __m128 vk31 = _mm_load1_ps(weights + 17);
const __m128 vk32 = _mm_load1_ps(weights + 18);
const __m128 vk33 = _mm_load1_ps(weights + 19);
const __m128 vk34 = _mm_load1_ps(weights + 20);
const __m128 vk40 = _mm_load1_ps(weights + 21);
const __m128 vk41 = _mm_load1_ps(weights + 22);
const __m128 vk42 = _mm_load1_ps(weights + 23);
const __m128 vk43 = _mm_load1_ps(weights + 24);
const __m128 vk44 = _mm_load1_ps(weights + 25);
const uint32_t padding_top_less_1 = padding_top - 1;
const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float));
const float* i0 = zero;
const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
i1 = zero;
}
const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
float* o0 = output;
float* o1 = (float*) ((uintptr_t) o0 + output_width);
size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
do {
if XNN_UNPREDICTABLE(padded_input_height < 6) {
i3 = zero;
}
if XNN_UNPREDICTABLE(padded_input_height < 7) {
i4 = zero;
o1 = o0;
}
if XNN_UNPREDICTABLE(padded_input_height < 8) {
i5 = zero;
}
if XNN_UNPREDICTABLE(padded_input_height < 9) {
i6 = zero;
}
__m128 vi0x6024 = _mm_setzero_ps();
__m128 vi1x6024 = _mm_setzero_ps();
__m128 vi2x6024 = _mm_setzero_ps();
__m128 vi3x6024 = _mm_setzero_ps();
__m128 vi4x6024 = _mm_setzero_ps();
__m128 vi5x6024 = _mm_setzero_ps();
__m128 vi6x6024 = _mm_setzero_ps();
__m128 vi0x7135 = _mm_setzero_ps();
__m128 vi1x7135 = _mm_setzero_ps();
__m128 vi2x7135 = _mm_setzero_ps();
__m128 vi3x7135 = _mm_setzero_ps();
__m128 vi4x7135 = _mm_setzero_ps();
__m128 vi5x7135 = _mm_setzero_ps();
__m128 vi6x7135 = _mm_setzero_ps();
const __m128 vi0x89AB = _mm_loadu_ps(i0);
const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
i0 += 8;
const __m128 vi1x89AB = _mm_loadu_ps(i1);
const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
i1 += 8;
const __m128 vi2x89AB = _mm_loadu_ps(i2);
const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
i2 += 8;
const __m128 vi3x89AB = _mm_loadu_ps(i3);
const __m128 vi3xCDEF = _mm_loadu_ps(i3 + 4);
i3 += 8;
const __m128 vi4x89AB = _mm_loadu_ps(i4);
const __m128 vi4xCDEF = _mm_loadu_ps(i4 + 4);
i4 += 8;
const __m128 vi5x89AB = _mm_loadu_ps(i5);
const __m128 vi5xCDEF = _mm_loadu_ps(i5 + 4);
i5 += 8;
const __m128 vi6x89AB = _mm_loadu_ps(i6);
const __m128 vi6xCDEF = _mm_loadu_ps(i6 + 4);
i6 += 8;
__m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
__m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
__m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
__m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
__m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
__m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
__m128 vi3x8ACE = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
__m128 vi3x9BDF = _mm_shuffle_ps(vi3x89AB, vi3xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
__m128 vi4x8ACE = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
__m128 vi4x9BDF = _mm_shuffle_ps(vi4x89AB, vi4xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
__m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
__m128 vi5x9BDF = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
__m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
__m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
size_t w = input_width;
for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) {
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
__m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
vi0x6024 = vi0xE8AC;
const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
vi1x6024 = vi1xE8AC;
const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
vi2x6024 = vi2xE8AC;
const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
vi3x6024 = vi3xE8AC;
const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
vi4x6024 = vi4xE8AC;
const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
vi5x6024 = vi5xE8AC;
const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
vi6x6024 = vi6xE8AC;
const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
const __m128 vi0xGHIJ = _mm_loadu_ps(i0);
const __m128 vi0xKLMN = _mm_loadu_ps(i0 + 4);
i0 += 8;
const __m128 vi1xGHIJ = _mm_loadu_ps(i1);
const __m128 vi1xKLMN = _mm_loadu_ps(i1 + 4);
i1 += 8;
const __m128 vi2xGHIJ = _mm_loadu_ps(i2);
const __m128 vi2xKLMN = _mm_loadu_ps(i2 + 4);
i2 += 8;
const __m128 vi3xGHIJ = _mm_loadu_ps(i3);
const __m128 vi3xKLMN = _mm_loadu_ps(i3 + 4);
i3 += 8;
const __m128 vi4xGHIJ = _mm_loadu_ps(i4);
const __m128 vi4xKLMN = _mm_loadu_ps(i4 + 4);
i4 += 8;
const __m128 vi5xGHIJ = _mm_loadu_ps(i5);
const __m128 vi5xKLMN = _mm_loadu_ps(i5 + 4);
i5 += 8;
const __m128 vi6xGHIJ = _mm_loadu_ps(i6);
const __m128 vi6xKLMN = _mm_loadu_ps(i6 + 4);
i6 += 8;
const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
vi0x7135 = vi0xF9BD;
const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
vi1x7135 = vi1xF9BD;
const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
vi2x7135 = vi2xF9BD;
const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
vi3x7135 = vi3xF9BD;
const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
vi4x7135 = vi4xF9BD;
const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
vi5x7135 = vi5xF9BD;
const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
vi6x7135 = vi6xF9BD;
const __m128 vi0xGIKM = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi0xHJLN = _mm_shuffle_ps(vi0xGHIJ, vi0xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
vi0x9BDF = vi0xHJLN;
const __m128 vi1xGIKM = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi1xHJLN = _mm_shuffle_ps(vi1xGHIJ, vi1xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
vi1x9BDF = vi1xHJLN;
const __m128 vi2xGIKM = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi2xHJLN = _mm_shuffle_ps(vi2xGHIJ, vi2xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
vi2x9BDF = vi2xHJLN;
const __m128 vi3xGIKM = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi3xHJLN = _mm_shuffle_ps(vi3xGHIJ, vi3xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
vi3x9BDF = vi3xHJLN;
const __m128 vi4xGIKM = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi4xHJLN = _mm_shuffle_ps(vi4xGHIJ, vi4xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
vi4x9BDF = vi4xHJLN;
const __m128 vi5xGIKM = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi5xHJLN = _mm_shuffle_ps(vi5xGHIJ, vi5xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
vi5x9BDF = vi5xHJLN;
const __m128 vi6xGIKM = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vi6xHJLN = _mm_shuffle_ps(vi6xGHIJ, vi6xKLMN, _MM_SHUFFLE(3, 1, 3, 1));
vi6x9BDF = vi6xHJLN;
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vi0xGIKM);
vi0x8ACE = vi0xGIKM;
const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vi1xGIKM);
vi1x8ACE = vi1xGIKM;
const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vi2xGIKM);
vi2x8ACE = vi2xGIKM;
const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vi3xGIKM);
vi3x8ACE = vi3xGIKM;
const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vi4xGIKM);
vi4x8ACE = vi4xGIKM;
const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM);
vi5x8ACE = vi5xGIKM;
const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vi6xGIKM);
vi6x8ACE = vi6xGIKM;
const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
__m128 vo1 = _mm_max_ps(vo1p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
vo1 = _mm_min_ps(vo1, vmax);
_mm_storeu_ps(o1, vo1);
o1 += 4;
_mm_storeu_ps(o0, vo0);
o0 += 4;
}
// Last block has 1-8 pixels to process.
assert(w <= 8 * sizeof(float));
assert(w >= 1 * sizeof(float));
{
vi0x8ACE = _mm_and_ps(vi0x8ACE, vmask_even);
vi0x9BDF = _mm_and_ps(vi0x9BDF, vmask_odd);
vi1x8ACE = _mm_and_ps(vi1x8ACE, vmask_even);
vi1x9BDF = _mm_and_ps(vi1x9BDF, vmask_odd);
vi2x8ACE = _mm_and_ps(vi2x8ACE, vmask_even);
vi2x9BDF = _mm_and_ps(vi2x9BDF, vmask_odd);
vi3x8ACE = _mm_and_ps(vi3x8ACE, vmask_even);
vi3x9BDF = _mm_and_ps(vi3x9BDF, vmask_odd);
vi4x8ACE = _mm_and_ps(vi4x8ACE, vmask_even);
vi4x9BDF = _mm_and_ps(vi4x9BDF, vmask_odd);
vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even);
vi5x9BDF = _mm_and_ps(vi5x9BDF, vmask_odd);
vi6x8ACE = _mm_and_ps(vi6x8ACE, vmask_even);
vi6x9BDF = _mm_and_ps(vi6x9BDF, vmask_odd);
__m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02));
__m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x8ACE, vk02));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x8ACE, vk12));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x8ACE, vk22));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x8ACE, vk42));
const __m128 vi0xE8AC = _mm_shuffle_ps(vi0x8ACE, vi0x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1xE8AC = _mm_shuffle_ps(vi1x8ACE, vi1x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2xE8AC = _mm_shuffle_ps(vi2x8ACE, vi2x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi3xE8AC = _mm_shuffle_ps(vi3x8ACE, vi3x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi4xE8AC = _mm_shuffle_ps(vi4x8ACE, vi4x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi6xE8AC = _mm_shuffle_ps(vi6x8ACE, vi6x8ACE, _MM_SHUFFLE(2, 1, 0, 3));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x9BDF, vk03));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x9BDF, vk13));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x9BDF, vk23));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x9BDF, vk33));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x9BDF, vk43));
const __m128 vi0x68AC = _mm_move_ss(vi0xE8AC, vi0x6024);
const __m128 vi1x68AC = _mm_move_ss(vi1xE8AC, vi1x6024);
const __m128 vi2x68AC = _mm_move_ss(vi2xE8AC, vi2x6024);
const __m128 vi3x68AC = _mm_move_ss(vi3xE8AC, vi3x6024);
const __m128 vi4x68AC = _mm_move_ss(vi4xE8AC, vi4x6024);
const __m128 vi5x68AC = _mm_move_ss(vi5xE8AC, vi5x6024);
const __m128 vi6x68AC = _mm_move_ss(vi6xE8AC, vi6x6024);
const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi3xF9BD = _mm_shuffle_ps(vi3x9BDF, vi3x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi4xF9BD = _mm_shuffle_ps(vi4x9BDF, vi4x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi5xF9BD = _mm_shuffle_ps(vi5x9BDF, vi5x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
const __m128 vi6xF9BD = _mm_shuffle_ps(vi6x9BDF, vi6x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x68AC, vk00));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x68AC, vk00));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x68AC, vk10));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x68AC, vk10));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x68AC, vk20));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x68AC, vk20));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x68AC, vk30));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x68AC, vk30));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x68AC, vk40));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x68AC, vk40));
const __m128 vi0x79BD = _mm_move_ss(vi0xF9BD, vi0x7135);
const __m128 vi1x79BD = _mm_move_ss(vi1xF9BD, vi1x7135);
const __m128 vi2x79BD = _mm_move_ss(vi2xF9BD, vi2x7135);
const __m128 vi3x79BD = _mm_move_ss(vi3xF9BD, vi3x7135);
const __m128 vi4x79BD = _mm_move_ss(vi4xF9BD, vi4x7135);
const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135);
const __m128 vi6x79BD = _mm_move_ss(vi6xF9BD, vi6x7135);
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x79BD, vk01));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x79BD, vk01));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x79BD, vk11));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x79BD, vk11));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x79BD, vk21));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x79BD, vk21));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x79BD, vk31));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x79BD, vk41));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6x79BD, vk41));
const __m128 vzero = _mm_setzero_ps();
const __m128 vi0xGACE = _mm_move_ss(vi0x8ACE, vzero);
const __m128 vi1xGACE = _mm_move_ss(vi1x8ACE, vzero);
const __m128 vi2xGACE = _mm_move_ss(vi2x8ACE, vzero);
const __m128 vi3xGACE = _mm_move_ss(vi3x8ACE, vzero);
const __m128 vi4xGACE = _mm_move_ss(vi4x8ACE, vzero);
const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero);
const __m128 vi6xGACE = _mm_move_ss(vi6x8ACE, vzero);
const __m128 vi0xACEG = _mm_shuffle_ps(vi0xGACE, vi0xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi1xACEG = _mm_shuffle_ps(vi1xGACE, vi1xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi2xACEG = _mm_shuffle_ps(vi2xGACE, vi2xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi3xACEG = _mm_shuffle_ps(vi3xGACE, vi3xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi4xACEG = _mm_shuffle_ps(vi4xGACE, vi4xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi5xACEG = _mm_shuffle_ps(vi5xGACE, vi5xGACE, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 vi6xACEG = _mm_shuffle_ps(vi6xGACE, vi6xGACE, _MM_SHUFFLE(0, 3, 2, 1));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0xACEG, vk04));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2xACEG, vk04));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1xACEG, vk14));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3xACEG, vk14));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2xACEG, vk24));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4xACEG, vk24));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3xACEG, vk34));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5xACEG, vk34));
vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4xACEG, vk44));
vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi6xACEG, vk44));
__m128 vo0 = _mm_max_ps(vo0p0, vmin);
__m128 vo1 = _mm_max_ps(vo1p0, vmin);
vo0 = _mm_min_ps(vo0, vmax);
vo1 = _mm_min_ps(vo1, vmax);
size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float));
if XNN_LIKELY(w_tmp >= 4) {
_mm_storeu_ps(o1, vo1);
o1 += 4;
_mm_storeu_ps(o0, vo0);
o0 += 4;
} else {
if (w_tmp & 2) {
_mm_storel_pi((__m64*) o1, vo1);
o1 += 2;
_mm_storel_pi((__m64*) o0, vo0);
o0 += 2;
vo0 = _mm_movehl_ps(vo0, vo0);
vo1 = _mm_movehl_ps(vo1, vo1);
}
if (w_tmp & 1) {
_mm_store_ss(o1, vo1);
o1 += 1;
_mm_store_ss(o0, vo0);
o0 += 1;
}
}
}
i0 = (const float*) ((uintptr_t) i4 - input_decrement);
i1 = (const float*) ((uintptr_t) i5 - input_decrement);
i2 = (const float*) ((uintptr_t) i6 - input_decrement);
i3 = (const float*) ((uintptr_t) i2 + input_width);
i4 = (const float*) ((uintptr_t) i3 + input_width);
i5 = (const float*) ((uintptr_t) i4 + input_width);
i6 = (const float*) ((uintptr_t) i5 + input_width);
o0 = o1;
o1 = (float*) ((uintptr_t) o0 + output_width);
output_height = doz(output_height, 2);
padded_input_height = doz(padded_input_height, 4);
} while (output_height != 0);
}
void xnn_f32_gavgpool_cw_ukernel__sse_x4(
size_t elements,
size_t channels,
const float* input,
float* output,
const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(elements != 0);
assert(elements % sizeof(float) == 0);
assert(channels != 0);
const float* i0 = input;
const float* i1 = (const float*) ((uintptr_t) i0 + elements);
const float* i2 = (const float*) ((uintptr_t) i1 + elements);
const float* i3 = (const float*) ((uintptr_t) i2 + elements);
const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
const __m128 vmultiplier = _mm_load_ps(params->sse.multiplier);
const __m128 voutput_min = _mm_load_ps(params->sse.output_min);
const __m128 voutput_max = _mm_load_ps(params->sse.output_max);
while (channels >= 4) {
__m128 vsum0 = _mm_setzero_ps();
__m128 vsum1 = _mm_setzero_ps();
__m128 vsum2 = _mm_setzero_ps();
__m128 vsum3 = _mm_setzero_ps();
size_t n = elements;
while (n >= 4 * sizeof(float)) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
vsum0 = _mm_add_ps(vsum0, vi0);
vsum1 = _mm_add_ps(vsum1, vi1);
vsum2 = _mm_add_ps(vsum2, vi2);
vsum3 = _mm_add_ps(vsum3, vi3);
n -= 4 * sizeof(float);
}
if XNN_UNLIKELY(n != 0) {
const __m128 vi0 = _mm_and_ps(_mm_loadu_ps(i0), vmask);
i0 = (const float*) ((uintptr_t) i0 + n);
const __m128 vi1 = _mm_and_ps(_mm_loadu_ps(i1), vmask);
i1 = (const float*) ((uintptr_t) i1 + n);
const __m128 vi2 = _mm_and_ps(_mm_loadu_ps(i2), vmask);
i2 = (const float*) ((uintptr_t) i2 + n);
const __m128 vi3 = _mm_and_ps(_mm_loadu_ps(i3), vmask);
i3 = (const float*) ((uintptr_t) i3 + n);
vsum0 = _mm_add_ps(vsum0, vi0);
vsum1 = _mm_add_ps(vsum1, vi1);
vsum2 = _mm_add_ps(vsum2, vi2);
vsum3 = _mm_add_ps(vsum3, vi3);
}
// Having exactly 4 rows makes this work out nicely as we end up with
// the 4 totals in 4 different lanes of the same vector.
const __m128 vsum01 = _mm_add_ps(_mm_unpacklo_ps(vsum0, vsum1), _mm_unpackhi_ps(vsum0, vsum1));
const __m128 vsum23 = _mm_add_ps(_mm_unpacklo_ps(vsum2, vsum3), _mm_unpackhi_ps(vsum2, vsum3));
const __m128 vsum = _mm_add_ps(_mm_movelh_ps(vsum01, vsum23), _mm_movehl_ps(vsum23, vsum01));
__m128 vout = _mm_mul_ps(vsum, vmultiplier);
vout = _mm_max_ps(vout, voutput_min);
vout = _mm_min_ps(vout, voutput_max);
_mm_storeu_ps(output, vout);
output += 4;
i0 = i3;
i1 = (const float*) ((uintptr_t) i0 + elements);
i2 = (const float*) ((uintptr_t) i1 + elements);
i3 = (const float*) ((uintptr_t) i2 + elements);
channels -= 4;
}
while (channels != 0) {
__m128 vsum = _mm_setzero_ps();
size_t n = elements;
while (n >= 4 * sizeof(float)) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
vsum = _mm_add_ps(vsum, vi0);
n -= 4 * sizeof(float);
}
if XNN_UNLIKELY(n != 0) {
__m128 vi0 = _mm_and_ps(_mm_loadu_ps(i0), vmask);
i0 = (const float*) ((uintptr_t) i0 + n);
vsum = _mm_add_ps(vsum, vi0);
}
vsum = _mm_add_ps(vsum, _mm_movehl_ps(vsum, vsum));
vsum = _mm_add_ss(vsum, _mm_shuffle_ps(vsum, vsum, _MM_SHUFFLE(3, 2, 1, 1)));
__m128 vout = _mm_mul_ss(vsum, vmultiplier);
vout = _mm_max_ss(vout, voutput_min);
vout = _mm_min_ss(vout, voutput_max);
_mm_store_ss(output, vout);
output += 1;
channels -= 1;
}
}
void xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4(
size_t rows,
size_t channels,
const float* input,
size_t input_stride,
const float* zero,
float* buffer,
float* output,
const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows > 7);
assert(channels != 0);
const float* i0 = input;
const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
const size_t packed_channels = round_up_po2(channels, 4);
const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float);
float* b = buffer;
for (size_t c = 0; c < channels; c += 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
_mm_store_ps(b, vsum); b += 4;
}
for (rows -= 7; rows > 7; rows -= 7) {
b = buffer;
i0 = (const float*) ((uintptr_t) i0 + input_increment);
i1 = (const float*) ((uintptr_t) i1 + input_increment);
i2 = (const float*) ((uintptr_t) i2 + input_increment);
i3 = (const float*) ((uintptr_t) i3 + input_increment);
i4 = (const float*) ((uintptr_t) i4 + input_increment);
i5 = (const float*) ((uintptr_t) i5 + input_increment);
i6 = (const float*) ((uintptr_t) i6 + input_increment);
for (size_t c = 0; c < channels; c += 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vacc = _mm_load_ps(b);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum6a = _mm_add_ps(vi6, vacc);
const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
_mm_store_ps(b, vsum); b += 4;
}
}
i0 = (const float*) ((uintptr_t) i0 + input_increment);
i1 = (const float*) ((uintptr_t) i1 + input_increment);
if (rows < 2) {
i1 = zero;
}
i2 = (const float*) ((uintptr_t) i2 + input_increment);
if (rows <= 2) {
i2 = zero;
}
i3 = (const float*) ((uintptr_t) i3 + input_increment);
if (rows < 4) {
i3 = zero;
}
i4 = (const float*) ((uintptr_t) i4 + input_increment);
if (rows <= 4) {
i4 = zero;
}
i5 = (const float*) ((uintptr_t) i5 + input_increment);
if (rows < 6) {
i5 = zero;
}
i6 = (const float*) ((uintptr_t) i6 + input_increment);
if (rows <= 6) {
i6 = zero;
}
const __m128 vscale = _mm_load_ps(params->sse.scale);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vmax = _mm_load_ps(params->sse.max);
b = buffer;
while (channels >= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vacc = _mm_load_ps(b);
b += 4;
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum6a = _mm_add_ps(vi6, vacc);
const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
__m128 vout = _mm_mul_ps(vsum, vscale);
vout = _mm_max_ps(vout, vmin);
vout = _mm_min_ps(vout, vmax);
_mm_storeu_ps(output, vout);
output += 4;
channels -= 4;
}
if (channels != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
const __m128 vi3 = _mm_loadu_ps(i3);
const __m128 vi4 = _mm_loadu_ps(i4);
const __m128 vi5 = _mm_loadu_ps(i5);
const __m128 vi6 = _mm_loadu_ps(i6);
const __m128 vacc = _mm_loadu_ps(b);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum6a = _mm_add_ps(vi6, vacc);
const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23);
const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a);
const __m128 vsum = _mm_add_ps(vsum0123, vsum456a);
__m128 vout = _mm_mul_ps(vsum, vscale);
vout = _mm_max_ps(vout, vmin);
vout = _mm_min_ps(vout, vmax);
if (channels & 2) {
_mm_storel_pi((__m64*) output, vout);
vout = _mm_movehl_ps(vout, vout);
output += 2;
}
if (channels & 1) {
_mm_store_ss(output, vout);
}
}
}
void xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4(
size_t rows,
size_t channels,
const float* input,
size_t input_stride,
const float* zero,
float* output,
const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows != 0);
assert(rows <= 7);
assert(channels != 0);
const float* i0 = input;
const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
if (rows < 2) {
i1 = zero;
}
const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
if (rows <= 2) {
i2 = zero;
}
const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
if (rows < 4) {
i3 = zero;
}
const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
if (rows <= 4) {
i4 = zero;
}
const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
if (rows < 6) {
i5 = zero;
}
const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
if (rows <= 6) {
i6 = zero;
}
const __m128 vscale = _mm_load_ps(params->sse.scale);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vmax = _mm_load_ps(params->sse.max);
while (channels >= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
__m128 vout = _mm_mul_ps(vsum, vscale);
vout = _mm_max_ps(vout, vmin);
vout = _mm_min_ps(vout, vmax);
_mm_storeu_ps(output, vout);
output += 4;
channels -= 4;
}
if (channels != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
const __m128 vi3 = _mm_loadu_ps(i3);
const __m128 vi4 = _mm_loadu_ps(i4);
const __m128 vi5 = _mm_loadu_ps(i5);
const __m128 vi6 = _mm_loadu_ps(i6);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum016 = _mm_add_ps(vsum01, vi6);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum = _mm_add_ps(vsum016, vsum2345);
__m128 vout = _mm_mul_ps(vsum, vscale);
vout = _mm_max_ps(vout, vmin);
vout = _mm_min_ps(vout, vmax);
if (channels & 2) {
_mm_storel_pi((__m64*) output, vout);
vout = _mm_movehl_ps(vout, vout);
output += 2;
}
if (channels & 1) {
_mm_store_ss(output, vout);
}
}
}
void xnn_f32_gemm_minmax_ukernel_1x8__sse_load1(
size_t mr,
size_t nc,
size_t kc,
const float*restrict a,
size_t a_stride,
const float*restrict w,
float*restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mr != 0);
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
const float* a0 = a;
float* c0 = c;
do {
__m128 vacc0x0123 = _mm_load_ps(w + 0);
__m128 vacc0x4567 = _mm_load_ps(w + 4);
w += 8;
size_t k = kc;
do {
const __m128 va0 = _mm_load1_ps(a0);
a0 += 1;
const __m128 vb0123 = _mm_load_ps(w);
const __m128 vb4567 = _mm_load_ps(w + 4);
w += 8;
vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
k -= sizeof(float);
} while (k != 0);
const __m128 vmax = _mm_load_ps(params->sse.max);
vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
const __m128 vmin = _mm_load_ps(params->sse.min);
vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
if XNN_LIKELY(nc >= 8) {
_mm_storeu_ps(c0, vacc0x0123);
_mm_storeu_ps(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a0 = (const float*) ((uintptr_t) a0 - kc);
nc -= 8;
} else {
if (nc & 4) {
_mm_storeu_ps(c0, vacc0x0123);
vacc0x0123 = vacc0x4567;
c0 += 4;
}
if (nc & 2) {
_mm_storel_pi((__m64*) c0, vacc0x0123);
vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
c0 += 2;
}
if (nc & 1) {
_mm_store_ss(c0, vacc0x0123);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_gemm_minmax_ukernel_4x2c4__sse(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(mr != 0);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
const float* a0 = a;
float* c0 = c;
const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
a1 = a0;
c1 = c0;
}
const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
a2 = a1;
c2 = c1;
}
const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr != 4) {
a3 = a2;
c3 = c2;
}
do {
__m128 vacc0x0c4 = _mm_load_ss(w);
__m128 vacc0x1c4 = _mm_load_ss(w + 1);
__m128 vacc1x0c4 = vacc0x0c4;
__m128 vacc1x1c4 = vacc0x1c4;
__m128 vacc2x0c4 = vacc0x0c4;
__m128 vacc2x1c4 = vacc0x1c4;
__m128 vacc3x0c4 = vacc0x0c4;
__m128 vacc3x1c4 = vacc0x1c4;
w += 2;
size_t k = kc;
for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
const __m128 va0 = _mm_loadu_ps(a0);
a0 += 4;
const __m128 va1 = _mm_loadu_ps(a1);
a1 += 4;
const __m128 va2 = _mm_loadu_ps(a2);
a2 += 4;
const __m128 va3 = _mm_loadu_ps(a3);
a3 += 4;
const __m128 vb0 = _mm_loadu_ps(w);
const __m128 vb1 = _mm_loadu_ps(w + 4);
w += 8;
vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(va0, vb0));
vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(va0, vb1));
vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(va1, vb0));
vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(va1, vb1));
vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(va2, vb0));
vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(va2, vb1));
vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(va3, vb0));
vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(va3, vb1));
}
if XNN_UNLIKELY(k != 0) {
const __m128 va0 = _mm_loadu_ps(a0);
a0 = (const float*) ((uintptr_t) a0 + k);
const __m128 va1 = _mm_loadu_ps(a1);
a1 = (const float*) ((uintptr_t) a1 + k);
const __m128 va2 = _mm_loadu_ps(a2);
a2 = (const float*) ((uintptr_t) a2 + k);
const __m128 va3 = _mm_loadu_ps(a3);
a3 = (const float*) ((uintptr_t) a3 + k);
const __m128 vb0 = _mm_loadu_ps(w);
const __m128 vb1 = _mm_loadu_ps(w + 4);
w += 8;
const __m128 vmask0 = _mm_cmpeq_ps(_mm_setzero_ps(), vb0);
const __m128 vmask1 = _mm_cmpeq_ps(_mm_setzero_ps(), vb1);
vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va0), vb0));
vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va0), vb1));
vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va1), vb0));
vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va1), vb1));
vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va2), vb0));
vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va2), vb1));
vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va3), vb0));
vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va3), vb1));
}
const __m128 vacc0x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc0x0c4, vacc0x1c4), _mm_unpackhi_ps(vacc0x0c4, vacc0x1c4));
const __m128 vacc1x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc1x0c4, vacc1x1c4), _mm_unpackhi_ps(vacc1x0c4, vacc1x1c4));
const __m128 vacc2x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc2x0c4, vacc2x1c4), _mm_unpackhi_ps(vacc2x0c4, vacc2x1c4));
const __m128 vacc3x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc3x0c4, vacc3x1c4), _mm_unpackhi_ps(vacc3x0c4, vacc3x1c4));
__m128 vacc01x01 = _mm_add_ps(_mm_movelh_ps(vacc0x01c2, vacc1x01c2), _mm_movehl_ps(vacc1x01c2, vacc0x01c2));
__m128 vacc23x01 = _mm_add_ps(_mm_movelh_ps(vacc2x01c2, vacc3x01c2), _mm_movehl_ps(vacc3x01c2, vacc2x01c2));
const __m128 vmax = _mm_load_ps(params->sse.max);
vacc01x01 = _mm_min_ps(vacc01x01, vmax);
vacc23x01 = _mm_min_ps(vacc23x01, vmax);
const __m128 vmin = _mm_load_ps(params->sse.min);
vacc01x01 = _mm_max_ps(vacc01x01, vmin);
vacc23x01 = _mm_max_ps(vacc23x01, vmin);
if XNN_LIKELY(nc >= 2) {
_mm_storel_pi((__m64*) c2, vacc23x01);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
a2 = (const float*) ((uintptr_t) a2 - kc);
_mm_storeh_pi((__m64*) c3, vacc23x01);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
a3 = (const float*) ((uintptr_t) a3 - kc);
_mm_storel_pi((__m64*) c0, vacc01x01);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a0 = (const float*) ((uintptr_t) a0 - kc);
_mm_storeh_pi((__m64*) c1, vacc01x01);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
a1 = (const float*) ((uintptr_t) a1 - kc);
nc -= 2;
} else {
assert(nc == 1);
_mm_store_ss(c2, vacc23x01);
_mm_store_ss(c3, _mm_movehl_ps(vacc23x01, vacc23x01));
_mm_store_ss(c0, vacc01x01);
_mm_store_ss(c1, _mm_movehl_ps(vacc01x01, vacc01x01));
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_gemm_minmax_ukernel_4x8__sse_load1(
size_t mr,
size_t nc,
size_t kc,
const float*restrict a,
size_t a_stride,
const float*restrict w,
float*restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mr != 0);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
const float* a0 = a;
float* c0 = c;
const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
a1 = a0;
c1 = c0;
}
const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
a2 = a1;
c2 = c1;
}
const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr != 4) {
a3 = a2;
c3 = c2;
}
do {
__m128 vacc0x0123 = _mm_load_ps(w + 0);
__m128 vacc0x4567 = _mm_load_ps(w + 4);
__m128 vacc1x0123 = vacc0x0123;
__m128 vacc1x4567 = vacc0x4567;
__m128 vacc2x0123 = vacc0x0123;
__m128 vacc2x4567 = vacc0x4567;
__m128 vacc3x0123 = vacc0x0123;
__m128 vacc3x4567 = vacc0x4567;
w += 8;
size_t k = kc;
do {
const __m128 va0 = _mm_load1_ps(a0);
a0 += 1;
const __m128 va1 = _mm_load1_ps(a1);
a1 += 1;
const __m128 va2 = _mm_load1_ps(a2);
a2 += 1;
const __m128 va3 = _mm_load1_ps(a3);
a3 += 1;
const __m128 vb0123 = _mm_load_ps(w);
const __m128 vb4567 = _mm_load_ps(w + 4);
w += 8;
vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
k -= sizeof(float);
} while (k != 0);
const __m128 vmax = _mm_load_ps(params->sse.max);
vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
const __m128 vmin = _mm_load_ps(params->sse.min);
vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
if XNN_LIKELY(nc >= 8) {
_mm_storeu_ps(c3, vacc3x0123);
_mm_storeu_ps(c3 + 4, vacc3x4567);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
_mm_storeu_ps(c2, vacc2x0123);
_mm_storeu_ps(c2 + 4, vacc2x4567);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm_storeu_ps(c1, vacc1x0123);
_mm_storeu_ps(c1 + 4, vacc1x4567);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
_mm_storeu_ps(c0, vacc0x0123);
_mm_storeu_ps(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a3 = (const float*) ((uintptr_t) a3 - kc);
a2 = (const float*) ((uintptr_t) a2 - kc);
a1 = (const float*) ((uintptr_t) a1 - kc);
a0 = (const float*) ((uintptr_t) a0 - kc);
nc -= 8;
} else {
if (nc & 4) {
_mm_storeu_ps(c3, vacc3x0123);
_mm_storeu_ps(c2, vacc2x0123);
_mm_storeu_ps(c1, vacc1x0123);
_mm_storeu_ps(c0, vacc0x0123);
vacc3x0123 = vacc3x4567;
vacc2x0123 = vacc2x4567;
vacc1x0123 = vacc1x4567;
vacc0x0123 = vacc0x4567;
c3 += 4;
c2 += 4;
c1 += 4;
c0 += 4;
}
if (nc & 2) {
_mm_storel_pi((__m64*) c3, vacc3x0123);
_mm_storel_pi((__m64*) c2, vacc2x0123);
_mm_storel_pi((__m64*) c1, vacc1x0123);
_mm_storel_pi((__m64*) c0, vacc0x0123);
vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
c3 += 2;
c2 += 2;
c1 += 2;
c0 += 2;
}
if (nc & 1) {
_mm_store_ss(c3, vacc3x0123);
_mm_store_ss(c2, vacc2x0123);
_mm_store_ss(c1, vacc1x0123);
_mm_store_ss(c0, vacc0x0123);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_ibilinear_chw_ukernel__sse_p8(
size_t output_pixels,
size_t channels,
const float**restrict input,
size_t input_offset,
const float*restrict weights,
float*restrict output,
size_t input_increment) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(channels != 0);
assert(input_increment % sizeof(float) == 0);
do {
const float** i = input;
const float* w = weights;
size_t p = output_pixels;
for (; p >= 8; p -= 8) {
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
i += 2 * 8;
const __m128 vw0123p0 = _mm_loadu_ps(w + 0);
const __m128 vw0123p1 = _mm_loadu_ps(w + 4);
const __m128 vw4567p0 = _mm_loadu_ps(w + 8);
const __m128 vw4567p1 = _mm_loadu_ps(w + 12);
w += 2 * 8;
const __m128 vtltr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0);
const __m128 vblbr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0);
const __m128 vtltr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl2);
const __m128 vblbr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl2);
const __m128 vtltr4 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl4);
const __m128 vblbr4 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl4);
const __m128 vtltr6 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl6);
const __m128 vblbr6 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl6);
const __m128 valphah0123 = _mm_shuffle_ps(vw0123p0, vw0123p1, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 valphav0123 = _mm_shuffle_ps(vw0123p0, vw0123p1, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 valphah4567 = _mm_shuffle_ps(vw4567p0, vw4567p1, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 valphav4567 = _mm_shuffle_ps(vw4567p0, vw4567p1, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vtltr01 = _mm_loadh_pi(vtltr0, (const __m64*) itl1);
const __m128 vblbr01 = _mm_loadh_pi(vblbr0, (const __m64*) ibl1);
const __m128 vtltr23 = _mm_loadh_pi(vtltr2, (const __m64*) itl3);
const __m128 vblbr23 = _mm_loadh_pi(vblbr2, (const __m64*) ibl3);
const __m128 vtltr45 = _mm_loadh_pi(vtltr4, (const __m64*) itl5);
const __m128 vblbr45 = _mm_loadh_pi(vblbr4, (const __m64*) ibl5);
const __m128 vtltr67 = _mm_loadh_pi(vtltr6, (const __m64*) itl7);
const __m128 vblbr67 = _mm_loadh_pi(vblbr6, (const __m64*) ibl7);
const __m128 vldrd01 = _mm_sub_ps(vblbr01, vtltr01);
const __m128 vldrd23 = _mm_sub_ps(vblbr23, vtltr23);
const __m128 vldrd45 = _mm_sub_ps(vblbr45, vtltr45);
const __m128 vldrd67 = _mm_sub_ps(vblbr67, vtltr67);
const __m128 vld0123 = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vrd0123 = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vld4567 = _mm_shuffle_ps(vldrd45, vldrd67, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vrd4567 = _mm_shuffle_ps(vldrd45, vldrd67, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vtl0123 = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vtr0123 = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vtl4567 = _mm_shuffle_ps(vtltr45, vtltr67, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vtr4567 = _mm_shuffle_ps(vtltr45, vtltr67, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vl0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vld0123, valphav0123));
const __m128 vr0123 = _mm_add_ps(vtr0123, _mm_mul_ps(vrd0123, valphav0123));
const __m128 vl4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vld4567, valphav4567));
const __m128 vr4567 = _mm_add_ps(vtr4567, _mm_mul_ps(vrd4567, valphav4567));
const __m128 vd0123 = _mm_sub_ps(vr0123, vl0123);
const __m128 vd4567 = _mm_sub_ps(vr4567, vl4567);
const __m128 vo0123 = _mm_add_ps(vl0123, _mm_mul_ps(vd0123, valphah0123));
const __m128 vo4567 = _mm_add_ps(vl4567, _mm_mul_ps(vd4567, valphah4567));
_mm_storeu_ps(output + 0, vo0123);
_mm_storeu_ps(output + 4, vo4567);
output += 8;
}
for (; p >= 4; p -= 4) {
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
i += 8;
const __m128 vw0 = _mm_loadu_ps(w);
const __m128 vw1 = _mm_loadu_ps(w + 4);
w += 8;
const __m128 vtltr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0);
const __m128 vblbr0 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0);
const __m128 vtltr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl2);
const __m128 vblbr2 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl2);
const __m128 valphah = _mm_shuffle_ps(vw0, vw1, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 valphav = _mm_shuffle_ps(vw0, vw1, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vtltr01 = _mm_loadh_pi(vtltr0, (const __m64*) itl1);
const __m128 vblbr01 = _mm_loadh_pi(vblbr0, (const __m64*) ibl1);
const __m128 vtltr23 = _mm_loadh_pi(vtltr2, (const __m64*) itl3);
const __m128 vblbr23 = _mm_loadh_pi(vblbr2, (const __m64*) ibl3);
const __m128 vldrd01 = _mm_sub_ps(vblbr01, vtltr01);
const __m128 vldrd23 = _mm_sub_ps(vblbr23, vtltr23);
const __m128 vld = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vrd = _mm_shuffle_ps(vldrd01, vldrd23, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vtl = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vtr = _mm_shuffle_ps(vtltr01, vtltr23, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vl = _mm_add_ps(vtl, _mm_mul_ps(vld, valphav));
const __m128 vr = _mm_add_ps(vtr, _mm_mul_ps(vrd, valphav));
const __m128 vd = _mm_sub_ps(vr, vl);
const __m128 vo = _mm_add_ps(vl, _mm_mul_ps(vd, valphah));
_mm_storeu_ps(output, vo);
output += 4;
}
if XNN_UNLIKELY(p != 0) {
if (p & 2) {
const __m128 vw = _mm_loadu_ps(w);
w += 4;
const __m128 valphah = _mm_shuffle_ps(vw, vw, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 valphav = _mm_shuffle_ps(vw, vw, _MM_SHUFFLE(3, 1, 3, 1));
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
i += 4;
const __m128 vtltr = _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl0), (const __m64*) itl1);
const __m128 vblbr = _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl0), (const __m64*) ibl1);
const __m128 vldrd = _mm_sub_ps(vblbr, vtltr);
const __m128 vld = _mm_shuffle_ps(vldrd, vldrd, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vrd = _mm_shuffle_ps(vldrd, vldrd, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vtl = _mm_shuffle_ps(vtltr, vtltr, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 vtr = _mm_shuffle_ps(vtltr, vtltr, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 vl = _mm_add_ps(vtl, _mm_mul_ps(vld, valphav));
const __m128 vr = _mm_add_ps(vtr, _mm_mul_ps(vrd, valphav));
const __m128 vd = _mm_sub_ps(vr, vl);
const __m128 vo = _mm_add_ps(vl, _mm_mul_ps(vd, valphah));
_mm_storel_pi((__m64*) output, vo);
output += 2;
}
if (p & 1) {
// We are computing the following formula:
// result = (1 - alpha_h) * (1 - alpha_v) * top_left +
// alpha_h * (1 - alpha_v) * top_right +
// (1 - alpha_h) * alpha_v * bottom_left +
// alpha_h * alpha_v * bottom_right.
//
// Rearranging gives
// result = left + alpha_h * (right - left),
// where
// left = top_left + alpha_v * (bottom_left - top_left),
// right = top_right + alpha_v * (bottom_right - top_right).
const float alphah = *w;
const __m128 valphav = _mm_load_ps1(w + 1);
w += 2;
const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
i += 2;
const __m128 vtltr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) itl);
const __m128 vblbr = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) ibl);
// Compute at once
// left_diff = bottom_left - top_left
// right_diff = bottom_right - top_right
const __m128 vldrd = _mm_sub_ps(vblbr, vtltr);
const __m128 vlr = _mm_add_ps(vtltr, _mm_mul_ps(vldrd, valphav));
// Extract them and compute the result.
const float l = _mm_cvtss_f32(vlr);
const float r = _mm_cvtss_f32(_mm_shuffle_ps(vlr, vlr, 1));
*output++ = l + alphah * (r - l);
}
}
input_offset += input_increment;
} while (--channels != 0);
}
void xnn_f32_ibilinear_ukernel__sse_c8(
size_t output_pixels,
size_t channels,
const float**restrict input,
size_t input_offset,
const float*restrict weights,
float*restrict output,
size_t output_increment) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(channels != 0);
assert(channels % sizeof(float) == 0);
do {
const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset);
const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset);
const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset);
const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset);
input += 4;
__m128 valphahv = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) weights);
valphahv = _mm_unpacklo_ps(valphahv, valphahv);
const __m128 valphah = _mm_movelh_ps(valphahv, valphahv);
const __m128 valphav = _mm_movehl_ps(valphahv, valphahv);
weights += 2;
size_t c = channels;
for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
const __m128 vtl0123 = _mm_loadu_ps(i0);
const __m128 vtr0123 = _mm_loadu_ps(i1);
const __m128 vbl0123 = _mm_loadu_ps(i2);
const __m128 vbr0123 = _mm_loadu_ps(i3);
const __m128 vtl4567 = _mm_loadu_ps(i0 + 4);
const __m128 vtr4567 = _mm_loadu_ps(i1 + 4);
const __m128 vbl4567 = _mm_loadu_ps(i2 + 4);
const __m128 vbr4567 = _mm_loadu_ps(i3 + 4);
i0 += 8;
i1 += 8;
i2 += 8;
i3 += 8;
const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
const __m128 vtd4567 = _mm_sub_ps(vtr4567, vtl4567);
const __m128 vbd4567 = _mm_sub_ps(vbr4567, vbl4567);
const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
const __m128 vt4567 = _mm_add_ps(vtl4567, _mm_mul_ps(vtd4567, valphah));
const __m128 vb4567 = _mm_add_ps(vbl4567, _mm_mul_ps(vbd4567, valphah));
const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
const __m128 vd4567 = _mm_sub_ps(vb4567, vt4567);
const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
const __m128 vo4567 = _mm_add_ps(vt4567, _mm_mul_ps(vd4567, valphav));
_mm_storeu_ps(output, vo0123);
_mm_storeu_ps(output + 4, vo4567);
output += 8;
}
for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
const __m128 vtl0123 = _mm_loadu_ps(i0);
const __m128 vtr0123 = _mm_loadu_ps(i1);
const __m128 vbl0123 = _mm_loadu_ps(i2);
const __m128 vbr0123 = _mm_loadu_ps(i3);
i0 += 4;
i1 += 4;
i2 += 4;
i3 += 4;
const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
const __m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
_mm_storeu_ps(output, vo0123);
output += 4;
}
if XNN_UNLIKELY(c != 0) {
const __m128 vtl0123 = _mm_loadu_ps(i0);
const __m128 vtr0123 = _mm_loadu_ps(i1);
const __m128 vbl0123 = _mm_loadu_ps(i2);
const __m128 vbr0123 = _mm_loadu_ps(i3);
const __m128 vtd0123 = _mm_sub_ps(vtr0123, vtl0123);
const __m128 vbd0123 = _mm_sub_ps(vbr0123, vbl0123);
const __m128 vt0123 = _mm_add_ps(vtl0123, _mm_mul_ps(vtd0123, valphah));
const __m128 vb0123 = _mm_add_ps(vbl0123, _mm_mul_ps(vbd0123, valphah));
const __m128 vd0123 = _mm_sub_ps(vb0123, vt0123);
__m128 vo0123 = _mm_add_ps(vt0123, _mm_mul_ps(vd0123, valphav));
if (c & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) output, vo0123);
vo0123 = _mm_movehl_ps(vo0123, vo0123);
output += 2;
}
if (c & (1 * sizeof(float))) {
_mm_store_ss(output, vo0123);
output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_pixels != 0);
}
void xnn_f32_igemm_minmax_ukernel_1x8__sse_load1(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float**restrict a,
const float*restrict w,
float*restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mr != 0);
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(ks != 0);
assert(ks % (1 * sizeof(void*)) == 0);
assert(a_offset % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
float* c0 = c;
do {
__m128 vacc0x0123 = _mm_load_ps(w);
__m128 vacc0x4567 = _mm_load_ps(w + 4);
w += 8;
size_t p = ks;
do {
const float* restrict a0 = a[0];
assert(a0 != NULL);
if XNN_UNPREDICTABLE(a0 != zero) {
a0 = (const float*) ((uintptr_t) a0 + a_offset);
}
a += 1;
size_t k = kc;
do {
const __m128 vb0123 = _mm_load_ps(w);
const __m128 vb4567 = _mm_load_ps(w + 4);
w += 8;
const __m128 va0 = _mm_load1_ps(a0);
a0 += 1;
vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
k -= sizeof(float);
} while (k != 0);
p -= 1 * sizeof(void*);
} while (p != 0);
const __m128 vmax = _mm_load_ps(params->sse.max);
vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
const __m128 vmin = _mm_load_ps(params->sse.min);
vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
if XNN_LIKELY(nc >= 8) {
_mm_storeu_ps(c0, vacc0x0123);
_mm_storeu_ps(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a = (const float**restrict) ((uintptr_t) a - ks);
nc -= 8;
} else {
if (nc & 4) {
_mm_storeu_ps(c0, vacc0x0123);
vacc0x0123 = vacc0x4567;
c0 += 4;
}
if (nc & 2) {
_mm_storel_pi((__m64*) c0, vacc0x0123);
vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
c0 += 2;
}
if (nc & 1) {
_mm_store_ss(c0, vacc0x0123);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_igemm_minmax_ukernel_4x2c4__sse(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float**restrict a,
const float*restrict w,
float*restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(mr != 0);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(ks != 0);
assert(ks % (4 * sizeof(void*)) == 0);
assert(a_offset % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
float* c0 = c;
float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
c1 = c0;
}
float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
c2 = c1;
}
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr != 4) {
c3 = c2;
}
do {
__m128 vacc0x0c4 = _mm_load_ss(w);
__m128 vacc0x1c4 = _mm_load_ss(w + 1);
__m128 vacc1x0c4 = vacc0x0c4;
__m128 vacc1x1c4 = vacc0x1c4;
__m128 vacc2x0c4 = vacc0x0c4;
__m128 vacc2x1c4 = vacc0x1c4;
__m128 vacc3x0c4 = vacc0x0c4;
__m128 vacc3x1c4 = vacc0x1c4;
w += 2;
size_t p = ks;
do {
const float* restrict a0 = a[0];
assert(a0 != NULL);
if XNN_UNPREDICTABLE(a0 != zero) {
a0 = (const float*) ((uintptr_t) a0 + a_offset);
}
const float* restrict a1 = a[1];
assert(a1 != NULL);
if XNN_UNPREDICTABLE(a1 != zero) {
a1 = (const float*) ((uintptr_t) a1 + a_offset);
}
const float* restrict a2 = a[2];
assert(a2 != NULL);
if XNN_UNPREDICTABLE(a2 != zero) {
a2 = (const float*) ((uintptr_t) a2 + a_offset);
}
const float* restrict a3 = a[3];
assert(a3 != NULL);
if XNN_UNPREDICTABLE(a3 != zero) {
a3 = (const float*) ((uintptr_t) a3 + a_offset);
}
a += 4;
size_t k = kc;
for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
const __m128 va0 = _mm_loadu_ps(a0);
a0 += 4;
const __m128 va1 = _mm_loadu_ps(a1);
a1 += 4;
const __m128 va2 = _mm_loadu_ps(a2);
a2 += 4;
const __m128 va3 = _mm_loadu_ps(a3);
a3 += 4;
const __m128 vb0 = _mm_loadu_ps(w);
const __m128 vb1 = _mm_loadu_ps(w + 4);
w += 8;
vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(va0, vb0));
vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(va0, vb1));
vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(va1, vb0));
vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(va1, vb1));
vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(va2, vb0));
vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(va2, vb1));
vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(va3, vb0));
vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(va3, vb1));
}
if XNN_UNLIKELY(k != 0) {
const __m128 va0 = _mm_loadu_ps(a0);
const __m128 va1 = _mm_loadu_ps(a1);
const __m128 va2 = _mm_loadu_ps(a2);
const __m128 va3 = _mm_loadu_ps(a3);
const __m128 vb0 = _mm_loadu_ps(w);
const __m128 vb1 = _mm_loadu_ps(w + 4);
w += 8;
const __m128 vmask0 = _mm_cmpeq_ps(_mm_setzero_ps(), vb0);
const __m128 vmask1 = _mm_cmpeq_ps(_mm_setzero_ps(), vb1);
vacc0x0c4 = _mm_add_ps(vacc0x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va0), vb0));
vacc0x1c4 = _mm_add_ps(vacc0x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va0), vb1));
vacc1x0c4 = _mm_add_ps(vacc1x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va1), vb0));
vacc1x1c4 = _mm_add_ps(vacc1x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va1), vb1));
vacc2x0c4 = _mm_add_ps(vacc2x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va2), vb0));
vacc2x1c4 = _mm_add_ps(vacc2x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va2), vb1));
vacc3x0c4 = _mm_add_ps(vacc3x0c4, _mm_mul_ps(_mm_andnot_ps(vmask0, va3), vb0));
vacc3x1c4 = _mm_add_ps(vacc3x1c4, _mm_mul_ps(_mm_andnot_ps(vmask1, va3), vb1));
}
p -= 4 * sizeof(void*);
} while (p != 0);
const __m128 vacc0x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc0x0c4, vacc0x1c4), _mm_unpackhi_ps(vacc0x0c4, vacc0x1c4));
const __m128 vacc1x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc1x0c4, vacc1x1c4), _mm_unpackhi_ps(vacc1x0c4, vacc1x1c4));
const __m128 vacc2x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc2x0c4, vacc2x1c4), _mm_unpackhi_ps(vacc2x0c4, vacc2x1c4));
const __m128 vacc3x01c2 = _mm_add_ps(_mm_unpacklo_ps(vacc3x0c4, vacc3x1c4), _mm_unpackhi_ps(vacc3x0c4, vacc3x1c4));
__m128 vacc01x01 = _mm_add_ps(_mm_movelh_ps(vacc0x01c2, vacc1x01c2), _mm_movehl_ps(vacc1x01c2, vacc0x01c2));
__m128 vacc23x01 = _mm_add_ps(_mm_movelh_ps(vacc2x01c2, vacc3x01c2), _mm_movehl_ps(vacc3x01c2, vacc2x01c2));
const __m128 vmax = _mm_load_ps(params->sse.max);
vacc01x01 = _mm_min_ps(vacc01x01, vmax);
vacc23x01 = _mm_min_ps(vacc23x01, vmax);
const __m128 vmin = _mm_load_ps(params->sse.min);
vacc01x01 = _mm_max_ps(vacc01x01, vmin);
vacc23x01 = _mm_max_ps(vacc23x01, vmin);
if XNN_LIKELY(nc >= 2) {
_mm_storeh_pi((__m64*) c3, vacc23x01);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
_mm_storel_pi((__m64*) c2, vacc23x01);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm_storeh_pi((__m64*) c1, vacc01x01);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
_mm_storel_pi((__m64*) c0, vacc01x01);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a = (const float**restrict) ((uintptr_t) a - ks);
nc -= 2;
} else {
assert(nc == 1);
_mm_store_ss(c3, _mm_movehl_ps(vacc23x01, vacc23x01));
_mm_store_ss(c2, vacc23x01);
_mm_store_ss(c1, _mm_movehl_ps(vacc01x01, vacc01x01));
_mm_store_ss(c0, vacc01x01);
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_igemm_minmax_ukernel_4x8__sse_load1(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float**restrict a,
const float*restrict w,
float*restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mr != 0);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(ks != 0);
assert(ks % (4 * sizeof(void*)) == 0);
assert(a_offset % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
float* c0 = c;
float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
c1 = c0;
}
float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
c2 = c1;
}
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr != 4) {
c3 = c2;
}
do {
__m128 vacc0x0123 = _mm_load_ps(w);
__m128 vacc0x4567 = _mm_load_ps(w + 4);
__m128 vacc1x0123 = vacc0x0123;
__m128 vacc1x4567 = vacc0x4567;
__m128 vacc2x0123 = vacc0x0123;
__m128 vacc2x4567 = vacc0x4567;
__m128 vacc3x0123 = vacc0x0123;
__m128 vacc3x4567 = vacc0x4567;
w += 8;
size_t p = ks;
do {
const float* restrict a0 = a[0];
assert(a0 != NULL);
if XNN_UNPREDICTABLE(a0 != zero) {
a0 = (const float*) ((uintptr_t) a0 + a_offset);
}
const float* restrict a1 = a[1];
assert(a1 != NULL);
if XNN_UNPREDICTABLE(a1 != zero) {
a1 = (const float*) ((uintptr_t) a1 + a_offset);
}
const float* restrict a2 = a[2];
assert(a2 != NULL);
if XNN_UNPREDICTABLE(a2 != zero) {
a2 = (const float*) ((uintptr_t) a2 + a_offset);
}
const float* restrict a3 = a[3];
assert(a3 != NULL);
if XNN_UNPREDICTABLE(a3 != zero) {
a3 = (const float*) ((uintptr_t) a3 + a_offset);
}
a += 4;
size_t k = kc;
do {
const __m128 vb0123 = _mm_load_ps(w);
const __m128 vb4567 = _mm_load_ps(w + 4);
w += 8;
const __m128 va0 = _mm_load1_ps(a0);
a0 += 1;
const __m128 va1 = _mm_load1_ps(a1);
a1 += 1;
const __m128 va2 = _mm_load1_ps(a2);
a2 += 1;
const __m128 va3 = _mm_load1_ps(a3);
a3 += 1;
vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123));
vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567));
vacc1x0123 = _mm_add_ps(vacc1x0123, _mm_mul_ps(va1, vb0123));
vacc1x4567 = _mm_add_ps(vacc1x4567, _mm_mul_ps(va1, vb4567));
vacc2x0123 = _mm_add_ps(vacc2x0123, _mm_mul_ps(va2, vb0123));
vacc2x4567 = _mm_add_ps(vacc2x4567, _mm_mul_ps(va2, vb4567));
vacc3x0123 = _mm_add_ps(vacc3x0123, _mm_mul_ps(va3, vb0123));
vacc3x4567 = _mm_add_ps(vacc3x4567, _mm_mul_ps(va3, vb4567));
k -= sizeof(float);
} while (k != 0);
p -= 4 * sizeof(void*);
} while (p != 0);
const __m128 vmax = _mm_load_ps(params->sse.max);
vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
vacc2x0123 = _mm_min_ps(vacc2x0123, vmax);
vacc3x0123 = _mm_min_ps(vacc3x0123, vmax);
vacc0x4567 = _mm_min_ps(vacc0x4567, vmax);
vacc1x4567 = _mm_min_ps(vacc1x4567, vmax);
vacc2x4567 = _mm_min_ps(vacc2x4567, vmax);
vacc3x4567 = _mm_min_ps(vacc3x4567, vmax);
const __m128 vmin = _mm_load_ps(params->sse.min);
vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
vacc2x0123 = _mm_max_ps(vacc2x0123, vmin);
vacc3x0123 = _mm_max_ps(vacc3x0123, vmin);
vacc0x4567 = _mm_max_ps(vacc0x4567, vmin);
vacc1x4567 = _mm_max_ps(vacc1x4567, vmin);
vacc2x4567 = _mm_max_ps(vacc2x4567, vmin);
vacc3x4567 = _mm_max_ps(vacc3x4567, vmin);
if XNN_LIKELY(nc >= 8) {
_mm_storeu_ps(c3, vacc3x0123);
_mm_storeu_ps(c3 + 4, vacc3x4567);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
_mm_storeu_ps(c2, vacc2x0123);
_mm_storeu_ps(c2 + 4, vacc2x4567);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm_storeu_ps(c1, vacc1x0123);
_mm_storeu_ps(c1 + 4, vacc1x4567);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
_mm_storeu_ps(c0, vacc0x0123);
_mm_storeu_ps(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a = (const float**restrict) ((uintptr_t) a - ks);
nc -= 8;
} else {
if (nc & 4) {
_mm_storeu_ps(c3, vacc3x0123);
_mm_storeu_ps(c2, vacc2x0123);
_mm_storeu_ps(c1, vacc1x0123);
_mm_storeu_ps(c0, vacc0x0123);
vacc3x0123 = vacc3x4567;
vacc2x0123 = vacc2x4567;
vacc1x0123 = vacc1x4567;
vacc0x0123 = vacc0x4567;
c3 += 4;
c2 += 4;
c1 += 4;
c0 += 4;
}
if (nc & 2) {
_mm_storel_pi((__m64*) c3, vacc3x0123);
_mm_storel_pi((__m64*) c2, vacc2x0123);
_mm_storel_pi((__m64*) c1, vacc1x0123);
_mm_storel_pi((__m64*) c0, vacc0x0123);
vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
c3 += 2;
c2 += 2;
c1 += 2;
c0 += 2;
}
if (nc & 1) {
_mm_store_ss(c3, vacc3x0123);
_mm_store_ss(c2, vacc2x0123);
_mm_store_ss(c1, vacc1x0123);
_mm_store_ss(c0, vacc0x0123);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(kernel_elements != 0);
assert(channels != 0);
const __m128 voutput_max = _mm_load_ps(params->sse.max);
const __m128 voutput_min = _mm_load_ps(params->sse.min);
do {
float* o = output;
{
const float* i0 = *input++;
const float* i1 = *input++;
const float* i2 = *input++;
const float* i3 = *input++;
const float* i4 = *input++;
const float* i5 = *input++;
const float* i6 = *input++;
const float* i7 = *input++;
const float* i8 = *input++;
i0 = (const float*) ((uintptr_t) i0 + input_offset);
i1 = (const float*) ((uintptr_t) i1 + input_offset);
i2 = (const float*) ((uintptr_t) i2 + input_offset);
i3 = (const float*) ((uintptr_t) i3 + input_offset);
i4 = (const float*) ((uintptr_t) i4 + input_offset);
i5 = (const float*) ((uintptr_t) i5 + input_offset);
i6 = (const float*) ((uintptr_t) i6 + input_offset);
i7 = (const float*) ((uintptr_t) i7 + input_offset);
i8 = (const float*) ((uintptr_t) i8 + input_offset);
if (kernel_elements < 2) {
i1 = i0;
}
if (kernel_elements <= 2) {
i2 = i0;
}
if (kernel_elements < 4) {
i3 = i0;
}
if (kernel_elements <= 4) {
i4 = i0;
}
if (kernel_elements < 6) {
i5 = i0;
}
if (kernel_elements <= 6) {
i6 = i0;
}
if (kernel_elements < 8) {
i7 = i0;
}
if (kernel_elements <= 8) {
i8 = i0;
}
size_t c = channels;
for (; c >= 4; c -= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vi8 = _mm_loadu_ps(i8);
i8 += 4;
const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
const __m128 vmax23 = _mm_max_ps(vi2, vi3);
const __m128 vmax45 = _mm_max_ps(vi4, vi5);
const __m128 vmax67 = _mm_max_ps(vi6, vi7);
const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
const __m128 vmax01678 = _mm_max_ps(vmax018, vmax67);
const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
_mm_storeu_ps(o, vout);
o += 4;
}
if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vi8 = _mm_loadu_ps(i8);
i8 += 4;
const __m128 vmax018 = _mm_max_ps(_mm_max_ps(vi0, vi1), vi8);
const __m128 vmax23 = _mm_max_ps(vi2, vi3);
const __m128 vmax45 = _mm_max_ps(vi4, vi5);
const __m128 vmax67 = _mm_max_ps(vi6, vi7);
const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
const __m128 vmax01678 = _mm_max_ps(vmax018, vmax67);
const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
__m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
if (c & 2) {
_mm_storel_pi((__m64*) o, vout);
o += 2;
vout = _mm_movehl_ps(vout, vout);
}
if (c & 1) {
_mm_store_ss(o, vout);
o += 1;
}
}
}
for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
const float* i0 = *input++;
const float* i1 = *input++;
const float* i2 = *input++;
const float* i3 = *input++;
const float* i4 = *input++;
const float* i5 = *input++;
const float* i6 = *input++;
const float* i7 = *input++;
i0 = (const float*) ((uintptr_t) i0 + input_offset);
i1 = (const float*) ((uintptr_t) i1 + input_offset);
i2 = (const float*) ((uintptr_t) i2 + input_offset);
i3 = (const float*) ((uintptr_t) i3 + input_offset);
i4 = (const float*) ((uintptr_t) i4 + input_offset);
i5 = (const float*) ((uintptr_t) i5 + input_offset);
i6 = (const float*) ((uintptr_t) i6 + input_offset);
i7 = (const float*) ((uintptr_t) i7 + input_offset);
if (k < 2) {
i1 = i0;
}
if (k <= 2) {
i2 = i0;
}
if (k < 4) {
i3 = i0;
}
if (k <= 4) {
i4 = i0;
}
if (k < 6) {
i5 = i0;
}
if (k <= 6) {
i6 = i0;
}
if (k < 8) {
i7 = i0;
}
o = output;
size_t c = channels;
for (; c >= 4; c -= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vo = _mm_loadu_ps(o);
const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
const __m128 vmax23 = _mm_max_ps(vi2, vi3);
const __m128 vmax45 = _mm_max_ps(vi4, vi5);
const __m128 vmax67 = _mm_max_ps(vi6, vi7);
const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
const __m128 vmax0167 = _mm_max_ps(vmax01, vmax67);
const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
_mm_storeu_ps(o, vout);
o += 4;
}
if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
const __m128 vi3 = _mm_loadu_ps(i3);
const __m128 vi4 = _mm_loadu_ps(i4);
const __m128 vi5 = _mm_loadu_ps(i5);
const __m128 vi6 = _mm_loadu_ps(i6);
const __m128 vi7 = _mm_loadu_ps(i7);
const __m128 vo = _mm_loadu_ps(o);
const __m128 vmax01 = _mm_max_ps(_mm_max_ps(vi0, vi1), vo);
const __m128 vmax23 = _mm_max_ps(vi2, vi3);
const __m128 vmax45 = _mm_max_ps(vi4, vi5);
const __m128 vmax67 = _mm_max_ps(vi6, vi7);
const __m128 vmax2345 = _mm_max_ps(vmax23, vmax45);
const __m128 vmax0167 = _mm_max_ps(vmax01, vmax67);
const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
__m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
if (c & 2) {
_mm_storel_pi((__m64*) o, vout);
o += 2;
vout = _mm_movehl_ps(vout, vout);
}
if (c & 1) {
_mm_store_ss(o, vout);
o += 1;
}
}
}
input = (const float**) ((uintptr_t) input + input_increment);
output = (float*) ((uintptr_t) o + output_increment);
} while (--output_pixels != 0);
}
void xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
const float* zero,
const float* multiplier,
float* buffer,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(kernel_elements > 9);
assert(channels != 0);
const __m128 voutput_min = _mm_load_ps(params->sse.min);
const __m128 voutput_max = _mm_load_ps(params->sse.max);
do {
{
const float* i0 = *input++;
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = *input++;
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = *input++;
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = *input++;
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = *input++;
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
const float* i5 = *input++;
assert(i5 != NULL);
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
const float* i6 = *input++;
assert(i6 != NULL);
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
const float* i7 = *input++;
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
const float* i8 = *input++;
assert(i8 != NULL);
if XNN_UNPREDICTABLE(i8 != zero) {
i8 = (const float*) ((uintptr_t) i8 + input_offset);
}
float* b = buffer;
for (size_t c = 0; c < channels; c += 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vi8 = _mm_loadu_ps(i8);
i8 += 4;
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
_mm_store_ps(b, vsum); b += 4;
}
}
size_t k = kernel_elements;
for (k -= 9; k > 8; k -= 8) {
const float* i0 = *input++;
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = *input++;
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = *input++;
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = *input++;
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = *input++;
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
const float* i5 = *input++;
assert(i5 != NULL);
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
const float* i6 = *input++;
assert(i6 != NULL);
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
const float* i7 = *input++;
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
float* b = buffer;
for (size_t c = 0; c < channels; c += 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vacc = _mm_load_ps(b);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
_mm_store_ps(b, vsum); b += 4;
}
}
{
const float* i0 = input[0];
assert(i0 != NULL);
const float* i1 = input[1];
const float* i2 = input[2];
const float* i3 = input[3];
const float* i4 = input[4];
const float* i5 = input[5];
const float* i6 = input[6];
const float* i7 = input[7];
input = (const float**) ((uintptr_t) input + input_increment);
if (k < 2) {
i1 = zero;
}
assert(i1 != NULL);
if (k <= 2) {
i2 = zero;
}
assert(i2 != NULL);
if (k < 4) {
i3 = zero;
}
assert(i3 != NULL);
if (k <= 4) {
i4 = zero;
}
assert(i4 != NULL);
if (k < 6) {
i5 = zero;
}
assert(i5 != NULL);
if (k <= 6) {
i6 = zero;
}
assert(i6 != NULL);
if (k < 8) {
i7 = zero;
}
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
const __m128 vmultiplier = _mm_load1_ps(multiplier);
multiplier += 1;
size_t c = channels;
float* b = buffer;
while (c >= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vacc = _mm_load_ps(b);
b += 4;
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
__m128 vout = _mm_mul_ps(vsum, vmultiplier);
vout = _mm_max_ps(vout, voutput_min);
vout = _mm_min_ps(vout, voutput_max);
_mm_storeu_ps(output, vout);
output += 4;
c -= 4;
}
if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
const __m128 vi3 = _mm_loadu_ps(i3);
const __m128 vi4 = _mm_loadu_ps(i4);
const __m128 vi5 = _mm_loadu_ps(i5);
const __m128 vi6 = _mm_loadu_ps(i6);
const __m128 vi7 = _mm_loadu_ps(i7);
const __m128 vacc = _mm_load_ps(b);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum01a = _mm_add_ps(vsum01, vacc);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum0167a = _mm_add_ps(vsum01a, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum0167a);
__m128 vout = _mm_mul_ps(vsum, vmultiplier);
vout = _mm_max_ps(vout, voutput_min);
vout = _mm_min_ps(vout, voutput_max);
if (c & 2) {
_mm_storel_pi((__m64*) output, vout);
vout = _mm_movehl_ps(vout, vout);
output += 2;
}
if (c & 1) {
_mm_store_ss(output, vout);
output += 1;
}
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_pixels != 0);
}
void xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4(
size_t output_pixels,
size_t kernel_elements,
size_t channels,
const float** input,
size_t input_offset,
const float* zero,
const float* multiplier,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(kernel_elements != 0);
assert(kernel_elements <= 9);
assert(channels != 0);
const __m128 voutput_min = _mm_load_ps(params->sse.min);
const __m128 voutput_max = _mm_load_ps(params->sse.max);
do {
const float* i0 = input[0];
assert(i0 != NULL);
const float* i1 = input[1];
const float* i2 = input[2];
const float* i3 = input[3];
const float* i4 = input[4];
const float* i5 = input[5];
const float* i6 = input[6];
const float* i7 = input[7];
const float* i8 = input[8];
input = (const float**) ((uintptr_t) input + input_increment);
if (kernel_elements < 2) {
i1 = zero;
}
assert(i1 != NULL);
if (kernel_elements <= 2) {
i2 = zero;
}
assert(i2 != NULL);
if (kernel_elements < 4) {
i3 = zero;
}
assert(i3 != NULL);
if (kernel_elements <= 4) {
i4 = zero;
}
assert(i4 != NULL);
if (kernel_elements < 6) {
i5 = zero;
}
assert(i5 != NULL);
if (kernel_elements <= 6) {
i6 = zero;
}
assert(i6 != NULL);
if (kernel_elements < 8) {
i7 = zero;
}
assert(i7 != NULL);
if (kernel_elements <= 8) {
i8 = zero;
}
assert(i8 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
if XNN_UNPREDICTABLE(i8 != zero) {
i8 = (const float*) ((uintptr_t) i8 + input_offset);
}
const __m128 vmultiplier = _mm_load1_ps(multiplier);
multiplier += 1;
size_t c = channels;
while (c >= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
i1 += 4;
const __m128 vi2 = _mm_loadu_ps(i2);
i2 += 4;
const __m128 vi3 = _mm_loadu_ps(i3);
i3 += 4;
const __m128 vi4 = _mm_loadu_ps(i4);
i4 += 4;
const __m128 vi5 = _mm_loadu_ps(i5);
i5 += 4;
const __m128 vi6 = _mm_loadu_ps(i6);
i6 += 4;
const __m128 vi7 = _mm_loadu_ps(i7);
i7 += 4;
const __m128 vi8 = _mm_loadu_ps(i8);
i8 += 4;
const __m128 vsum018 = _mm_add_ps(_mm_add_ps(vi0, vi1), vi8);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
__m128 vout = _mm_mul_ps(vsum, vmultiplier);
vout = _mm_max_ps(vout, voutput_min);
vout = _mm_min_ps(vout, voutput_max);
_mm_storeu_ps(output, vout); output += 4;
c -= 4;
}
if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
const __m128 vi3 = _mm_loadu_ps(i3);
const __m128 vi4 = _mm_loadu_ps(i4);
const __m128 vi5 = _mm_loadu_ps(i5);
const __m128 vi6 = _mm_loadu_ps(i6);
const __m128 vi7 = _mm_loadu_ps(i7);
const __m128 vi8 = _mm_loadu_ps(i8);
const __m128 vsum01 = _mm_add_ps(vi0, vi1);
const __m128 vsum23 = _mm_add_ps(vi2, vi3);
const __m128 vsum45 = _mm_add_ps(vi4, vi5);
const __m128 vsum67 = _mm_add_ps(vi6, vi7);
const __m128 vsum018 = _mm_add_ps(vsum01, vi8);
const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45);
const __m128 vsum01678 = _mm_add_ps(vsum018, vsum67);
const __m128 vsum = _mm_add_ps(vsum2345, vsum01678);
__m128 vout = _mm_mul_ps(vsum, vmultiplier);
vout = _mm_max_ps(vout, voutput_min);
vout = _mm_min_ps(vout, voutput_max);
if (c & 2) {
_mm_storel_pi((__m64*) output, vout);
vout = _mm_movehl_ps(vout, vout);
output += 2;
}
if (c & 1) {
_mm_store_ss(output, vout);
output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_pixels != 0);
}
void xnn_f32_rmax_ukernel__sse(
size_t n,
const float* x,
float* y)
{
assert(n != 0);
assert(n % sizeof(float) == 0);
__m128 vmax0 = _mm_load_ss(x);
vmax0 = _mm_shuffle_ps(vmax0, vmax0, _MM_SHUFFLE(0, 0, 0, 0));
__m128 vmax1 = vmax0;
__m128 vmax2 = vmax0;
__m128 vmax3 = vmax0;
for (; n >= 64; n -= 64) {
const __m128 vx0 = _mm_loadu_ps(x);
const __m128 vx1 = _mm_loadu_ps(x + 4);
const __m128 vx2 = _mm_loadu_ps(x + 8);
const __m128 vx3 = _mm_loadu_ps(x + 12);
x += 16;
vmax0 = _mm_max_ps(vmax0, vx0);
vmax1 = _mm_max_ps(vmax1, vx1);
vmax2 = _mm_max_ps(vmax2, vx2);
vmax3 = _mm_max_ps(vmax3, vx3);
}
__m128 vmax = _mm_max_ps(_mm_max_ps(vmax0, vmax1), _mm_max_ps(vmax2, vmax3));
for (; n >= 16; n -= 16) {
const __m128 vx = _mm_loadu_ps(x);
vmax = _mm_max_ps(vmax, vx);
x += 4;
}
__m128 vmax_lo = _mm_max_ps(vmax, _mm_movehl_ps(vmax, vmax));
vmax_lo = _mm_max_ss(vmax_lo, _mm_shuffle_ps(vmax_lo, vmax_lo, _MM_SHUFFLE(3, 3, 1, 1)));
if XNN_UNLIKELY(n != 0) {
do {
vmax_lo = _mm_max_ss(vmax_lo, _mm_load_ss(x));
x += 1;
n -= 4;
} while (n != 0);
}
_mm_store_ss(y, vmax_lo);
}
void xnn_f32_spmm_minmax_ukernel_32x1__sse(
size_t mc,
size_t nc,
const float*restrict input,
const float*restrict weights,
const int32_t*restrict widx_dmap,
const uint32_t*restrict nidx_nnzmap,
float*restrict output,
size_t output_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mc != 0);
assert(mc % sizeof(float) == 0);
assert(nc != 0);
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vmax = _mm_load_ps(params->sse.max);
size_t output_decrement = output_stride * nc - 32 * sizeof(float);
while XNN_LIKELY(mc >= 32 * sizeof(float)) {
const float*restrict w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
__m128 vacc0123 = _mm_load1_ps(w); w += 1;
__m128 vacc4567 = vacc0123;
__m128 vacc89AB = vacc0123;
__m128 vaccCDEF = vacc0123;
__m128 vaccGHIJ = vacc0123;
__m128 vaccKLMN = vacc0123;
__m128 vaccOPQR = vacc0123;
__m128 vaccSTUV = vacc0123;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const __m128 vi0123 = _mm_loadu_ps(input);
const __m128 vi4567 = _mm_loadu_ps(input + 4);
const __m128 vi89AB = _mm_loadu_ps(input + 8);
const __m128 viCDEF = _mm_loadu_ps(input + 12);
const __m128 viGHIJ = _mm_loadu_ps(input + 16);
const __m128 viKLMN = _mm_loadu_ps(input + 20);
const __m128 viOPQR = _mm_loadu_ps(input + 24);
const __m128 viSTUV = _mm_loadu_ps(input + 28);
input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
const __m128 vw = _mm_load1_ps(w); w += 1;
vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
vacc89AB = _mm_add_ps(vacc89AB, _mm_mul_ps(vi89AB, vw));
vaccCDEF = _mm_add_ps(vaccCDEF, _mm_mul_ps(viCDEF, vw));
vaccGHIJ = _mm_add_ps(vaccGHIJ, _mm_mul_ps(viGHIJ, vw));
vaccKLMN = _mm_add_ps(vaccKLMN, _mm_mul_ps(viKLMN, vw));
vaccOPQR = _mm_add_ps(vaccOPQR, _mm_mul_ps(viOPQR, vw));
vaccSTUV = _mm_add_ps(vaccSTUV, _mm_mul_ps(viSTUV, vw));
} while (--nnz != 0);
}
__m128 vout0123 = _mm_min_ps(vacc0123, vmax);
__m128 vout4567 = _mm_min_ps(vacc4567, vmax);
__m128 vout89AB = _mm_min_ps(vacc89AB, vmax);
__m128 voutCDEF = _mm_min_ps(vaccCDEF, vmax);
__m128 voutGHIJ = _mm_min_ps(vaccGHIJ, vmax);
__m128 voutKLMN = _mm_min_ps(vaccKLMN, vmax);
__m128 voutOPQR = _mm_min_ps(vaccOPQR, vmax);
__m128 voutSTUV = _mm_min_ps(vaccSTUV, vmax);
vout0123 = _mm_max_ps(vout0123, vmin);
vout4567 = _mm_max_ps(vout4567, vmin);
vout89AB = _mm_max_ps(vout89AB, vmin);
voutCDEF = _mm_max_ps(voutCDEF, vmin);
voutGHIJ = _mm_max_ps(voutGHIJ, vmin);
voutKLMN = _mm_max_ps(voutKLMN, vmin);
voutOPQR = _mm_max_ps(voutOPQR, vmin);
voutSTUV = _mm_max_ps(voutSTUV, vmin);
_mm_storeu_ps(output, vout0123);
_mm_storeu_ps(output + 4, vout4567);
_mm_storeu_ps(output + 8, vout89AB);
_mm_storeu_ps(output + 12, voutCDEF);
_mm_storeu_ps(output + 16, voutGHIJ);
_mm_storeu_ps(output + 20, voutKLMN);
_mm_storeu_ps(output + 24, voutOPQR);
_mm_storeu_ps(output + 28, voutSTUV);
output = (float*restrict) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*restrict) ((uintptr_t) output - output_decrement);
input += 32;
mc -= 32 * sizeof(float);
}
if XNN_UNLIKELY(mc != 0) {
output_decrement += 16 * sizeof(float);
if (mc & (16 * sizeof(float))) {
const float*restrict w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
__m128 vacc0123 = _mm_load1_ps(w); w += 1;
__m128 vacc4567 = vacc0123;
__m128 vacc89AB = vacc0123;
__m128 vaccCDEF = vacc0123;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const __m128 vi0123 = _mm_loadu_ps(input);
const __m128 vi4567 = _mm_loadu_ps(input + 4);
const __m128 vi89AB = _mm_loadu_ps(input + 8);
const __m128 viCDEF = _mm_loadu_ps(input + 12);
input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
const __m128 vw = _mm_load1_ps(w); w += 1;
vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
vacc89AB = _mm_add_ps(vacc89AB, _mm_mul_ps(vi89AB, vw));
vaccCDEF = _mm_add_ps(vaccCDEF, _mm_mul_ps(viCDEF, vw));
} while (--nnz != 0);
}
__m128 vout0123 = _mm_min_ps(vacc0123, vmax);
__m128 vout4567 = _mm_min_ps(vacc4567, vmax);
__m128 vout89AB = _mm_min_ps(vacc89AB, vmax);
__m128 voutCDEF = _mm_min_ps(vaccCDEF, vmax);
vout0123 = _mm_max_ps(vout0123, vmin);
vout4567 = _mm_max_ps(vout4567, vmin);
vout89AB = _mm_max_ps(vout89AB, vmin);
voutCDEF = _mm_max_ps(voutCDEF, vmin);
_mm_storeu_ps(output, vout0123);
_mm_storeu_ps(output + 4, vout4567);
_mm_storeu_ps(output + 8, vout89AB);
_mm_storeu_ps(output + 12, voutCDEF);
output = (float*restrict) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*restrict) ((uintptr_t) output - output_decrement);
input += 16;
}
output_decrement += 8 * sizeof(float);
if (mc & (8 * sizeof(float))) {
const float*restrict w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
__m128 vacc0123 = _mm_load1_ps(w); w += 1;
__m128 vacc4567 = vacc0123;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const __m128 vi0123 = _mm_loadu_ps(input);
const __m128 vi4567 = _mm_loadu_ps(input + 4);
input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
const __m128 vw = _mm_load1_ps(w); w += 1;
vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vi4567, vw));
} while (--nnz != 0);
}
__m128 vout0123 = _mm_min_ps(vacc0123, vmax);
__m128 vout4567 = _mm_min_ps(vacc4567, vmax);
vout0123 = _mm_max_ps(vout0123, vmin);
vout4567 = _mm_max_ps(vout4567, vmin);
_mm_storeu_ps(output, vout0123);
_mm_storeu_ps(output + 4, vout4567);
output = (float*restrict) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*restrict) ((uintptr_t) output - output_decrement);
input += 8;
}
output_decrement += 4 * sizeof(float);
if (mc & (4 * sizeof(float))) {
const float*restrict w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
__m128 vacc0123 = _mm_load1_ps(w); w += 1;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const __m128 vi0123 = _mm_loadu_ps(input);
input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
const __m128 vw = _mm_load1_ps(w); w += 1;
vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw));
} while (--nnz != 0);
}
__m128 vout0123 = _mm_min_ps(vacc0123, vmax);
vout0123 = _mm_max_ps(vout0123, vmin);
_mm_storeu_ps(output, vout0123);
output = (float*restrict) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*restrict) ((uintptr_t) output - output_decrement);
input += 4;
}
output_decrement += 2 * sizeof(float);
if (mc & (2 * sizeof(float))) {
const float*restrict w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
__m128 vacc01 = _mm_load_ss(w); w += 1;
vacc01 = _mm_unpacklo_ps(vacc01, vacc01);
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const __m128 vi01 = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) input);
input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
__m128 vw = _mm_load_ss(w); w += 1;
vw = _mm_unpacklo_ps(vw, vw);
vacc01 = _mm_add_ps(vacc01, _mm_mul_ps(vi01, vw));
} while (--nnz != 0);
}
__m128 vout01 = _mm_min_ps(vacc01, vmax);
vout01 = _mm_max_ps(vout01, vmin);
_mm_storel_pi((__m64*) output, vout01);
output = (float*restrict) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*restrict) ((uintptr_t) output - output_decrement);
input += 2;
}
output_decrement += 1 * sizeof(float);
if (mc & (1 * sizeof(float))) {
const float*restrict w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
__m128 vacc0 = _mm_load_ss(w); w += 1;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const __m128 vi0 = _mm_load_ss(input);
input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
const __m128 vw = _mm_load_ss(w); w += 1;
vacc0 = _mm_add_ss(vacc0, _mm_mul_ss(vi0, vw));
} while (--nnz != 0);
}
__m128 vout0 = _mm_min_ss(vacc0, vmax);
vout0 = _mm_max_ss(vout0, vmin);
_mm_store_ss(output, vout0);
output = (float*restrict) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*restrict) ((uintptr_t) output - output_decrement);
input += 1;
}
}
}
void xnn_f32_vadd_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
const __m128 vb0123 = _mm_loadu_ps(b);
const __m128 vb4567 = _mm_loadu_ps(b + 4);
b += 8;
__m128 vy0123 = _mm_add_ps(va0123, vb0123);
__m128 vy4567 = _mm_add_ps(va4567, vb4567);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
const __m128 vb0123 = _mm_loadu_ps(b);
b += 4;
__m128 vy0123 = _mm_add_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 vb0123 = _mm_loadu_ps(b);
__m128 vy0123 = _mm_add_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vaddc_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_add_ps(va0123, vb);
__m128 vy4567 = _mm_add_ps(va4567, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_add_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_add_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vdiv_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
const __m128 vb0123 = _mm_loadu_ps(b);
const __m128 vb4567 = _mm_loadu_ps(b + 4);
b += 8;
__m128 vy0123 = _mm_div_ps(va0123, vb0123);
__m128 vy4567 = _mm_div_ps(va4567, vb4567);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
const __m128 vb0123 = _mm_loadu_ps(b);
b += 4;
__m128 vy0123 = _mm_div_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 vb0123 = _mm_loadu_ps(b);
__m128 vy0123 = _mm_div_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vdivc_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_div_ps(va0123, vb);
__m128 vy4567 = _mm_div_ps(va4567, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_div_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_div_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
const __m128 vb0123 = _mm_loadu_ps(b);
const __m128 vb4567 = _mm_loadu_ps(b + 4);
b += 8;
__m128 vy0123 = _mm_max_ps(va0123, vb0123);
__m128 vy4567 = _mm_max_ps(va4567, vb4567);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
const __m128 vb0123 = _mm_loadu_ps(b);
b += 4;
__m128 vy0123 = _mm_max_ps(va0123, vb0123);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 vb0123 = _mm_loadu_ps(b);
__m128 vy0123 = _mm_max_ps(va0123, vb0123);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vmaxc_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_max_ps(va0123, vb);
__m128 vy4567 = _mm_max_ps(va4567, vb);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_max_ps(va0123, vb);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_max_ps(va0123, vb);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vmin_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
const __m128 vb0123 = _mm_loadu_ps(b);
const __m128 vb4567 = _mm_loadu_ps(b + 4);
b += 8;
__m128 vy0123 = _mm_min_ps(va0123, vb0123);
__m128 vy4567 = _mm_min_ps(va4567, vb4567);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
const __m128 vb0123 = _mm_loadu_ps(b);
b += 4;
__m128 vy0123 = _mm_min_ps(va0123, vb0123);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 vb0123 = _mm_loadu_ps(b);
__m128 vy0123 = _mm_min_ps(va0123, vb0123);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vminc_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_min_ps(va0123, vb);
__m128 vy4567 = _mm_min_ps(va4567, vb);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_min_ps(va0123, vb);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_min_ps(va0123, vb);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vmul_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
const __m128 vb0123 = _mm_loadu_ps(b);
const __m128 vb4567 = _mm_loadu_ps(b + 4);
b += 8;
__m128 vy0123 = _mm_mul_ps(va0123, vb0123);
__m128 vy4567 = _mm_mul_ps(va4567, vb4567);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
const __m128 vb0123 = _mm_loadu_ps(b);
b += 4;
__m128 vy0123 = _mm_mul_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 vb0123 = _mm_loadu_ps(b);
__m128 vy0123 = _mm_mul_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vmulc_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_mul_ps(va0123, vb);
__m128 vy4567 = _mm_mul_ps(va4567, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_mul_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_mul_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vrdivc_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_div_ps(vb, va0123);
__m128 vy4567 = _mm_div_ps(vb, va4567);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_div_ps(vb, va0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_div_ps(vb, va0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vrsubc_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_sub_ps(vb, va0123);
__m128 vy4567 = _mm_sub_ps(vb, va4567);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_sub_ps(vb, va0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_sub_ps(vb, va0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vsqrdiff_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
const __m128 vb0123 = _mm_loadu_ps(b);
const __m128 vb4567 = _mm_loadu_ps(b + 4);
b += 8;
__m128 vy0123 = _mm_sub_ps(va0123, vb0123);
__m128 vy4567 = _mm_sub_ps(va4567, vb4567);
vy0123 = _mm_mul_ps(vy0123, vy0123);
vy4567 = _mm_mul_ps(vy4567, vy4567);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
const __m128 vb0123 = _mm_loadu_ps(b);
b += 4;
__m128 vy0123 = _mm_sub_ps(va0123, vb0123);
vy0123 = _mm_mul_ps(vy0123, vy0123);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 vb0123 = _mm_loadu_ps(b);
__m128 vy0123 = _mm_sub_ps(va0123, vb0123);
vy0123 = _mm_mul_ps(vy0123, vy0123);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vsqrdiffc_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_sub_ps(va0123, vb);
__m128 vy4567 = _mm_sub_ps(va4567, vb);
vy0123 = _mm_mul_ps(vy0123, vy0123);
vy4567 = _mm_mul_ps(vy4567, vy4567);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_sub_ps(va0123, vb);
vy0123 = _mm_mul_ps(vy0123, vy0123);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_sub_ps(va0123, vb);
vy0123 = _mm_mul_ps(vy0123, vy0123);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vsub_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
const __m128 vb0123 = _mm_loadu_ps(b);
const __m128 vb4567 = _mm_loadu_ps(b + 4);
b += 8;
__m128 vy0123 = _mm_sub_ps(va0123, vb0123);
__m128 vy4567 = _mm_sub_ps(va4567, vb4567);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
const __m128 vb0123 = _mm_loadu_ps(b);
b += 4;
__m128 vy0123 = _mm_sub_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 vb0123 = _mm_loadu_ps(b);
__m128 vy0123 = _mm_sub_ps(va0123, vb0123);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vsubc_minmax_ukernel__sse_x8(
size_t n,
const float* a,
const float* b,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(a != NULL);
assert(b != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
const __m128 vb = _mm_load1_ps(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
const __m128 va4567 = _mm_loadu_ps(a + 4);
a += 8;
__m128 vy0123 = _mm_sub_ps(va0123, vb);
__m128 vy4567 = _mm_sub_ps(va4567, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy4567 = _mm_max_ps(vy4567, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
vy4567 = _mm_min_ps(vy4567, vy_max);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 va0123 = _mm_loadu_ps(a);
a += 4;
__m128 vy0123 = _mm_sub_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
_mm_storeu_ps(y, vy0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 va0123 = _mm_loadu_ps(a);
__m128 vy0123 = _mm_sub_ps(va0123, vb);
vy0123 = _mm_max_ps(vy0123, vy_min);
vy0123 = _mm_min_ps(vy0123, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy0123);
vy0123 = _mm_movehl_ps(vy0123, vy0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy0123);
}
}
}
void xnn_f32_vclamp_ukernel__sse_x8(
size_t n,
const float* x,
float* y,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(x != NULL);
assert(y != NULL);
const __m128 vy_min = _mm_load_ps(params->sse.min);
const __m128 vy_max = _mm_load_ps(params->sse.max);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
__m128 vacc0123 = _mm_loadu_ps(x);
__m128 vacc4567 = _mm_loadu_ps(x + 4);
x += 8;
vacc0123 = _mm_max_ps(vacc0123, vy_min);
vacc4567 = _mm_max_ps(vacc4567, vy_min);
vacc0123 = _mm_min_ps(vacc0123, vy_max);
vacc4567 = _mm_min_ps(vacc4567, vy_max);
_mm_storeu_ps(y, vacc0123);
_mm_storeu_ps(y + 4, vacc4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
__m128 vacc = _mm_loadu_ps(x);
x += 4;
vacc = _mm_max_ps(vacc, vy_min);
vacc = _mm_min_ps(vacc, vy_max);
_mm_storeu_ps(y, vacc);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
__m128 vacc = _mm_loadu_ps(x);
vacc = _mm_max_ps(vacc, vy_min);
vacc = _mm_min_ps(vacc, vy_max);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vacc);
vacc = _mm_movehl_ps(vacc, vacc);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vacc);
}
}
}
void xnn_f32_vhswish_ukernel__sse_x8(
size_t n,
const float* x,
float* y,
const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
const __m128 vsixth = _mm_load_ps(params->sse.sixth);
const __m128 vhalf = _mm_load_ps(params->sse.half);
const __m128 vone = _mm_load_ps(params->sse.one);
const __m128 vzero = _mm_setzero_ps();
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 vx0123 = _mm_loadu_ps(x);
const __m128 vx4567 = _mm_loadu_ps(x + 4);
x += 8;
__m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
__m128 vacc4567 = _mm_mul_ps(vx4567, vsixth);
vacc0123 = _mm_add_ps(vacc0123, vhalf);
vacc4567 = _mm_add_ps(vacc4567, vhalf);
vacc0123 = _mm_max_ps(vacc0123, vzero);
vacc4567 = _mm_max_ps(vacc4567, vzero);
vacc0123 = _mm_min_ps(vacc0123, vone);
vacc4567 = _mm_min_ps(vacc4567, vone);
vacc0123 = _mm_mul_ps(vacc0123, vx0123);
vacc4567 = _mm_mul_ps(vacc4567, vx4567);
_mm_storeu_ps(y, vacc0123);
_mm_storeu_ps(y + 4, vacc4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 vx0123 = _mm_loadu_ps(x);
x += 4;
__m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
vacc0123 = _mm_add_ps(vacc0123, vhalf);
vacc0123 = _mm_max_ps(vacc0123, vzero);
vacc0123 = _mm_min_ps(vacc0123, vone);
vacc0123 = _mm_mul_ps(vacc0123, vx0123);
_mm_storeu_ps(y, vacc0123);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 vx0123 = _mm_loadu_ps(x);
__m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
vacc0123 = _mm_add_ps(vacc0123, vhalf);
vacc0123 = _mm_max_ps(vacc0123, vzero);
vacc0123 = _mm_min_ps(vacc0123, vone);
vacc0123 = _mm_mul_ps(vacc0123, vx0123);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vacc0123);
vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vacc0123);
}
}
}
void xnn_f32_vlrelu_ukernel__sse_x8(
size_t n,
const float* x,
float* y,
const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
const __m128 vslope = _mm_load_ps(params->sse.slope);
const __m128 vzero = _mm_setzero_ps();
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
__m128 vx0123 = _mm_loadu_ps(x);
__m128 vx4567 = _mm_loadu_ps(x + 4);
x += 8;
__m128 vacc0123 = _mm_max_ps(_mm_setzero_ps(), vx0123);
vx0123 = _mm_min_ps(vx0123, vzero);
__m128 vacc4567 = _mm_max_ps(_mm_setzero_ps(), vx4567);
vx4567 = _mm_min_ps(vx4567, vzero);
vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vx0123, vslope));
vacc4567 = _mm_add_ps(vacc4567, _mm_mul_ps(vx4567, vslope));
_mm_storeu_ps(y, vacc0123);
_mm_storeu_ps(y + 4, vacc4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
__m128 vx = _mm_loadu_ps(x);
x += 4;
__m128 vacc = _mm_max_ps(_mm_setzero_ps(), vx);
vx = _mm_min_ps(vx, vzero);
vacc = _mm_add_ps(vacc, _mm_mul_ps(vx, vslope));
_mm_storeu_ps(y, vacc);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
__m128 vx = _mm_loadu_ps(x);
__m128 vacc = _mm_max_ps(_mm_setzero_ps(), vx);
vx = _mm_min_ps(vx, vzero);
vacc = _mm_add_ps(vacc, _mm_mul_ps(vx, vslope));
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vacc);
vacc = _mm_movehl_ps(vacc, vacc);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vacc);
}
}
}
void xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x(
size_t rows,
size_t channels,
const float*restrict input,
size_t input_stride,
const float*restrict weights,
float*restrict output,
size_t output_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows != 0);
assert(channels != 0);
assert(channels % sizeof(float) == 0);
const float* i0 = input;
float* o0 = output;
const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
float* o1 = (float*) ((uintptr_t) o0 + output_stride);
const size_t input_increment = input_stride * 2 - channels;
const size_t output_increment = output_stride * 2 - channels;
const __m128 vmin = _mm_load_ps(params->sse.min);
const __m128 vmax = _mm_load_ps(params->sse.max);
do {
if XNN_UNPREDICTABLE(rows < 2) {
i1 = i0;
o1 = o0;
}
const float* w = weights;
size_t c = channels;
for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
const __m128 vscale0123 = _mm_load_ps(w);
__m128 vacc0x0123 = _mm_loadu_ps(i0);
i0 += 4;
__m128 vacc1x0123 = _mm_loadu_ps(i1);
i1 += 4;
vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
const __m128 vbias0123 = _mm_load_ps(w + 4);
vacc0x0123 = _mm_add_ps(vacc0x0123, vbias0123);
vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123);
vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
_mm_storeu_ps(o0, vacc0x0123);
o0 += 4;
_mm_storeu_ps(o1, vacc1x0123);
o1 += 4;
w += 8;
}
if XNN_UNLIKELY(c != 0) {
const __m128 vscale0123 = _mm_load_ps(w);
__m128 vacc0x0123 = _mm_loadu_ps(i0);
i0 = (const float*) ((uintptr_t) i0 + c);
__m128 vacc1x0123 = _mm_loadu_ps(i1);
i1 = (const float*) ((uintptr_t) i1 + c);
vacc0x0123 = _mm_mul_ps(vacc0x0123, vscale0123);
vacc1x0123 = _mm_mul_ps(vacc1x0123, vscale0123);
const __m128 vbias0123 = _mm_load_ps(w + 4);
vacc0x0123 = _mm_add_ps(vacc0x0123, vbias0123);
vacc1x0123 = _mm_add_ps(vacc1x0123, vbias0123);
vacc0x0123 = _mm_max_ps(vacc0x0123, vmin);
vacc1x0123 = _mm_max_ps(vacc1x0123, vmin);
vacc0x0123 = _mm_min_ps(vacc0x0123, vmax);
vacc1x0123 = _mm_min_ps(vacc1x0123, vmax);
if (c & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) o0, vacc0x0123);
_mm_storel_pi((__m64*) o1, vacc1x0123);
vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
o0 += 2;
o1 += 2;
}
if (c & (1 * sizeof(float))) {
_mm_store_ss(o0, vacc0x0123);
_mm_store_ss(o1, vacc1x0123);
o0 += 1;
o1 += 1;
}
}
i0 = (const float*) ((uintptr_t) i0 + input_increment);
o0 = (float*) ((uintptr_t) o0 + output_increment);
i1 = (const float*) ((uintptr_t) i1 + input_increment);
o1 = (float*) ((uintptr_t) o1 + output_increment);
rows = doz(rows, 2);
} while (rows != 0);
}
void xnn_f32_vsqrt_ukernel__sse_sqrt_x4(
size_t n,
const float* x,
float* y,
const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 vx = _mm_loadu_ps(x);
x += 4;
const __m128 vy = _mm_sqrt_ps(vx);
_mm_storeu_ps(y, vy);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 vx = _mm_loadu_ps(x);
__m128 vy = _mm_sqrt_ps(vx);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy);
vy = _mm_movehl_ps(vy, vy);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy);
}
}
}
void xnn_f32_vabs_ukernel__sse_x8(
size_t n,
const float* x,
float* y,
const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(x != NULL);
assert(y != NULL);
const __m128 vnonsign_mask = _mm_load_ps(params->sse.nonsign_mask);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 vx0123 = _mm_loadu_ps(x);
const __m128 vx4567 = _mm_loadu_ps(x + 4);
x += 8;
const __m128 vy0123 = _mm_and_ps(vx0123, vnonsign_mask);
const __m128 vy4567 = _mm_and_ps(vx4567, vnonsign_mask);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 vx = _mm_loadu_ps(x);
x += 4;
const __m128 vy = _mm_and_ps(vx, vnonsign_mask);
_mm_storeu_ps(y, vy);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 vx = _mm_loadu_ps(x);
__m128 vy = _mm_and_ps(vx, vnonsign_mask);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy);
vy = _mm_movehl_ps(vy, vy);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy);
}
}
}
void xnn_f32_vneg_ukernel__sse_x8(
size_t n,
const float* x,
float* y,
const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(x != NULL);
assert(y != NULL);
const __m128 vsign_mask = _mm_load_ps(params->sse.sign_mask);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 vx0123 = _mm_loadu_ps(x);
const __m128 vx4567 = _mm_loadu_ps(x + 4);
x += 8;
const __m128 vy0123 = _mm_xor_ps(vx0123, vsign_mask);
const __m128 vy4567 = _mm_xor_ps(vx4567, vsign_mask);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 vx = _mm_loadu_ps(x);
x += 4;
const __m128 vy = _mm_xor_ps(vx, vsign_mask);
_mm_storeu_ps(y, vy);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 vx = _mm_loadu_ps(x);
__m128 vy = _mm_xor_ps(vx, vsign_mask);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy);
vy = _mm_movehl_ps(vy, vy);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy);
}
}
}
void xnn_f32_vsqr_ukernel__sse_x8(
size_t n,
const float* x,
float* y,
const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(n != 0);
assert(n % sizeof(float) == 0);
assert(x != NULL);
assert(y != NULL);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const __m128 vx0123 = _mm_loadu_ps(x);
const __m128 vx4567 = _mm_loadu_ps(x + 4);
x += 8;
const __m128 vy0123 = _mm_mul_ps(vx0123, vx0123);
const __m128 vy4567 = _mm_mul_ps(vx4567, vx4567);
_mm_storeu_ps(y, vy0123);
_mm_storeu_ps(y + 4, vy4567);
y += 8;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const __m128 vx = _mm_loadu_ps(x);
x += 4;
const __m128 vy = _mm_mul_ps(vx, vx);
_mm_storeu_ps(y, vy);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
const __m128 vx = _mm_loadu_ps(x);
__m128 vy = _mm_mul_ps(vx, vx);
if (n & (2 * sizeof(float))) {
_mm_storel_pi((__m64*) y, vy);
vy = _mm_movehl_ps(vy, vy);
y += 2;
}
if (n & (1 * sizeof(float))) {
_mm_store_ss(y, vy);
}
}
}
void xnn_x32_packx_ukernel_4x__sse(
size_t m,
size_t k,
const uint32_t* restrict x,
size_t x_stride,
uint32_t* restrict y)
{
assert(m != 0);
assert(k != 0);
const float* x0 = (const float*) x;
const float* x1 = (const float*) ((uintptr_t) x0 + x_stride);
if (m < 2) {
x1 = x0;
}
const float* x2 = (const float*) ((uintptr_t) x1 + x_stride);
if (m <= 2) {
x2 = x1;
}
const float* x3 = (const float*) ((uintptr_t) x2 + x_stride);
if (m != 4) {
x3 = x2;
}
float*restrict y_f32 = (float*) y;
for (; k >= 4; k -= 4) {
const __m128 vx0 = _mm_loadu_ps(x0);
x0 += 4;
const __m128 vx1 = _mm_loadu_ps(x1);
x1 += 4;
const __m128 vx2 = _mm_loadu_ps(x2);
x2 += 4;
const __m128 vx3 = _mm_loadu_ps(x3);
x3 += 4;
const __m128 vt0 = _mm_unpacklo_ps(vx0, vx1);
const __m128 vt1 = _mm_unpackhi_ps(vx0, vx1);
const __m128 vt2 = _mm_unpacklo_ps(vx2, vx3);
const __m128 vt3 = _mm_unpackhi_ps(vx2, vx3);
const __m128 vy0 = _mm_movelh_ps(vt0, vt2);
_mm_store_ps(y_f32, vy0);
const __m128 vy1 = _mm_movehl_ps(vt2, vt0);
_mm_store_ps(y_f32 + 4, vy1);
const __m128 vy2 = _mm_movelh_ps(vt1, vt3);
_mm_store_ps(y_f32 + 8, vy2);
const __m128 vy3 = _mm_movehl_ps(vt3, vt1);
_mm_store_ps(y_f32 + 12, vy3);
y_f32 += 16;
}
if XNN_UNLIKELY(k != 0) {
do {
const __m128 vx0 = _mm_load_ss(x0);
x0 += 1;
const __m128 vx1 = _mm_load_ss(x1);
x1 += 1;
const __m128 vx2 = _mm_load_ss(x2);
x2 += 1;
const __m128 vx3 = _mm_load_ss(x3);
x3 += 1;
const __m128 vx01 = _mm_unpacklo_ps(vx0, vx1);
const __m128 vx23 = _mm_unpacklo_ps(vx2, vx3);
const __m128 vy = _mm_movelh_ps(vx01, vx23);
_mm_store_ps(y_f32, vy);
y_f32 += 4;
} while (--k != 0);
}
}
void xnn_x32_transposec_ukernel__4x4_sse(
const uint32_t* input,
uint32_t* output,
size_t input_stride,
size_t output_stride,
size_t block_width,
size_t block_height) XNN_OOB_READS
{
assert(output_stride >= block_height * sizeof(uint32_t));
assert(input_stride >= block_width * sizeof(uint32_t));
const size_t tile_height = 4;
const size_t tile_width = 4;
const size_t tile_wbytes = tile_width * sizeof(float);
const size_t input_vreset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
const size_t output_vreset = tile_height * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t);
const size_t input_offset = tile_height * input_stride;
const float* i0 = (const float*) input;
const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
float* o0 = (float*) output;
float* o1 = (float*) ((uintptr_t) o0 + output_stride);
float* o2 = (float*) ((uintptr_t) o1 + output_stride);
float* o3 = (float*) ((uintptr_t) o2 + output_stride);
do {
if XNN_UNPREDICTABLE(block_width < 2) {
o1 = o0;
}
if XNN_UNPREDICTABLE(block_width <= 2) {
o2 = o0;
}
if XNN_UNPREDICTABLE(block_width < 4) {
o3 = o0;
}
size_t bh = block_height;
for (; bh >= 4; bh -= 4) {
__m128 v0 = _mm_loadu_ps(i0);
i0 = (const float*) ((uintptr_t) i0 + input_offset);
__m128 v1 = _mm_loadu_ps(i1);
i1 = (const float*) ((uintptr_t) i1 + input_offset);
__m128 v2 = _mm_loadu_ps(i2);
i2 = (const float*) ((uintptr_t) i2 + input_offset);
__m128 v3 = _mm_loadu_ps(i3);
i3 = (const float*) ((uintptr_t) i3 + input_offset);
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
_mm_storeu_ps(o3, v3);
o3 = (float*) ((uintptr_t) o3 + tile_wbytes);
_mm_storeu_ps(o2, v2);
o2 = (float*) ((uintptr_t) o2 + tile_wbytes);
_mm_storeu_ps(o1, v1);
o1 = (float*) ((uintptr_t) o1 + tile_wbytes);
_mm_storeu_ps(o0, v0);
o0 = (float*) ((uintptr_t) o0 + tile_wbytes);
}
if (bh != 0) {
if XNN_UNPREDICTABLE(bh <= 2) {
i2 = i0;
}
if XNN_UNPREDICTABLE(bh < 2) {
i1 = i0;
}
__m128 v0 = _mm_loadu_ps(i0);
__m128 v1 = _mm_loadu_ps(i1);
__m128 v2 = _mm_loadu_ps(i2);
__m128 v3 = _mm_setzero_ps();
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
if (bh & 2) {
_mm_storel_pi((__m64*) o3, v3);
o3 += 2;
_mm_storel_pi((__m64*) o2, v2);
o2 += 2;
_mm_storel_pi((__m64*) o1, v1);
o1 += 2;
_mm_storel_pi((__m64*) o0, v0);
o0 += 2;
v0 = _mm_movehl_ps(v0, v0);
v1 = _mm_movehl_ps(v1, v1);
v2 = _mm_movehl_ps(v2, v2);
v3 = _mm_movehl_ps(v3, v3);
}
if (bh & 1) {
_mm_store_ss(o3, v3);
_mm_store_ss(o2, v2);
_mm_store_ss(o1, v1);
_mm_store_ss(o0, v0);
}
}
i0 = (const float*) ((uintptr_t) i0 + input_vreset);
i1 = (const float*) ((uintptr_t) i0 + input_stride);
i2 = (const float*) ((uintptr_t) i1 + input_stride);
i3 = (const float*) ((uintptr_t) i2 + input_stride);
o0 = (float*) ((uintptr_t) o0 + output_vreset);
o1 = (float*) ((uintptr_t) o1 + output_vreset);
o2 = (float*) ((uintptr_t) o2 + output_vreset);
o3 = (float*) ((uintptr_t) o3 + output_vreset);
block_width = doz(block_width, tile_width);
} while (block_width != 0);
}