Refactor parameters in F32 VRND microkernels
- Move mask_table into AVX microkernel parameters to unblock amalgamation
- Move constant literals into microkernel parameters
PiperOrigin-RevId: 419529540
diff --git a/src/f32-vrnd/avx.c.in b/src/f32-vrnd/avx.c.in
index 4ac9915..2914797 100644
--- a/src/f32-vrnd/avx.c.in
+++ b/src/f32-vrnd/avx.c.in
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
$_MM_FROUND_TO_FLAG = {
$ "RNDNE": "_MM_FROUND_TO_NEAREST_INT",
$ "RNDZ": "_MM_FROUND_TO_ZERO",
@@ -60,7 +58,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, ${_MM_FROUND_TO_FLAG} | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndd-avx-x16.c b/src/f32-vrnd/gen/vrndd-avx-x16.c
index 0306175..1f4d736 100644
--- a/src/f32-vrnd/gen/vrndd-avx-x16.c
+++ b/src/f32-vrnd/gen/vrndd-avx-x16.c
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
void xnn_f32_vrndd_ukernel__avx_x16(
size_t n,
const float* x,
@@ -51,7 +49,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndd-avx-x8.c b/src/f32-vrnd/gen/vrndd-avx-x8.c
index 4e311b3..cbe0327 100644
--- a/src/f32-vrnd/gen/vrndd-avx-x8.c
+++ b/src/f32-vrnd/gen/vrndd-avx-x8.c
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
void xnn_f32_vrndd_ukernel__avx_x8(
size_t n,
const float* x,
@@ -39,7 +37,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndd-wasmsimd-addsub-x4.c b/src/f32-vrnd/gen/vrndd-wasmsimd-addsub-x4.c
index 6c4ad52..093f173 100644
--- a/src/f32-vrnd/gen/vrndd-wasmsimd-addsub-x4.c
+++ b/src/f32-vrnd/gen/vrndd-wasmsimd-addsub-x4.c
@@ -25,16 +25,16 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t vx = wasm_v128_load(x);
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vy = wasm_f32x4_sub(vrndx, wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone));
@@ -45,8 +45,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
v128_t vy = wasm_f32x4_sub(vrndx, wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone));
diff --git a/src/f32-vrnd/gen/vrndd-wasmsimd-addsub-x8.c b/src/f32-vrnd/gen/vrndd-wasmsimd-addsub-x8.c
index de63427..9d406fc 100644
--- a/src/f32-vrnd/gen/vrndd-wasmsimd-addsub-x8.c
+++ b/src/f32-vrnd/gen/vrndd-wasmsimd-addsub-x8.c
@@ -25,9 +25,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t vx0123 = wasm_v128_load(x);
const v128_t vx4567 = wasm_v128_load(x + 4);
@@ -36,11 +36,11 @@
const v128_t vabsx0123 = wasm_v128_andnot(vx0123, vsign_mask);
const v128_t vabsx4567 = wasm_v128_andnot(vx4567, vsign_mask);
- const v128_t vrndmask0123 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx0123));
- const v128_t vrndmask4567 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx4567));
+ const v128_t vrndmask0123 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx0123));
+ const v128_t vrndmask4567 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx4567));
- const v128_t vrndabsx0123 = wasm_f32x4_sub(wasm_f32x4_add(vabsx0123, vmagic_number), vmagic_number);
- const v128_t vrndabsx4567 = wasm_f32x4_sub(wasm_f32x4_add(vabsx4567, vmagic_number), vmagic_number);
+ const v128_t vrndabsx0123 = wasm_f32x4_sub(wasm_f32x4_add(vabsx0123, vmagic_bias), vmagic_bias);
+ const v128_t vrndabsx4567 = wasm_f32x4_sub(wasm_f32x4_add(vabsx4567, vmagic_bias), vmagic_bias);
const v128_t vrndx0123 = wasm_v128_bitselect(vx0123, vrndabsx0123, vrndmask0123);
const v128_t vrndx4567 = wasm_v128_bitselect(vx4567, vrndabsx4567, vrndmask4567);
@@ -57,8 +57,8 @@
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vy = wasm_f32x4_sub(vrndx, wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone));
@@ -69,8 +69,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
v128_t vy = wasm_f32x4_sub(vrndx, wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone));
diff --git a/src/f32-vrnd/gen/vrndd-wasmsimd-cvt-x4.c b/src/f32-vrnd/gen/vrndd-wasmsimd-cvt-x4.c
index f786d4e..7c4a538 100644
--- a/src/f32-vrnd/gen/vrndd-wasmsimd-cvt-x4.c
+++ b/src/f32-vrnd/gen/vrndd-wasmsimd-cvt-x4.c
@@ -25,9 +25,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t vx = wasm_v128_load(x);
x += 4;
@@ -35,7 +35,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadj = wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone);
const v128_t vy = wasm_f32x4_sub(vrndx, vadj);
@@ -49,7 +49,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadj = wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone);
v128_t vy = wasm_f32x4_sub(vrndx, vadj);
diff --git a/src/f32-vrnd/gen/vrndd-wasmsimd-cvt-x8.c b/src/f32-vrnd/gen/vrndd-wasmsimd-cvt-x8.c
index 0f30e22..04cc5f6 100644
--- a/src/f32-vrnd/gen/vrndd-wasmsimd-cvt-x8.c
+++ b/src/f32-vrnd/gen/vrndd-wasmsimd-cvt-x8.c
@@ -25,9 +25,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t vx0123 = wasm_v128_load(x);
const v128_t vx4567 = wasm_v128_load(x + 4);
@@ -39,9 +39,9 @@
const v128_t vabsx4567 = wasm_f32x4_abs(vx4567);
const v128_t vprerndx0123 = wasm_f32x4_convert_i32x4(vintx0123);
- const v128_t vrndmask0123 = wasm_v128_andnot(wasm_f32x4_lt(vabsx0123, vmagic_number), vsign_mask);
+ const v128_t vrndmask0123 = wasm_v128_andnot(wasm_f32x4_lt(vabsx0123, vmagic_bias), vsign_mask);
const v128_t vprerndx4567 = wasm_f32x4_convert_i32x4(vintx4567);
- const v128_t vrndmask4567 = wasm_v128_andnot(wasm_f32x4_lt(vabsx4567, vmagic_number), vsign_mask);
+ const v128_t vrndmask4567 = wasm_v128_andnot(wasm_f32x4_lt(vabsx4567, vmagic_bias), vsign_mask);
const v128_t vrndx0123 = wasm_v128_bitselect(vprerndx0123, vx0123, vrndmask0123);
const v128_t vrndx4567 = wasm_v128_bitselect(vprerndx4567, vx4567, vrndmask4567);
@@ -63,7 +63,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadj = wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone);
const v128_t vy = wasm_f32x4_sub(vrndx, vadj);
@@ -77,7 +77,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadj = wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone);
v128_t vy = wasm_f32x4_sub(vrndx, vadj);
diff --git a/src/f32-vrnd/gen/vrndne-avx-x16.c b/src/f32-vrnd/gen/vrndne-avx-x16.c
index a90bf3c..9e61460 100644
--- a/src/f32-vrnd/gen/vrndne-avx-x16.c
+++ b/src/f32-vrnd/gen/vrndne-avx-x16.c
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
void xnn_f32_vrndne_ukernel__avx_x16(
size_t n,
const float* x,
@@ -51,7 +49,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndne-avx-x8.c b/src/f32-vrnd/gen/vrndne-avx-x8.c
index 8568efc..99cff04 100644
--- a/src/f32-vrnd/gen/vrndne-avx-x8.c
+++ b/src/f32-vrnd/gen/vrndne-avx-x8.c
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
void xnn_f32_vrndne_ukernel__avx_x8(
size_t n,
const float* x,
@@ -39,7 +37,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndne-wasmsimd-addsub-x4.c b/src/f32-vrnd/gen/vrndne-wasmsimd-addsub-x4.c
index c8163a5..cf57aab 100644
--- a/src/f32-vrnd/gen/vrndne-wasmsimd-addsub-x4.c
+++ b/src/f32-vrnd/gen/vrndne-wasmsimd-addsub-x4.c
@@ -25,15 +25,15 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t vx = wasm_v128_load(x);
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vy = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
wasm_v128_store(y, vy);
@@ -43,8 +43,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
v128_t vy = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vrnd/gen/vrndne-wasmsimd-addsub-x8.c b/src/f32-vrnd/gen/vrndne-wasmsimd-addsub-x8.c
index da97437..431c08a 100644
--- a/src/f32-vrnd/gen/vrndne-wasmsimd-addsub-x8.c
+++ b/src/f32-vrnd/gen/vrndne-wasmsimd-addsub-x8.c
@@ -25,8 +25,8 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t vx0123 = wasm_v128_load(x);
const v128_t vx4567 = wasm_v128_load(x + 4);
@@ -35,11 +35,11 @@
const v128_t vabsx0123 = wasm_v128_andnot(vx0123, vsign_mask);
const v128_t vabsx4567 = wasm_v128_andnot(vx4567, vsign_mask);
- const v128_t vrndmask0123 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx0123));
- const v128_t vrndmask4567 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx4567));
+ const v128_t vrndmask0123 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx0123));
+ const v128_t vrndmask4567 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx4567));
- const v128_t vrndabsx0123 = wasm_f32x4_sub(wasm_f32x4_add(vabsx0123, vmagic_number), vmagic_number);
- const v128_t vrndabsx4567 = wasm_f32x4_sub(wasm_f32x4_add(vabsx4567, vmagic_number), vmagic_number);
+ const v128_t vrndabsx0123 = wasm_f32x4_sub(wasm_f32x4_add(vabsx0123, vmagic_bias), vmagic_bias);
+ const v128_t vrndabsx4567 = wasm_f32x4_sub(wasm_f32x4_add(vabsx4567, vmagic_bias), vmagic_bias);
const v128_t vy0123 = wasm_v128_bitselect(vx0123, vrndabsx0123, vrndmask0123);
const v128_t vy4567 = wasm_v128_bitselect(vx4567, vrndabsx4567, vrndmask4567);
@@ -53,8 +53,8 @@
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vy = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
wasm_v128_store(y, vy);
@@ -64,8 +64,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
v128_t vy = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vrnd/gen/vrndu-avx-x16.c b/src/f32-vrnd/gen/vrndu-avx-x16.c
index 18167a0..aa0557d 100644
--- a/src/f32-vrnd/gen/vrndu-avx-x16.c
+++ b/src/f32-vrnd/gen/vrndu-avx-x16.c
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
void xnn_f32_vrndu_ukernel__avx_x16(
size_t n,
const float* x,
@@ -51,7 +49,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndu-avx-x8.c b/src/f32-vrnd/gen/vrndu-avx-x8.c
index a18f175..c490fd6 100644
--- a/src/f32-vrnd/gen/vrndu-avx-x8.c
+++ b/src/f32-vrnd/gen/vrndu-avx-x8.c
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
void xnn_f32_vrndu_ukernel__avx_x8(
size_t n,
const float* x,
@@ -39,7 +37,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndu-wasmsimd-addsub-x4.c b/src/f32-vrnd/gen/vrndu-wasmsimd-addsub-x4.c
index c7769b9..f6aa630 100644
--- a/src/f32-vrnd/gen/vrndu-wasmsimd-addsub-x4.c
+++ b/src/f32-vrnd/gen/vrndu-wasmsimd-addsub-x4.c
@@ -25,16 +25,16 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t vx = wasm_v128_load(x);
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
@@ -47,8 +47,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
diff --git a/src/f32-vrnd/gen/vrndu-wasmsimd-addsub-x8.c b/src/f32-vrnd/gen/vrndu-wasmsimd-addsub-x8.c
index 702b089..cc74a15 100644
--- a/src/f32-vrnd/gen/vrndu-wasmsimd-addsub-x8.c
+++ b/src/f32-vrnd/gen/vrndu-wasmsimd-addsub-x8.c
@@ -25,9 +25,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t vx0123 = wasm_v128_load(x);
const v128_t vx4567 = wasm_v128_load(x + 4);
@@ -36,11 +36,11 @@
const v128_t vabsx0123 = wasm_v128_andnot(vx0123, vsign_mask);
const v128_t vabsx4567 = wasm_v128_andnot(vx4567, vsign_mask);
- const v128_t vrndmask0123 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx0123));
- const v128_t vrndmask4567 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx4567));
+ const v128_t vrndmask0123 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx0123));
+ const v128_t vrndmask4567 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx4567));
- const v128_t vrndabsx0123 = wasm_f32x4_sub(wasm_f32x4_add(vabsx0123, vmagic_number), vmagic_number);
- const v128_t vrndabsx4567 = wasm_f32x4_sub(wasm_f32x4_add(vabsx4567, vmagic_number), vmagic_number);
+ const v128_t vrndabsx0123 = wasm_f32x4_sub(wasm_f32x4_add(vabsx0123, vmagic_bias), vmagic_bias);
+ const v128_t vrndabsx4567 = wasm_f32x4_sub(wasm_f32x4_add(vabsx4567, vmagic_bias), vmagic_bias);
const v128_t vrndx0123 = wasm_v128_bitselect(vx0123, vrndabsx0123, vrndmask0123);
const v128_t vrndx4567 = wasm_v128_bitselect(vx4567, vrndabsx4567, vrndmask4567);
@@ -63,8 +63,8 @@
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
@@ -77,8 +77,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
diff --git a/src/f32-vrnd/gen/vrndu-wasmsimd-cvt-x4.c b/src/f32-vrnd/gen/vrndu-wasmsimd-cvt-x4.c
index ed93237..e4572b5 100644
--- a/src/f32-vrnd/gen/vrndu-wasmsimd-cvt-x4.c
+++ b/src/f32-vrnd/gen/vrndu-wasmsimd-cvt-x4.c
@@ -25,9 +25,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t vx = wasm_v128_load(x);
x += 4;
@@ -35,7 +35,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
@@ -50,7 +50,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
diff --git a/src/f32-vrnd/gen/vrndu-wasmsimd-cvt-x8.c b/src/f32-vrnd/gen/vrndu-wasmsimd-cvt-x8.c
index 36f5cd9..bf4cef7 100644
--- a/src/f32-vrnd/gen/vrndu-wasmsimd-cvt-x8.c
+++ b/src/f32-vrnd/gen/vrndu-wasmsimd-cvt-x8.c
@@ -25,9 +25,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t vx0123 = wasm_v128_load(x);
const v128_t vx4567 = wasm_v128_load(x + 4);
@@ -39,9 +39,9 @@
const v128_t vabsx4567 = wasm_f32x4_abs(vx4567);
const v128_t vprerndx0123 = wasm_f32x4_convert_i32x4(vintx0123);
- const v128_t vrndmask0123 = wasm_v128_andnot(wasm_f32x4_lt(vabsx0123, vmagic_number), vsign_mask);
+ const v128_t vrndmask0123 = wasm_v128_andnot(wasm_f32x4_lt(vabsx0123, vmagic_bias), vsign_mask);
const v128_t vprerndx4567 = wasm_f32x4_convert_i32x4(vintx4567);
- const v128_t vrndmask4567 = wasm_v128_andnot(wasm_f32x4_lt(vabsx4567, vmagic_number), vsign_mask);
+ const v128_t vrndmask4567 = wasm_v128_andnot(wasm_f32x4_lt(vabsx4567, vmagic_bias), vsign_mask);
const v128_t vrndx0123 = wasm_v128_bitselect(vprerndx0123, vx0123, vrndmask0123);
const v128_t vrndx4567 = wasm_v128_bitselect(vprerndx4567, vx4567, vrndmask4567);
@@ -66,7 +66,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
@@ -81,7 +81,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
diff --git a/src/f32-vrnd/gen/vrndz-avx-x16.c b/src/f32-vrnd/gen/vrndz-avx-x16.c
index 7790580..12bb48d 100644
--- a/src/f32-vrnd/gen/vrndz-avx-x16.c
+++ b/src/f32-vrnd/gen/vrndz-avx-x16.c
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
void xnn_f32_vrndz_ukernel__avx_x16(
size_t n,
const float* x,
@@ -51,7 +49,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndz-avx-x8.c b/src/f32-vrnd/gen/vrndz-avx-x8.c
index 7050492..e9efa20 100644
--- a/src/f32-vrnd/gen/vrndz-avx-x8.c
+++ b/src/f32-vrnd/gen/vrndz-avx-x8.c
@@ -16,8 +16,6 @@
#include <xnnpack/vunary.h>
-static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
-
void xnn_f32_vrndz_ukernel__avx_x8(
size_t n,
const float* x,
@@ -39,7 +37,7 @@
if XNN_UNLIKELY(n != 0) {
assert(n >= 1 * sizeof(float));
assert(n <= 7 * sizeof(float));
- __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+ const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
const __m256 vx = _mm256_maskload_ps(x, vmask);
const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
diff --git a/src/f32-vrnd/gen/vrndz-wasmsimd-addsub-x4.c b/src/f32-vrnd/gen/vrndz-wasmsimd-addsub-x4.c
index 82d4a28..6250a12 100644
--- a/src/f32-vrnd/gen/vrndz-wasmsimd-addsub-x4.c
+++ b/src/f32-vrnd/gen/vrndz-wasmsimd-addsub-x4.c
@@ -25,16 +25,16 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t vx = wasm_v128_load(x);
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vadjustment = wasm_v128_and(wasm_f32x4_lt(vabsx, vrndabsx), vone);
const v128_t vflrabsx = wasm_f32x4_sub(vrndabsx, vadjustment);
const v128_t vy = wasm_v128_bitselect(vx, vflrabsx, vrndmask);
@@ -46,8 +46,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vadjustment = wasm_v128_and(wasm_f32x4_lt(vabsx, vrndabsx), vone);
const v128_t vflrabsx = wasm_f32x4_sub(vrndabsx, vadjustment);
v128_t vy = wasm_v128_bitselect(vx, vflrabsx, vrndmask);
diff --git a/src/f32-vrnd/gen/vrndz-wasmsimd-addsub-x8.c b/src/f32-vrnd/gen/vrndz-wasmsimd-addsub-x8.c
index 4f6f093..e95cc34 100644
--- a/src/f32-vrnd/gen/vrndz-wasmsimd-addsub-x8.c
+++ b/src/f32-vrnd/gen/vrndz-wasmsimd-addsub-x8.c
@@ -25,9 +25,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t vx0123 = wasm_v128_load(x);
const v128_t vx4567 = wasm_v128_load(x + 4);
@@ -36,11 +36,11 @@
const v128_t vabsx0123 = wasm_v128_andnot(vx0123, vsign_mask);
const v128_t vabsx4567 = wasm_v128_andnot(vx4567, vsign_mask);
- const v128_t vrndmask0123 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx0123));
- const v128_t vrndmask4567 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx4567));
+ const v128_t vrndmask0123 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx0123));
+ const v128_t vrndmask4567 = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx4567));
- const v128_t vrndabsx0123 = wasm_f32x4_sub(wasm_f32x4_add(vabsx0123, vmagic_number), vmagic_number);
- const v128_t vrndabsx4567 = wasm_f32x4_sub(wasm_f32x4_add(vabsx4567, vmagic_number), vmagic_number);
+ const v128_t vrndabsx0123 = wasm_f32x4_sub(wasm_f32x4_add(vabsx0123, vmagic_bias), vmagic_bias);
+ const v128_t vrndabsx4567 = wasm_f32x4_sub(wasm_f32x4_add(vabsx4567, vmagic_bias), vmagic_bias);
const v128_t vadjustment0123 = wasm_v128_and(wasm_f32x4_lt(vabsx0123, vrndabsx0123), vone);
const v128_t vadjustment4567 = wasm_v128_and(wasm_f32x4_lt(vabsx4567, vrndabsx4567), vone);
@@ -60,8 +60,8 @@
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vadjustment = wasm_v128_and(wasm_f32x4_lt(vabsx, vrndabsx), vone);
const v128_t vflrabsx = wasm_f32x4_sub(vrndabsx, vadjustment);
const v128_t vy = wasm_v128_bitselect(vx, vflrabsx, vrndmask);
@@ -73,8 +73,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vadjustment = wasm_v128_and(wasm_f32x4_lt(vabsx, vrndabsx), vone);
const v128_t vflrabsx = wasm_f32x4_sub(vrndabsx, vadjustment);
v128_t vy = wasm_v128_bitselect(vx, vflrabsx, vrndmask);
diff --git a/src/f32-vrnd/gen/vrndz-wasmsimd-cvt-x4.c b/src/f32-vrnd/gen/vrndz-wasmsimd-cvt-x4.c
index 43c3939..6168b6e 100644
--- a/src/f32-vrnd/gen/vrndz-wasmsimd-cvt-x4.c
+++ b/src/f32-vrnd/gen/vrndz-wasmsimd-cvt-x4.c
@@ -25,8 +25,8 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t vx = wasm_v128_load(x);
x += 4;
@@ -34,7 +34,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vrndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vy = wasm_v128_bitselect(vrndx, vx, vrndmask);
wasm_v128_store(y, vy);
@@ -46,7 +46,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vrndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
v128_t vy = wasm_v128_bitselect(vrndx, vx, vrndmask);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vrnd/gen/vrndz-wasmsimd-cvt-x8.c b/src/f32-vrnd/gen/vrndz-wasmsimd-cvt-x8.c
index 5bba5d7..f592d5e 100644
--- a/src/f32-vrnd/gen/vrndz-wasmsimd-cvt-x8.c
+++ b/src/f32-vrnd/gen/vrndz-wasmsimd-cvt-x8.c
@@ -25,8 +25,8 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t vx0123 = wasm_v128_load(x);
const v128_t vx4567 = wasm_v128_load(x + 4);
@@ -38,9 +38,9 @@
const v128_t vabsx4567 = wasm_f32x4_abs(vx4567);
const v128_t vrndx0123 = wasm_f32x4_convert_i32x4(vintx0123);
- const v128_t vrndmask0123 = wasm_v128_andnot(wasm_f32x4_lt(vabsx0123, vmagic_number), vsign_mask);
+ const v128_t vrndmask0123 = wasm_v128_andnot(wasm_f32x4_lt(vabsx0123, vmagic_bias), vsign_mask);
const v128_t vrndx4567 = wasm_f32x4_convert_i32x4(vintx4567);
- const v128_t vrndmask4567 = wasm_v128_andnot(wasm_f32x4_lt(vabsx4567, vmagic_number), vsign_mask);
+ const v128_t vrndmask4567 = wasm_v128_andnot(wasm_f32x4_lt(vabsx4567, vmagic_bias), vsign_mask);
const v128_t vy0123 = wasm_v128_bitselect(vrndx0123, vx0123, vrndmask0123);
const v128_t vy4567 = wasm_v128_bitselect(vrndx4567, vx4567, vrndmask4567);
@@ -56,7 +56,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vrndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vy = wasm_v128_bitselect(vrndx, vx, vrndmask);
wasm_v128_store(y, vy);
@@ -68,7 +68,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vrndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
v128_t vy = wasm_v128_bitselect(vrndx, vx, vrndmask);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vrnd/vrndd-wasmsimd-addsub.c.in b/src/f32-vrnd/vrndd-wasmsimd-addsub.c.in
index 59965aa..c4c973b 100644
--- a/src/f32-vrnd/vrndd-wasmsimd-addsub.c.in
+++ b/src/f32-vrnd/vrndd-wasmsimd-addsub.c.in
@@ -24,9 +24,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
$if BATCH_TILE > 4:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
@@ -38,10 +38,10 @@
const v128_t vabsx${ABC[N:N+4]} = wasm_v128_andnot(vx${ABC[N:N+4]}, vsign_mask);
$for N in range(0, BATCH_TILE, 4):
- const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx${ABC[N:N+4]}));
+ const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx${ABC[N:N+4]}));
$for N in range(0, BATCH_TILE, 4):
- const v128_t vrndabsx${ABC[N:N+4]} = wasm_f32x4_sub(wasm_f32x4_add(vabsx${ABC[N:N+4]}, vmagic_number), vmagic_number);
+ const v128_t vrndabsx${ABC[N:N+4]} = wasm_f32x4_sub(wasm_f32x4_add(vabsx${ABC[N:N+4]}, vmagic_bias), vmagic_bias);
$for N in range(0, BATCH_TILE, 4):
const v128_t vrndx${ABC[N:N+4]} = wasm_v128_bitselect(vx${ABC[N:N+4]}, vrndabsx${ABC[N:N+4]}, vrndmask${ABC[N:N+4]});
@@ -59,8 +59,8 @@
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vy = wasm_f32x4_sub(vrndx, wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone));
@@ -71,8 +71,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
v128_t vy = wasm_f32x4_sub(vrndx, wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone));
diff --git a/src/f32-vrnd/vrndd-wasmsimd-cvt.c.in b/src/f32-vrnd/vrndd-wasmsimd-cvt.c.in
index c82e5ee..417eaba 100644
--- a/src/f32-vrnd/vrndd-wasmsimd-cvt.c.in
+++ b/src/f32-vrnd/vrndd-wasmsimd-cvt.c.in
@@ -24,9 +24,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
$if BATCH_TILE > 4:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
@@ -40,7 +40,7 @@
$for N in range(0, BATCH_TILE, 4):
const v128_t vprerndx${ABC[N:N+4]} = wasm_f32x4_convert_i32x4(vintx${ABC[N:N+4]});
- const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_andnot(wasm_f32x4_lt(vabsx${ABC[N:N+4]}, vmagic_number), vsign_mask);
+ const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_andnot(wasm_f32x4_lt(vabsx${ABC[N:N+4]}, vmagic_bias), vsign_mask);
$for N in range(0, BATCH_TILE, 4):
const v128_t vrndx${ABC[N:N+4]} = wasm_v128_bitselect(vprerndx${ABC[N:N+4]}, vx${ABC[N:N+4]}, vrndmask${ABC[N:N+4]});
@@ -63,7 +63,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadj = wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone);
const v128_t vy = wasm_f32x4_sub(vrndx, vadj);
@@ -77,7 +77,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadj = wasm_v128_and(wasm_f32x4_lt(vx, vrndx), vone);
v128_t vy = wasm_f32x4_sub(vrndx, vadj);
diff --git a/src/f32-vrnd/vrndne-wasmsimd-addsub.c.in b/src/f32-vrnd/vrndne-wasmsimd-addsub.c.in
index 238eec1..93983f6 100644
--- a/src/f32-vrnd/vrndne-wasmsimd-addsub.c.in
+++ b/src/f32-vrnd/vrndne-wasmsimd-addsub.c.in
@@ -24,8 +24,8 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
$if BATCH_TILE > 4:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
@@ -37,10 +37,10 @@
const v128_t vabsx${ABC[N:N+4]} = wasm_v128_andnot(vx${ABC[N:N+4]}, vsign_mask);
$for N in range(0, BATCH_TILE, 4):
- const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx${ABC[N:N+4]}));
+ const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx${ABC[N:N+4]}));
$for N in range(0, BATCH_TILE, 4):
- const v128_t vrndabsx${ABC[N:N+4]} = wasm_f32x4_sub(wasm_f32x4_add(vabsx${ABC[N:N+4]}, vmagic_number), vmagic_number);
+ const v128_t vrndabsx${ABC[N:N+4]} = wasm_f32x4_sub(wasm_f32x4_add(vabsx${ABC[N:N+4]}, vmagic_bias), vmagic_bias);
$for N in range(0, BATCH_TILE, 4):
const v128_t vy${ABC[N:N+4]} = wasm_v128_bitselect(vx${ABC[N:N+4]}, vrndabsx${ABC[N:N+4]}, vrndmask${ABC[N:N+4]});
@@ -55,8 +55,8 @@
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vy = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
wasm_v128_store(y, vy);
@@ -66,8 +66,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
v128_t vy = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-vrnd/vrndu-wasmsimd-addsub.c.in b/src/f32-vrnd/vrndu-wasmsimd-addsub.c.in
index 1ee68ee..231c710 100644
--- a/src/f32-vrnd/vrndu-wasmsimd-addsub.c.in
+++ b/src/f32-vrnd/vrndu-wasmsimd-addsub.c.in
@@ -24,9 +24,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
$if BATCH_TILE > 4:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
@@ -38,10 +38,10 @@
const v128_t vabsx${ABC[N:N+4]} = wasm_v128_andnot(vx${ABC[N:N+4]}, vsign_mask);
$for N in range(0, BATCH_TILE, 4):
- const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx${ABC[N:N+4]}));
+ const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx${ABC[N:N+4]}));
$for N in range(0, BATCH_TILE, 4):
- const v128_t vrndabsx${ABC[N:N+4]} = wasm_f32x4_sub(wasm_f32x4_add(vabsx${ABC[N:N+4]}, vmagic_number), vmagic_number);
+ const v128_t vrndabsx${ABC[N:N+4]} = wasm_f32x4_sub(wasm_f32x4_add(vabsx${ABC[N:N+4]}, vmagic_bias), vmagic_bias);
$for N in range(0, BATCH_TILE, 4):
const v128_t vrndx${ABC[N:N+4]} = wasm_v128_bitselect(vx${ABC[N:N+4]}, vrndabsx${ABC[N:N+4]}, vrndmask${ABC[N:N+4]});
@@ -65,8 +65,8 @@
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
@@ -79,8 +79,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vrndx = wasm_v128_bitselect(vx, vrndabsx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
diff --git a/src/f32-vrnd/vrndu-wasmsimd-cvt.c.in b/src/f32-vrnd/vrndu-wasmsimd-cvt.c.in
index 24e5527..55c0feb 100644
--- a/src/f32-vrnd/vrndu-wasmsimd-cvt.c.in
+++ b/src/f32-vrnd/vrndu-wasmsimd-cvt.c.in
@@ -24,9 +24,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
$if BATCH_TILE > 4:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
@@ -40,7 +40,7 @@
$for N in range(0, BATCH_TILE, 4):
const v128_t vprerndx${ABC[N:N+4]} = wasm_f32x4_convert_i32x4(vintx${ABC[N:N+4]});
- const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_andnot(wasm_f32x4_lt(vabsx${ABC[N:N+4]}, vmagic_number), vsign_mask);
+ const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_andnot(wasm_f32x4_lt(vabsx${ABC[N:N+4]}, vmagic_bias), vsign_mask);
$for N in range(0, BATCH_TILE, 4):
const v128_t vrndx${ABC[N:N+4]} = wasm_v128_bitselect(vprerndx${ABC[N:N+4]}, vx${ABC[N:N+4]}, vrndmask${ABC[N:N+4]});
@@ -66,7 +66,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
@@ -81,7 +81,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vprerndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vrndx = wasm_v128_bitselect(vprerndx, vx, vrndmask);
const v128_t vadjmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vx, vrndx));
const v128_t vadjrndx = wasm_f32x4_add(vrndx, vone);
diff --git a/src/f32-vrnd/vrndz-wasmsimd-addsub.c.in b/src/f32-vrnd/vrndz-wasmsimd-addsub.c.in
index ec4adad..3057677 100644
--- a/src/f32-vrnd/vrndz-wasmsimd-addsub.c.in
+++ b/src/f32-vrnd/vrndz-wasmsimd-addsub.c.in
@@ -24,9 +24,9 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
- const v128_t vone = wasm_f32x4_const_splat(1.0f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
+ const v128_t vone = wasm_v128_load64_splat(params->wasmsimd.one);
$if BATCH_TILE > 4:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
@@ -38,10 +38,10 @@
const v128_t vabsx${ABC[N:N+4]} = wasm_v128_andnot(vx${ABC[N:N+4]}, vsign_mask);
$for N in range(0, BATCH_TILE, 4):
- const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx${ABC[N:N+4]}));
+ const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx${ABC[N:N+4]}));
$for N in range(0, BATCH_TILE, 4):
- const v128_t vrndabsx${ABC[N:N+4]} = wasm_f32x4_sub(wasm_f32x4_add(vabsx${ABC[N:N+4]}, vmagic_number), vmagic_number);
+ const v128_t vrndabsx${ABC[N:N+4]} = wasm_f32x4_sub(wasm_f32x4_add(vabsx${ABC[N:N+4]}, vmagic_bias), vmagic_bias);
$for N in range(0, BATCH_TILE, 4):
const v128_t vadjustment${ABC[N:N+4]} = wasm_v128_and(wasm_f32x4_lt(vabsx${ABC[N:N+4]}, vrndabsx${ABC[N:N+4]}), vone);
@@ -62,8 +62,8 @@
x += 4;
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vadjustment = wasm_v128_and(wasm_f32x4_lt(vabsx, vrndabsx), vone);
const v128_t vflrabsx = wasm_f32x4_sub(vrndabsx, vadjustment);
const v128_t vy = wasm_v128_bitselect(vx, vflrabsx, vrndmask);
@@ -75,8 +75,8 @@
const v128_t vx = wasm_v128_load(x);
const v128_t vabsx = wasm_v128_andnot(vx, vsign_mask);
- const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_number, vabsx));
- const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_number), vmagic_number);
+ const v128_t vrndmask = wasm_v128_or(vsign_mask, wasm_f32x4_le(vmagic_bias, vabsx));
+ const v128_t vrndabsx = wasm_f32x4_sub(wasm_f32x4_add(vabsx, vmagic_bias), vmagic_bias);
const v128_t vadjustment = wasm_v128_and(wasm_f32x4_lt(vabsx, vrndabsx), vone);
const v128_t vflrabsx = wasm_f32x4_sub(vrndabsx, vadjustment);
v128_t vy = wasm_v128_bitselect(vx, vflrabsx, vrndmask);
diff --git a/src/f32-vrnd/vrndz-wasmsimd-cvt.c.in b/src/f32-vrnd/vrndz-wasmsimd-cvt.c.in
index 72fb72e..4a39568 100644
--- a/src/f32-vrnd/vrndz-wasmsimd-cvt.c.in
+++ b/src/f32-vrnd/vrndz-wasmsimd-cvt.c.in
@@ -24,8 +24,8 @@
assert(n != 0);
assert(n % sizeof(float) == 0);
- const v128_t vsign_mask = wasm_f32x4_const_splat(-0.0f);
- const v128_t vmagic_number = wasm_f32x4_const_splat(0x1.000000p+23f);
+ const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd.sign_mask);
+ const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd.magic_bias);
$if BATCH_TILE > 4:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
@@ -39,7 +39,7 @@
$for N in range(0, BATCH_TILE, 4):
const v128_t vrndx${ABC[N:N+4]} = wasm_f32x4_convert_i32x4(vintx${ABC[N:N+4]});
- const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_andnot(wasm_f32x4_lt(vabsx${ABC[N:N+4]}, vmagic_number), vsign_mask);
+ const v128_t vrndmask${ABC[N:N+4]} = wasm_v128_andnot(wasm_f32x4_lt(vabsx${ABC[N:N+4]}, vmagic_bias), vsign_mask);
$for N in range(0, BATCH_TILE, 4):
const v128_t vy${ABC[N:N+4]} = wasm_v128_bitselect(vrndx${ABC[N:N+4]}, vx${ABC[N:N+4]}, vrndmask${ABC[N:N+4]});
@@ -56,7 +56,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vrndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
const v128_t vy = wasm_v128_bitselect(vrndx, vx, vrndmask);
wasm_v128_store(y, vy);
@@ -68,7 +68,7 @@
const v128_t vintx = wasm_i32x4_trunc_sat_f32x4(vx);
const v128_t vabsx = wasm_f32x4_abs(vx);
const v128_t vrndx = wasm_f32x4_convert_i32x4(vintx);
- const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_number), vsign_mask);
+ const v128_t vrndmask = wasm_v128_andnot(wasm_f32x4_lt(vabsx, vmagic_bias), vsign_mask);
v128_t vy = wasm_v128_bitselect(vrndx, vx, vrndmask);
if (n & (2 * sizeof(float))) {
diff --git a/src/init.c b/src/init.c
index 6ede570..e2daf71 100644
--- a/src/init.c
+++ b/src/init.c
@@ -611,15 +611,39 @@
.element_tile = 8,
};
if (cpuinfo_has_arm_neon_v8()) {
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
+ .element_tile = 8,
+ };
} else {
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8,
+ .element_tile = 8,
+ };
}
xnn_params.f32.sigmoid = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
@@ -1107,10 +1131,22 @@
.ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
.element_tile = 4,
};
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
+ .element_tile = 1,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
+ .element_tile = 1,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
+ .element_tile = 1,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
+ .element_tile = 1,
+ };
xnn_params.f32.sigmoid = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
.init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
@@ -2360,10 +2396,22 @@
.ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
.element_tile = 8,
};
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
+ .element_tile = 8,
+ };
xnn_params.f32.sigmoid = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
.init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
@@ -3486,25 +3534,81 @@
};
}
if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16,
+ .element_tile = 16,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16,
+ .element_tile = 16,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16,
+ .element_tile = 16,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16,
+ .element_tile = 16,
+ };
} else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16,
+ .init.f32_rnd = xnn_init_f32_rnd_avx_params,
+ .element_tile = 16,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16,
+ .init.f32_rnd = xnn_init_f32_rnd_avx_params,
+ .element_tile = 16,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16,
+ .init.f32_rnd = xnn_init_f32_rnd_avx_params,
+ .element_tile = 16,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16,
+ .init.f32_rnd = xnn_init_f32_rnd_avx_params,
+ .element_tile = 16,
+ };
} else if (cpuinfo_has_x86_sse4_1()) {
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8,
+ .element_tile = 8,
+ };
} else {
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
+ .element_tile = 8,
+ };
}
if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
xnn_params.f32.sigmoid = (struct vunary_parameters) {
@@ -4464,19 +4568,51 @@
};
xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16;
#if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 91)
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_native_x8;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_native_x8;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_native_x8;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_native_x8;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_native_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_native_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_native_x8,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_native_x8,
+ .element_tile = 8,
+ };
#else // XNN_WASMSIMD_VERSION >= 91
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
+ .element_tile = 8,
+ };
if (is_wasm_x86) {
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8;
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
+ .element_tile = 8,
+ };
} else {
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8;
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
+ .element_tile = 8,
+ };
}
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8;
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
+ .element_tile = 8,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8,
+ .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
+ .element_tile = 8,
+ };
#endif // XNN_WASMSIMD_VERSION >= 91
xnn_params.f32.sigmoid = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
@@ -5155,10 +5291,22 @@
} else {
xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8;
}
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
+ .element_tile = 4,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
+ .element_tile = 4,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
+ .element_tile = 4,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
+ .element_tile = 4,
+ };
xnn_params.f32.sigmoid = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
.init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
@@ -5600,10 +5748,22 @@
.element_tile = 4,
};
xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8;
- xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4;
- xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4;
- xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4;
- xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4;
+ xnn_params.f32.rndne = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
+ .element_tile = 4,
+ };
+ xnn_params.f32.rndz = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
+ .element_tile = 4,
+ };
+ xnn_params.f32.rndu = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
+ .element_tile = 4,
+ };
+ xnn_params.f32.rndd = (struct vunary_parameters) {
+ .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
+ .element_tile = 4,
+ };
xnn_params.f32.sigmoid = (struct vunary_parameters) {
.ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
.init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c
index ac51572..0017d32 100644
--- a/src/operators/unary-elementwise-nc.c
+++ b/src/operators/unary-elementwise-nc.c
@@ -281,12 +281,14 @@
xnn_operator_t* rounding_op_out)
{
union xnn_f32_rnd_params params;
- xnn_init_f32_rnd_params(¶ms);
+ if (xnn_params.f32.rndne.init.f32_rnd != NULL) {
+ xnn_params.f32.rndne.init.f32_rnd(¶ms);
+ }
return create_unary_elementwise_nc(
channels, input_stride, output_stride, flags,
¶ms, sizeof(params),
xnn_operator_type_bankers_rounding_nc_f32,
- xnn_params.f32.rndne,
+ xnn_params.f32.rndne.ukernel,
rounding_op_out);
}
@@ -298,12 +300,14 @@
xnn_operator_t* ceiling_op_out)
{
union xnn_f32_rnd_params params;
- xnn_init_f32_rnd_params(¶ms);
+ if (xnn_params.f32.rndu.init.f32_rnd != NULL) {
+ xnn_params.f32.rndu.init.f32_rnd(¶ms);
+ }
return create_unary_elementwise_nc(
channels, input_stride, output_stride, flags,
¶ms, sizeof(params),
xnn_operator_type_ceiling_nc_f32,
- xnn_params.f32.rndu,
+ xnn_params.f32.rndu.ukernel,
ceiling_op_out);
}
@@ -525,12 +529,14 @@
xnn_operator_t* floor_op_out)
{
union xnn_f32_rnd_params params;
- xnn_init_f32_rnd_params(¶ms);
+ if (xnn_params.f32.rndd.init.f32_rnd != NULL) {
+ xnn_params.f32.rndd.init.f32_rnd(¶ms);
+ }
return create_unary_elementwise_nc(
channels, input_stride, output_stride, flags,
¶ms, sizeof(params),
xnn_operator_type_floor_nc_f32,
- xnn_params.f32.rndd,
+ xnn_params.f32.rndd.ukernel,
floor_op_out);
}
@@ -696,12 +702,14 @@
xnn_operator_t* truncation_op_out)
{
union xnn_f32_rnd_params params;
- xnn_init_f32_rnd_params(¶ms);
+ if (xnn_params.f32.rndz.init.f32_rnd != NULL) {
+ xnn_params.f32.rndz.init.f32_rnd(¶ms);
+ }
return create_unary_elementwise_nc(
channels, input_stride, output_stride, flags,
¶ms, sizeof(params),
xnn_operator_type_truncation_nc_f32,
- xnn_params.f32.rndz,
+ xnn_params.f32.rndz.ukernel,
truncation_op_out);
}
diff --git a/src/params-init.c b/src/params-init.c
index e9a5805..9bee350 100644
--- a/src/params-init.c
+++ b/src/params-init.c
@@ -1691,23 +1691,40 @@
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-void xnn_init_f32_rnd_params(
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+void xnn_init_f32_rnd_sse2_params(
union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
{
- #if XNN_ARCH_X86 || XNN_ARCH_X86_64
- for (uint32_t i = 0; i < 4; i++) {
- params->sse2.sign_mask[i] = -0.0f;
- }
- for (uint32_t i = 0; i < 4; i++) {
- params->sse2.one[i] = 1.0f;
- }
- #endif
+ for (uint32_t i = 0; i < 4; i++) {
+ params->sse2.sign_mask[i] = -0.0f;
+ params->sse2.one[i] = 1.0f;
+ }
}
-void xnn_init_scalar_f32_rnd_params(
+void xnn_init_f32_rnd_avx_params(
union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
{
+ for (uint32_t i = 0; i < 7; i++) {
+ params->avx.mask_table[i] = -1;
+ }
+ for (uint32_t i = 7; i < 14; i++) {
+ params->avx.mask_table[i] = 0;
+ }
}
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+void xnn_init_f32_rnd_wasmsimd_params(
+ union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
+{
+ params->wasmsimd.sign_mask[0] = -0.0f;
+ params->wasmsimd.sign_mask[1] = -0.0f;
+ params->wasmsimd.magic_bias[0] = 0x1.000000p+23f;
+ params->wasmsimd.magic_bias[1] = 0x1.000000p+23f;
+ params->wasmsimd.one[0] = 1.0f;
+ params->wasmsimd.one[1] = 1.0f;
+}
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
void xnn_init_f32_elu_scalar_rr2_lut16_p3_params(
union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index 5a54ed8..54bb35f 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -317,11 +317,18 @@
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-XNN_INTERNAL void xnn_init_f32_rnd_params(
- union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)]);
+#define DECLARE_INIT_F32_RND_PARAMS_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)]);
-XNN_INTERNAL void xnn_init_scalar_f32_rnd_params(
- union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)]);
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ DECLARE_INIT_F32_RND_PARAMS_FUNCTION(xnn_init_f32_rnd_sse2_params)
+ DECLARE_INIT_F32_RND_PARAMS_FUNCTION(xnn_init_f32_rnd_avx_params)
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ DECLARE_INIT_F32_RND_PARAMS_FUNCTION(xnn_init_f32_rnd_wasmsimd_params)
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
#define DECLARE_INIT_F32_ELU_PARAMS_FUNCTION(fn_name) \
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 998f46e..2e5abc9 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -127,7 +127,17 @@
XNN_ALIGN(16) float sign_mask[4];
XNN_ALIGN(16) float one[4];
} sse2;
+ struct {
+ int32_t mask_table[14];
+ } avx;
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ struct {
+ XNN_ALIGN(8) float sign_mask[2];
+ XNN_ALIGN(8) float magic_bias[2];
+ XNN_ALIGN(8) float one[2];
+ } wasmsimd;
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};
union xnn_f32_elu_params {
@@ -3195,6 +3205,9 @@
typedef void (*xnn_init_f32_neg_params_fn)(
union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)]);
+typedef void (*xnn_init_f32_rnd_params_fn)(
+ union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)]);
+
typedef void (*xnn_init_f32_sigmoid_params_fn)(
union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)]);
@@ -3308,6 +3321,7 @@
xnn_init_f32_neg_params_fn f32_neg;
xnn_init_f32_qs8_cvt_params_fn f32_qs8_cvt;
xnn_init_f32_qu8_cvt_params_fn f32_qu8_cvt;
+ xnn_init_f32_rnd_params_fn f32_rnd;
xnn_init_f32_sigmoid_params_fn f32_sigmoid;
xnn_init_f32_sqrt_params_fn f32_sqrt;
xnn_init_qs8_f32_cvt_params_fn qs8_f32_cvt;
@@ -3613,10 +3627,10 @@
struct vunary_parameters lrelu;
struct vunary_parameters neg;
xnn_univector_ukernel_function relu;
- xnn_univector_ukernel_function rndne;
- xnn_univector_ukernel_function rndz;
- xnn_univector_ukernel_function rndu;
- xnn_univector_ukernel_function rndd;
+ struct vunary_parameters rndne;
+ struct vunary_parameters rndz;
+ struct vunary_parameters rndu;
+ struct vunary_parameters rndd;
struct vunary_parameters sigmoid;
struct vunary_parameters sqr;
struct vunary_parameters sqrt;
diff --git a/test/f32-vrndd.cc b/test/f32-vrndd.cc
index 66dcd2c..84609d4 100644
--- a/test/f32-vrndd.cc
+++ b/test/f32-vrndd.cc
@@ -210,7 +210,7 @@
TEST_REQUIRES_X86_SSE2;
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
TEST(F32_VRNDD__SSE2_X4, batch_div_4) {
@@ -218,7 +218,7 @@
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
}
@@ -227,7 +227,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
}
@@ -236,7 +236,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
}
@@ -246,7 +246,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -257,7 +257,7 @@
TEST_REQUIRES_X86_SSE2;
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
TEST(F32_VRNDD__SSE2_X8, batch_div_8) {
@@ -265,7 +265,7 @@
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
}
@@ -274,7 +274,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
}
@@ -283,7 +283,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
}
@@ -293,7 +293,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_sse2_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -398,7 +398,7 @@
TEST_REQUIRES_X86_AVX;
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
TEST(F32_VRNDD__AVX_X8, batch_div_8) {
@@ -406,7 +406,7 @@
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
}
@@ -415,7 +415,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
}
@@ -424,7 +424,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
}
@@ -434,7 +434,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -445,7 +445,7 @@
TEST_REQUIRES_X86_AVX;
VUnaryMicrokernelTester()
.batch_size(16)
- .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
TEST(F32_VRNDD__AVX_X16, batch_div_16) {
@@ -453,7 +453,7 @@
for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
}
@@ -462,7 +462,7 @@
for (size_t batch_size = 1; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
}
@@ -471,7 +471,7 @@
for (size_t batch_size = 17; batch_size < 32; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
}
@@ -481,7 +481,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_avx_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -585,14 +585,14 @@
TEST(F32_VRNDD__WASMSIMD_ADDSUB_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDD__WASMSIMD_ADDSUB_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -600,7 +600,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -608,7 +608,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -617,7 +617,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -627,14 +627,14 @@
TEST(F32_VRNDD__WASMSIMD_ADDSUB_X8, batch_eq_8) {
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDD__WASMSIMD_ADDSUB_X8, batch_div_8) {
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -642,7 +642,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -650,7 +650,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -659,7 +659,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -669,14 +669,14 @@
TEST(F32_VRNDD__WASMSIMD_CVT_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDD__WASMSIMD_CVT_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -684,7 +684,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -692,7 +692,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -701,7 +701,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -711,14 +711,14 @@
TEST(F32_VRNDD__WASMSIMD_CVT_X8, batch_eq_8) {
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDD__WASMSIMD_CVT_X8, batch_div_8) {
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -726,7 +726,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -734,7 +734,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -743,7 +743,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown);
+ .Test(xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundDown, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -836,14 +836,14 @@
TEST(F32_VRNDD__SCALAR_LIBM_X1, batch_eq_1) {
VUnaryMicrokernelTester()
.batch_size(1)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundDown);
}
TEST(F32_VRNDD__SCALAR_LIBM_X1, batch_gt_1) {
for (size_t batch_size = 2; batch_size < 10; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -852,7 +852,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -860,14 +860,14 @@
TEST(F32_VRNDD__SCALAR_LIBM_X2, batch_eq_2) {
VUnaryMicrokernelTester()
.batch_size(2)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown);
}
TEST(F32_VRNDD__SCALAR_LIBM_X2, batch_div_2) {
for (size_t batch_size = 4; batch_size < 20; batch_size += 2) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -875,7 +875,7 @@
for (size_t batch_size = 1; batch_size < 2; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -883,7 +883,7 @@
for (size_t batch_size = 3; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -892,7 +892,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -900,14 +900,14 @@
TEST(F32_VRNDD__SCALAR_LIBM_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown);
}
TEST(F32_VRNDD__SCALAR_LIBM_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -915,7 +915,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -923,7 +923,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
@@ -932,6 +932,6 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndd_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundDown);
}
}
diff --git a/test/f32-vrndd.yaml b/test/f32-vrndd.yaml
index 9cc3751..f2b353b 100644
--- a/test/f32-vrndd.yaml
+++ b/test/f32-vrndd.yaml
@@ -7,17 +7,25 @@
- name: xnn_f32_vrndd_ukernel__neonv8_x4
- name: xnn_f32_vrndd_ukernel__neonv8_x8
- name: xnn_f32_vrndd_ukernel__sse2_x4
+ init: xnn_init_f32_rnd_sse2_params
- name: xnn_f32_vrndd_ukernel__sse2_x8
+ init: xnn_init_f32_rnd_sse2_params
- name: xnn_f32_vrndd_ukernel__sse41_x4
- name: xnn_f32_vrndd_ukernel__sse41_x8
- name: xnn_f32_vrndd_ukernel__avx_x8
+ init: xnn_init_f32_rnd_avx_params
- name: xnn_f32_vrndd_ukernel__avx_x16
+ init: xnn_init_f32_rnd_avx_params
- name: xnn_f32_vrndd_ukernel__avx512f_x16
- name: xnn_f32_vrndd_ukernel__avx512f_x32
- name: xnn_f32_vrndd_ukernel__wasmsimd_addsub_x4
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndd_ukernel__wasmsimd_cvt_x4
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndd_ukernel__wasmsimd_cvt_x8
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndd_ukernel__wasmsimd_native_x4
- name: xnn_f32_vrndd_ukernel__wasmsimd_native_x8
- name: xnn_f32_vrndd_ukernel__scalar_libm_x1
diff --git a/test/f32-vrndne.cc b/test/f32-vrndne.cc
index 3e173b1..750ad37 100644
--- a/test/f32-vrndne.cc
+++ b/test/f32-vrndne.cc
@@ -210,7 +210,7 @@
TEST_REQUIRES_X86_SSE2;
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
TEST(F32_VRNDNE__SSE2_X4, batch_div_4) {
@@ -218,7 +218,7 @@
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
}
@@ -227,7 +227,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
}
@@ -236,7 +236,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
}
@@ -246,7 +246,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -257,7 +257,7 @@
TEST_REQUIRES_X86_SSE2;
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
TEST(F32_VRNDNE__SSE2_X8, batch_div_8) {
@@ -265,7 +265,7 @@
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
}
@@ -274,7 +274,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
}
@@ -283,7 +283,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
}
@@ -293,7 +293,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_sse2_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -398,7 +398,7 @@
TEST_REQUIRES_X86_AVX;
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
TEST(F32_VRNDNE__AVX_X8, batch_div_8) {
@@ -406,7 +406,7 @@
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
}
@@ -415,7 +415,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
}
@@ -424,7 +424,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
}
@@ -434,7 +434,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -445,7 +445,7 @@
TEST_REQUIRES_X86_AVX;
VUnaryMicrokernelTester()
.batch_size(16)
- .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
TEST(F32_VRNDNE__AVX_X16, batch_div_16) {
@@ -453,7 +453,7 @@
for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
}
@@ -462,7 +462,7 @@
for (size_t batch_size = 1; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
}
@@ -471,7 +471,7 @@
for (size_t batch_size = 17; batch_size < 32; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
}
@@ -481,7 +481,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_avx_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -585,14 +585,14 @@
TEST(F32_VRNDNE__WASMSIMD_ADDSUB_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDNE__WASMSIMD_ADDSUB_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -600,7 +600,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -608,7 +608,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -617,7 +617,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -627,14 +627,14 @@
TEST(F32_VRNDNE__WASMSIMD_ADDSUB_X8, batch_eq_8) {
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDNE__WASMSIMD_ADDSUB_X8, batch_div_8) {
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -642,7 +642,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -650,7 +650,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -659,7 +659,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
+ .Test(xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundToNearestEven, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -752,14 +752,14 @@
TEST(F32_VRNDNE__SCALAR_LIBM_X1, batch_eq_1) {
VUnaryMicrokernelTester()
.batch_size(1)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
TEST(F32_VRNDNE__SCALAR_LIBM_X1, batch_gt_1) {
for (size_t batch_size = 2; batch_size < 10; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -768,7 +768,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -776,14 +776,14 @@
TEST(F32_VRNDNE__SCALAR_LIBM_X2, batch_eq_2) {
VUnaryMicrokernelTester()
.batch_size(2)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
TEST(F32_VRNDNE__SCALAR_LIBM_X2, batch_div_2) {
for (size_t batch_size = 4; batch_size < 20; batch_size += 2) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -791,7 +791,7 @@
for (size_t batch_size = 1; batch_size < 2; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -799,7 +799,7 @@
for (size_t batch_size = 3; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -808,7 +808,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -816,14 +816,14 @@
TEST(F32_VRNDNE__SCALAR_LIBM_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
TEST(F32_VRNDNE__SCALAR_LIBM_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -831,7 +831,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -839,7 +839,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
@@ -848,6 +848,6 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndne_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundToNearestEven);
}
}
diff --git a/test/f32-vrndne.yaml b/test/f32-vrndne.yaml
index e513e4b..bcc5076 100644
--- a/test/f32-vrndne.yaml
+++ b/test/f32-vrndne.yaml
@@ -7,15 +7,21 @@
- name: xnn_f32_vrndne_ukernel__neonv8_x4
- name: xnn_f32_vrndne_ukernel__neonv8_x8
- name: xnn_f32_vrndne_ukernel__sse2_x4
+ init: xnn_init_f32_rnd_sse2_params
- name: xnn_f32_vrndne_ukernel__sse2_x8
+ init: xnn_init_f32_rnd_sse2_params
- name: xnn_f32_vrndne_ukernel__sse41_x4
- name: xnn_f32_vrndne_ukernel__sse41_x8
- name: xnn_f32_vrndne_ukernel__avx_x8
+ init: xnn_init_f32_rnd_avx_params
- name: xnn_f32_vrndne_ukernel__avx_x16
+ init: xnn_init_f32_rnd_avx_params
- name: xnn_f32_vrndne_ukernel__avx512f_x16
- name: xnn_f32_vrndne_ukernel__avx512f_x32
- name: xnn_f32_vrndne_ukernel__wasmsimd_addsub_x4
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndne_ukernel__wasmsimd_native_x4
- name: xnn_f32_vrndne_ukernel__wasmsimd_native_x8
- name: xnn_f32_vrndne_ukernel__scalar_libm_x1
diff --git a/test/f32-vrndu.cc b/test/f32-vrndu.cc
index 2b9dc5b..bc8f61a 100644
--- a/test/f32-vrndu.cc
+++ b/test/f32-vrndu.cc
@@ -210,7 +210,7 @@
TEST_REQUIRES_X86_SSE2;
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
TEST(F32_VRNDU__SSE2_X4, batch_div_4) {
@@ -218,7 +218,7 @@
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
}
@@ -227,7 +227,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
}
@@ -236,7 +236,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
}
@@ -246,7 +246,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -257,7 +257,7 @@
TEST_REQUIRES_X86_SSE2;
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
TEST(F32_VRNDU__SSE2_X8, batch_div_8) {
@@ -265,7 +265,7 @@
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
}
@@ -274,7 +274,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
}
@@ -283,7 +283,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
}
@@ -293,7 +293,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_sse2_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -398,7 +398,7 @@
TEST_REQUIRES_X86_AVX;
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
TEST(F32_VRNDU__AVX_X8, batch_div_8) {
@@ -406,7 +406,7 @@
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
}
@@ -415,7 +415,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
}
@@ -424,7 +424,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
}
@@ -434,7 +434,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -445,7 +445,7 @@
TEST_REQUIRES_X86_AVX;
VUnaryMicrokernelTester()
.batch_size(16)
- .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
TEST(F32_VRNDU__AVX_X16, batch_div_16) {
@@ -453,7 +453,7 @@
for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
}
@@ -462,7 +462,7 @@
for (size_t batch_size = 1; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
}
@@ -471,7 +471,7 @@
for (size_t batch_size = 17; batch_size < 32; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
}
@@ -481,7 +481,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_avx_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -585,14 +585,14 @@
TEST(F32_VRNDU__WASMSIMD_ADDSUB_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDU__WASMSIMD_ADDSUB_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -600,7 +600,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -608,7 +608,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -617,7 +617,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -627,14 +627,14 @@
TEST(F32_VRNDU__WASMSIMD_ADDSUB_X8, batch_eq_8) {
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDU__WASMSIMD_ADDSUB_X8, batch_div_8) {
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -642,7 +642,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -650,7 +650,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -659,7 +659,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -669,14 +669,14 @@
TEST(F32_VRNDU__WASMSIMD_CVT_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDU__WASMSIMD_CVT_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -684,7 +684,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -692,7 +692,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -701,7 +701,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -711,14 +711,14 @@
TEST(F32_VRNDU__WASMSIMD_CVT_X8, batch_eq_8) {
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDU__WASMSIMD_CVT_X8, batch_div_8) {
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -726,7 +726,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -734,7 +734,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -743,7 +743,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp);
+ .Test(xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundUp, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -836,14 +836,14 @@
TEST(F32_VRNDU__SCALAR_LIBM_X1, batch_eq_1) {
VUnaryMicrokernelTester()
.batch_size(1)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundUp);
}
TEST(F32_VRNDU__SCALAR_LIBM_X1, batch_gt_1) {
for (size_t batch_size = 2; batch_size < 10; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -852,7 +852,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -860,14 +860,14 @@
TEST(F32_VRNDU__SCALAR_LIBM_X2, batch_eq_2) {
VUnaryMicrokernelTester()
.batch_size(2)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp);
}
TEST(F32_VRNDU__SCALAR_LIBM_X2, batch_div_2) {
for (size_t batch_size = 4; batch_size < 20; batch_size += 2) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -875,7 +875,7 @@
for (size_t batch_size = 1; batch_size < 2; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -883,7 +883,7 @@
for (size_t batch_size = 3; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -892,7 +892,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -900,14 +900,14 @@
TEST(F32_VRNDU__SCALAR_LIBM_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp);
}
TEST(F32_VRNDU__SCALAR_LIBM_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -915,7 +915,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -923,7 +923,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
@@ -932,6 +932,6 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndu_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundUp);
}
}
diff --git a/test/f32-vrndu.yaml b/test/f32-vrndu.yaml
index 72426d9..6678cda 100644
--- a/test/f32-vrndu.yaml
+++ b/test/f32-vrndu.yaml
@@ -7,17 +7,25 @@
- name: xnn_f32_vrndu_ukernel__neonv8_x4
- name: xnn_f32_vrndu_ukernel__neonv8_x8
- name: xnn_f32_vrndu_ukernel__sse2_x4
+ init: xnn_init_f32_rnd_sse2_params
- name: xnn_f32_vrndu_ukernel__sse2_x8
+ init: xnn_init_f32_rnd_sse2_params
- name: xnn_f32_vrndu_ukernel__sse41_x4
- name: xnn_f32_vrndu_ukernel__sse41_x8
- name: xnn_f32_vrndu_ukernel__avx_x8
+ init: xnn_init_f32_rnd_avx_params
- name: xnn_f32_vrndu_ukernel__avx_x16
+ init: xnn_init_f32_rnd_avx_params
- name: xnn_f32_vrndu_ukernel__avx512f_x16
- name: xnn_f32_vrndu_ukernel__avx512f_x32
- name: xnn_f32_vrndu_ukernel__wasmsimd_addsub_x4
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndu_ukernel__wasmsimd_cvt_x4
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndu_ukernel__wasmsimd_cvt_x8
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndu_ukernel__wasmsimd_native_x4
- name: xnn_f32_vrndu_ukernel__wasmsimd_native_x8
- name: xnn_f32_vrndu_ukernel__scalar_libm_x1
diff --git a/test/f32-vrndz.cc b/test/f32-vrndz.cc
index ab3b571..c023d83 100644
--- a/test/f32-vrndz.cc
+++ b/test/f32-vrndz.cc
@@ -210,7 +210,7 @@
TEST_REQUIRES_X86_SSE2;
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
TEST(F32_VRNDZ__SSE2_X4, batch_div_4) {
@@ -218,7 +218,7 @@
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
}
@@ -227,7 +227,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
}
@@ -236,7 +236,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
}
@@ -246,7 +246,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -257,7 +257,7 @@
TEST_REQUIRES_X86_SSE2;
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
TEST(F32_VRNDZ__SSE2_X8, batch_div_8) {
@@ -265,7 +265,7 @@
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
}
@@ -274,7 +274,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
}
@@ -283,7 +283,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
}
@@ -293,7 +293,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__sse2_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_sse2_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -398,7 +398,7 @@
TEST_REQUIRES_X86_AVX;
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
TEST(F32_VRNDZ__AVX_X8, batch_div_8) {
@@ -406,7 +406,7 @@
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
}
@@ -415,7 +415,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
}
@@ -424,7 +424,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
}
@@ -434,7 +434,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -445,7 +445,7 @@
TEST_REQUIRES_X86_AVX;
VUnaryMicrokernelTester()
.batch_size(16)
- .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
TEST(F32_VRNDZ__AVX_X16, batch_div_16) {
@@ -453,7 +453,7 @@
for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
}
@@ -462,7 +462,7 @@
for (size_t batch_size = 1; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
}
@@ -471,7 +471,7 @@
for (size_t batch_size = 17; batch_size < 32; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
}
@@ -481,7 +481,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__avx_x16, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_avx_params);
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -585,14 +585,14 @@
TEST(F32_VRNDZ__WASMSIMD_ADDSUB_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDZ__WASMSIMD_ADDSUB_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -600,7 +600,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -608,7 +608,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -617,7 +617,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -627,14 +627,14 @@
TEST(F32_VRNDZ__WASMSIMD_ADDSUB_X8, batch_eq_8) {
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDZ__WASMSIMD_ADDSUB_X8, batch_div_8) {
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -642,7 +642,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -650,7 +650,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -659,7 +659,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -669,14 +669,14 @@
TEST(F32_VRNDZ__WASMSIMD_CVT_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDZ__WASMSIMD_CVT_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -684,7 +684,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -692,7 +692,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -701,7 +701,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -711,14 +711,14 @@
TEST(F32_VRNDZ__WASMSIMD_CVT_X8, batch_eq_8) {
VUnaryMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
TEST(F32_VRNDZ__WASMSIMD_CVT_X8, batch_div_8) {
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -726,7 +726,7 @@
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -734,7 +734,7 @@
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
@@ -743,7 +743,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
+ .Test(xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8, VUnaryMicrokernelTester::OpType::RoundTowardsZero, xnn_init_f32_rnd_wasmsimd_params);
}
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
@@ -836,14 +836,14 @@
TEST(F32_VRNDZ__SCALAR_LIBM_X1, batch_eq_1) {
VUnaryMicrokernelTester()
.batch_size(1)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
TEST(F32_VRNDZ__SCALAR_LIBM_X1, batch_gt_1) {
for (size_t batch_size = 2; batch_size < 10; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -852,7 +852,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x1, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -860,14 +860,14 @@
TEST(F32_VRNDZ__SCALAR_LIBM_X2, batch_eq_2) {
VUnaryMicrokernelTester()
.batch_size(2)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
TEST(F32_VRNDZ__SCALAR_LIBM_X2, batch_div_2) {
for (size_t batch_size = 4; batch_size < 20; batch_size += 2) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -875,7 +875,7 @@
for (size_t batch_size = 1; batch_size < 2; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -883,7 +883,7 @@
for (size_t batch_size = 3; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -892,7 +892,7 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x2, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -900,14 +900,14 @@
TEST(F32_VRNDZ__SCALAR_LIBM_X4, batch_eq_4) {
VUnaryMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
TEST(F32_VRNDZ__SCALAR_LIBM_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -915,7 +915,7 @@
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -923,7 +923,7 @@
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
VUnaryMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
@@ -932,6 +932,6 @@
VUnaryMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero, VUnaryMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_vrndz_ukernel__scalar_libm_x4, VUnaryMicrokernelTester::OpType::RoundTowardsZero);
}
}
diff --git a/test/f32-vrndz.yaml b/test/f32-vrndz.yaml
index 22f15b4..eaa4da2 100644
--- a/test/f32-vrndz.yaml
+++ b/test/f32-vrndz.yaml
@@ -7,17 +7,25 @@
- name: xnn_f32_vrndz_ukernel__neonv8_x4
- name: xnn_f32_vrndz_ukernel__neonv8_x8
- name: xnn_f32_vrndz_ukernel__sse2_x4
+ init: xnn_init_f32_rnd_sse2_params
- name: xnn_f32_vrndz_ukernel__sse2_x8
+ init: xnn_init_f32_rnd_sse2_params
- name: xnn_f32_vrndz_ukernel__sse41_x4
- name: xnn_f32_vrndz_ukernel__sse41_x8
- name: xnn_f32_vrndz_ukernel__avx_x8
+ init: xnn_init_f32_rnd_avx_params
- name: xnn_f32_vrndz_ukernel__avx_x16
+ init: xnn_init_f32_rnd_avx_params
- name: xnn_f32_vrndz_ukernel__avx512f_x16
- name: xnn_f32_vrndz_ukernel__avx512f_x32
- name: xnn_f32_vrndz_ukernel__wasmsimd_addsub_x4
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndz_ukernel__wasmsimd_cvt_x4
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8
+ init: xnn_init_f32_rnd_wasmsimd_params
- name: xnn_f32_vrndz_ukernel__wasmsimd_native_x4
- name: xnn_f32_vrndz_ukernel__wasmsimd_native_x8
- name: xnn_f32_vrndz_ukernel__scalar_libm_x1
diff --git a/test/vunary-microkernel-tester.h b/test/vunary-microkernel-tester.h
index 50bfab7..1fa2e43 100644
--- a/test/vunary-microkernel-tester.h
+++ b/test/vunary-microkernel-tester.h
@@ -143,46 +143,14 @@
case OpType::ReLU:
y_ref[i] = std::max(x_data[i], 0.0f);
break;
- case OpType::RoundToNearestEven:
- y_ref[i] = std::nearbyint(double(x_data[i]));
- break;
- case OpType::RoundTowardsZero:
- y_ref[i] = std::trunc(double(x_data[i]));
- break;
- case OpType::RoundUp:
- y_ref[i] = std::ceil(double(x_data[i]));
- break;
- case OpType::RoundDown:
- y_ref[i] = std::floor(double(x_data[i]));
- break;
+ default:
+ GTEST_FAIL() << "Unexpected operation type";
+ return;
}
}
- // Prepare parameters.
- union {
- union xnn_f32_relu_params relu;
- union xnn_f32_rnd_params rnd;
- } params;
- switch (op_type) {
- case OpType::RoundToNearestEven:
- case OpType::RoundTowardsZero:
- case OpType::RoundUp:
- case OpType::RoundDown:
- switch (variant) {
- case Variant::Native:
- xnn_init_f32_rnd_params(¶ms.rnd);
- break;
- case Variant::Scalar:
- xnn_init_scalar_f32_rnd_params(¶ms.rnd);
- break;
- }
- break;
- case OpType::ReLU:
- break;
- }
-
// Call optimized micro-kernel.
- vunary(batch_size() * sizeof(float), x_data, y.data(), ¶ms);
+ vunary(batch_size() * sizeof(float), x_data, y.data(), nullptr);
// Verify results.
for (size_t i = 0; i < batch_size(); i++) {
@@ -418,6 +386,62 @@
}
}
+ void Test(xnn_f32_vround_ukernel_function vrnd, OpType op_type, xnn_init_f32_rnd_params_fn init_params = nullptr) const {
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto distribution = std::uniform_real_distribution<float>(-5.0f, 5.0f);
+ auto f32rng = std::bind(distribution, std::ref(rng));
+
+ std::vector<float> x(batch_size() + XNN_EXTRA_BYTES / sizeof(float));
+ std::vector<float> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(float) : 0));
+ std::vector<float> y_ref(batch_size());
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ if (inplace()) {
+ std::generate(y.begin(), y.end(), std::ref(f32rng));
+ } else {
+ std::generate(x.begin(), x.end(), std::ref(f32rng));
+ std::fill(y.begin(), y.end(), nanf(""));
+ }
+ const float* x_data = inplace() ? y.data() : x.data();
+
+ // Compute reference results.
+ for (size_t i = 0; i < batch_size(); i++) {
+ switch (op_type) {
+ case OpType::RoundToNearestEven:
+ y_ref[i] = std::nearbyint(double(x_data[i]));
+ break;
+ case OpType::RoundTowardsZero:
+ y_ref[i] = std::trunc(double(x_data[i]));
+ break;
+ case OpType::RoundUp:
+ y_ref[i] = std::ceil(double(x_data[i]));
+ break;
+ case OpType::RoundDown:
+ y_ref[i] = std::floor(double(x_data[i]));
+ break;
+ default:
+ GTEST_FAIL() << "Unexpected operation type";
+ return;
+ }
+ }
+
+ // Prepare parameters.
+ xnn_f32_rnd_params params;
+ if (init_params != nullptr) {
+ init_params(¶ms);
+ }
+
+ // Call optimized micro-kernel.
+ vrnd(batch_size() * sizeof(float), x_data, y.data(), ¶ms);
+
+ // Verify results.
+ for (size_t i = 0; i < batch_size(); i++) {
+ ASSERT_EQ(y[i], y_ref[i])
+ << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << x[i];
+ }
+ }
+ }
+
void Test(xnn_f32_vsigmoid_ukernel_function vsigmoid, xnn_init_f32_sigmoid_params_fn init_params) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());
@@ -551,10 +575,6 @@
Test(xnn_f32_vunary_ukernel_function(vunary), op_type, variant);
}
- inline void Test(xnn_f32_vround_ukernel_function vunary, OpType op_type, Variant variant = Variant::Native) const {
- Test(xnn_f32_vunary_ukernel_function(vunary), op_type, variant);
- }
-
void Test(xnn_f16_vunary_ukernel_function vunary, OpType op_type, Variant variant = Variant::Native) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());
diff --git a/tools/generate-vunary-test.py b/tools/generate-vunary-test.py
index 0027a94..92aea36 100755
--- a/tools/generate-vunary-test.py
+++ b/tools/generate-vunary-test.py
@@ -204,8 +204,11 @@
_, test_name = ukernel.split("_", 1)
_, datatype, _ = ukernel.split("_", 2)
test_args = [ukernel]
- if init_fn:
- test_args.append(init_fn)
+ if init_fn or op_type.startswith("Round"):
+ if op_type.startswith("Round"):
+ test_args.append("VUnaryMicrokernelTester::OpType::" + op_type)
+ if init_fn is not None:
+ test_args.append(init_fn)
elif op_type not in ["Abs", "Negate", "Square", "SquareRoot"]:
test_args.append("VUnaryMicrokernelTester::OpType::" + op_type)
if not isa: