blob: 9edfe15e939f740549211a81f72c6e3e73738470 [file] [log] [blame] [edit]
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
#include <stddef.h>
#include <xnnpack/math.h>
#include <xnnpack/math-stubs.h>
void xnn_math_u32_sqrt__scalar_tflm(
size_t n,
const uint32_t* input,
uint32_t* output)
{
assert(n % sizeof(uint32_t) == 0);
for (; n != 0; n -= sizeof(uint32_t)) {
uint32_t vx = *input++;
// Algorithm adapted from tensorflow/lite/experimental/microfrontend/lib/filterbank.c in TFLite-Micro
uint32_t vy = 0;
if (vx != 0) {
const uint32_t vn = (math_clz_nonzero_u32(vx) | 1) ^ 31;
uint32_t vb = UINT32_C(1) << vn;
uint32_t iterations = (vn >> 1) + 1;
while (iterations--) {
const uint32_t vyb = vy + vb;
if (vx >= vyb) {
vx -= vyb;
vy = (vy >> 1) + vb;
} else {
vy >>= 1;
}
vb >>= 2;
}
// vy is sqrt(.) rounded down. Do the final rounding up if needed.
if (vx > vy) {
// This condition prevents overflowing uint16_t, but produces incorrectly
// rounded result for large inputs where square root should round to 0x10000.
if (vy != UINT32_C(0xFFFF)) {
vy += 1;
}
}
}
*output++ = vy;
}
}