QS8/QU8 Tanh operator using LUT microkernels

PiperOrigin-RevId: 395835065
diff --git a/BUILD.bazel b/BUILD.bazel
index 6a68ca6..3cfe853 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -10285,6 +10285,15 @@
 )
 
 xnnpack_unit_test(
+    name = "tanh_nc_test",
+    srcs = [
+        "test/tanh-nc.cc",
+        "test/tanh-operator-tester.h",
+    ],
+    deps = OPERATOR_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "truncation_nc_test",
     srcs = [
         "test/truncation-nc.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87b0128..83a6344 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5500,6 +5500,15 @@
   TARGET_LINK_LIBRARIES(subtract-nd-test PRIVATE XNNPACK fp16 gtest gtest_main)
   ADD_TEST(subtract-nd-test subtract-nd-test)
 
+  ADD_EXECUTABLE(tanh-nc-test test/tanh-nc.cc)
+  SET_TARGET_PROPERTIES(tanh-nc-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS NO)
+  TARGET_INCLUDE_DIRECTORIES(tanh-nc-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(tanh-nc-test PRIVATE XNNPACK gtest gtest_main)
+  ADD_TEST(tanh-nc-test tanh-nc-test)
+
   ADD_EXECUTABLE(truncation-nc-test test/truncation-nc.cc)
   SET_TARGET_PROPERTIES(truncation-nc-test PROPERTIES
     CXX_STANDARD 11
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 1e8a891..9fec285 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -2228,6 +2228,26 @@
   int8_t* output,
   pthreadpool_t threadpool);
 
+enum xnn_status xnn_create_tanh_nc_qs8(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_setup_tanh_nc_qs8(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  const int8_t* input,
+  int8_t* output,
+  pthreadpool_t threadpool);
+
 #endif  // XNN_NO_QS8_OPERATORS
 
 #ifndef XNN_NO_QU8_OPERATORS
@@ -2513,6 +2533,26 @@
   uint8_t* output,
   pthreadpool_t threadpool);
 
+enum xnn_status xnn_create_tanh_nc_qu8(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_setup_tanh_nc_qu8(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  const uint8_t* input,
+  uint8_t* output,
+  pthreadpool_t threadpool);
+
 #endif  // XNN_NO_QU8_OPERATORS
 
 #ifndef XNN_NO_S8_OPERATORS
diff --git a/src/operator-strings.c b/src/operator-strings.c
index 5c7e8c9..0fc2ae3 100644
--- a/src/operator-strings.c
+++ b/src/operator-strings.c
@@ -154,6 +154,10 @@
       return "Subtract (ND, QS8)";
     case xnn_operator_type_subtract_nd_qu8:
       return "Subtract (ND, QU8)";
+    case xnn_operator_type_tanh_nc_qs8:
+      return "Tanh (NC, QS8)";
+    case xnn_operator_type_tanh_nc_qu8:
+      return "Tanh (NC, QU8)";
     case xnn_operator_type_truncation_nc_f32:
       return "Truncation (NC, F32)";
     case xnn_operator_type_unpooling_nhwc_x32:
diff --git a/src/operators/lut-elementwise-nc.c b/src/operators/lut-elementwise-nc.c
index b49daa7..eb085cd 100644
--- a/src/operators/lut-elementwise-nc.c
+++ b/src/operators/lut-elementwise-nc.c
@@ -265,6 +265,84 @@
     xnn_operator_type_sigmoid_nc_qu8, sigmoid_op_out);
 }
 
+static float calculate_tanh(float x, const void* params) {
+  return tanhf(x);
+}
+
+enum xnn_status xnn_create_tanh_nc_qs8(
+    size_t channels,
+    size_t input_stride,
+    size_t output_stride,
+    int8_t input_zero_point,
+    float input_scale,
+    int8_t output_zero_point,
+    float output_scale,
+    int8_t output_min,
+    int8_t output_max,
+    uint32_t flags,
+    xnn_operator_t* tanh_op_out)
+{
+  if (output_scale != 0x1.0p-7f) {
+    xnn_log_error(
+      "failed to create %s operator with %.7g output scale: only output scale of 1/128 is supported",
+      xnn_operator_type_to_string(xnn_operator_type_tanh_nc_qs8), output_scale);
+    return xnn_status_unsupported_parameter;
+  }
+
+  if (output_zero_point != 0) {
+    xnn_log_error(
+      "failed to create %s operator with %" PRIu8 " output zero point: only output zero point of 0 is supported",
+      xnn_operator_type_to_string(xnn_operator_type_tanh_nc_qs8), output_zero_point);
+    return xnn_status_unsupported_parameter;
+  }
+
+  return create_lut_elementwise_nc(
+    channels, input_stride, output_stride,
+    (int32_t) input_zero_point, input_scale, INT8_MIN,
+    (long) output_zero_point, output_scale,
+    (long) output_min, (long) output_max,
+    flags,
+    (xnn_lut_init_fn) &calculate_tanh, NULL,
+    xnn_operator_type_tanh_nc_qs8, tanh_op_out);
+}
+
+enum xnn_status xnn_create_tanh_nc_qu8(
+    size_t channels,
+    size_t input_stride,
+    size_t output_stride,
+    uint8_t input_zero_point,
+    float input_scale,
+    uint8_t output_zero_point,
+    float output_scale,
+    uint8_t output_min,
+    uint8_t output_max,
+    uint32_t flags,
+    xnn_operator_t* tanh_op_out)
+{
+  if (output_scale != 0x1.0p-7f) {
+    xnn_log_error(
+      "failed to create %s operator with %.7g output scale: only output scale of 1/128 is supported",
+      xnn_operator_type_to_string(xnn_operator_type_tanh_nc_qu8), output_scale);
+    return xnn_status_unsupported_parameter;
+  }
+
+  if (output_zero_point != 128) {
+    xnn_log_error(
+      "failed to create %s operator with %" PRIu8 " output zero point: only output zero point of 128 is supported",
+      xnn_operator_type_to_string(xnn_operator_type_tanh_nc_qu8), output_zero_point);
+    return xnn_status_unsupported_parameter;
+  }
+
+  return create_lut_elementwise_nc(
+    channels, input_stride, output_stride,
+    (int32_t) (uint32_t) input_zero_point, input_scale, 0 /* input min */,
+    (long) (unsigned long) output_zero_point, output_scale,
+    (long) (unsigned long) output_min, (long) (unsigned long) output_max,
+    flags,
+    (xnn_lut_init_fn) &calculate_tanh, NULL,
+    xnn_operator_type_tanh_nc_qu8, tanh_op_out);
+}
+
 static enum xnn_status setup_lut_elementwise_nc(
     xnn_operator_t lut_elementwise_op,
     enum xnn_operator_type expected_operator_type,
@@ -364,3 +442,27 @@
     sigmoid_op, xnn_operator_type_sigmoid_nc_qu8,
     batch_size, input, output);
 }
+
+enum xnn_status xnn_setup_tanh_nc_qs8(
+    xnn_operator_t tanh_op,
+    size_t batch_size,
+    const int8_t* input,
+    int8_t* output,
+    pthreadpool_t threadpool)
+{
+  return setup_lut_elementwise_nc(
+    tanh_op, xnn_operator_type_tanh_nc_qs8,
+    batch_size, input, output);
+}
+
+enum xnn_status xnn_setup_tanh_nc_qu8(
+    xnn_operator_t tanh_op,
+    size_t batch_size,
+    const uint8_t* input,
+    uint8_t* output,
+    pthreadpool_t threadpool)
+{
+  return setup_lut_elementwise_nc(
+    tanh_op, xnn_operator_type_tanh_nc_qu8,
+    batch_size, input, output);
+}
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index d73b6fb..6a8a554 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -100,6 +100,8 @@
   xnn_operator_type_subtract_nd_f32,
   xnn_operator_type_subtract_nd_qs8,
   xnn_operator_type_subtract_nd_qu8,
+  xnn_operator_type_tanh_nc_qs8,
+  xnn_operator_type_tanh_nc_qu8,
   xnn_operator_type_truncation_nc_f32,
   xnn_operator_type_unpooling_nhwc_x32,
 };
diff --git a/test/tanh-nc.cc b/test/tanh-nc.cc
new file mode 100644
index 0000000..8bca334
--- /dev/null
+++ b/test/tanh-nc.cc
@@ -0,0 +1,421 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <gtest/gtest.h>
+
+#include "tanh-operator-tester.h"
+
+
+TEST(TANH_NC_QS8, unit_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(1)
+      .channels(channels)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, unit_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(1)
+      .channels(channels)
+      .qmin(128)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, unit_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(1)
+      .channels(channels)
+      .qmax(128)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, unit_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float input_scale = 1.0e-2f; input_scale < 1.0e+2f; input_scale *= 10.0f) {
+      TanhOperatorTester()
+        .batch_size(1)
+        .channels(channels)
+        .input_scale(input_scale)
+        .iterations(1)
+        .TestQS8();
+    }
+  }
+}
+
+TEST(TANH_NC_QS8, unit_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) {
+      TanhOperatorTester()
+        .batch_size(1)
+        .channels(channels)
+        .input_zero_point(uint8_t(input_zero_point))
+        .iterations(1)
+        .TestQS8();
+    }
+  }
+}
+
+TEST(TANH_NC_QS8, small_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, small_batch_with_input_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, small_batch_with_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .output_stride(117)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, small_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .qmin(128)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, small_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .qmax(128)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, small_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float input_scale = 1.0e-2f; input_scale < 1.0e+2f; input_scale *= 10.0f) {
+      TanhOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_scale(input_scale)
+        .iterations(1)
+        .TestQS8();
+    }
+  }
+}
+
+TEST(TANH_NC_QS8, small_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) {
+      TanhOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_zero_point(uint8_t(input_zero_point))
+        .iterations(1)
+        .TestQS8();
+    }
+  }
+}
+
+TEST(TANH_NC_QS8, strided_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .output_stride(117)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, strided_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .output_stride(117)
+      .qmin(128)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, strided_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .output_stride(117)
+      .qmax(128)
+      .iterations(3)
+      .TestQS8();
+  }
+}
+
+TEST(TANH_NC_QS8, strided_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float input_scale = 1.0e-2f; input_scale < 1.0e+2f; input_scale *= 10.0f) {
+      TanhOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_stride(129)
+        .output_stride(117)
+        .input_scale(input_scale)
+        .iterations(1)
+        .TestQS8();
+    }
+  }
+}
+
+TEST(TANH_NC_QS8, strided_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) {
+      TanhOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_stride(129)
+        .output_stride(117)
+        .input_zero_point(uint8_t(input_zero_point))
+        .iterations(1)
+        .TestQS8();
+    }
+  }
+}
+
+TEST(TANH_NC_QU8, unit_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(1)
+      .channels(channels)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, unit_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(1)
+      .channels(channels)
+      .qmin(128)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, unit_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(1)
+      .channels(channels)
+      .qmax(128)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, unit_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float input_scale = 1.0e-2f; input_scale < 1.0e+2f; input_scale *= 10.0f) {
+      TanhOperatorTester()
+        .batch_size(1)
+        .channels(channels)
+        .input_scale(input_scale)
+        .iterations(1)
+        .TestQU8();
+    }
+  }
+}
+
+TEST(TANH_NC_QU8, unit_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) {
+      TanhOperatorTester()
+        .batch_size(1)
+        .channels(channels)
+        .input_zero_point(uint8_t(input_zero_point))
+        .iterations(1)
+        .TestQU8();
+    }
+  }
+}
+
+TEST(TANH_NC_QU8, small_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, small_batch_with_input_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, small_batch_with_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .output_stride(117)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, small_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .qmin(128)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, small_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .qmax(128)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, small_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float input_scale = 1.0e-2f; input_scale < 1.0e+2f; input_scale *= 10.0f) {
+      TanhOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_scale(input_scale)
+        .iterations(1)
+        .TestQU8();
+    }
+  }
+}
+
+TEST(TANH_NC_QU8, small_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) {
+      TanhOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_zero_point(uint8_t(input_zero_point))
+        .iterations(1)
+        .TestQU8();
+    }
+  }
+}
+
+TEST(TANH_NC_QU8, strided_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .output_stride(117)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, strided_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .output_stride(117)
+      .qmin(128)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, strided_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    TanhOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .output_stride(117)
+      .qmax(128)
+      .iterations(3)
+      .TestQU8();
+  }
+}
+
+TEST(TANH_NC_QU8, strided_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float input_scale = 1.0e-2f; input_scale < 1.0e+2f; input_scale *= 10.0f) {
+      TanhOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_stride(129)
+        .output_stride(117)
+        .input_scale(input_scale)
+        .iterations(1)
+        .TestQU8();
+    }
+  }
+}
+
+TEST(TANH_NC_QU8, strided_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) {
+      TanhOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_stride(129)
+        .output_stride(117)
+        .input_zero_point(uint8_t(input_zero_point))
+        .iterations(1)
+        .TestQU8();
+    }
+  }
+}
diff --git a/test/tanh-operator-tester.h b/test/tanh-operator-tester.h
new file mode 100644
index 0000000..4a23391
--- /dev/null
+++ b/test/tanh-operator-tester.h
@@ -0,0 +1,264 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <functional>
+#include <limits>
+#include <random>
+#include <vector>
+
+#include <xnnpack.h>
+
+
+class TanhOperatorTester {
+ public:
+  inline TanhOperatorTester& channels(size_t channels) {
+    assert(channels != 0);
+    this->channels_ = channels;
+    return *this;
+  }
+
+  inline size_t channels() const {
+    return this->channels_;
+  }
+
+  inline TanhOperatorTester& input_stride(size_t input_stride) {
+    assert(input_stride != 0);
+    this->input_stride_ = input_stride;
+    return *this;
+  }
+
+  inline size_t input_stride() const {
+    if (this->input_stride_ == 0) {
+      return this->channels_;
+    } else {
+      assert(this->input_stride_ >= this->channels_);
+      return this->input_stride_;
+    }
+  }
+
+  inline TanhOperatorTester& output_stride(size_t output_stride) {
+    assert(output_stride != 0);
+    this->output_stride_ = output_stride;
+    return *this;
+  }
+
+  inline size_t output_stride() const {
+    if (this->output_stride_ == 0) {
+      return this->channels_;
+    } else {
+      assert(this->output_stride_ >= this->channels_);
+      return this->output_stride_;
+    }
+  }
+
+  inline TanhOperatorTester& batch_size(size_t batch_size) {
+    assert(batch_size != 0);
+    this->batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline size_t batch_size() const {
+    return this->batch_size_;
+  }
+
+  inline TanhOperatorTester& input_scale(float input_scale) {
+    assert(input_scale > 0.0f);
+    assert(std::isnormal(input_scale));
+    this->input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float input_scale() const {
+    return this->input_scale_;
+  }
+
+  inline TanhOperatorTester& input_zero_point(uint8_t input_zero_point) {
+    this->input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline uint8_t input_zero_point() const {
+    return this->input_zero_point_;
+  }
+
+  inline float output_scale() const {
+    return 1.0f / 128.0f;
+  }
+
+  inline uint8_t output_zero_point() const {
+    return 128;
+  }
+
+  inline TanhOperatorTester& qmin(uint8_t qmin) {
+    this->qmin_ = qmin;
+    return *this;
+  }
+
+  inline uint8_t qmin() const {
+    return this->qmin_;
+  }
+
+  inline TanhOperatorTester& qmax(uint8_t qmax) {
+    this->qmax_ = qmax;
+    return *this;
+  }
+
+  inline uint8_t qmax() const {
+    return this->qmax_;
+  }
+
+  inline TanhOperatorTester& iterations(size_t iterations) {
+    this->iterations_ = iterations;
+    return *this;
+  }
+
+  inline size_t iterations() const {
+    return this->iterations_;
+  }
+
+  void TestQS8() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto i8rng = std::bind(
+      std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+      std::ref(rng));
+
+    std::vector<int8_t> input((batch_size() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
+    std::vector<int8_t> output((batch_size() - 1) * output_stride() + channels());
+    std::vector<float> output_ref(batch_size() * channels());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(i8rng));
+      std::fill(output.begin(), output.end(), 0xA5);
+
+      // Compute reference results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          const float x = input_scale() *
+            (int32_t(input[i * input_stride() + c]) - int32_t(input_zero_point() - 0x80));
+          const float tanh_x = std::tanh(x);
+          const float scaled_tanh_x = tanh_x / output_scale();
+          float y = scaled_tanh_x;
+          y = std::min<float>(y, int32_t(qmax() - 0x80) - int32_t(output_zero_point() - 0x80));
+          y = std::max<float>(y, int32_t(qmin() - 0x80) - int32_t(output_zero_point() - 0x80));
+          output_ref[i * channels() + c] = y + int32_t(output_zero_point() - 0x80);
+        }
+      }
+
+      // Create, setup, run, and destroy Sigmoid operator.
+      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+      xnn_operator_t tanh_op = nullptr;
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_create_tanh_nc_qs8(
+          channels(), input_stride(), output_stride(),
+          int8_t(input_zero_point() - 0x80), input_scale(),
+          int8_t(output_zero_point() - 0x80), output_scale(),
+          int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
+          0, &tanh_op));
+      ASSERT_NE(nullptr, tanh_op);
+
+      // Smart pointer to automatically delete tanh_op.
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_tanh_op(tanh_op, xnn_delete_operator);
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_tanh_nc_qs8(
+          tanh_op,
+          batch_size(),
+          input.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(tanh_op, nullptr /* thread pool */));
+
+      // Verify results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_NEAR(float(int32_t(output[i * output_stride() + c])), output_ref[i * channels() + c], 0.6f);
+        }
+      }
+    }
+  }
+
+  void TestQU8() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
+
+    std::vector<uint8_t> input((batch_size() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
+    std::vector<uint8_t> output((batch_size() - 1) * output_stride() + channels());
+    std::vector<float> output_ref(batch_size() * channels());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(u8rng));
+      std::fill(output.begin(), output.end(), 0xA5);
+
+      // Compute reference results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          const float x = input_scale() *
+            (int32_t(input[i * input_stride() + c]) - int32_t(input_zero_point()));
+          const float tanh_x = std::tanh(x);
+          const float scaled_tanh_x = tanh_x / output_scale();
+          float y = scaled_tanh_x;
+          y = std::min<float>(y, int32_t(qmax()) - int32_t(output_zero_point()));
+          y = std::max<float>(y, int32_t(qmin()) - int32_t(output_zero_point()));
+          output_ref[i * channels() + c] = y + int32_t(output_zero_point());
+        }
+      }
+
+      // Create, setup, run, and destroy Sigmoid operator.
+      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+      xnn_operator_t tanh_op = nullptr;
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_create_tanh_nc_qu8(
+          channels(), input_stride(), output_stride(),
+          input_zero_point(), input_scale(),
+          output_zero_point(), output_scale(),
+          qmin(), qmax(),
+          0, &tanh_op));
+      ASSERT_NE(nullptr, tanh_op);
+
+      // Smart pointer to automatically delete tanh_op.
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_tanh_op(tanh_op, xnn_delete_operator);
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_tanh_nc_qu8(
+          tanh_op,
+          batch_size(),
+          input.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(tanh_op, nullptr /* thread pool */));
+
+      // Verify results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_NEAR(float(int32_t(output[i * output_stride() + c])), output_ref[i * channels() + c], 0.6f);
+        }
+      }
+    }
+  }
+
+ private:
+  size_t batch_size_{1};
+  size_t channels_{1};
+  size_t input_stride_{0};
+  size_t output_stride_{0};
+  float input_scale_{0.75f};
+  uint8_t input_zero_point_{121};
+  uint8_t qmin_{0};
+  uint8_t qmax_{255};
+  size_t iterations_{15};
+};