Initial implementation of the following quantized ops.

  - CONV_QUANT8
  - DEPTHWISE_CONV_QUANT8
  - AVERAGE_POOL_QUANT8
  - MAX_POOL_QUANT8
  - LOGISTIC_QUANT8

  Additionally, added functions to plumb through quantization
  parameters.

Bug: 63905942
Test: mm
Test: end-to-end MobileNet quantized test pass

Change-Id: Ib2753c68bf2c51467ae1c158b45541bcfdf10789
diff --git a/common/CpuExecutor.cpp b/common/CpuExecutor.cpp
index 5e36da9..897105e 100644
--- a/common/CpuExecutor.cpp
+++ b/common/CpuExecutor.cpp
@@ -28,6 +28,8 @@
 static bool allocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape) {
     info->type = shape.type;
     info->dimensions = shape.dimensions;
+    info->scale = shape.scale;
+    info->offset = shape.offset;
     if (info->buffer == nullptr) {
         uint32_t length = sizeOfData(info->type, info->dimensions);
         info->buffer = new uint8_t[length];
@@ -78,6 +80,8 @@
             return false;
         }
         mOperands[i].type = from.type;
+        mOperands[i].scale = from.scale;
+        mOperands[i].offset = from.zeroPoint;
     }
 
     nnAssert(mModel->inputIndexes.size() == mRequest->inputs.size());
@@ -159,16 +163,18 @@
     auto parameterCountIs = [&ins, &outs, &operation](size_t expectedIns,
                                                       size_t expectedOuts) -> bool {
         if (ins.size() != expectedIns || outs.size() != expectedOuts) {
-            LOG(ERROR) << getOperationName(operation.type) << ": Invalid number of ins "
-                       << ins.size() << " / " << expectedIns << " and outs " << outs.size() << " / "
+            LOG(ERROR) << getOperationName(operation.opTuple.operationType)
+                       << ": Invalid number of ins "
+                       << ins.size() << " / " << expectedIns
+                       << " and outs " << outs.size() << " / "
                        << expectedOuts;
             return false;
         }
         return true;
     };
 
-    switch (operation.type) { // static_cast<OperationType>(operation.type)) {
-        case OperationType::ADD_FLOAT32: {
+    switch (operation.opTuple.operationType) {
+        case OperationType::ADD: {
             if (!parameterCountIs(2, 1)) {
                 return ANEURALNETWORKS_BAD_DATA;
             }
@@ -177,13 +183,16 @@
             RunTimeOperandInfo& out = mOperands[outs[0]];
             Shape outShape = out.shape();
 
-            success = addTensorsFloat32Prepare(in1.shape(), in2.shape(), &outShape) &&
-                    allocateIfNeeded(&out, outShape) &&
-                    addTensorsFloat32(reinterpret_cast<const float*>(in1.buffer),
-                                      reinterpret_cast<const float*>(in2.buffer),
-                                      reinterpret_cast<float*>(out.buffer), outShape);
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = addTensorsPrepare(in1.shape(), in2.shape(), &outShape) &&
+                          allocateIfNeeded(&out, outShape) &&
+                          addTensorsFloat32(reinterpret_cast<const float*>(in1.buffer),
+                                            reinterpret_cast<const float*>(in2.buffer),
+                                            reinterpret_cast<float*>(out.buffer),
+                                            outShape);
+            }
         } break;
-        case OperationType::DEPTHWISE_CONV_FLOAT32: {
+        case OperationType::DEPTHWISE_CONV: {
             if (!parameterCountIs(8, 1)) {
                 return ANEURALNETWORKS_BAD_DATA;
             }
@@ -200,23 +209,40 @@
             RunTimeOperandInfo& output = mOperands[outs[0]];
             Shape outShape = output.shape();
 
-            success = depthwiseConvFloat32Prepare(input.shape(), filter.shape(), bias.shape(),
-                                                  padding, stride_width, stride_height,
-                                                  &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      depthwiseConvFloat32(reinterpret_cast<const float*>(input.buffer),
-                                           input.shape(),
-                                           reinterpret_cast<const float*>(filter.buffer),
-                                           filter.shape(),
-                                           reinterpret_cast<const float*>(bias.buffer),
-                                           bias.shape(),
-                                           padding, stride_width, stride_height,
-                                           depth_multiplier, activation,
-                                           reinterpret_cast<float*>(output.buffer),
-                                           outShape);
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = depthwiseConvPrepare(input.shape(), filter.shape(), bias.shape(),
+                                               padding, stride_width, stride_height,
+                                               &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          depthwiseConvFloat32(reinterpret_cast<const float*>(input.buffer),
+                                               input.shape(),
+                                               reinterpret_cast<const float*>(filter.buffer),
+                                               filter.shape(),
+                                               reinterpret_cast<const float*>(bias.buffer),
+                                               bias.shape(),
+                                               padding, stride_width, stride_height,
+                                               depth_multiplier, activation,
+                                               reinterpret_cast<float*>(output.buffer),
+                                               outShape);
+            } else if (operation.opTuple.operandType == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = depthwiseConvPrepare(input.shape(), filter.shape(), bias.shape(),
+                                               padding, stride_width, stride_height,
+                                               &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          depthwiseConvQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                              input.shape(),
+                                              reinterpret_cast<const uint8_t*>(filter.buffer),
+                                              filter.shape(),
+                                              reinterpret_cast<const int32_t*>(bias.buffer),
+                                              bias.shape(),
+                                              padding, stride_width, stride_height,
+                                              depth_multiplier, activation,
+                                              reinterpret_cast<uint8_t*>(output.buffer),
+                                              outShape);
+            }
 
         } break;
-        case OperationType::CONV_FLOAT32: {
+        case OperationType::CONV: {
             if (!parameterCountIs(7, 1)) {
                 return ANEURALNETWORKS_BAD_DATA;
             }
@@ -232,22 +258,37 @@
             RunTimeOperandInfo& output = mOperands[outs[0]];
             Shape outShape = output.shape();
 
-            success = convFloat32Prepare(input.shape(), filter.shape(), bias.shape(),
-                                         padding, stride_width, stride_height,
-                                         &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      convFloat32(reinterpret_cast<const float*>(input.buffer), input.shape(),
-                                  reinterpret_cast<const float*>(filter.buffer), filter.shape(),
-                                  reinterpret_cast<const float*>(bias.buffer), bias.shape(),
-                                  padding, stride_width, stride_height, activation,
-                                  reinterpret_cast<float*>(output.buffer), outShape);
-
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = convPrepare(input.shape(), filter.shape(), bias.shape(),
+                                      padding, stride_width, stride_height,
+                                      &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          convFloat32(reinterpret_cast<const float*>(input.buffer), input.shape(),
+                                      reinterpret_cast<const float*>(filter.buffer), filter.shape(),
+                                      reinterpret_cast<const float*>(bias.buffer), bias.shape(),
+                                      padding, stride_width, stride_height, activation,
+                                      reinterpret_cast<float*>(output.buffer), outShape);
+            } else if (operation.opTuple.operandType == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = convPrepare(input.shape(), filter.shape(), bias.shape(),
+                                      padding, stride_width, stride_height,
+                                      &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          convQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                     input.shape(),
+                                     reinterpret_cast<const uint8_t*>(filter.buffer),
+                                     filter.shape(),
+                                     reinterpret_cast<const int32_t*>(bias.buffer),
+                                     bias.shape(),
+                                     padding, stride_width, stride_height, activation,
+                                     reinterpret_cast<uint8_t*>(output.buffer),
+                                     outShape);
+            }
         } break;
-        case OperationType::AVERAGE_POOL_FLOAT32: {
+        case OperationType::AVERAGE_POOL: {
             if (!parameterCountIs(7, 1)) {
                 return ANEURALNETWORKS_BAD_DATA;
             }
-            const RunTimeOperandInfo& input  = mOperands[ins[0]];
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
 
             int32_t padding          = getInt32ScalarData(mOperands[ins[1]]);
             int32_t stride_width     = getInt32ScalarData(mOperands[ins[2]]);
@@ -259,129 +300,186 @@
             RunTimeOperandInfo& output = mOperands[outs[0]];
             Shape outShape = output.shape();
 
-            success = genericPoolingFloat32Prepare(input.shape(),
-                                                   padding, stride_width, stride_height,
-                                                   filter_width, filter_height,
-                                                   &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      averagePoolFloat32(reinterpret_cast<const float*>(input.buffer), input.shape(),
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = genericPoolingPrepare(input.shape(),
+                                                padding, stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          averagePoolFloat32(reinterpret_cast<const float*>(input.buffer),
+                                             input.shape(),
+                                             padding, stride_width, stride_height,
+                                             filter_width, filter_height, activation,
+                                             reinterpret_cast<float*>(output.buffer),
+                                             outShape);
+            } else if (operation.opTuple.operandType == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericPoolingPrepare(input.shape(),
+                                                padding, stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          averagePoolQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                            input.shape(),
+                                            padding, stride_width, stride_height,
+                                            filter_width, filter_height, activation,
+                                            reinterpret_cast<uint8_t*>(output.buffer),
+                                            outShape);
+            }
+        } break;
+        case OperationType::L2_POOL: {
+            if (!parameterCountIs(7, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+
+            int32_t padding          = getInt32ScalarData(mOperands[ins[1]]);
+            int32_t stride_width     = getInt32ScalarData(mOperands[ins[2]]);
+            int32_t stride_height    = getInt32ScalarData(mOperands[ins[3]]);
+            int32_t filter_width     = getInt32ScalarData(mOperands[ins[4]]);
+            int32_t filter_height    = getInt32ScalarData(mOperands[ins[5]]);
+            int32_t activation       = getInt32ScalarData(mOperands[ins[6]]);
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = genericPoolingPrepare(input.shape(),
+                                                padding, stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          l2PoolFloat32(reinterpret_cast<const float*>(input.buffer),
+                                        input.shape(),
+                                        padding, stride_width, stride_height,
+                                        filter_width, filter_height, activation,
+                                        reinterpret_cast<float*>(output.buffer),
+                                        outShape);
+            }
+        } break;
+        case OperationType::MAX_POOL: {
+            if (!parameterCountIs(7, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+
+            int32_t padding          = getInt32ScalarData(mOperands[ins[1]]);
+            int32_t stride_width     = getInt32ScalarData(mOperands[ins[2]]);
+            int32_t stride_height    = getInt32ScalarData(mOperands[ins[3]]);
+            int32_t filter_width     = getInt32ScalarData(mOperands[ins[4]]);
+            int32_t filter_height    = getInt32ScalarData(mOperands[ins[5]]);
+            int32_t activation       = getInt32ScalarData(mOperands[ins[6]]);
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = genericPoolingPrepare(input.shape(),
+                                                padding, stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          maxPoolFloat32(reinterpret_cast<const float*>(input.buffer),
+                                         input.shape(),
                                          padding, stride_width, stride_height,
                                          filter_width, filter_height, activation,
-                                         reinterpret_cast<float*>(output.buffer), outShape);
-
-        } break;
-        case OperationType::L2_POOL_FLOAT32: {
-            if (!parameterCountIs(7, 1)) {
-                return ANEURALNETWORKS_BAD_DATA;
+                                         reinterpret_cast<float*>(output.buffer),
+                                         outShape);
+            } else if (operation.opTuple.operandType == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericPoolingPrepare(input.shape(),
+                                                padding, stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          maxPoolQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                        input.shape(),
+                                        padding, stride_width, stride_height,
+                                        filter_width, filter_height, activation,
+                                        reinterpret_cast<uint8_t*>(output.buffer),
+                                        outShape);
             }
-            const RunTimeOperandInfo& input  = mOperands[ins[0]];
-
-            int32_t padding          = getInt32ScalarData(mOperands[ins[1]]);
-            int32_t stride_width     = getInt32ScalarData(mOperands[ins[2]]);
-            int32_t stride_height    = getInt32ScalarData(mOperands[ins[3]]);
-            int32_t filter_width     = getInt32ScalarData(mOperands[ins[4]]);
-            int32_t filter_height    = getInt32ScalarData(mOperands[ins[5]]);
-            int32_t activation       = getInt32ScalarData(mOperands[ins[6]]);
-
-            RunTimeOperandInfo& output = mOperands[outs[0]];
-            Shape outShape = output.shape();
-
-            success = genericPoolingFloat32Prepare(input.shape(),
-                                                   padding, stride_width, stride_height,
-                                                   filter_width, filter_height,
-                                                   &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      l2PoolFloat32(reinterpret_cast<const float*>(input.buffer), input.shape(),
-                                    padding, stride_width, stride_height,
-                                    filter_width, filter_height, activation,
-                                    reinterpret_cast<float*>(output.buffer), outShape);
 
         } break;
-        case OperationType::MAX_POOL_FLOAT32: {
-            if (!parameterCountIs(7, 1)) {
-                return ANEURALNETWORKS_BAD_DATA;
-            }
-            const RunTimeOperandInfo& input  = mOperands[ins[0]];
-
-            int32_t padding          = getInt32ScalarData(mOperands[ins[1]]);
-            int32_t stride_width     = getInt32ScalarData(mOperands[ins[2]]);
-            int32_t stride_height    = getInt32ScalarData(mOperands[ins[3]]);
-            int32_t filter_width     = getInt32ScalarData(mOperands[ins[4]]);
-            int32_t filter_height    = getInt32ScalarData(mOperands[ins[5]]);
-            int32_t activation       = getInt32ScalarData(mOperands[ins[6]]);
-
-            RunTimeOperandInfo& output = mOperands[outs[0]];
-            Shape outShape = output.shape();
-
-            success = genericPoolingFloat32Prepare(input.shape(),
-                                                   padding, stride_width, stride_height,
-                                                   filter_width, filter_height,
-                                                   &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      maxPoolFloat32(reinterpret_cast<const float*>(input.buffer), input.shape(),
-                                     padding, stride_width, stride_height,
-                                     filter_width, filter_height, activation,
-                                     reinterpret_cast<float*>(output.buffer), outShape);
-
-        } break;
-        case OperationType::RELU_FLOAT32: {
+        case OperationType::RELU: {
             if (!parameterCountIs(1, 1)) {
                 return ANEURALNETWORKS_BAD_DATA;
             }
-            const RunTimeOperandInfo& input  = mOperands[ins[0]];
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
             RunTimeOperandInfo& output = mOperands[outs[0]];
             Shape outShape = output.shape();
 
-            success = genericActivationFloat32Prepare(input.shape(), &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      reluFloat32(reinterpret_cast<const float*>(input.buffer), input.shape(),
-                                  reinterpret_cast<float*>(output.buffer), outShape);
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          reluFloat32(reinterpret_cast<const float*>(input.buffer),
+                                      input.shape(),
+                                      reinterpret_cast<float*>(output.buffer),
+                                      outShape);
+            }
         } break;
-        case OperationType::RELU6_FLOAT32: {
+        case OperationType::RELU6: {
             if (!parameterCountIs(1, 1)) {
                 return ANEURALNETWORKS_BAD_DATA;
             }
-            const RunTimeOperandInfo& input  = mOperands[ins[0]];
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
             RunTimeOperandInfo& output = mOperands[outs[0]];
             Shape outShape = output.shape();
 
-            success = genericActivationFloat32Prepare(input.shape(), &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      relu6Float32(reinterpret_cast<const float*>(input.buffer), input.shape(),
-                                   reinterpret_cast<float*>(output.buffer), outShape);
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          relu6Float32(reinterpret_cast<const float*>(input.buffer),
+                                       input.shape(),
+                                       reinterpret_cast<float*>(output.buffer),
+                                       outShape);
+            }
         } break;
-        case OperationType::TANH_FLOAT32: {
+        case OperationType::TANH: {
             if (!parameterCountIs(1, 1)) {
                 return ANEURALNETWORKS_BAD_DATA;
             }
-            const RunTimeOperandInfo& input  = mOperands[ins[0]];
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
             RunTimeOperandInfo& output = mOperands[outs[0]];
             Shape outShape = output.shape();
 
-            success = genericActivationFloat32Prepare(input.shape(), &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      tanhFloat32(reinterpret_cast<const float*>(input.buffer), input.shape(),
-                                  reinterpret_cast<float*>(output.buffer), outShape);
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          tanhFloat32(reinterpret_cast<const float*>(input.buffer),
+                                      input.shape(),
+                                      reinterpret_cast<float*>(output.buffer),
+                                      outShape);
+            }
         } break;
-        case OperationType::LOGISTIC_FLOAT32: {
+        case OperationType::LOGISTIC: {
             if (!parameterCountIs(1, 1)) {
                 return ANEURALNETWORKS_BAD_DATA;
             }
-            const RunTimeOperandInfo& input  = mOperands[ins[0]];
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
             RunTimeOperandInfo& output = mOperands[outs[0]];
             Shape outShape = output.shape();
 
-            success = genericActivationFloat32Prepare(input.shape(), &outShape) &&
-                      allocateIfNeeded(&output, outShape) &&
-                      logisticFloat32(reinterpret_cast<const float*>(input.buffer), input.shape(),
-                                      reinterpret_cast<float*>(output.buffer), outShape);
+            if (operation.opTuple.operandType == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          logisticFloat32(reinterpret_cast<const float*>(input.buffer),
+                                          input.shape(),
+                                          reinterpret_cast<float*>(output.buffer),
+                                          outShape);
+            } else if (operation.opTuple.operandType == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          allocateIfNeeded(&output, outShape) &&
+                          logisticQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                         input.shape(),
+                                         reinterpret_cast<uint8_t*>(output.buffer),
+                                         outShape);
+            }
         } break;
         default:
             nnAssert(false);
             break;
     }
     if (!success) {
-        LOG(ERROR) << getOperationName(operation.type) << " failed.";
+        LOG(ERROR) << getOperationName(operation.opTuple.operationType) << " failed.";
         return ANEURALNETWORKS_OP_FAILED;
     }
 
diff --git a/common/OperationsUtils.cpp b/common/OperationsUtils.cpp
index 401f7e6..b2ce63a 100644
--- a/common/OperationsUtils.cpp
+++ b/common/OperationsUtils.cpp
@@ -17,8 +17,11 @@
 #define LOG_TAG "OperationsUtils"
 
 #include "OperationsUtils.h"
+#include "Operations.h"
 #include "Utils.h"
 
+#include <cmath>
+
 namespace android {
 namespace nn {
 
@@ -62,5 +65,102 @@
     return shape.dimensions[dimensionIdx];
 }
 
+
+void QuantizeMultiplierSmallerThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int32_t* right_shift) {
+    CHECK(double_multiplier >= 0.);
+    CHECK(double_multiplier < 1.);
+    if (double_multiplier == 0.) {
+        *quantized_multiplier = 0;
+        *right_shift = 0;
+        return;
+    }
+    CHECK(double_multiplier > 0.);
+    const double q = std::frexp(double_multiplier, right_shift);
+    *right_shift *= -1;
+    int64_t q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+    CHECK(q_fixed <= (1ll << 31));
+    if (q_fixed == (1ll << 31)) {
+        q_fixed /= 2;
+        --*right_shift;
+    }
+    CHECK_GE(*right_shift, 0);
+    CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+    *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift) {
+    CHECK(double_multiplier > 1.);
+    const double q = std::frexp(double_multiplier, left_shift);
+    int64_t q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+    CHECK(q_fixed <= (1ll << 31));
+    if (q_fixed == (1ll << 31)) {
+        q_fixed /= 2;
+        ++*left_shift;
+    }
+    CHECK_GE(*left_shift, 0);
+    CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+    *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void GetQuantizedConvolutionMultipler(const Shape& inputShape,
+                                      const Shape& filterShape,
+                                      const Shape& biasShape,
+                                      const Shape& outputShape,
+                                      float* multiplier) {
+    const float input_product_scale = inputShape.scale * filterShape.scale;
+    const float bias_scale = biasShape.scale;
+    const float output_scale = outputShape.scale;
+
+    // The following conditions must be guaranteed by the training pipeline.
+    CHECK(std::abs(input_product_scale - bias_scale) <=
+              1e-6 * std::min(input_product_scale, bias_scale));
+    CHECK(input_product_scale >= 0);
+    CHECK(input_product_scale < output_scale);
+    *multiplier = input_product_scale / output_scale;
+}
+
+void CalculateActivationRangeUint8(int32_t activation,
+                                   const Shape& outputShape,
+                                   int32_t* act_min,
+                                   int32_t* act_max) {
+    const int32_t qmin = std::numeric_limits<uint8_t>::min();
+    const int32_t qmax = std::numeric_limits<uint8_t>::max();
+
+    const auto scale = outputShape.scale;
+    const auto zero_point = outputShape.offset;
+
+    auto quantize = [scale, zero_point](float f) {
+        return zero_point + static_cast<int32_t>(std::round(f / scale));
+    };
+
+    if (activation == kActivationRelu) {
+        *act_min = std::max(qmin, quantize(0.0));
+        *act_max = qmax;
+    } else if (activation == kActivationRelu6) {
+        *act_min = std::max(qmin, quantize(0.0));
+        *act_max = std::min(qmax, quantize(6.0));
+    } else if (activation == kActivationRelu1) {
+        *act_min = std::max(qmin, quantize(-1.0));
+        *act_max = std::min(qmax, quantize(1.0));
+    } else {
+        *act_min = qmin;
+        *act_max = qmax;
+    }
+}
+
+int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift) {
+    const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
+                                      (1ll << (31 - input_integer_bits)) /
+                                      (1ll << input_left_shift);
+    // Tighten bound using floor.  Suppose that we could use the exact value.
+    // After scaling the difference, the result would be at the maximum.  Thus we
+    // must ensure that our value has lower magnitude.
+    return static_cast<int32_t>(std::floor(max_input_rescaled));
+}
+
 } // namespace nn
 } // namespace android
diff --git a/common/Utils.cpp b/common/Utils.cpp
index a050dfd..5a4bf9a 100644
--- a/common/Utils.cpp
+++ b/common/Utils.cpp
@@ -37,7 +37,7 @@
         "UINT32",
         "TENSOR_FLOAT16",
         "TENSOR_FLOAT32",
-        "TENSOR_SIMMETRICAL_QUANT8",
+        "TENSOR_QUANT8_ASYMM",
 };
 
 // TODO Check if this useful
@@ -47,39 +47,39 @@
 };
 
 const char* kOperationNames[ANEURALNETWORKS_NUMBER_OPERATION_TYPES] = {
-        "AVERAGE_POOL_FLOAT32",
-        "CONCATENATION_FLOAT32",
-        "CONV_FLOAT32",
-        "DEPTHWISE_CONV_FLOAT32",
-        "MAX_POOL_FLOAT32",
-        "L2_POOL_FLOAT32",
-        "DEPTH_TO_SPACE_FLOAT32",
-        "SPACE_TO_DEPTH_FLOAT32",
-        "LOCAL_RESPONSE_NORMALIZATION_FLOAT32",
-        "SOFTMAX_FLOAT32",
-        "RESHAPE_FLOAT32",
-        "SPLIT_FLOAT32",
-        "FAKE_QUANT_FLOAT32",
-        "ADD_FLOAT32",
-        "FULLY_CONNECTED_FLOAT32",
-        "CAST_FLOAT32",
-        "MUL_FLOAT32",
-        "L2_NORMALIZATION_FLOAT32",
-        "LOGISTIC_FLOAT32",
-        "RELU_FLOAT32",
-        "RELU6_FLOAT32",
-        "RELU1_FLOAT32",
-        "TANH_FLOAT32",
-        "DEQUANTIZE_FLOAT32",
-        "FLOOR_FLOAT32",
-        "GATHER_FLOAT32",
-        "RESIZE_BILINEAR_FLOAT32",
-        "LSH_PROJECTION_FLOAT32",
-        "LSTM_FLOAT32",
-        "SVDF_FLOAT32",
-        "RNN_FLOAT32",
-        "N_GRAM_FLOAT32",
-        "LOOKUP_FLOAT32",
+        "AVERAGE_POOL",
+        "CONCATENATION",
+        "CONV",
+        "DEPTHWISE_CONV",
+        "MAX_POOL",
+        "L2_POOL",
+        "DEPTH_TO_SPACE",
+        "SPACE_TO_DEPTH",
+        "LOCAL_RESPONSE_NORMALIZATION",
+        "SOFTMAX",
+        "RESHAPE",
+        "SPLIT",
+        "FAKE_QUANT",
+        "ADD",
+        "FULLY_CONNECTED",
+        "CAST",
+        "MUL",
+        "L2_NORMALIZATION",
+        "LOGISTIC",
+        "RELU",
+        "RELU6",
+        "RELU1",
+        "TANH",
+        "DEQUANTIZE",
+        "FLOOR",
+        "GATHER",
+        "RESIZE_BILINEAR",
+        "LSH_PROJECTION",
+        "LSTM",
+        "SVDF",
+        "RNN",
+        "N_GRAM",
+        "LOOKUP",
 };
 
 const char* getOperationName(OperationType type) {
@@ -191,8 +191,8 @@
 
 static bool validOperations(const hidl_vec<Operation>& operations, size_t operandCount) {
     for (auto& op : operations) {
-        if (static_cast<uint32_t>(op.type) >= HAL_NUM_OPERATION_TYPES) {
-            LOG(ERROR) << "Invalid operation type " << toString(op.type);
+        if (static_cast<uint32_t>(op.opTuple.operationType) >= HAL_NUM_OPERATION_TYPES) {
+            LOG(ERROR) << "Invalid operation type " << toString(op.opTuple.operationType);
             return false;
         }
         if (!validOperandIndexes(op.inputs, operandCount) ||
diff --git a/common/include/CpuExecutor.h b/common/include/CpuExecutor.h
index ae49406..8065e00 100644
--- a/common/include/CpuExecutor.h
+++ b/common/include/CpuExecutor.h
@@ -36,6 +36,9 @@
     // to pass together with the dimension to the functions implementing
     // the operators.
     std::vector<uint32_t> dimensions;
+
+    float scale;
+    int32_t offset;
     // Where the operand's data is stored.  Check the corresponding
     // location information in the model to figure out if this points
     // to memory we have allocated for an temporary operand.
@@ -50,7 +53,9 @@
 
     Shape shape() const {
         return Shape{.type = type,
-                     .dimensions = dimensions};
+                     .dimensions = dimensions,
+                     .scale = scale,
+                     .offset = offset};
     }
 };
 
diff --git a/common/include/HalInterfaces.h b/common/include/HalInterfaces.h
index 8e341ab..670d826 100644
--- a/common/include/HalInterfaces.h
+++ b/common/include/HalInterfaces.h
@@ -40,6 +40,7 @@
 using ::android::hardware::neuralnetworks::V1_0::Operand;
 using ::android::hardware::neuralnetworks::V1_0::OperandType;
 using ::android::hardware::neuralnetworks::V1_0::Operation;
+using ::android::hardware::neuralnetworks::V1_0::OperationTuple;
 using ::android::hardware::neuralnetworks::V1_0::OperationType;
 using ::android::hardware::neuralnetworks::V1_0::PerformanceInfo;
 using ::android::hardware::neuralnetworks::V1_0::Request;
diff --git a/common/include/Operations.h b/common/include/Operations.h
index 3774355..bfe5b06 100644
--- a/common/include/Operations.h
+++ b/common/include/Operations.h
@@ -33,13 +33,14 @@
 enum ActivationFn {
     kActivationNone = 0,
     kActivationRelu = 1,
+    kActivationRelu1 = 2,
     kActivationRelu6 = 3,
 };
 
-bool addTensorsFloat32Prepare(const Shape& in1, const Shape& in2, Shape* out1);
+bool addTensorsPrepare(const Shape& in1, const Shape& in2, Shape* out1);
 bool addTensorsFloat32(const float* in1, const float* in2, float* out, const Shape& shape);
 
-bool depthwiseConvFloat32Prepare(const Shape& input,
+bool depthwiseConvPrepare(const Shape& input,
                                  const Shape& filter,
                                  const Shape& bias,
                                  int32_t padding,
@@ -51,28 +52,45 @@
                           int32_t padding, int32_t stride_width, int32_t stride_height,
                           int32_t depth_multiplier, int32_t activation,
                           float* outputData, const Shape& outputShape);
+bool depthwiseConvQuant8(const uint8_t* inputData, const Shape& inputShape,
+                         const uint8_t* filterData, const Shape& filterShape,
+                         const int32_t* biasData, const Shape& biasShape,
+                         int32_t padding, int32_t stride_width, int32_t stride_height,
+                         int32_t depth_multiplier, int32_t activation,
+                         uint8_t* outputData, const Shape& outputShape);
 
-bool convFloat32Prepare(const Shape& input,
-                        const Shape& filter,
-                        const Shape& bias,
-                        int32_t padding,
-                        int32_t stride_width, int32_t stride_height,
-                        Shape* output);
+bool convPrepare(const Shape& input,
+                 const Shape& filter,
+                 const Shape& bias,
+                 int32_t padding,
+                 int32_t stride_width, int32_t stride_height,
+                 Shape* output);
 bool convFloat32(const float* inputData, const Shape& inputShape,
                  const float* filterData, const Shape& filterShape,
                  const float* biasData, const Shape& biasShape,
-                 int32_t padding, int32_t stride_width, int32_t stride_height, int32_t activation,
+                 int32_t padding, int32_t stride_width, int32_t stride_height,
+                 int32_t activation,
                  float* outputData, const Shape& outputShape);
+bool convQuant8(const uint8_t* inputData, const Shape& inputShape,
+                const uint8_t* filterData, const Shape& filterShape,
+                const int32_t* biasData, const Shape& biasShape,
+                int32_t padding, int32_t stride_width, int32_t stride_height,
+                int32_t activation,
+                uint8_t* outputData, const Shape& outputShape);
 
-bool genericPoolingFloat32Prepare(const Shape& input,
-                                  int32_t padding,
-                                  int32_t stride_width, int32_t stride_height,
-                                  int32_t filter_width, int32_t filter_height,
-                                  Shape* output);
+bool genericPoolingPrepare(const Shape& input,
+                           int32_t padding,
+                           int32_t stride_width, int32_t stride_height,
+                           int32_t filter_width, int32_t filter_height,
+                           Shape* output);
 bool averagePoolFloat32(const float* inputData, const Shape& inputShape,
                         int32_t padding, int32_t stride_width, int32_t stride_height,
                         int32_t filter_width, int32_t filter_height, int32_t activation,
                         float* outputData, const Shape& outputShape);
+bool averagePoolQuant8(const uint8_t* inputData, const Shape& inputShape,
+                       int32_t padding, int32_t stride_width, int32_t stride_height,
+                       int32_t filter_width, int32_t filter_height, int32_t activation,
+                       uint8_t* outputData, const Shape& outputShape);
 bool l2PoolFloat32(const float* inputData, const Shape& inputShape,
                    int32_t padding, int32_t stride_width, int32_t stride_height,
                    int32_t filter_width, int32_t filter_height, int32_t activation,
@@ -81,8 +99,12 @@
                     int32_t padding, int32_t stride_width, int32_t stride_height,
                     int32_t filter_width, int32_t filter_height, int32_t activation,
                     float* outputData, const Shape& outputShape);
+bool maxPoolQuant8(const uint8_t* inputData, const Shape& inputShape,
+                   int32_t padding, int32_t stride_width, int32_t stride_height,
+                   int32_t filter_width, int32_t filter_height, int32_t activation,
+                   uint8_t* outputData, const Shape& outputShape);
 
-bool genericActivationFloat32Prepare(const Shape& input, Shape* output);
+bool genericActivationPrepare(const Shape& input, Shape* output);
 bool reluFloat32(const float* inputData, const Shape& inputShape,
                  float* outputData, const Shape& outputShape);
 bool relu6Float32(const float* inputData, const Shape& inputShape,
@@ -91,7 +113,8 @@
                  float* outputData, const Shape& outputShape);
 bool logisticFloat32(const float* inputData, const Shape& inputShape,
                      float* outputData, const Shape& outputShape);
-
+bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape,
+                    uint8_t* outputData, const Shape& outputShape);
 } // namespace nn
 } // namespace android
 
diff --git a/common/include/OperationsUtils.h b/common/include/OperationsUtils.h
index 586046a..3be7479 100644
--- a/common/include/OperationsUtils.h
+++ b/common/include/OperationsUtils.h
@@ -29,6 +29,8 @@
 struct Shape {
     OperandType type;
     std::vector<uint32_t> dimensions;
+    float scale;
+    int32_t offset;
 };
 
 // Verifies that the two shapes are the same.
@@ -47,9 +49,35 @@
 
 inline uint32_t ComputePadding(uint32_t stride, uint32_t in_size, uint32_t filter_size,
                                uint32_t out_size) {
-  return ((out_size - 1) * stride + filter_size - in_size) / 2;
+    uint32_t tmp = (out_size - 1) * stride + filter_size;
+    if (tmp > in_size) {
+        return (tmp - in_size) / 2;
+    } else {
+        return 0;
+    }
 }
 
+void QuantizeMultiplierSmallerThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int32_t* right_shift);
+
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift);
+
+void GetQuantizedConvolutionMultipler(const Shape& inputShape,
+                                      const Shape& filterShape,
+                                      const Shape& biasShape,
+                                      const Shape& outputShape,
+                                      float* multiplier);
+
+void CalculateActivationRangeUint8(int32_t activation,
+                                   const Shape& outputShape,
+                                   int32_t* act_min,
+                                   int32_t* act_max);
+
+int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift);
+
 } // namespace nn
 } // namespace android
 
diff --git a/common/operations/Activation.cpp b/common/operations/Activation.cpp
index c96584b..224b6b7 100644
--- a/common/operations/Activation.cpp
+++ b/common/operations/Activation.cpp
@@ -18,13 +18,12 @@
 #include "OperationsUtils.h"
 
 #include "internal/optimized/optimized_ops.h"
-#include "internal/reference/reference_ops.h"
 
 namespace android {
 namespace nn {
 
-bool genericActivationFloat32Prepare(const Shape& input,
-                                     Shape* output) {
+bool genericActivationPrepare(const Shape& input,
+                              Shape* output) {
     DCHECK_EQ(getNumberOfDimensions(input), 4);
     return SetShape(input, output);
 }
@@ -65,5 +64,31 @@
     return true;
 }
 
+bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape,
+                    uint8_t* outputData, const Shape& outputShape) {
+    int numElements = getNumberOfElements(inputShape);
+    static constexpr int kInputIntegerBits = 4;
+
+    const double input_real_multiplier =
+            inputShape.scale *
+            static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    int32_t input_multiplier = 0;
+    int32_t input_left_shift = 0;
+    QuantizeMultiplierGreaterThanOne(input_real_multiplier,
+                                     &input_multiplier,
+                                     &input_left_shift);
+    int32_t input_range_radius =
+            CalculateInputRadius(kInputIntegerBits, input_left_shift);
+
+    optimized_ops::Logistic(
+            inputData, convertShapeToDims(inputShape),
+            inputShape.offset, input_range_radius,
+            input_multiplier, input_left_shift,
+            outputData, convertShapeToDims(outputShape));
+
+    return true;
+}
+
 }  // namespace nn
 }  // namespace android
diff --git a/common/operations/Conv2D.cpp b/common/operations/Conv2D.cpp
index aff2dfb..c68cc93 100644
--- a/common/operations/Conv2D.cpp
+++ b/common/operations/Conv2D.cpp
@@ -18,7 +18,6 @@
 #include "OperationsUtils.h"
 
 #include "internal/optimized/optimized_ops.h"
-#include "internal/reference/reference_ops.h"
 
 namespace android {
 namespace nn {
@@ -27,12 +26,12 @@
 static constexpr int kStaticBufferSize = 1605632;
 static char static_scratch_buffer[kStaticBufferSize];
 
-bool convFloat32Prepare(const Shape& input,
-                        const Shape& filter,
-                        const Shape& bias,
-                        int32_t padding,
-                        int32_t stride_width, int32_t stride_height,
-                        Shape* output) {
+bool convPrepare(const Shape& input,
+                 const Shape& filter,
+                 const Shape& bias,
+                 int32_t padding,
+                 int32_t stride_width, int32_t stride_height,
+                 Shape* output) {
     DCHECK_EQ(getNumberOfDimensions(input), 4);
     DCHECK_EQ(getNumberOfDimensions(filter), 4);
     DCHECK_EQ(getNumberOfDimensions(bias), 1);
@@ -66,45 +65,50 @@
     return true;
 }
 
+#define ANDROID_NN_CONV_PARAMETERS(Type)                                        \
+    uint32_t height       = getSizeOfDimension(inputShape, 1);                  \
+    uint32_t width        = getSizeOfDimension(inputShape, 2);                  \
+    uint32_t filterHeight = getSizeOfDimension(filterShape, 1);                 \
+    uint32_t filterWidth  = getSizeOfDimension(filterShape, 2);                 \
+    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);                 \
+    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);                 \
+    uint32_t inDepth      = getSizeOfDimension(inputShape, 3);                  \
+                                                                                \
+    uint32_t paddingHeight =                                                    \
+            ComputePadding(stride_height, height, filterHeight, outHeight);     \
+    uint32_t paddingWidth =                                                     \
+            ComputePadding(stride_width, width, filterWidth, outWidth);         \
+                                                                                \
+    Dims<4> im2colDim;                                                          \
+    im2colDim.sizes[3] = (int)getSizeOfDimension(outputShape, 0);               \
+    im2colDim.sizes[2] = (int)getSizeOfDimension(outputShape, 1);               \
+    im2colDim.sizes[1] = (int)getSizeOfDimension(outputShape, 2);               \
+    im2colDim.sizes[0] = (int)inDepth * filterHeight * filterWidth;             \
+                                                                                \
+    im2colDim.strides[0] = 1;                                                   \
+    for (int i=1; i<4; i++) {                                                   \
+        im2colDim.strides[i] = im2colDim.strides[i-1] * im2colDim.sizes[i-1];   \
+    }                                                                           \
+                                                                                \
+    Type* im2colData = nullptr;                                                 \
+    int im2colByteSize = sizeof(Type);                                          \
+    for (int i=0; i<4; i++) {                                                   \
+        im2colByteSize *= im2colDim.sizes[i];                                   \
+    }                                                                           \
+    if (im2colByteSize <= kStaticBufferSize) {                                  \
+        im2colData = reinterpret_cast<Type *>(static_scratch_buffer);           \
+    } else {                                                                    \
+        im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)];    \
+    }
+
+
 bool convFloat32(const float* inputData, const Shape& inputShape,
                  const float* filterData, const Shape& filterShape,
                  const float* biasData, const Shape& biasShape,
                  int32_t padding, int32_t stride_width, int32_t stride_height, int32_t activation,
                  float* outputData, const Shape& outputShape) {
-    uint32_t height       = getSizeOfDimension(inputShape, 1);
-    uint32_t width        = getSizeOfDimension(inputShape, 2);
-    uint32_t filterHeight = getSizeOfDimension(filterShape, 1);
-    uint32_t filterWidth  = getSizeOfDimension(filterShape, 2);
-    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);
-    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);
-    uint32_t inDepth      = getSizeOfDimension(inputShape, 3);
 
-    uint32_t paddingHeight =
-            ComputePadding(stride_height, height, filterHeight, outHeight);
-    uint32_t paddingWidth =
-            ComputePadding(stride_width, width, filterWidth, outWidth);
-
-    Dims<4> im2colDim;
-    im2colDim.sizes[3] = (int)getSizeOfDimension(outputShape, 0);
-    im2colDim.sizes[2] = (int)getSizeOfDimension(outputShape, 1);
-    im2colDim.sizes[1] = (int)getSizeOfDimension(outputShape, 2);
-    im2colDim.sizes[0] = (int)inDepth * filterHeight * filterWidth;
-
-    im2colDim.strides[0] = 1;
-    for (int i=1; i<4; i++) {
-        im2colDim.strides[i] = im2colDim.strides[i-1] * im2colDim.sizes[i-1];
-    }
-
-    float* im2colData = nullptr;
-    int im2colByteSize = sizeof(float);
-    for (int i=0; i<4; i++) {
-        im2colByteSize *= im2colDim.sizes[i];
-    }
-    if (im2colByteSize <= kStaticBufferSize) {
-        im2colData = reinterpret_cast<float *>(static_scratch_buffer);
-    } else {
-        im2colData = new (std::nothrow) float[im2colByteSize / sizeof(float)];
-    }
+    ANDROID_NN_CONV_PARAMETERS(float)
 
     #define ANDROID_NN_CONV(activation)                                        \
         optimized_ops::Conv<FusedActivationFunctionType::activation>(          \
@@ -133,5 +137,62 @@
     return true;
 }
 
+bool convQuant8(const uint8_t* inputData, const Shape& inputShape,
+                const uint8_t* filterData, const Shape& filterShape,
+                const int32_t* biasData, const Shape& biasShape,
+                int32_t padding, int32_t stride_width, int32_t stride_height, int32_t activation,
+                uint8_t* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_CONV_PARAMETERS(uint8_t)
+
+    float real_multiplier = 0.0;
+    int32_t output_multiplier = 0;
+    int32_t output_shift = 0;
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+    GetQuantizedConvolutionMultipler(inputShape, filterShape, biasShape,
+                                     outputShape, &real_multiplier);
+    QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+                                     &output_shift);
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    static gemmlowp::GemmContext gemm_context;
+
+    int32_t inputOffset = -inputShape.offset;
+    int32_t filterOffset = -filterShape.offset;
+    int32_t outputOffset = outputShape.offset;
+    #define ANDROID_NN_CONV(activation)                                        \
+        optimized_ops::Conv<FusedActivationFunctionType::activation>(          \
+            inputData, convertShapeToDims(inputShape), inputOffset,            \
+            filterData, convertShapeToDims(filterShape), filterOffset,         \
+            biasData, convertShapeToDims(biasShape),                           \
+            stride_width, paddingWidth, paddingHeight,                         \
+            outputOffset, output_multiplier, output_shift,                     \
+            output_activation_min, output_activation_max,                      \
+            outputData, convertShapeToDims(outputShape),                       \
+            im2colData, im2colDim, &gemm_context)
+
+    if (activation == kActivationNone) {
+        ANDROID_NN_CONV(kNone);
+    }
+    if (activation == kActivationRelu) {
+        ANDROID_NN_CONV(kRelu);
+    }
+    if (activation == kActivationRelu6) {
+        ANDROID_NN_CONV(kRelu6);
+    }
+
+    #undef ANDROID_NN_CONV
+
+    if (im2colByteSize > kStaticBufferSize) {
+        delete[] im2colData;
+    }
+    return true;
+}
+
+#undef ANDROID_NN_CONV_PARAMETERS
 }  // namespace nn
 }  // namespace android
diff --git a/common/operations/DepthwiseConv2D.cpp b/common/operations/DepthwiseConv2D.cpp
index 2ab4c15..68012f2 100644
--- a/common/operations/DepthwiseConv2D.cpp
+++ b/common/operations/DepthwiseConv2D.cpp
@@ -18,17 +18,17 @@
 #include "OperationsUtils.h"
 
 #include "internal/optimized/depthwiseconv_float.h"
-#include "internal/reference/depthwiseconv_float.h"
+#include "internal/optimized/depthwiseconv_uint8.h"
 
 namespace android {
 namespace nn {
 
-bool depthwiseConvFloat32Prepare(const Shape& input,
-                                 const Shape& filter,
-                                 const Shape& bias,
-                                 int32_t padding,
-                                 int32_t stride_width, int32_t stride_height,
-                                 Shape* output) {
+bool depthwiseConvPrepare(const Shape& input,
+                          const Shape& filter,
+                          const Shape& bias,
+                          int32_t padding,
+                          int32_t stride_width, int32_t stride_height,
+                          Shape* output) {
     DCHECK_EQ(getNumberOfDimensions(input), 4);
     DCHECK_EQ(getNumberOfDimensions(filter), 4);
     DCHECK_EQ(getNumberOfDimensions(bias), 1);
@@ -61,23 +61,28 @@
     return true;
 }
 
+
+#define ANDROID_NN_DEPTHWISE_CONV_PARAMETERS                                    \
+    uint32_t height       = getSizeOfDimension(inputShape, 1);                  \
+    uint32_t width        = getSizeOfDimension(inputShape, 2);                  \
+    uint32_t filterHeight = getSizeOfDimension(filterShape, 1);                 \
+    uint32_t filterWidth  = getSizeOfDimension(filterShape, 2);                 \
+    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);                 \
+    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);                 \
+                                                                                \
+    uint32_t paddingHeight =                                                    \
+            ComputePadding(stride_height, height, filterHeight, outHeight);     \
+    uint32_t paddingWidth =                                                     \
+            ComputePadding(stride_width, width, filterWidth, outWidth);
+
 bool depthwiseConvFloat32(const float* inputData, const Shape& inputShape,
                           const float* filterData, const Shape& filterShape,
                           const float* biasData, const Shape& biasShape,
                           int32_t padding, int32_t stride_width, int32_t stride_height,
                           int32_t depth_multiplier, int32_t activation,
                           float* outputData, const Shape& outputShape) {
-    uint32_t height       = getSizeOfDimension(inputShape, 1);
-    uint32_t width        = getSizeOfDimension(inputShape, 2);
-    uint32_t filterHeight = getSizeOfDimension(filterShape, 1);
-    uint32_t filterWidth  = getSizeOfDimension(filterShape, 2);
-    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);
-    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);
 
-    uint32_t paddingHeight =
-            ComputePadding(stride_height, height, filterHeight, outHeight);
-    uint32_t paddingWidth =
-            ComputePadding(stride_width, width, filterWidth, outWidth);
+    ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
 
     #define ANDROID_NN_DEPTHWISE_CONV(activation)                              \
         optimized_ops::DepthwiseConv<FusedActivationFunctionType::activation>( \
@@ -102,5 +107,58 @@
     return true;
 }
 
+
+bool depthwiseConvQuant8(const uint8_t* inputData, const Shape& inputShape,
+                         const uint8_t* filterData, const Shape& filterShape,
+                         const int32_t* biasData, const Shape& biasShape,
+                         int32_t padding, int32_t stride_width, int32_t stride_height,
+                         int32_t depth_multiplier, int32_t activation,
+                         uint8_t* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
+
+    float real_multiplier = 0.0;
+    int32_t output_multiplier = 0;
+    int32_t output_shift = 0;
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+    GetQuantizedConvolutionMultipler(inputShape, filterShape, biasShape,
+                                     outputShape, &real_multiplier);
+    QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+                                     &output_shift);
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    uint32_t inputOffset = -inputShape.offset;
+    uint32_t filterOffset = -filterShape.offset;
+    uint32_t outputOffset = outputShape.offset;
+    #define ANDROID_NN_DEPTHWISE_CONV(activation)                              \
+        optimized_ops::DepthwiseConv<FusedActivationFunctionType::activation>( \
+            inputData, convertShapeToDims(inputShape), inputOffset,            \
+            filterData, convertShapeToDims(filterShape), filterOffset,         \
+            biasData, convertShapeToDims(biasShape),                           \
+            stride_width, paddingWidth, paddingHeight, depth_multiplier,       \
+            outputOffset, output_multiplier, output_shift,                     \
+            output_activation_min, output_activation_max,                      \
+            outputData, convertShapeToDims(outputShape))
+
+    if (activation == kActivationNone) {
+        ANDROID_NN_DEPTHWISE_CONV(kNone);
+    }
+    if (activation == kActivationRelu) {
+        ANDROID_NN_DEPTHWISE_CONV(kRelu);
+    }
+    if (activation == kActivationRelu6) {
+        ANDROID_NN_DEPTHWISE_CONV(kRelu6);
+    }
+
+    #undef ANDROID_NN_DEPTHWISE_CONV
+
+    return true;
+}
+
+#undef ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
 }  // namespace nn
 }  // namespace android
diff --git a/common/operations/Pooling.cpp b/common/operations/Pooling.cpp
index 0a328b3..bb20be5 100644
--- a/common/operations/Pooling.cpp
+++ b/common/operations/Pooling.cpp
@@ -18,16 +18,15 @@
 #include "OperationsUtils.h"
 
 #include "internal/optimized/optimized_ops.h"
-#include "internal/reference/reference_ops.h"
 
 namespace android {
 namespace nn {
 
-bool genericPoolingFloat32Prepare(const Shape& input,
-                                  int32_t padding,
-                                  int32_t stride_width, int32_t stride_height,
-                                  int32_t filter_width, int32_t filter_height,
-                                  Shape* output) {
+bool genericPoolingPrepare(const Shape& input,
+                           int32_t padding,
+                           int32_t stride_width, int32_t stride_height,
+                           int32_t filter_width, int32_t filter_height,
+                           Shape* output) {
     DCHECK_EQ(getNumberOfDimensions(input), 4);
     DCHECK_EQ(stride_width, stride_height);
 
@@ -54,19 +53,23 @@
     return true;
 }
 
+#define ANDROID_NN_POOLING_PARAMETERS                                           \
+    uint32_t height       = getSizeOfDimension(inputShape, 1);                  \
+    uint32_t width        = getSizeOfDimension(inputShape, 2);                  \
+    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);                 \
+    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);                 \
+                                                                                \
+    uint32_t paddingHeight =                                                    \
+            ComputePadding(stride_height, height, filter_height, outHeight);    \
+    uint32_t paddingWidth =                                                     \
+            ComputePadding(stride_width, width, filter_width, outWidth);
+
 bool averagePoolFloat32(const float* inputData, const Shape& inputShape,
                         int32_t padding, int32_t stride_width, int32_t stride_height,
                         int32_t filter_width, int32_t filter_height, int32_t activation,
                         float* outputData, const Shape& outputShape) {
-    uint32_t height       = getSizeOfDimension(inputShape, 1);
-    uint32_t width        = getSizeOfDimension(inputShape, 2);
-    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);
-    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);
 
-    uint32_t paddingHeight =
-            ComputePadding(stride_height, height, filter_height, outHeight);
-    uint32_t paddingWidth =
-            ComputePadding(stride_width, width, filter_width, outWidth);
+    ANDROID_NN_POOLING_PARAMETERS
 
     #define ANDROID_NN_AVERAGE_POOL(activation)                                \
         optimized_ops::AveragePool<FusedActivationFunctionType::activation>(   \
@@ -90,19 +93,49 @@
     return true;
 }
 
+bool averagePoolQuant8(const uint8_t* inputData, const Shape& inputShape,
+                       int32_t padding, int32_t stride_width, int32_t stride_height,
+                       int32_t filter_width, int32_t filter_height, int32_t activation,
+                       uint8_t* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_POOLING_PARAMETERS
+
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    #define ANDROID_NN_AVERAGE_POOL(activation)                                \
+        optimized_ops::AveragePool<FusedActivationFunctionType::activation>(   \
+            inputData, convertShapeToDims(inputShape),                         \
+            stride_width, paddingWidth, paddingHeight,                         \
+            filter_width, filter_height,                                       \
+            output_activation_min, output_activation_max,                      \
+            outputData, convertShapeToDims(outputShape))
+
+    if (activation == kActivationNone) {
+        ANDROID_NN_AVERAGE_POOL(kNone);
+    }
+    if (activation == kActivationRelu) {
+        ANDROID_NN_AVERAGE_POOL(kRelu);
+    }
+    if (activation == kActivationRelu6) {
+        ANDROID_NN_AVERAGE_POOL(kRelu6);
+    }
+
+    #undef ANDROID_NN_AVERAGE_POOL
+
+    return true;
+}
+
 bool l2PoolFloat32(const float* inputData, const Shape& inputShape,
                    int32_t padding, int32_t stride_width, int32_t stride_height,
                    int32_t filter_width, int32_t filter_height, int32_t activation,
                    float* outputData, const Shape& outputShape) {
-    uint32_t height       = getSizeOfDimension(inputShape, 1);
-    uint32_t width        = getSizeOfDimension(inputShape, 2);
-    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);
-    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);
 
-    uint32_t paddingHeight =
-            ComputePadding(stride_height, height, filter_height, outHeight);
-    uint32_t paddingWidth =
-            ComputePadding(stride_width, width, filter_width, outWidth);
+    ANDROID_NN_POOLING_PARAMETERS
 
     #define ANDROID_NN_L2_POOL(activation)                                     \
         optimized_ops::L2Pool<FusedActivationFunctionType::activation>(        \
@@ -130,15 +163,8 @@
                     int32_t padding, int32_t stride_width, int32_t stride_height,
                     int32_t filter_width, int32_t filter_height, int32_t activation,
                     float* outputData, const Shape& outputShape) {
-    uint32_t height       = getSizeOfDimension(inputShape, 1);
-    uint32_t width        = getSizeOfDimension(inputShape, 2);
-    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);
-    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);
 
-    uint32_t paddingHeight =
-            ComputePadding(stride_height, height, filter_height, outHeight);
-    uint32_t paddingWidth =
-            ComputePadding(stride_width, width, filter_width, outWidth);
+    ANDROID_NN_POOLING_PARAMETERS
 
     #define ANDROID_NN_MAX_POOL(activation)                                    \
         optimized_ops::MaxPool<FusedActivationFunctionType::activation>(       \
@@ -162,7 +188,43 @@
     return true;
 }
 
+bool maxPoolQuant8(const uint8_t* inputData, const Shape& inputShape,
+                   int32_t padding, int32_t stride_width, int32_t stride_height,
+                   int32_t filter_width, int32_t filter_height, int32_t activation,
+                   uint8_t* outputData, const Shape& outputShape) {
 
+    ANDROID_NN_POOLING_PARAMETERS
 
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    #define ANDROID_NN_MAX_POOL(activation)                                    \
+        optimized_ops::MaxPool<FusedActivationFunctionType::activation>(       \
+            inputData, convertShapeToDims(inputShape),                         \
+            stride_width, paddingWidth, paddingHeight,                         \
+            filter_width, filter_height,                                       \
+            output_activation_min, output_activation_max,                      \
+            outputData, convertShapeToDims(outputShape))
+
+    if (activation == kActivationNone) {
+        ANDROID_NN_MAX_POOL(kNone);
+    }
+    if (activation == kActivationRelu) {
+        ANDROID_NN_MAX_POOL(kRelu);
+    }
+    if (activation == kActivationRelu6) {
+        ANDROID_NN_MAX_POOL(kRelu6);
+    }
+
+    #undef ANDROID_NN_MAX_POOL
+
+    return true;
+}
+
+#undef ANDROID_NN_POOLING_PARAMETERS
 }  // namespace nn
 }  // namespace android
diff --git a/common/operations/SimpleMath.cpp b/common/operations/SimpleMath.cpp
index 36b1cb0..882304a 100644
--- a/common/operations/SimpleMath.cpp
+++ b/common/operations/SimpleMath.cpp
@@ -24,7 +24,7 @@
 namespace android {
 namespace nn {
 
-bool addTensorsFloat32Prepare(const Shape& in1, const Shape& in2, Shape* out) {
+bool addTensorsPrepare(const Shape& in1, const Shape& in2, Shape* out) {
     return SameShape(in1, in2) && SetShape(in1, out);
 }