NNAPI systrace for timing statistics

This adds systrace tracing to NNAPI. The tracing
will be helpful for:
- getting numbers on where time is spent currently
- diagnosing and improving performance
- benchmarking

TODOs:
- Write analysis tools for traces

Change-Id: I9026f1043428cb715b577901bec3a2e1e39a82e3
Merged-In: I9026f1043428cb715b577901bec3a2e1e39a82e3
Bug: 78137932
Test: manually run systrace.py against unit tests
Test: manually run systrace.py against benchmarking app
(cherry picked from commit e9e637ab73b68b5982281a3f7c621f6a75d51743)
diff --git a/common/operations/Activation.cpp b/common/operations/Activation.cpp
index b80984d..5da58f9 100644
--- a/common/operations/Activation.cpp
+++ b/common/operations/Activation.cpp
@@ -19,11 +19,14 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool reluFloat32(const float* inputData, const Shape& inputShape,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reluFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::max(0.f, *inputData);
@@ -33,6 +36,7 @@
 
 bool relu1Float32(const float* inputData, const Shape& inputShape,
                   float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu1Float32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::min(std::max(-1.f, *inputData), 1.f);
@@ -42,6 +46,7 @@
 
 bool relu6Float32(const float* inputData, const Shape& inputShape,
                   float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu6Float32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::min(std::max(0.f, *inputData), 6.f);
@@ -51,6 +56,7 @@
 
 bool tanhFloat32(const float* inputData, const Shape& inputShape,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("tanhFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::tanh(*inputData);
@@ -60,6 +66,7 @@
 
 bool logisticFloat32(const float* inputData, const Shape& inputShape,
                      float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("logisticFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = 1.f / (1.f + std::exp(-*inputData));
@@ -70,6 +77,7 @@
 bool softmaxFloat32(const float* inputData, const Shape& inputShape,
                     const float beta,
                     float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("softmaxFloat32");
     tflite::Dims<4> dim;
     if (getNumberOfDimensions(inputShape) == 2) {
         uint32_t batch_size = getSizeOfDimension(inputShape, 0);
@@ -85,6 +93,7 @@
         return false;
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Softmax");
     tflite::optimized_ops::Softmax(inputData, dim, beta,
                                    outputData, dim);
     return true;
@@ -107,18 +116,21 @@
 
 bool reluQuant8(const uint8_t* inputData, const Shape& inputShape,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reluQuant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu)
     return true;
 }
 
 bool relu1Quant8(const uint8_t* inputData, const Shape& inputShape,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu1Quant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu1)
     return true;
 }
 
 bool relu6Quant8(const uint8_t* inputData, const Shape& inputShape,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu6Quant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu6)
     return true;
 }
@@ -127,6 +139,7 @@
 
 bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape,
                     uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("logisticQuant8");
     if (outputShape.offset != 0 || outputShape.scale != 1.f / 256) {
         LOG(ERROR) << "incorrect scale / offset for output";
         return false;
@@ -149,6 +162,7 @@
     int32_t input_range_radius =
             CalculateInputRadius(kInputIntegerBits, input_left_shift);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Logistic");
     tflite::optimized_ops::Logistic(
             inputData, convertShapeToDims(inputShape),
             inputShape.offset, input_range_radius,
@@ -161,6 +175,7 @@
 bool softmaxQuant8(const uint8_t* inputData, const Shape& inputShape,
                    const float beta,
                    uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("softmaxQuant8");
     tflite::Dims<4> dim;
     if (getNumberOfDimensions(inputShape) == 2) {
         uint32_t batch_size = getSizeOfDimension(inputShape, 0);
@@ -196,6 +211,7 @@
     float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits,
                                                   input_left_shift);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Softmax");
     tflite::optimized_ops::Softmax(inputData, dim, input_multiplier,
                                    input_left_shift, diff_min,
                                    outputData, dim);
diff --git a/common/operations/Concatenation.cpp b/common/operations/Concatenation.cpp
index fc65e43..cc1fa5a 100644
--- a/common/operations/Concatenation.cpp
+++ b/common/operations/Concatenation.cpp
@@ -19,12 +19,15 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
                           const std::vector<Shape>& inputShapes, int32_t axis,
                           float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("concatenationFloat32");
     int num_inputs = inputShapes.size();
     std::vector<tflite::Dims<4>*> inputDimsPtr(num_inputs);
     std::vector<tflite::Dims<4> > inputDims(num_inputs);
@@ -33,6 +36,7 @@
         inputDimsPtr[i] = &inputDims[i];
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Concatenation");
     tflite::optimized_ops::Concatenation<tflite::FusedActivationFunctionType::kNone, float>(
             getNumberOfDimensions(outputShape) - axis - 1,
             inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
@@ -44,6 +48,7 @@
 bool concatenationQuant8(const std::vector<const uint8_t*>& inputDataPtrs,
                          const std::vector<Shape>& inputShapes, int32_t axis,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("concatenationQuant8");
     int num_inputs = inputShapes.size();
     std::vector<tflite::Dims<4>*> inputDimsPtr(num_inputs);
     std::vector<tflite::Dims<4> > inputDims(num_inputs);
@@ -52,6 +57,7 @@
         inputDimsPtr[i] = &inputDims[i];
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Concatenation");
     tflite::optimized_ops::Concatenation<tflite::FusedActivationFunctionType::kNone, uint8_t>(
             getNumberOfDimensions(outputShape) - axis - 1,
             inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
diff --git a/common/operations/Conv2D.cpp b/common/operations/Conv2D.cpp
index c16426c..60f9132 100644
--- a/common/operations/Conv2D.cpp
+++ b/common/operations/Conv2D.cpp
@@ -19,6 +19,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -84,6 +86,7 @@
                  int32_t stride_width, int32_t stride_height,
                  int32_t activation,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("convFloat32");
 
     ANDROID_NN_CONV_PARAMETERS(float)
 
@@ -95,6 +98,7 @@
 
     // Prevent concurrent executions that may access the scratch buffer.
     std::unique_lock<std::mutex> lock(executionMutex);
+    NNTRACE_COMP_SWITCH("optimized_ops::Conv");
     tflite::optimized_ops::Conv(
             inputData, convertShapeToDims(inputShape),
             filterData, convertShapeToDims(filterShape),
@@ -116,6 +120,7 @@
                 int32_t stride_width, int32_t stride_height,
                 int32_t activation,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("convQuant8");
 
     ANDROID_NN_CONV_PARAMETERS(uint8_t)
 
@@ -146,6 +151,8 @@
     std::unique_lock<std::mutex> lock(executionMutex);
     // Alow gemmlowp automatically decide how many threads to use.
     gemm_context.set_max_num_threads(0);
+
+    NNTRACE_COMP_SWITCH("optimized_ops::Conv");
     tflite::optimized_ops::Conv(
             inputData, convertShapeToDims(inputShape), inputOffset,
             filterData, convertShapeToDims(filterShape), filterOffset,
diff --git a/common/operations/DepthwiseConv2D.cpp b/common/operations/DepthwiseConv2D.cpp
index 5dd67e2..60c5e41 100644
--- a/common/operations/DepthwiseConv2D.cpp
+++ b/common/operations/DepthwiseConv2D.cpp
@@ -20,6 +20,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -42,6 +44,7 @@
                           int32_t stride_width, int32_t stride_height,
                           int32_t depth_multiplier, int32_t activation,
                           float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthwiseConvFloat32");
 
     ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
 
@@ -49,6 +52,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::DepthwiseConv");
     tflite::optimized_ops::DepthwiseConv(
             inputData, convertShapeToDims(inputShape),
             filterData, convertShapeToDims(filterShape),
@@ -70,6 +74,7 @@
                          int32_t stride_width, int32_t stride_height,
                          int32_t depth_multiplier, int32_t activation,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthwiseConvQuant8");
 
     ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
 
@@ -94,6 +99,7 @@
     uint32_t filterOffset = -filterShape.offset;
     uint32_t outputOffset = outputShape.offset;
 
+    NNTRACE_COMP_SWITCH("optimized_ops::DepthwiseConv");
     tflite::optimized_ops::DepthwiseConv(
             inputData, convertShapeToDims(inputShape), inputOffset,
             filterData, convertShapeToDims(filterShape), filterOffset,
diff --git a/common/operations/EmbeddingLookup.cpp b/common/operations/EmbeddingLookup.cpp
index 504c684..9f231ed 100644
--- a/common/operations/EmbeddingLookup.cpp
+++ b/common/operations/EmbeddingLookup.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "Operations.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -32,6 +34,7 @@
 }
 
 bool EmbeddingLookup::Eval() {
+  NNTRACE_COMP("EmbeddingLookup::Eval");
   const int row_size = value_->shape().dimensions[0];
   const int total_bytes = sizeOfData(value_->type, value_->dimensions);
   const int row_bytes = total_bytes/row_size;
diff --git a/common/operations/FullyConnected.cpp b/common/operations/FullyConnected.cpp
index 4e2deff..12da51b 100644
--- a/common/operations/FullyConnected.cpp
+++ b/common/operations/FullyConnected.cpp
@@ -20,6 +20,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -33,6 +35,7 @@
                            const float* biasData, const Shape& biasShape,
                            int32_t activation,
                            float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("fullyConnectedFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
@@ -42,6 +45,7 @@
     uint32_t batch_size = getSizeOfDimension(outputShape, 0);
     uint32_t input_n_elements = getNumberOfElements(inputShape);
     if (batch_size * batch_size == input_n_elements) {
+        NNTRACE_COMP_SWITCH("reference_ops::FullyConnected");
         tflite::reference_ops::FullyConnected(
                 inputData, convertShapeToDims(inputShape),
                 weightsData, convertShapeToDims(weightsShape),
@@ -49,6 +53,7 @@
                 output_activation_min, output_activation_max,
                 outputData, convertShapeToDims(outputShape));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::FullyConnected");
         tflite::optimized_ops::FullyConnected(
                 inputData, convertShapeToDims(inputShape),
                 weightsData, convertShapeToDims(weightsShape),
@@ -64,6 +69,7 @@
                           const int32_t* biasData, const Shape& biasShape,
                           int32_t activation,
                           uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("fullyConnectedQuant8");
     int32_t inputOffset = -inputShape.offset;
     int32_t weightsOffset = -weightsShape.offset;
     int32_t outputOffset = outputShape.offset;
@@ -91,6 +97,7 @@
     // Alow gemmlowp automatically decide how many threads to use.
     gemm_context.set_max_num_threads(0);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::FullyConnected");
     tflite::optimized_ops::FullyConnected(
             inputData, convertShapeToDims(inputShape), inputOffset,
             weightsData, convertShapeToDims(weightsShape), weightsOffset,
diff --git a/common/operations/HashtableLookup.cpp b/common/operations/HashtableLookup.cpp
index 1c8d802..e864b3d 100644
--- a/common/operations/HashtableLookup.cpp
+++ b/common/operations/HashtableLookup.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "Operations.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -42,6 +44,7 @@
 }
 
 bool HashtableLookup::Eval() {
+  NNTRACE_COMP("HashtableLookup::Eval");
   const int num_rows = value_->shape().dimensions[0];
   const int row_bytes = sizeOfData(value_->type, value_->dimensions) / num_rows;
   void* pointer = nullptr;
diff --git a/common/operations/LSHProjection.cpp b/common/operations/LSHProjection.cpp
index 57d1475..97183e2 100644
--- a/common/operations/LSHProjection.cpp
+++ b/common/operations/LSHProjection.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "util/hash/farmhash.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -141,6 +143,8 @@
 }
 
 bool LSHProjection::Eval() {
+  NNTRACE_COMP("LSHProjection::Eval");
+
   int32_t* out_buf = reinterpret_cast<int32_t*>(output_->buffer);
 
   switch (type_) {
diff --git a/common/operations/LSTM.cpp b/common/operations/LSTM.cpp
index 80b0eb9..7667c66 100644
--- a/common/operations/LSTM.cpp
+++ b/common/operations/LSTM.cpp
@@ -19,6 +19,8 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -300,6 +302,8 @@
 }
 
 bool LSTMCell::Eval() {
+  NNTRACE_COMP("LSTMCell::Eval");
+
   const uint32_t n_batch = input_->shape().dimensions[0];
   const uint32_t n_input = input_->shape().dimensions[1];
   // n_cell and n_output will be the same size when there is no projection.
diff --git a/common/operations/Normalization.cpp b/common/operations/Normalization.cpp
index eccb3bd..7f36dbf 100644
--- a/common/operations/Normalization.cpp
+++ b/common/operations/Normalization.cpp
@@ -19,11 +19,14 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool l2normFloat32(const float* inputData, const Shape& inputShape,
                    float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::L2Normalization::float");
     tflite::optimized_ops::L2Normalization<tflite::FusedActivationFunctionType::kNone>(
             inputData, convertShapeToDims(inputShape),
             outputData, convertShapeToDims(outputShape));
@@ -33,6 +36,7 @@
 
 bool l2normQuant8(const uint8_t* inputData, const Shape& inputShape,
                   uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::L2Normalization::uint8");
     tflite::optimized_ops::L2Normalization(
             inputData, convertShapeToDims(inputShape),
             inputShape.offset,
@@ -44,6 +48,7 @@
 bool localResponseNormFloat32(const float* inputData, const Shape& inputShape,
                               int32_t radius, float bias, float alpha, float beta,
                               float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::LocalResponseNormalization::float");
     tflite::optimized_ops::LocalResponseNormalization(
             inputData, convertShapeToDims(inputShape),
             radius, bias, alpha, beta,
diff --git a/common/operations/Pooling.cpp b/common/operations/Pooling.cpp
index db4497a..7f60323 100644
--- a/common/operations/Pooling.cpp
+++ b/common/operations/Pooling.cpp
@@ -19,6 +19,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -37,6 +39,7 @@
                         int32_t stride_width, int32_t stride_height,
                         int32_t filter_width, int32_t filter_height, int32_t activation,
                         float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("averagePoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -44,6 +47,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
     tflite::optimized_ops::AveragePool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -60,6 +64,7 @@
                        int32_t stride_width, int32_t stride_height,
                        int32_t filter_width, int32_t filter_height, int32_t activation,
                        uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("averagePoolQuant8");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -70,6 +75,7 @@
                                   &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
     tflite::optimized_ops::AveragePool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -86,6 +92,7 @@
                    int32_t stride_width, int32_t stride_height,
                    int32_t filter_width, int32_t filter_height, int32_t activation,
                    float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("l2PoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -93,6 +100,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::L2Pool");
     tflite::optimized_ops::L2Pool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -109,6 +117,7 @@
                     int32_t stride_width, int32_t stride_height,
                     int32_t filter_width, int32_t filter_height, int32_t activation,
                     float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("maxPoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -116,6 +125,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
     tflite::optimized_ops::MaxPool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -132,6 +142,7 @@
                    int32_t stride_width, int32_t stride_height,
                    int32_t filter_width, int32_t filter_height, int32_t activation,
                    uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("maxPoolQuant8");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -142,6 +153,7 @@
                                   &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
     tflite::optimized_ops::MaxPool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
diff --git a/common/operations/RNN.cpp b/common/operations/RNN.cpp
index 8a00734..4d7a4c9 100644
--- a/common/operations/RNN.cpp
+++ b/common/operations/RNN.cpp
@@ -19,11 +19,14 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 RNN::RNN(const Operation& operation,
          std::vector<RunTimeOperandInfo>& operands) {
+  NNTRACE_TRANS("RNN::RNN");
   input_ = GetInput(operation, operands, kInputTensor);
   weights_ = GetInput(operation, operands, kWeightsTensor);
   recurrent_weights_ = GetInput(operation, operands, kRecurrentWeightsTensor);
@@ -41,6 +44,7 @@
                   std::vector<RunTimeOperandInfo> &operands,
                   Shape *hiddenStateShape,
                   Shape *outputShape) {
+  NNTRACE_TRANS("RNN::Prepare");
   // Check we have all the inputs and outputs we need.
   const int num_inputs = NumInputsWithValues(operation, operands);
   NN_CHECK(num_inputs == 5 || num_inputs == 6);
@@ -78,6 +82,8 @@
 }
 
 bool RNN::Eval() {
+  NNTRACE_COMP("RNN::Eval");
+
   const float* bias_ptr = reinterpret_cast<float*>(bias_->buffer);
 
   const uint32_t batch_size = input_->shape().dimensions[0];
diff --git a/common/operations/Reshape.cpp b/common/operations/Reshape.cpp
index 5803968..5a05aed 100644
--- a/common/operations/Reshape.cpp
+++ b/common/operations/Reshape.cpp
@@ -24,11 +24,14 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool reshapeGeneric(const void* inputData, const Shape& inputShape,
                     void* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reshapeGeneric");
     size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
     memcpy(outputData, inputData, count);
     return true;
@@ -36,6 +39,7 @@
 
 bool resizeBilinearFloat32(const float* inputData, const Shape& inputShape,
                            float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("resizeBilinearFloat32");
     int32_t height = (int32_t) getSizeOfDimension(outputShape, 1);
     int32_t width  = (int32_t) getSizeOfDimension(outputShape, 2);
 
@@ -44,6 +48,7 @@
     Shape outDimShape;
     outDimShape.dimensions = {1, 1, 1, 2};
 
+    NNTRACE_COMP_SWITCH("optimized_ops::ResizeBilinear");
     tflite::optimized_ops::ResizeBilinear(
             inputData, convertShapeToDims(inputShape),
             outDimData, convertShapeToDims(outDimShape),
@@ -54,14 +59,17 @@
 bool depthToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
                          int32_t blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthToSpaceGeneric");
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
-       tflite::optimized_ops::DepthToSpace(
-                reinterpret_cast<const float*>(inputData),
-                convertShapeToDims(inputShape),
-                blockSize,
-                reinterpret_cast<float*>(outputData),
-                convertShapeToDims(outputShape));
+        NNTRACE_COMP_SWITCH("optimized_ops::DepthToSpace::float");
+        tflite::optimized_ops::DepthToSpace(
+                 reinterpret_cast<const float*>(inputData),
+                 convertShapeToDims(inputShape),
+                 blockSize,
+                 reinterpret_cast<float*>(outputData),
+                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::DepthToSpace::uint8");
         tflite::optimized_ops::DepthToSpace(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -78,7 +86,9 @@
 bool spaceToDepthGeneric(const uint8_t* inputData, const Shape& inputShape,
                          int32_t blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("spaceToDepthGeneric");
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToDepth::float");
         tflite::optimized_ops::SpaceToDepth(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -86,6 +96,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToDepth::uint8");
         tflite::optimized_ops::SpaceToDepth(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -102,6 +113,7 @@
 bool padGeneric(const uint8_t* inputData, const Shape& inputShape,
                 const int32_t* paddings,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("padGeneric");
     int32_t numInputDims = static_cast<int32_t>(getNumberOfDimensions(inputShape));
 
     std::vector<int> beforePadding;
@@ -113,6 +125,7 @@
     }
 
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Pad::float");
         tflite::optimized_ops::Pad(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -120,6 +133,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Pad::uint8");
         tflite::optimized_ops::Pad(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -136,18 +150,21 @@
 bool batchToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
                          const int32_t* blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("batchToSpaceGeneric");
     // Needed by low level implementation, but not really used.
     tflite::Dims<4> blockSizeDim, cropsDim;
     const int32 crops[4] = {0, 0, 0, 0};
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
-       tflite::optimized_ops::BatchToSpaceND(
-                reinterpret_cast<const float*>(inputData),
-                convertShapeToDims(inputShape),
-                blockSize, blockSizeDim,
-                crops, cropsDim,
-                reinterpret_cast<float*>(outputData),
-                convertShapeToDims(outputShape));
+        NNTRACE_COMP_SWITCH("optimized_ops::BatchToSpaceND::float");
+        tflite::optimized_ops::BatchToSpaceND(
+                 reinterpret_cast<const float*>(inputData),
+                 convertShapeToDims(inputShape),
+                 blockSize, blockSizeDim,
+                 crops, cropsDim,
+                 reinterpret_cast<float*>(outputData),
+                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BatchToSpaceND::uint8");
         tflite::optimized_ops::BatchToSpaceND(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -166,9 +183,11 @@
                          const int32_t* blockSize,
                          const int32_t* padding, const Shape& paddingShape,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("spaceToBatchGeneric");
     // Needed by low level implementation, but not really used.
     tflite::Dims<4> blockSizeDim;
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToBatchND::float");
         tflite::optimized_ops::SpaceToBatchND(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -177,6 +196,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToBatchND::uint8");
         tflite::optimized_ops::SpaceToBatchND(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -193,6 +213,7 @@
 
 bool squeezeGeneric(const void* inputData, const Shape& inputShape,
                     void* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("squeezeGeneric");
     size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
     memcpy(outputData, inputData, count);
     return true;
@@ -201,6 +222,7 @@
 bool transposeGeneric(const uint8_t* inputData, const Shape& inputShape,
                       const int32_t* perm, const Shape& permShape,
                       uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("transposeGeneric");
     // Reverse the permuted axes and convert to 4D due to the way Dims are
     // constructed.
     const int32_t kOutputDimensionNum = 4;
@@ -215,6 +237,7 @@
         reversed_perm[k] = k;
     }
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Transpose::float");
         tflite::reference_ops::Transpose(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -222,6 +245,7 @@
                 convertShapeToDims(outputShape),
                 reversed_perm);
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Transpose::uint8");
         tflite::reference_ops::Transpose(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
diff --git a/common/operations/SVDF.cpp b/common/operations/SVDF.cpp
index 38224a6..cdba351 100644
--- a/common/operations/SVDF.cpp
+++ b/common/operations/SVDF.cpp
@@ -19,6 +19,8 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -38,6 +40,7 @@
 
 SVDF::SVDF(const Operation& operation,
            std::vector<RunTimeOperandInfo>& operands) {
+    NNTRACE_TRANS("SVDF::SVDF");
     input_ = GetInput(operation, operands, kInputTensor);
     weights_feature_ = GetInput(operation, operands, kWeightsFeatureTensor);
     weights_time_ = GetInput(operation, operands, kWeightsTimeTensor);
@@ -56,6 +59,7 @@
                    std::vector<RunTimeOperandInfo> &operands,
                    Shape *stateShape,
                    Shape *outputShape) {
+  NNTRACE_TRANS("SVDF::Prepare");
   // Check we have all the inputs and outputs we need.
   const int num_inputs = NumInputsWithValues(operation, operands);
 
@@ -103,6 +107,8 @@
 }
 
 bool SVDF::Eval() {
+    NNTRACE_COMP("SVDF::Eval");
+
     const int rank = params_.rank_;
     const int batch_size = SizeOfDimension(input_, 0);
     const int input_size = SizeOfDimension(input_, 1);
diff --git a/common/operations/SimpleMath.cpp b/common/operations/SimpleMath.cpp
index 045ac97..e842439 100644
--- a/common/operations/SimpleMath.cpp
+++ b/common/operations/SimpleMath.cpp
@@ -24,6 +24,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -50,9 +52,11 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("addFloat32");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastAdd");
         #define ANDROID_NN_BROADCAST_ADD(activation)                                              \
             tflite::optimized_ops::BroadcastAdd<tflite::FusedActivationFunctionType::activation>( \
                     in1, convertShapeToDims(shape1),                                              \
@@ -66,6 +70,7 @@
         CalculateActivationRangeFloat(activation, &output_activation_min,
                                       &output_activation_max);
 
+        NNTRACE_COMP_SWITCH("optimized_ops::Add");
         tflite::optimized_ops::Add(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -80,6 +85,7 @@
                const uint8_t* in2, const Shape& shape2,
                int32_t activation,
                uint8_t* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("addQuant8");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     const int32_t input1_offset = -shape1.offset;
@@ -118,6 +124,7 @@
                                   &output_activation_max);
 
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastAdd");
         tflite::optimized_ops::BroadcastAdd(
                 left_shift,
                 in1, convertShapeToDims(shape1),
@@ -128,6 +135,7 @@
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Add");
         #define ANDROID_NN_NORMAL_ADD(activation)                                        \
             tflite::optimized_ops::Add<tflite::FusedActivationFunctionType::activation>( \
                     left_shift,                                                          \
@@ -150,10 +158,12 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("mulFloat32");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     if (needBroadcast) {
-    #define ANDROID_NN_BROADCAST_MUL(activation)                                              \
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastMul");
+        #define ANDROID_NN_BROADCAST_MUL(activation)                                          \
         tflite::optimized_ops::BroadcastMul<tflite::FusedActivationFunctionType::activation>( \
                 in1, convertShapeToDims(shape1),                                              \
                 in2, convertShapeToDims(shape2),                                              \
@@ -166,6 +176,7 @@
         CalculateActivationRangeFloat(activation, &output_activation_min,
                                       &output_activation_max);
 
+        NNTRACE_COMP_SWITCH("optimized_ops::Mul");
         tflite::optimized_ops::Mul(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -180,6 +191,7 @@
                const uint8_t* in2, const Shape& shape2,
                int32_t activation,
                uint8_t* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("mulQuant8");
     const int32_t input1_offset = -shape1.offset;
     const int32_t input2_offset = -shape2.offset;
     const int32_t output_offset = shapeOut.offset;
@@ -198,6 +210,7 @@
                                   &output_activation_max);
 
     // Use BROADCAST version to handle the normal case.
+    NNTRACE_COMP_SWITCH("optimized_ops::BroadcastMul");
     tflite::optimized_ops::BroadcastMul(
                 in1, convertShapeToDims(shape1), input1_offset,
                 in2, convertShapeToDims(shape2), input2_offset,
@@ -211,7 +224,9 @@
 bool floorFloat32(const float* inputData,
                   float* outputData,
                   const Shape& shape) {
+    NNTRACE_TRANS("floorFloat32");
     tflite::Dims<4> dim = convertShapeToDims(shape);
+    NNTRACE_COMP_SWITCH("optimized_ops::Floor");
     tflite::optimized_ops::Floor(inputData, dim, outputData, dim);
     return true;
 }
@@ -219,7 +234,9 @@
 bool dequantizeQuant8ToFloat32(const uint8_t* inputData,
                                float* outputData,
                                const Shape& shape) {
+    NNTRACE_TRANS("dequantizeQuant8ToFloat32");
     tflite::Dims<4> dim = convertShapeToDims(shape);
+    NNTRACE_COMP_SWITCH("optimized_ops::Dequantize");
     tflite::optimized_ops::Dequantize(inputData, dim,
                                       shape.offset, shape.scale,
                                       outputData, dim);
@@ -230,18 +247,21 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("subFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
     bool needBroadcast = !SameShape(shape1, shape2);
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastSub");
         tflite::optimized_ops::BroadcastSub(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Sub");
         tflite::optimized_ops::Sub(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -255,18 +275,21 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("divFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
     bool needBroadcast = !SameShape(shape1, shape2);
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastDiv");
         tflite::optimized_ops::BroadcastDiv(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Div");
         tflite::optimized_ops::Div(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -279,6 +302,7 @@
 bool meanGeneric(const uint8_t* inputData, const Shape& inputShape,
                  const int32_t* axis, const Shape& axisShape, bool keepDims,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("meanGeneric");
     // Creates a temp index to iterate through input data.
     int32_t* scratchBuffer = new int32_t[getNumberOfDimensions(inputShape)];
 
@@ -293,6 +317,7 @@
             LOG(ERROR) << "Failed to allocate tempSumBuffer for MEAN";
             result = false;
         } else {
+            NNTRACE_COMP_SWITCH("optimized_ops::Mean");
             tflite::reference_ops::Mean<float, float>(
                     const_cast<float*>(reinterpret_cast<const float*>(inputData)),
                     reinterpret_cast<const int*>(inputShape.dimensions.data()),
@@ -310,6 +335,7 @@
             LOG(ERROR) << "Failed to allocate tempSumBuffer for MEAN";
             result = false;
         } else {
+            NNTRACE_COMP_SWITCH("optimized_ops::Mean");
             tflite::reference_ops::Mean<uint8_t, int32_t>(
                     const_cast<uint8_t*>(inputData),
                     reinterpret_cast<const int*>(inputShape.dimensions.data()),
diff --git a/common/operations/StridedSlice.cpp b/common/operations/StridedSlice.cpp
index 9db9523..222c48d 100644
--- a/common/operations/StridedSlice.cpp
+++ b/common/operations/StridedSlice.cpp
@@ -23,6 +23,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -31,6 +33,7 @@
                          const int32_t* stridesData,
                          int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("stridedSliceGeneric");
     // This Op only supports 1-4D cases and since we use the reference 4D
     // implementation, the 1-3D tensors are mapped to 4D.
     const int kMaxDim = 4;
@@ -56,6 +59,7 @@
     endMask = ReverseMaskBits(endMask, numInputDims);
 
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("reference_ops::StridedSlice::float");
         tflite::reference_ops::StridedSlice(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -64,6 +68,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("reference_ops::StridedSlice::uint8");
         tflite::reference_ops::StridedSlice(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),