NNAPI systrace for timing statistics

This adds systrace tracing to NNAPI. The tracing
will be helpful for:
- getting numbers on where time is spent currently
- diagnosing and improving performance
- benchmarking

TODOs:
- Write analysis tools for traces

Change-Id: I9026f1043428cb715b577901bec3a2e1e39a82e3
Merged-In: I9026f1043428cb715b577901bec3a2e1e39a82e3
Bug: 78137932
Test: manually run systrace.py against unit tests
Test: manually run systrace.py against benchmarking app
(cherry picked from commit e9e637ab73b68b5982281a3f7c621f6a75d51743)
diff --git a/common/CpuExecutor.cpp b/common/CpuExecutor.cpp
index 8f87067..be79fa5 100644
--- a/common/CpuExecutor.cpp
+++ b/common/CpuExecutor.cpp
@@ -20,6 +20,7 @@
 
 #include "NeuralNetworks.h"
 #include "Operations.h"
+#include "Tracing.h"
 
 #include "Eigen/Core"
 #include <omp.h>
@@ -189,12 +190,14 @@
 int CpuExecutor::run(const V1_0::Model& model, const Request& request,
                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
+    NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run::V1_0");
     return run(convertToV1_1(model), request, modelPoolInfos, requestPoolInfos);
 }
 
 int CpuExecutor::run(const V1_1::Model& model, const Request& request,
                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
+    NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run::V1_1");
     VLOG(CPUEXE) << "CpuExecutor::run() with request("
                  << SHOW_IF_DEBUG(toString(request)) << ")";
 
diff --git a/common/ValidateHal.cpp b/common/ValidateHal.cpp
index 0955178..8016b1f 100644
--- a/common/ValidateHal.cpp
+++ b/common/ValidateHal.cpp
@@ -18,6 +18,7 @@
 
 #include "ValidateHal.h"
 #include "NeuralNetworks.h"
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <android-base/logging.h>
@@ -405,6 +406,8 @@
 
 template<typename VersionedModel>
 static bool validateModelVersioned(const VersionedModel& model) {
+    NNTRACE_FULL(NNTRACE_LAYER_UTILITY, NNTRACE_PHASE_UNSPECIFIED,
+                 "validateModelVersioned");
     return (validateOperands(model.operands, model.operandValues, model.pools) &&
             validateOperations(model.operations, model.operands) &&
             validateModelInputOutputs(model.inputIndexes, model.operands,
diff --git a/common/include/Tracing.h b/common/include/Tracing.h
new file mode 100644
index 0000000..c2eae7a
--- /dev/null
+++ b/common/include/Tracing.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_ML_NN_COMMON_TRACING_H
+#define ANDROID_ML_NN_COMMON_TRACING_H
+
+#define ATRACE_TAG ATRACE_TAG_NNAPI
+#include "utils/Trace.h"
+
+// Neural Networks API (NNAPI) systracing
+//
+// Primary goal of the tracing is to capture and present timings for NNAPI.
+// (Other uses include providing visibility to split of execution between
+// drivers and the CPU fallback, and the ability to visualize call sequences).
+//
+// The tracing has three parts:
+//  1 Trace macros defined in this file and used throughout the codebase,
+//    modelled after and using atrace. These implement a naming convention for
+//    the tracepoints, interpreted by the systrace parser.
+//  2 Android systrace (atrace) on-device capture and host-based analysis.
+//  3 A systrace parser (TODO) to summarize the timings.
+//
+// For an overview and introduction, please refer to the "NNAPI Systrace design
+// and HOWTO" (internal Docs for now). This header doesn't try to replicate all
+// the information in that document.
+//
+// Glossary:
+// - Phase: stage in processing (e.g., Preparation, Compilation, Execution);
+//   Overall phase nests rest, Execution nests Input/Output, Transformation,
+//   Computation and Results - otherwise not nested (Initialization phase
+//   functions may occur inside other phases but will be counted out during
+//   analysis). Nested phases (other than Initialization) are analysed as a
+//   breakdown of the parent phase.
+// - Layer: component in the stack (from top to bottom: App, Runtime, IPC,
+//   Driver/CPU). Calls to lower layers are typically nested within calls to upper
+//   layers.
+// - Bucket: unit of timing analysis, the combination of Phase and Layer (and
+//   thus also typically nested).
+// - Detail: specific unit being executed, typically a function.
+
+// Convenience macros to be used in the code (phases defined below).
+// (Macros so that string concatenation is done at compile time).
+//
+// These exist in three variants:
+// - Simple (NNTRACE_<layer and potentially phase>) - to be used when only one
+//   Phase is active within a scope
+// - "Switch" (NNTRACE_<...>_SWITCH) - to be used when multiple Phases
+//   share a scope (e.g., transformation of data and computation in same
+//   function).
+// - "Subtract" (NNTRACE_<...>_SUBTRACT) - to be used when nesting is violated
+//   and the time should be subtracted from the parent scope
+// Arguments:
+// - phase: one of the NNTRACE_PHASE_* macros defined below.
+// - detail: free-form string constant, typically function name.
+// Example usage:
+//   // Simple
+//   int ANeuralNetworksMemory_createFromFd(...) {
+//     NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksMemory_createFromFd");
+//   }
+//   // Switch
+//   bool concatenationFloat32(...) {
+//     NNTRACE_TRANS("concatenationFloat32");  // Transformation of data begins
+//     ...
+//     NNTRACE_COMP_SWITCH("optimized_ops::Concatenation"); // Transformation
+//                                                          // ends and computation
+//                                                          // begins
+//   }
+//   // Subtract
+//   static int compile(...) {
+//     NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModel");
+//     device->getInterface()->prepareModel(..., preparedModelCallback);
+//     preparedModelCallback->wait()
+//   }
+//   ErrorStatus VersionedIDevice::prepareModel(...) {
+//     ... IPC work ...
+//     {
+//       NNTRACE_FULL_SUBTRACT(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_COMPILATION,
+//                             "VersionedIDevice::prepareModel");
+//       ... Runtime work ...
+//     }
+//     ... IPC work ...
+//   }
+//
+// Layer Application - For native applications (e.g., unit tests)
+#define NNTRACE_APP(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_APPLICATION, phase, detail)
+#define NNTRACE_APP_SWITCH(phase, detail) \
+        NNTRACE_FULL_SWITCH(NNTRACE_LAYER_APPLICATION, phase, detail)
+// Layer Runtime - For the NNAPI runtime
+#define NNTRACE_RT(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_RUNTIME, phase, detail)
+#define NNTRACE_RT_SWITCH(phase, detail) NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, phase, detail)
+// Layer CPU - CPU executor
+#define NNTRACE_CPU(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, phase, detail)
+#define NNTRACE_COMP(detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, \
+                                          NNTRACE_PHASE_COMPUTATION, detail)
+#define NNTRACE_COMP_SWITCH(detail) NNTRACE_FULL_SWITCH(NNTRACE_LAYER_CPU, \
+                                                        NNTRACE_PHASE_COMPUTATION, detail)
+#define NNTRACE_TRANS(detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, \
+                                           NNTRACE_PHASE_TRANSFORMATION, detail)
+
+// Fully specified macros to be used when no convenience wrapper exists for your
+// need.
+#define NNTRACE_FULL(layer, phase, detail) NNTRACE_NAME_1(("[NN_" layer "_" phase "]" detail))
+#define NNTRACE_FULL_SWITCH(layer, phase, detail) \
+        NNTRACE_NAME_SWITCH(("[SW][NN_" layer "_" phase "]" detail))
+#define NNTRACE_FULL_SUBTRACT(layer, phase, detail) \
+        NNTRACE_NAME_1(("[SUB][NN_" layer "_" phase "]" detail))
+// Raw macro without scoping requirements, for special cases
+#define NNTRACE_FULL_RAW(layer, phase, detail) android::ScopedTrace PASTE(___tracer, __LINE__) \
+        (ATRACE_TAG, ("[NN_" layer "_" phase "]" detail))
+
+// Tracing buckets - for calculating timing summaries over.
+//
+// Phases
+#define NNTRACE_PHASE_OVERALL "PO"      // Overall program, e.g., one benchmark case
+#define NNTRACE_PHASE_INITIALIZATION "PI" // Initialization - not related to a model
+#define NNTRACE_PHASE_PREPARATION "PP"  // Model construction
+#define NNTRACE_PHASE_COMPILATION "PC"  // Model compilation
+#define NNTRACE_PHASE_EXECUTION "PE"    // Executing the model
+#define NNTRACE_PHASE_TERMINATION "PT"  // Tearing down
+#define NNTRACE_PHASE_UNSPECIFIED "PU"  // Helper code called from multiple phases
+// Subphases of execution
+#define NNTRACE_PHASE_INPUTS_AND_OUTPUTS "PIO"  // Setting inputs/outputs and allocating buffers
+#define NNTRACE_PHASE_TRANSFORMATION "PTR"      // Transforming data for computation
+#define NNTRACE_PHASE_COMPUTATION "PCO"         // Computing operations' outputs
+#define NNTRACE_PHASE_RESULTS "PR"              // Reading out results
+// Layers
+#define NNTRACE_LAYER_APPLICATION "LA"
+#define NNTRACE_LAYER_RUNTIME "LR"
+#define NNTRACE_LAYER_IPC "LI"
+#define NNTRACE_LAYER_DRIVER "LD"
+#define NNTRACE_LAYER_CPU "LC"
+#define NNTRACE_LAYER_OTHER "LO"
+#define NNTRACE_LAYER_UTILITY "LU"              // Code used from multiple layers
+
+
+// Implementation
+//
+// Almost same as ATRACE_NAME, but enforcing explicit distinction between
+// phase-per-scope and switching phases.
+//
+// Basic trace, one per scope allowed to enforce disjointness
+#define NNTRACE_NAME_1(name) android::ScopedTrace ___tracer_1(ATRACE_TAG, name)
+// Switching trace, more than one per scope allowed, translated by
+// systrace_parser.py. This is mainly useful for tracing multiple phases through
+// one function / scope.
+#define NNTRACE_NAME_SWITCH(name) android::ScopedTrace PASTE(___tracer, __LINE__) \
+        (ATRACE_TAG, name); \
+        (void)___tracer_1  // ensure switch is only used after a basic trace
+
+
+// Disallow use of raw ATRACE macros
+#undef ATRACE_NAME
+#undef ATRACE_CALL
+
+#endif // ANDROID_ML_NN_COMMON_TRACING_H
diff --git a/common/operations/Activation.cpp b/common/operations/Activation.cpp
index b80984d..5da58f9 100644
--- a/common/operations/Activation.cpp
+++ b/common/operations/Activation.cpp
@@ -19,11 +19,14 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool reluFloat32(const float* inputData, const Shape& inputShape,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reluFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::max(0.f, *inputData);
@@ -33,6 +36,7 @@
 
 bool relu1Float32(const float* inputData, const Shape& inputShape,
                   float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu1Float32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::min(std::max(-1.f, *inputData), 1.f);
@@ -42,6 +46,7 @@
 
 bool relu6Float32(const float* inputData, const Shape& inputShape,
                   float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu6Float32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::min(std::max(0.f, *inputData), 6.f);
@@ -51,6 +56,7 @@
 
 bool tanhFloat32(const float* inputData, const Shape& inputShape,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("tanhFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::tanh(*inputData);
@@ -60,6 +66,7 @@
 
 bool logisticFloat32(const float* inputData, const Shape& inputShape,
                      float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("logisticFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = 1.f / (1.f + std::exp(-*inputData));
@@ -70,6 +77,7 @@
 bool softmaxFloat32(const float* inputData, const Shape& inputShape,
                     const float beta,
                     float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("softmaxFloat32");
     tflite::Dims<4> dim;
     if (getNumberOfDimensions(inputShape) == 2) {
         uint32_t batch_size = getSizeOfDimension(inputShape, 0);
@@ -85,6 +93,7 @@
         return false;
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Softmax");
     tflite::optimized_ops::Softmax(inputData, dim, beta,
                                    outputData, dim);
     return true;
@@ -107,18 +116,21 @@
 
 bool reluQuant8(const uint8_t* inputData, const Shape& inputShape,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reluQuant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu)
     return true;
 }
 
 bool relu1Quant8(const uint8_t* inputData, const Shape& inputShape,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu1Quant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu1)
     return true;
 }
 
 bool relu6Quant8(const uint8_t* inputData, const Shape& inputShape,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu6Quant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu6)
     return true;
 }
@@ -127,6 +139,7 @@
 
 bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape,
                     uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("logisticQuant8");
     if (outputShape.offset != 0 || outputShape.scale != 1.f / 256) {
         LOG(ERROR) << "incorrect scale / offset for output";
         return false;
@@ -149,6 +162,7 @@
     int32_t input_range_radius =
             CalculateInputRadius(kInputIntegerBits, input_left_shift);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Logistic");
     tflite::optimized_ops::Logistic(
             inputData, convertShapeToDims(inputShape),
             inputShape.offset, input_range_radius,
@@ -161,6 +175,7 @@
 bool softmaxQuant8(const uint8_t* inputData, const Shape& inputShape,
                    const float beta,
                    uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("softmaxQuant8");
     tflite::Dims<4> dim;
     if (getNumberOfDimensions(inputShape) == 2) {
         uint32_t batch_size = getSizeOfDimension(inputShape, 0);
@@ -196,6 +211,7 @@
     float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits,
                                                   input_left_shift);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Softmax");
     tflite::optimized_ops::Softmax(inputData, dim, input_multiplier,
                                    input_left_shift, diff_min,
                                    outputData, dim);
diff --git a/common/operations/Concatenation.cpp b/common/operations/Concatenation.cpp
index fc65e43..cc1fa5a 100644
--- a/common/operations/Concatenation.cpp
+++ b/common/operations/Concatenation.cpp
@@ -19,12 +19,15 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
                           const std::vector<Shape>& inputShapes, int32_t axis,
                           float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("concatenationFloat32");
     int num_inputs = inputShapes.size();
     std::vector<tflite::Dims<4>*> inputDimsPtr(num_inputs);
     std::vector<tflite::Dims<4> > inputDims(num_inputs);
@@ -33,6 +36,7 @@
         inputDimsPtr[i] = &inputDims[i];
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Concatenation");
     tflite::optimized_ops::Concatenation<tflite::FusedActivationFunctionType::kNone, float>(
             getNumberOfDimensions(outputShape) - axis - 1,
             inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
@@ -44,6 +48,7 @@
 bool concatenationQuant8(const std::vector<const uint8_t*>& inputDataPtrs,
                          const std::vector<Shape>& inputShapes, int32_t axis,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("concatenationQuant8");
     int num_inputs = inputShapes.size();
     std::vector<tflite::Dims<4>*> inputDimsPtr(num_inputs);
     std::vector<tflite::Dims<4> > inputDims(num_inputs);
@@ -52,6 +57,7 @@
         inputDimsPtr[i] = &inputDims[i];
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Concatenation");
     tflite::optimized_ops::Concatenation<tflite::FusedActivationFunctionType::kNone, uint8_t>(
             getNumberOfDimensions(outputShape) - axis - 1,
             inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
diff --git a/common/operations/Conv2D.cpp b/common/operations/Conv2D.cpp
index c16426c..60f9132 100644
--- a/common/operations/Conv2D.cpp
+++ b/common/operations/Conv2D.cpp
@@ -19,6 +19,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -84,6 +86,7 @@
                  int32_t stride_width, int32_t stride_height,
                  int32_t activation,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("convFloat32");
 
     ANDROID_NN_CONV_PARAMETERS(float)
 
@@ -95,6 +98,7 @@
 
     // Prevent concurrent executions that may access the scratch buffer.
     std::unique_lock<std::mutex> lock(executionMutex);
+    NNTRACE_COMP_SWITCH("optimized_ops::Conv");
     tflite::optimized_ops::Conv(
             inputData, convertShapeToDims(inputShape),
             filterData, convertShapeToDims(filterShape),
@@ -116,6 +120,7 @@
                 int32_t stride_width, int32_t stride_height,
                 int32_t activation,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("convQuant8");
 
     ANDROID_NN_CONV_PARAMETERS(uint8_t)
 
@@ -146,6 +151,8 @@
     std::unique_lock<std::mutex> lock(executionMutex);
     // Alow gemmlowp automatically decide how many threads to use.
     gemm_context.set_max_num_threads(0);
+
+    NNTRACE_COMP_SWITCH("optimized_ops::Conv");
     tflite::optimized_ops::Conv(
             inputData, convertShapeToDims(inputShape), inputOffset,
             filterData, convertShapeToDims(filterShape), filterOffset,
diff --git a/common/operations/DepthwiseConv2D.cpp b/common/operations/DepthwiseConv2D.cpp
index 5dd67e2..60c5e41 100644
--- a/common/operations/DepthwiseConv2D.cpp
+++ b/common/operations/DepthwiseConv2D.cpp
@@ -20,6 +20,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -42,6 +44,7 @@
                           int32_t stride_width, int32_t stride_height,
                           int32_t depth_multiplier, int32_t activation,
                           float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthwiseConvFloat32");
 
     ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
 
@@ -49,6 +52,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::DepthwiseConv");
     tflite::optimized_ops::DepthwiseConv(
             inputData, convertShapeToDims(inputShape),
             filterData, convertShapeToDims(filterShape),
@@ -70,6 +74,7 @@
                          int32_t stride_width, int32_t stride_height,
                          int32_t depth_multiplier, int32_t activation,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthwiseConvQuant8");
 
     ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
 
@@ -94,6 +99,7 @@
     uint32_t filterOffset = -filterShape.offset;
     uint32_t outputOffset = outputShape.offset;
 
+    NNTRACE_COMP_SWITCH("optimized_ops::DepthwiseConv");
     tflite::optimized_ops::DepthwiseConv(
             inputData, convertShapeToDims(inputShape), inputOffset,
             filterData, convertShapeToDims(filterShape), filterOffset,
diff --git a/common/operations/EmbeddingLookup.cpp b/common/operations/EmbeddingLookup.cpp
index 504c684..9f231ed 100644
--- a/common/operations/EmbeddingLookup.cpp
+++ b/common/operations/EmbeddingLookup.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "Operations.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -32,6 +34,7 @@
 }
 
 bool EmbeddingLookup::Eval() {
+  NNTRACE_COMP("EmbeddingLookup::Eval");
   const int row_size = value_->shape().dimensions[0];
   const int total_bytes = sizeOfData(value_->type, value_->dimensions);
   const int row_bytes = total_bytes/row_size;
diff --git a/common/operations/FullyConnected.cpp b/common/operations/FullyConnected.cpp
index 4e2deff..12da51b 100644
--- a/common/operations/FullyConnected.cpp
+++ b/common/operations/FullyConnected.cpp
@@ -20,6 +20,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -33,6 +35,7 @@
                            const float* biasData, const Shape& biasShape,
                            int32_t activation,
                            float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("fullyConnectedFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
@@ -42,6 +45,7 @@
     uint32_t batch_size = getSizeOfDimension(outputShape, 0);
     uint32_t input_n_elements = getNumberOfElements(inputShape);
     if (batch_size * batch_size == input_n_elements) {
+        NNTRACE_COMP_SWITCH("reference_ops::FullyConnected");
         tflite::reference_ops::FullyConnected(
                 inputData, convertShapeToDims(inputShape),
                 weightsData, convertShapeToDims(weightsShape),
@@ -49,6 +53,7 @@
                 output_activation_min, output_activation_max,
                 outputData, convertShapeToDims(outputShape));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::FullyConnected");
         tflite::optimized_ops::FullyConnected(
                 inputData, convertShapeToDims(inputShape),
                 weightsData, convertShapeToDims(weightsShape),
@@ -64,6 +69,7 @@
                           const int32_t* biasData, const Shape& biasShape,
                           int32_t activation,
                           uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("fullyConnectedQuant8");
     int32_t inputOffset = -inputShape.offset;
     int32_t weightsOffset = -weightsShape.offset;
     int32_t outputOffset = outputShape.offset;
@@ -91,6 +97,7 @@
     // Alow gemmlowp automatically decide how many threads to use.
     gemm_context.set_max_num_threads(0);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::FullyConnected");
     tflite::optimized_ops::FullyConnected(
             inputData, convertShapeToDims(inputShape), inputOffset,
             weightsData, convertShapeToDims(weightsShape), weightsOffset,
diff --git a/common/operations/HashtableLookup.cpp b/common/operations/HashtableLookup.cpp
index 1c8d802..e864b3d 100644
--- a/common/operations/HashtableLookup.cpp
+++ b/common/operations/HashtableLookup.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "Operations.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -42,6 +44,7 @@
 }
 
 bool HashtableLookup::Eval() {
+  NNTRACE_COMP("HashtableLookup::Eval");
   const int num_rows = value_->shape().dimensions[0];
   const int row_bytes = sizeOfData(value_->type, value_->dimensions) / num_rows;
   void* pointer = nullptr;
diff --git a/common/operations/LSHProjection.cpp b/common/operations/LSHProjection.cpp
index 57d1475..97183e2 100644
--- a/common/operations/LSHProjection.cpp
+++ b/common/operations/LSHProjection.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "util/hash/farmhash.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -141,6 +143,8 @@
 }
 
 bool LSHProjection::Eval() {
+  NNTRACE_COMP("LSHProjection::Eval");
+
   int32_t* out_buf = reinterpret_cast<int32_t*>(output_->buffer);
 
   switch (type_) {
diff --git a/common/operations/LSTM.cpp b/common/operations/LSTM.cpp
index 80b0eb9..7667c66 100644
--- a/common/operations/LSTM.cpp
+++ b/common/operations/LSTM.cpp
@@ -19,6 +19,8 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -300,6 +302,8 @@
 }
 
 bool LSTMCell::Eval() {
+  NNTRACE_COMP("LSTMCell::Eval");
+
   const uint32_t n_batch = input_->shape().dimensions[0];
   const uint32_t n_input = input_->shape().dimensions[1];
   // n_cell and n_output will be the same size when there is no projection.
diff --git a/common/operations/Normalization.cpp b/common/operations/Normalization.cpp
index eccb3bd..7f36dbf 100644
--- a/common/operations/Normalization.cpp
+++ b/common/operations/Normalization.cpp
@@ -19,11 +19,14 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool l2normFloat32(const float* inputData, const Shape& inputShape,
                    float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::L2Normalization::float");
     tflite::optimized_ops::L2Normalization<tflite::FusedActivationFunctionType::kNone>(
             inputData, convertShapeToDims(inputShape),
             outputData, convertShapeToDims(outputShape));
@@ -33,6 +36,7 @@
 
 bool l2normQuant8(const uint8_t* inputData, const Shape& inputShape,
                   uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::L2Normalization::uint8");
     tflite::optimized_ops::L2Normalization(
             inputData, convertShapeToDims(inputShape),
             inputShape.offset,
@@ -44,6 +48,7 @@
 bool localResponseNormFloat32(const float* inputData, const Shape& inputShape,
                               int32_t radius, float bias, float alpha, float beta,
                               float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::LocalResponseNormalization::float");
     tflite::optimized_ops::LocalResponseNormalization(
             inputData, convertShapeToDims(inputShape),
             radius, bias, alpha, beta,
diff --git a/common/operations/Pooling.cpp b/common/operations/Pooling.cpp
index db4497a..7f60323 100644
--- a/common/operations/Pooling.cpp
+++ b/common/operations/Pooling.cpp
@@ -19,6 +19,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -37,6 +39,7 @@
                         int32_t stride_width, int32_t stride_height,
                         int32_t filter_width, int32_t filter_height, int32_t activation,
                         float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("averagePoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -44,6 +47,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
     tflite::optimized_ops::AveragePool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -60,6 +64,7 @@
                        int32_t stride_width, int32_t stride_height,
                        int32_t filter_width, int32_t filter_height, int32_t activation,
                        uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("averagePoolQuant8");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -70,6 +75,7 @@
                                   &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
     tflite::optimized_ops::AveragePool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -86,6 +92,7 @@
                    int32_t stride_width, int32_t stride_height,
                    int32_t filter_width, int32_t filter_height, int32_t activation,
                    float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("l2PoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -93,6 +100,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::L2Pool");
     tflite::optimized_ops::L2Pool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -109,6 +117,7 @@
                     int32_t stride_width, int32_t stride_height,
                     int32_t filter_width, int32_t filter_height, int32_t activation,
                     float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("maxPoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -116,6 +125,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
     tflite::optimized_ops::MaxPool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -132,6 +142,7 @@
                    int32_t stride_width, int32_t stride_height,
                    int32_t filter_width, int32_t filter_height, int32_t activation,
                    uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("maxPoolQuant8");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -142,6 +153,7 @@
                                   &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
     tflite::optimized_ops::MaxPool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
diff --git a/common/operations/RNN.cpp b/common/operations/RNN.cpp
index 8a00734..4d7a4c9 100644
--- a/common/operations/RNN.cpp
+++ b/common/operations/RNN.cpp
@@ -19,11 +19,14 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 RNN::RNN(const Operation& operation,
          std::vector<RunTimeOperandInfo>& operands) {
+  NNTRACE_TRANS("RNN::RNN");
   input_ = GetInput(operation, operands, kInputTensor);
   weights_ = GetInput(operation, operands, kWeightsTensor);
   recurrent_weights_ = GetInput(operation, operands, kRecurrentWeightsTensor);
@@ -41,6 +44,7 @@
                   std::vector<RunTimeOperandInfo> &operands,
                   Shape *hiddenStateShape,
                   Shape *outputShape) {
+  NNTRACE_TRANS("RNN::Prepare");
   // Check we have all the inputs and outputs we need.
   const int num_inputs = NumInputsWithValues(operation, operands);
   NN_CHECK(num_inputs == 5 || num_inputs == 6);
@@ -78,6 +82,8 @@
 }
 
 bool RNN::Eval() {
+  NNTRACE_COMP("RNN::Eval");
+
   const float* bias_ptr = reinterpret_cast<float*>(bias_->buffer);
 
   const uint32_t batch_size = input_->shape().dimensions[0];
diff --git a/common/operations/Reshape.cpp b/common/operations/Reshape.cpp
index 5803968..5a05aed 100644
--- a/common/operations/Reshape.cpp
+++ b/common/operations/Reshape.cpp
@@ -24,11 +24,14 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool reshapeGeneric(const void* inputData, const Shape& inputShape,
                     void* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reshapeGeneric");
     size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
     memcpy(outputData, inputData, count);
     return true;
@@ -36,6 +39,7 @@
 
 bool resizeBilinearFloat32(const float* inputData, const Shape& inputShape,
                            float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("resizeBilinearFloat32");
     int32_t height = (int32_t) getSizeOfDimension(outputShape, 1);
     int32_t width  = (int32_t) getSizeOfDimension(outputShape, 2);
 
@@ -44,6 +48,7 @@
     Shape outDimShape;
     outDimShape.dimensions = {1, 1, 1, 2};
 
+    NNTRACE_COMP_SWITCH("optimized_ops::ResizeBilinear");
     tflite::optimized_ops::ResizeBilinear(
             inputData, convertShapeToDims(inputShape),
             outDimData, convertShapeToDims(outDimShape),
@@ -54,14 +59,17 @@
 bool depthToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
                          int32_t blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthToSpaceGeneric");
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
-       tflite::optimized_ops::DepthToSpace(
-                reinterpret_cast<const float*>(inputData),
-                convertShapeToDims(inputShape),
-                blockSize,
-                reinterpret_cast<float*>(outputData),
-                convertShapeToDims(outputShape));
+        NNTRACE_COMP_SWITCH("optimized_ops::DepthToSpace::float");
+        tflite::optimized_ops::DepthToSpace(
+                 reinterpret_cast<const float*>(inputData),
+                 convertShapeToDims(inputShape),
+                 blockSize,
+                 reinterpret_cast<float*>(outputData),
+                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::DepthToSpace::uint8");
         tflite::optimized_ops::DepthToSpace(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -78,7 +86,9 @@
 bool spaceToDepthGeneric(const uint8_t* inputData, const Shape& inputShape,
                          int32_t blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("spaceToDepthGeneric");
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToDepth::float");
         tflite::optimized_ops::SpaceToDepth(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -86,6 +96,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToDepth::uint8");
         tflite::optimized_ops::SpaceToDepth(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -102,6 +113,7 @@
 bool padGeneric(const uint8_t* inputData, const Shape& inputShape,
                 const int32_t* paddings,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("padGeneric");
     int32_t numInputDims = static_cast<int32_t>(getNumberOfDimensions(inputShape));
 
     std::vector<int> beforePadding;
@@ -113,6 +125,7 @@
     }
 
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Pad::float");
         tflite::optimized_ops::Pad(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -120,6 +133,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Pad::uint8");
         tflite::optimized_ops::Pad(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -136,18 +150,21 @@
 bool batchToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
                          const int32_t* blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("batchToSpaceGeneric");
     // Needed by low level implementation, but not really used.
     tflite::Dims<4> blockSizeDim, cropsDim;
     const int32 crops[4] = {0, 0, 0, 0};
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
-       tflite::optimized_ops::BatchToSpaceND(
-                reinterpret_cast<const float*>(inputData),
-                convertShapeToDims(inputShape),
-                blockSize, blockSizeDim,
-                crops, cropsDim,
-                reinterpret_cast<float*>(outputData),
-                convertShapeToDims(outputShape));
+        NNTRACE_COMP_SWITCH("optimized_ops::BatchToSpaceND::float");
+        tflite::optimized_ops::BatchToSpaceND(
+                 reinterpret_cast<const float*>(inputData),
+                 convertShapeToDims(inputShape),
+                 blockSize, blockSizeDim,
+                 crops, cropsDim,
+                 reinterpret_cast<float*>(outputData),
+                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BatchToSpaceND::uint8");
         tflite::optimized_ops::BatchToSpaceND(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -166,9 +183,11 @@
                          const int32_t* blockSize,
                          const int32_t* padding, const Shape& paddingShape,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("spaceToBatchGeneric");
     // Needed by low level implementation, but not really used.
     tflite::Dims<4> blockSizeDim;
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToBatchND::float");
         tflite::optimized_ops::SpaceToBatchND(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -177,6 +196,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToBatchND::uint8");
         tflite::optimized_ops::SpaceToBatchND(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -193,6 +213,7 @@
 
 bool squeezeGeneric(const void* inputData, const Shape& inputShape,
                     void* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("squeezeGeneric");
     size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
     memcpy(outputData, inputData, count);
     return true;
@@ -201,6 +222,7 @@
 bool transposeGeneric(const uint8_t* inputData, const Shape& inputShape,
                       const int32_t* perm, const Shape& permShape,
                       uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("transposeGeneric");
     // Reverse the permuted axes and convert to 4D due to the way Dims are
     // constructed.
     const int32_t kOutputDimensionNum = 4;
@@ -215,6 +237,7 @@
         reversed_perm[k] = k;
     }
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Transpose::float");
         tflite::reference_ops::Transpose(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -222,6 +245,7 @@
                 convertShapeToDims(outputShape),
                 reversed_perm);
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Transpose::uint8");
         tflite::reference_ops::Transpose(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
diff --git a/common/operations/SVDF.cpp b/common/operations/SVDF.cpp
index 38224a6..cdba351 100644
--- a/common/operations/SVDF.cpp
+++ b/common/operations/SVDF.cpp
@@ -19,6 +19,8 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -38,6 +40,7 @@
 
 SVDF::SVDF(const Operation& operation,
            std::vector<RunTimeOperandInfo>& operands) {
+    NNTRACE_TRANS("SVDF::SVDF");
     input_ = GetInput(operation, operands, kInputTensor);
     weights_feature_ = GetInput(operation, operands, kWeightsFeatureTensor);
     weights_time_ = GetInput(operation, operands, kWeightsTimeTensor);
@@ -56,6 +59,7 @@
                    std::vector<RunTimeOperandInfo> &operands,
                    Shape *stateShape,
                    Shape *outputShape) {
+  NNTRACE_TRANS("SVDF::Prepare");
   // Check we have all the inputs and outputs we need.
   const int num_inputs = NumInputsWithValues(operation, operands);
 
@@ -103,6 +107,8 @@
 }
 
 bool SVDF::Eval() {
+    NNTRACE_COMP("SVDF::Eval");
+
     const int rank = params_.rank_;
     const int batch_size = SizeOfDimension(input_, 0);
     const int input_size = SizeOfDimension(input_, 1);
diff --git a/common/operations/SimpleMath.cpp b/common/operations/SimpleMath.cpp
index 045ac97..e842439 100644
--- a/common/operations/SimpleMath.cpp
+++ b/common/operations/SimpleMath.cpp
@@ -24,6 +24,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -50,9 +52,11 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("addFloat32");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastAdd");
         #define ANDROID_NN_BROADCAST_ADD(activation)                                              \
             tflite::optimized_ops::BroadcastAdd<tflite::FusedActivationFunctionType::activation>( \
                     in1, convertShapeToDims(shape1),                                              \
@@ -66,6 +70,7 @@
         CalculateActivationRangeFloat(activation, &output_activation_min,
                                       &output_activation_max);
 
+        NNTRACE_COMP_SWITCH("optimized_ops::Add");
         tflite::optimized_ops::Add(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -80,6 +85,7 @@
                const uint8_t* in2, const Shape& shape2,
                int32_t activation,
                uint8_t* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("addQuant8");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     const int32_t input1_offset = -shape1.offset;
@@ -118,6 +124,7 @@
                                   &output_activation_max);
 
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastAdd");
         tflite::optimized_ops::BroadcastAdd(
                 left_shift,
                 in1, convertShapeToDims(shape1),
@@ -128,6 +135,7 @@
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Add");
         #define ANDROID_NN_NORMAL_ADD(activation)                                        \
             tflite::optimized_ops::Add<tflite::FusedActivationFunctionType::activation>( \
                     left_shift,                                                          \
@@ -150,10 +158,12 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("mulFloat32");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     if (needBroadcast) {
-    #define ANDROID_NN_BROADCAST_MUL(activation)                                              \
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastMul");
+        #define ANDROID_NN_BROADCAST_MUL(activation)                                          \
         tflite::optimized_ops::BroadcastMul<tflite::FusedActivationFunctionType::activation>( \
                 in1, convertShapeToDims(shape1),                                              \
                 in2, convertShapeToDims(shape2),                                              \
@@ -166,6 +176,7 @@
         CalculateActivationRangeFloat(activation, &output_activation_min,
                                       &output_activation_max);
 
+        NNTRACE_COMP_SWITCH("optimized_ops::Mul");
         tflite::optimized_ops::Mul(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -180,6 +191,7 @@
                const uint8_t* in2, const Shape& shape2,
                int32_t activation,
                uint8_t* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("mulQuant8");
     const int32_t input1_offset = -shape1.offset;
     const int32_t input2_offset = -shape2.offset;
     const int32_t output_offset = shapeOut.offset;
@@ -198,6 +210,7 @@
                                   &output_activation_max);
 
     // Use BROADCAST version to handle the normal case.
+    NNTRACE_COMP_SWITCH("optimized_ops::BroadcastMul");
     tflite::optimized_ops::BroadcastMul(
                 in1, convertShapeToDims(shape1), input1_offset,
                 in2, convertShapeToDims(shape2), input2_offset,
@@ -211,7 +224,9 @@
 bool floorFloat32(const float* inputData,
                   float* outputData,
                   const Shape& shape) {
+    NNTRACE_TRANS("floorFloat32");
     tflite::Dims<4> dim = convertShapeToDims(shape);
+    NNTRACE_COMP_SWITCH("optimized_ops::Floor");
     tflite::optimized_ops::Floor(inputData, dim, outputData, dim);
     return true;
 }
@@ -219,7 +234,9 @@
 bool dequantizeQuant8ToFloat32(const uint8_t* inputData,
                                float* outputData,
                                const Shape& shape) {
+    NNTRACE_TRANS("dequantizeQuant8ToFloat32");
     tflite::Dims<4> dim = convertShapeToDims(shape);
+    NNTRACE_COMP_SWITCH("optimized_ops::Dequantize");
     tflite::optimized_ops::Dequantize(inputData, dim,
                                       shape.offset, shape.scale,
                                       outputData, dim);
@@ -230,18 +247,21 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("subFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
     bool needBroadcast = !SameShape(shape1, shape2);
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastSub");
         tflite::optimized_ops::BroadcastSub(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Sub");
         tflite::optimized_ops::Sub(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -255,18 +275,21 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("divFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
     bool needBroadcast = !SameShape(shape1, shape2);
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastDiv");
         tflite::optimized_ops::BroadcastDiv(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Div");
         tflite::optimized_ops::Div(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -279,6 +302,7 @@
 bool meanGeneric(const uint8_t* inputData, const Shape& inputShape,
                  const int32_t* axis, const Shape& axisShape, bool keepDims,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("meanGeneric");
     // Creates a temp index to iterate through input data.
     int32_t* scratchBuffer = new int32_t[getNumberOfDimensions(inputShape)];
 
@@ -293,6 +317,7 @@
             LOG(ERROR) << "Failed to allocate tempSumBuffer for MEAN";
             result = false;
         } else {
+            NNTRACE_COMP_SWITCH("optimized_ops::Mean");
             tflite::reference_ops::Mean<float, float>(
                     const_cast<float*>(reinterpret_cast<const float*>(inputData)),
                     reinterpret_cast<const int*>(inputShape.dimensions.data()),
@@ -310,6 +335,7 @@
             LOG(ERROR) << "Failed to allocate tempSumBuffer for MEAN";
             result = false;
         } else {
+            NNTRACE_COMP_SWITCH("optimized_ops::Mean");
             tflite::reference_ops::Mean<uint8_t, int32_t>(
                     const_cast<uint8_t*>(inputData),
                     reinterpret_cast<const int*>(inputShape.dimensions.data()),
diff --git a/common/operations/StridedSlice.cpp b/common/operations/StridedSlice.cpp
index 9db9523..222c48d 100644
--- a/common/operations/StridedSlice.cpp
+++ b/common/operations/StridedSlice.cpp
@@ -23,6 +23,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -31,6 +33,7 @@
                          const int32_t* stridesData,
                          int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("stridedSliceGeneric");
     // This Op only supports 1-4D cases and since we use the reference 4D
     // implementation, the 1-3D tensors are mapped to 4D.
     const int kMaxDim = 4;
@@ -56,6 +59,7 @@
     endMask = ReverseMaskBits(endMask, numInputDims);
 
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("reference_ops::StridedSlice::float");
         tflite::reference_ops::StridedSlice(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -64,6 +68,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("reference_ops::StridedSlice::uint8");
         tflite::reference_ops::StridedSlice(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),