NNAPI systrace for timing statistics

This adds systrace tracing to NNAPI. The tracing
will be helpful for:
- getting numbers on where time is spent currently
- diagnosing and improving performance
- benchmarking

TODOs:
- Write analysis tools for traces

Change-Id: I9026f1043428cb715b577901bec3a2e1e39a82e3
Merged-In: I9026f1043428cb715b577901bec3a2e1e39a82e3
Bug: 78137932
Test: manually run systrace.py against unit tests
Test: manually run systrace.py against benchmarking app
(cherry picked from commit e9e637ab73b68b5982281a3f7c621f6a75d51743)
diff --git a/common/CpuExecutor.cpp b/common/CpuExecutor.cpp
index 8f87067..be79fa5 100644
--- a/common/CpuExecutor.cpp
+++ b/common/CpuExecutor.cpp
@@ -20,6 +20,7 @@
 
 #include "NeuralNetworks.h"
 #include "Operations.h"
+#include "Tracing.h"
 
 #include "Eigen/Core"
 #include <omp.h>
@@ -189,12 +190,14 @@
 int CpuExecutor::run(const V1_0::Model& model, const Request& request,
                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
+    NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run::V1_0");
     return run(convertToV1_1(model), request, modelPoolInfos, requestPoolInfos);
 }
 
 int CpuExecutor::run(const V1_1::Model& model, const Request& request,
                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
+    NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run::V1_1");
     VLOG(CPUEXE) << "CpuExecutor::run() with request("
                  << SHOW_IF_DEBUG(toString(request)) << ")";
 
diff --git a/common/ValidateHal.cpp b/common/ValidateHal.cpp
index 0955178..8016b1f 100644
--- a/common/ValidateHal.cpp
+++ b/common/ValidateHal.cpp
@@ -18,6 +18,7 @@
 
 #include "ValidateHal.h"
 #include "NeuralNetworks.h"
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <android-base/logging.h>
@@ -405,6 +406,8 @@
 
 template<typename VersionedModel>
 static bool validateModelVersioned(const VersionedModel& model) {
+    NNTRACE_FULL(NNTRACE_LAYER_UTILITY, NNTRACE_PHASE_UNSPECIFIED,
+                 "validateModelVersioned");
     return (validateOperands(model.operands, model.operandValues, model.pools) &&
             validateOperations(model.operations, model.operands) &&
             validateModelInputOutputs(model.inputIndexes, model.operands,
diff --git a/common/include/Tracing.h b/common/include/Tracing.h
new file mode 100644
index 0000000..c2eae7a
--- /dev/null
+++ b/common/include/Tracing.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_ML_NN_COMMON_TRACING_H
+#define ANDROID_ML_NN_COMMON_TRACING_H
+
+#define ATRACE_TAG ATRACE_TAG_NNAPI
+#include "utils/Trace.h"
+
+// Neural Networks API (NNAPI) systracing
+//
+// Primary goal of the tracing is to capture and present timings for NNAPI.
+// (Other uses include providing visibility to split of execution between
+// drivers and the CPU fallback, and the ability to visualize call sequences).
+//
+// The tracing has three parts:
+//  1 Trace macros defined in this file and used throughout the codebase,
+//    modelled after and using atrace. These implement a naming convention for
+//    the tracepoints, interpreted by the systrace parser.
+//  2 Android systrace (atrace) on-device capture and host-based analysis.
+//  3 A systrace parser (TODO) to summarize the timings.
+//
+// For an overview and introduction, please refer to the "NNAPI Systrace design
+// and HOWTO" (internal Docs for now). This header doesn't try to replicate all
+// the information in that document.
+//
+// Glossary:
+// - Phase: stage in processing (e.g., Preparation, Compilation, Execution);
+//   Overall phase nests rest, Execution nests Input/Output, Transformation,
+//   Computation and Results - otherwise not nested (Initialization phase
+//   functions may occur inside other phases but will be counted out during
+//   analysis). Nested phases (other than Initialization) are analysed as a
+//   breakdown of the parent phase.
+// - Layer: component in the stack (from top to bottom: App, Runtime, IPC,
+//   Driver/CPU). Calls to lower layers are typically nested within calls to upper
+//   layers.
+// - Bucket: unit of timing analysis, the combination of Phase and Layer (and
+//   thus also typically nested).
+// - Detail: specific unit being executed, typically a function.
+
+// Convenience macros to be used in the code (phases defined below).
+// (Macros so that string concatenation is done at compile time).
+//
+// These exist in three variants:
+// - Simple (NNTRACE_<layer and potentially phase>) - to be used when only one
+//   Phase is active within a scope
+// - "Switch" (NNTRACE_<...>_SWITCH) - to be used when multiple Phases
+//   share a scope (e.g., transformation of data and computation in same
+//   function).
+// - "Subtract" (NNTRACE_<...>_SUBTRACT) - to be used when nesting is violated
+//   and the time should be subtracted from the parent scope
+// Arguments:
+// - phase: one of the NNTRACE_PHASE_* macros defined below.
+// - detail: free-form string constant, typically function name.
+// Example usage:
+//   // Simple
+//   int ANeuralNetworksMemory_createFromFd(...) {
+//     NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksMemory_createFromFd");
+//   }
+//   // Switch
+//   bool concatenationFloat32(...) {
+//     NNTRACE_TRANS("concatenationFloat32");  // Transformation of data begins
+//     ...
+//     NNTRACE_COMP_SWITCH("optimized_ops::Concatenation"); // Transformation
+//                                                          // ends and computation
+//                                                          // begins
+//   }
+//   // Subtract
+//   static int compile(...) {
+//     NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModel");
+//     device->getInterface()->prepareModel(..., preparedModelCallback);
+//     preparedModelCallback->wait()
+//   }
+//   ErrorStatus VersionedIDevice::prepareModel(...) {
+//     ... IPC work ...
+//     {
+//       NNTRACE_FULL_SUBTRACT(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_COMPILATION,
+//                             "VersionedIDevice::prepareModel");
+//       ... Runtime work ...
+//     }
+//     ... IPC work ...
+//   }
+//
+// Layer Application - For native applications (e.g., unit tests)
+#define NNTRACE_APP(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_APPLICATION, phase, detail)
+#define NNTRACE_APP_SWITCH(phase, detail) \
+        NNTRACE_FULL_SWITCH(NNTRACE_LAYER_APPLICATION, phase, detail)
+// Layer Runtime - For the NNAPI runtime
+#define NNTRACE_RT(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_RUNTIME, phase, detail)
+#define NNTRACE_RT_SWITCH(phase, detail) NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, phase, detail)
+// Layer CPU - CPU executor
+#define NNTRACE_CPU(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, phase, detail)
+#define NNTRACE_COMP(detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, \
+                                          NNTRACE_PHASE_COMPUTATION, detail)
+#define NNTRACE_COMP_SWITCH(detail) NNTRACE_FULL_SWITCH(NNTRACE_LAYER_CPU, \
+                                                        NNTRACE_PHASE_COMPUTATION, detail)
+#define NNTRACE_TRANS(detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, \
+                                           NNTRACE_PHASE_TRANSFORMATION, detail)
+
+// Fully specified macros to be used when no convenience wrapper exists for your
+// need.
+#define NNTRACE_FULL(layer, phase, detail) NNTRACE_NAME_1(("[NN_" layer "_" phase "]" detail))
+#define NNTRACE_FULL_SWITCH(layer, phase, detail) \
+        NNTRACE_NAME_SWITCH(("[SW][NN_" layer "_" phase "]" detail))
+#define NNTRACE_FULL_SUBTRACT(layer, phase, detail) \
+        NNTRACE_NAME_1(("[SUB][NN_" layer "_" phase "]" detail))
+// Raw macro without scoping requirements, for special cases
+#define NNTRACE_FULL_RAW(layer, phase, detail) android::ScopedTrace PASTE(___tracer, __LINE__) \
+        (ATRACE_TAG, ("[NN_" layer "_" phase "]" detail))
+
+// Tracing buckets - for calculating timing summaries over.
+//
+// Phases
+#define NNTRACE_PHASE_OVERALL "PO"      // Overall program, e.g., one benchmark case
+#define NNTRACE_PHASE_INITIALIZATION "PI" // Initialization - not related to a model
+#define NNTRACE_PHASE_PREPARATION "PP"  // Model construction
+#define NNTRACE_PHASE_COMPILATION "PC"  // Model compilation
+#define NNTRACE_PHASE_EXECUTION "PE"    // Executing the model
+#define NNTRACE_PHASE_TERMINATION "PT"  // Tearing down
+#define NNTRACE_PHASE_UNSPECIFIED "PU"  // Helper code called from multiple phases
+// Subphases of execution
+#define NNTRACE_PHASE_INPUTS_AND_OUTPUTS "PIO"  // Setting inputs/outputs and allocating buffers
+#define NNTRACE_PHASE_TRANSFORMATION "PTR"      // Transforming data for computation
+#define NNTRACE_PHASE_COMPUTATION "PCO"         // Computing operations' outputs
+#define NNTRACE_PHASE_RESULTS "PR"              // Reading out results
+// Layers
+#define NNTRACE_LAYER_APPLICATION "LA"
+#define NNTRACE_LAYER_RUNTIME "LR"
+#define NNTRACE_LAYER_IPC "LI"
+#define NNTRACE_LAYER_DRIVER "LD"
+#define NNTRACE_LAYER_CPU "LC"
+#define NNTRACE_LAYER_OTHER "LO"
+#define NNTRACE_LAYER_UTILITY "LU"              // Code used from multiple layers
+
+
+// Implementation
+//
+// Almost same as ATRACE_NAME, but enforcing explicit distinction between
+// phase-per-scope and switching phases.
+//
+// Basic trace, one per scope allowed to enforce disjointness
+#define NNTRACE_NAME_1(name) android::ScopedTrace ___tracer_1(ATRACE_TAG, name)
+// Switching trace, more than one per scope allowed, translated by
+// systrace_parser.py. This is mainly useful for tracing multiple phases through
+// one function / scope.
+#define NNTRACE_NAME_SWITCH(name) android::ScopedTrace PASTE(___tracer, __LINE__) \
+        (ATRACE_TAG, name); \
+        (void)___tracer_1  // ensure switch is only used after a basic trace
+
+
+// Disallow use of raw ATRACE macros
+#undef ATRACE_NAME
+#undef ATRACE_CALL
+
+#endif // ANDROID_ML_NN_COMMON_TRACING_H
diff --git a/common/operations/Activation.cpp b/common/operations/Activation.cpp
index b80984d..5da58f9 100644
--- a/common/operations/Activation.cpp
+++ b/common/operations/Activation.cpp
@@ -19,11 +19,14 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool reluFloat32(const float* inputData, const Shape& inputShape,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reluFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::max(0.f, *inputData);
@@ -33,6 +36,7 @@
 
 bool relu1Float32(const float* inputData, const Shape& inputShape,
                   float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu1Float32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::min(std::max(-1.f, *inputData), 1.f);
@@ -42,6 +46,7 @@
 
 bool relu6Float32(const float* inputData, const Shape& inputShape,
                   float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu6Float32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::min(std::max(0.f, *inputData), 6.f);
@@ -51,6 +56,7 @@
 
 bool tanhFloat32(const float* inputData, const Shape& inputShape,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("tanhFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = std::tanh(*inputData);
@@ -60,6 +66,7 @@
 
 bool logisticFloat32(const float* inputData, const Shape& inputShape,
                      float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("logisticFloat32");
     int numElements = getNumberOfElements(inputShape);
     for (int i=0; i<numElements; i++, inputData++, outputData++) {
         *outputData = 1.f / (1.f + std::exp(-*inputData));
@@ -70,6 +77,7 @@
 bool softmaxFloat32(const float* inputData, const Shape& inputShape,
                     const float beta,
                     float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("softmaxFloat32");
     tflite::Dims<4> dim;
     if (getNumberOfDimensions(inputShape) == 2) {
         uint32_t batch_size = getSizeOfDimension(inputShape, 0);
@@ -85,6 +93,7 @@
         return false;
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Softmax");
     tflite::optimized_ops::Softmax(inputData, dim, beta,
                                    outputData, dim);
     return true;
@@ -107,18 +116,21 @@
 
 bool reluQuant8(const uint8_t* inputData, const Shape& inputShape,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reluQuant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu)
     return true;
 }
 
 bool relu1Quant8(const uint8_t* inputData, const Shape& inputShape,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu1Quant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu1)
     return true;
 }
 
 bool relu6Quant8(const uint8_t* inputData, const Shape& inputShape,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("relu6Quant8");
     ANDROID_NN_RELUX_QUANT8(kActivationRelu6)
     return true;
 }
@@ -127,6 +139,7 @@
 
 bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape,
                     uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("logisticQuant8");
     if (outputShape.offset != 0 || outputShape.scale != 1.f / 256) {
         LOG(ERROR) << "incorrect scale / offset for output";
         return false;
@@ -149,6 +162,7 @@
     int32_t input_range_radius =
             CalculateInputRadius(kInputIntegerBits, input_left_shift);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Logistic");
     tflite::optimized_ops::Logistic(
             inputData, convertShapeToDims(inputShape),
             inputShape.offset, input_range_radius,
@@ -161,6 +175,7 @@
 bool softmaxQuant8(const uint8_t* inputData, const Shape& inputShape,
                    const float beta,
                    uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("softmaxQuant8");
     tflite::Dims<4> dim;
     if (getNumberOfDimensions(inputShape) == 2) {
         uint32_t batch_size = getSizeOfDimension(inputShape, 0);
@@ -196,6 +211,7 @@
     float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits,
                                                   input_left_shift);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Softmax");
     tflite::optimized_ops::Softmax(inputData, dim, input_multiplier,
                                    input_left_shift, diff_min,
                                    outputData, dim);
diff --git a/common/operations/Concatenation.cpp b/common/operations/Concatenation.cpp
index fc65e43..cc1fa5a 100644
--- a/common/operations/Concatenation.cpp
+++ b/common/operations/Concatenation.cpp
@@ -19,12 +19,15 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
                           const std::vector<Shape>& inputShapes, int32_t axis,
                           float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("concatenationFloat32");
     int num_inputs = inputShapes.size();
     std::vector<tflite::Dims<4>*> inputDimsPtr(num_inputs);
     std::vector<tflite::Dims<4> > inputDims(num_inputs);
@@ -33,6 +36,7 @@
         inputDimsPtr[i] = &inputDims[i];
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Concatenation");
     tflite::optimized_ops::Concatenation<tflite::FusedActivationFunctionType::kNone, float>(
             getNumberOfDimensions(outputShape) - axis - 1,
             inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
@@ -44,6 +48,7 @@
 bool concatenationQuant8(const std::vector<const uint8_t*>& inputDataPtrs,
                          const std::vector<Shape>& inputShapes, int32_t axis,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("concatenationQuant8");
     int num_inputs = inputShapes.size();
     std::vector<tflite::Dims<4>*> inputDimsPtr(num_inputs);
     std::vector<tflite::Dims<4> > inputDims(num_inputs);
@@ -52,6 +57,7 @@
         inputDimsPtr[i] = &inputDims[i];
     }
 
+    NNTRACE_COMP_SWITCH("optimized_ops::Concatenation");
     tflite::optimized_ops::Concatenation<tflite::FusedActivationFunctionType::kNone, uint8_t>(
             getNumberOfDimensions(outputShape) - axis - 1,
             inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
diff --git a/common/operations/Conv2D.cpp b/common/operations/Conv2D.cpp
index c16426c..60f9132 100644
--- a/common/operations/Conv2D.cpp
+++ b/common/operations/Conv2D.cpp
@@ -19,6 +19,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -84,6 +86,7 @@
                  int32_t stride_width, int32_t stride_height,
                  int32_t activation,
                  float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("convFloat32");
 
     ANDROID_NN_CONV_PARAMETERS(float)
 
@@ -95,6 +98,7 @@
 
     // Prevent concurrent executions that may access the scratch buffer.
     std::unique_lock<std::mutex> lock(executionMutex);
+    NNTRACE_COMP_SWITCH("optimized_ops::Conv");
     tflite::optimized_ops::Conv(
             inputData, convertShapeToDims(inputShape),
             filterData, convertShapeToDims(filterShape),
@@ -116,6 +120,7 @@
                 int32_t stride_width, int32_t stride_height,
                 int32_t activation,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("convQuant8");
 
     ANDROID_NN_CONV_PARAMETERS(uint8_t)
 
@@ -146,6 +151,8 @@
     std::unique_lock<std::mutex> lock(executionMutex);
     // Alow gemmlowp automatically decide how many threads to use.
     gemm_context.set_max_num_threads(0);
+
+    NNTRACE_COMP_SWITCH("optimized_ops::Conv");
     tflite::optimized_ops::Conv(
             inputData, convertShapeToDims(inputShape), inputOffset,
             filterData, convertShapeToDims(filterShape), filterOffset,
diff --git a/common/operations/DepthwiseConv2D.cpp b/common/operations/DepthwiseConv2D.cpp
index 5dd67e2..60c5e41 100644
--- a/common/operations/DepthwiseConv2D.cpp
+++ b/common/operations/DepthwiseConv2D.cpp
@@ -20,6 +20,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -42,6 +44,7 @@
                           int32_t stride_width, int32_t stride_height,
                           int32_t depth_multiplier, int32_t activation,
                           float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthwiseConvFloat32");
 
     ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
 
@@ -49,6 +52,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::DepthwiseConv");
     tflite::optimized_ops::DepthwiseConv(
             inputData, convertShapeToDims(inputShape),
             filterData, convertShapeToDims(filterShape),
@@ -70,6 +74,7 @@
                          int32_t stride_width, int32_t stride_height,
                          int32_t depth_multiplier, int32_t activation,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthwiseConvQuant8");
 
     ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
 
@@ -94,6 +99,7 @@
     uint32_t filterOffset = -filterShape.offset;
     uint32_t outputOffset = outputShape.offset;
 
+    NNTRACE_COMP_SWITCH("optimized_ops::DepthwiseConv");
     tflite::optimized_ops::DepthwiseConv(
             inputData, convertShapeToDims(inputShape), inputOffset,
             filterData, convertShapeToDims(filterShape), filterOffset,
diff --git a/common/operations/EmbeddingLookup.cpp b/common/operations/EmbeddingLookup.cpp
index 504c684..9f231ed 100644
--- a/common/operations/EmbeddingLookup.cpp
+++ b/common/operations/EmbeddingLookup.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "Operations.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -32,6 +34,7 @@
 }
 
 bool EmbeddingLookup::Eval() {
+  NNTRACE_COMP("EmbeddingLookup::Eval");
   const int row_size = value_->shape().dimensions[0];
   const int total_bytes = sizeOfData(value_->type, value_->dimensions);
   const int row_bytes = total_bytes/row_size;
diff --git a/common/operations/FullyConnected.cpp b/common/operations/FullyConnected.cpp
index 4e2deff..12da51b 100644
--- a/common/operations/FullyConnected.cpp
+++ b/common/operations/FullyConnected.cpp
@@ -20,6 +20,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -33,6 +35,7 @@
                            const float* biasData, const Shape& biasShape,
                            int32_t activation,
                            float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("fullyConnectedFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
@@ -42,6 +45,7 @@
     uint32_t batch_size = getSizeOfDimension(outputShape, 0);
     uint32_t input_n_elements = getNumberOfElements(inputShape);
     if (batch_size * batch_size == input_n_elements) {
+        NNTRACE_COMP_SWITCH("reference_ops::FullyConnected");
         tflite::reference_ops::FullyConnected(
                 inputData, convertShapeToDims(inputShape),
                 weightsData, convertShapeToDims(weightsShape),
@@ -49,6 +53,7 @@
                 output_activation_min, output_activation_max,
                 outputData, convertShapeToDims(outputShape));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::FullyConnected");
         tflite::optimized_ops::FullyConnected(
                 inputData, convertShapeToDims(inputShape),
                 weightsData, convertShapeToDims(weightsShape),
@@ -64,6 +69,7 @@
                           const int32_t* biasData, const Shape& biasShape,
                           int32_t activation,
                           uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("fullyConnectedQuant8");
     int32_t inputOffset = -inputShape.offset;
     int32_t weightsOffset = -weightsShape.offset;
     int32_t outputOffset = outputShape.offset;
@@ -91,6 +97,7 @@
     // Alow gemmlowp automatically decide how many threads to use.
     gemm_context.set_max_num_threads(0);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::FullyConnected");
     tflite::optimized_ops::FullyConnected(
             inputData, convertShapeToDims(inputShape), inputOffset,
             weightsData, convertShapeToDims(weightsShape), weightsOffset,
diff --git a/common/operations/HashtableLookup.cpp b/common/operations/HashtableLookup.cpp
index 1c8d802..e864b3d 100644
--- a/common/operations/HashtableLookup.cpp
+++ b/common/operations/HashtableLookup.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "Operations.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -42,6 +44,7 @@
 }
 
 bool HashtableLookup::Eval() {
+  NNTRACE_COMP("HashtableLookup::Eval");
   const int num_rows = value_->shape().dimensions[0];
   const int row_bytes = sizeOfData(value_->type, value_->dimensions) / num_rows;
   void* pointer = nullptr;
diff --git a/common/operations/LSHProjection.cpp b/common/operations/LSHProjection.cpp
index 57d1475..97183e2 100644
--- a/common/operations/LSHProjection.cpp
+++ b/common/operations/LSHProjection.cpp
@@ -20,6 +20,8 @@
 #include "HalInterfaces.h"
 #include "util/hash/farmhash.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -141,6 +143,8 @@
 }
 
 bool LSHProjection::Eval() {
+  NNTRACE_COMP("LSHProjection::Eval");
+
   int32_t* out_buf = reinterpret_cast<int32_t*>(output_->buffer);
 
   switch (type_) {
diff --git a/common/operations/LSTM.cpp b/common/operations/LSTM.cpp
index 80b0eb9..7667c66 100644
--- a/common/operations/LSTM.cpp
+++ b/common/operations/LSTM.cpp
@@ -19,6 +19,8 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -300,6 +302,8 @@
 }
 
 bool LSTMCell::Eval() {
+  NNTRACE_COMP("LSTMCell::Eval");
+
   const uint32_t n_batch = input_->shape().dimensions[0];
   const uint32_t n_input = input_->shape().dimensions[1];
   // n_cell and n_output will be the same size when there is no projection.
diff --git a/common/operations/Normalization.cpp b/common/operations/Normalization.cpp
index eccb3bd..7f36dbf 100644
--- a/common/operations/Normalization.cpp
+++ b/common/operations/Normalization.cpp
@@ -19,11 +19,14 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool l2normFloat32(const float* inputData, const Shape& inputShape,
                    float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::L2Normalization::float");
     tflite::optimized_ops::L2Normalization<tflite::FusedActivationFunctionType::kNone>(
             inputData, convertShapeToDims(inputShape),
             outputData, convertShapeToDims(outputShape));
@@ -33,6 +36,7 @@
 
 bool l2normQuant8(const uint8_t* inputData, const Shape& inputShape,
                   uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::L2Normalization::uint8");
     tflite::optimized_ops::L2Normalization(
             inputData, convertShapeToDims(inputShape),
             inputShape.offset,
@@ -44,6 +48,7 @@
 bool localResponseNormFloat32(const float* inputData, const Shape& inputShape,
                               int32_t radius, float bias, float alpha, float beta,
                               float* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("optimized_ops::LocalResponseNormalization::float");
     tflite::optimized_ops::LocalResponseNormalization(
             inputData, convertShapeToDims(inputShape),
             radius, bias, alpha, beta,
diff --git a/common/operations/Pooling.cpp b/common/operations/Pooling.cpp
index db4497a..7f60323 100644
--- a/common/operations/Pooling.cpp
+++ b/common/operations/Pooling.cpp
@@ -19,6 +19,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -37,6 +39,7 @@
                         int32_t stride_width, int32_t stride_height,
                         int32_t filter_width, int32_t filter_height, int32_t activation,
                         float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("averagePoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -44,6 +47,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
     tflite::optimized_ops::AveragePool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -60,6 +64,7 @@
                        int32_t stride_width, int32_t stride_height,
                        int32_t filter_width, int32_t filter_height, int32_t activation,
                        uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("averagePoolQuant8");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -70,6 +75,7 @@
                                   &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
     tflite::optimized_ops::AveragePool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -86,6 +92,7 @@
                    int32_t stride_width, int32_t stride_height,
                    int32_t filter_width, int32_t filter_height, int32_t activation,
                    float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("l2PoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -93,6 +100,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::L2Pool");
     tflite::optimized_ops::L2Pool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -109,6 +117,7 @@
                     int32_t stride_width, int32_t stride_height,
                     int32_t filter_width, int32_t filter_height, int32_t activation,
                     float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("maxPoolFloat32");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -116,6 +125,7 @@
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
     tflite::optimized_ops::MaxPool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
@@ -132,6 +142,7 @@
                    int32_t stride_width, int32_t stride_height,
                    int32_t filter_width, int32_t filter_height, int32_t activation,
                    uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("maxPoolQuant8");
 
     ANDROID_NN_POOLING_PARAMETERS
 
@@ -142,6 +153,7 @@
                                   &output_activation_min,
                                   &output_activation_max);
 
+    NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
     tflite::optimized_ops::MaxPool(
             inputData, convertShapeToDims(inputShape),
             stride_width, stride_height, paddingWidth, paddingHeight,
diff --git a/common/operations/RNN.cpp b/common/operations/RNN.cpp
index 8a00734..4d7a4c9 100644
--- a/common/operations/RNN.cpp
+++ b/common/operations/RNN.cpp
@@ -19,11 +19,14 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 RNN::RNN(const Operation& operation,
          std::vector<RunTimeOperandInfo>& operands) {
+  NNTRACE_TRANS("RNN::RNN");
   input_ = GetInput(operation, operands, kInputTensor);
   weights_ = GetInput(operation, operands, kWeightsTensor);
   recurrent_weights_ = GetInput(operation, operands, kRecurrentWeightsTensor);
@@ -41,6 +44,7 @@
                   std::vector<RunTimeOperandInfo> &operands,
                   Shape *hiddenStateShape,
                   Shape *outputShape) {
+  NNTRACE_TRANS("RNN::Prepare");
   // Check we have all the inputs and outputs we need.
   const int num_inputs = NumInputsWithValues(operation, operands);
   NN_CHECK(num_inputs == 5 || num_inputs == 6);
@@ -78,6 +82,8 @@
 }
 
 bool RNN::Eval() {
+  NNTRACE_COMP("RNN::Eval");
+
   const float* bias_ptr = reinterpret_cast<float*>(bias_->buffer);
 
   const uint32_t batch_size = input_->shape().dimensions[0];
diff --git a/common/operations/Reshape.cpp b/common/operations/Reshape.cpp
index 5803968..5a05aed 100644
--- a/common/operations/Reshape.cpp
+++ b/common/operations/Reshape.cpp
@@ -24,11 +24,14 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
 bool reshapeGeneric(const void* inputData, const Shape& inputShape,
                     void* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("reshapeGeneric");
     size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
     memcpy(outputData, inputData, count);
     return true;
@@ -36,6 +39,7 @@
 
 bool resizeBilinearFloat32(const float* inputData, const Shape& inputShape,
                            float* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("resizeBilinearFloat32");
     int32_t height = (int32_t) getSizeOfDimension(outputShape, 1);
     int32_t width  = (int32_t) getSizeOfDimension(outputShape, 2);
 
@@ -44,6 +48,7 @@
     Shape outDimShape;
     outDimShape.dimensions = {1, 1, 1, 2};
 
+    NNTRACE_COMP_SWITCH("optimized_ops::ResizeBilinear");
     tflite::optimized_ops::ResizeBilinear(
             inputData, convertShapeToDims(inputShape),
             outDimData, convertShapeToDims(outDimShape),
@@ -54,14 +59,17 @@
 bool depthToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
                          int32_t blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("depthToSpaceGeneric");
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
-       tflite::optimized_ops::DepthToSpace(
-                reinterpret_cast<const float*>(inputData),
-                convertShapeToDims(inputShape),
-                blockSize,
-                reinterpret_cast<float*>(outputData),
-                convertShapeToDims(outputShape));
+        NNTRACE_COMP_SWITCH("optimized_ops::DepthToSpace::float");
+        tflite::optimized_ops::DepthToSpace(
+                 reinterpret_cast<const float*>(inputData),
+                 convertShapeToDims(inputShape),
+                 blockSize,
+                 reinterpret_cast<float*>(outputData),
+                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::DepthToSpace::uint8");
         tflite::optimized_ops::DepthToSpace(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -78,7 +86,9 @@
 bool spaceToDepthGeneric(const uint8_t* inputData, const Shape& inputShape,
                          int32_t blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("spaceToDepthGeneric");
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToDepth::float");
         tflite::optimized_ops::SpaceToDepth(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -86,6 +96,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToDepth::uint8");
         tflite::optimized_ops::SpaceToDepth(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -102,6 +113,7 @@
 bool padGeneric(const uint8_t* inputData, const Shape& inputShape,
                 const int32_t* paddings,
                 uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("padGeneric");
     int32_t numInputDims = static_cast<int32_t>(getNumberOfDimensions(inputShape));
 
     std::vector<int> beforePadding;
@@ -113,6 +125,7 @@
     }
 
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Pad::float");
         tflite::optimized_ops::Pad(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -120,6 +133,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Pad::uint8");
         tflite::optimized_ops::Pad(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -136,18 +150,21 @@
 bool batchToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
                          const int32_t* blockSize,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("batchToSpaceGeneric");
     // Needed by low level implementation, but not really used.
     tflite::Dims<4> blockSizeDim, cropsDim;
     const int32 crops[4] = {0, 0, 0, 0};
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
-       tflite::optimized_ops::BatchToSpaceND(
-                reinterpret_cast<const float*>(inputData),
-                convertShapeToDims(inputShape),
-                blockSize, blockSizeDim,
-                crops, cropsDim,
-                reinterpret_cast<float*>(outputData),
-                convertShapeToDims(outputShape));
+        NNTRACE_COMP_SWITCH("optimized_ops::BatchToSpaceND::float");
+        tflite::optimized_ops::BatchToSpaceND(
+                 reinterpret_cast<const float*>(inputData),
+                 convertShapeToDims(inputShape),
+                 blockSize, blockSizeDim,
+                 crops, cropsDim,
+                 reinterpret_cast<float*>(outputData),
+                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BatchToSpaceND::uint8");
         tflite::optimized_ops::BatchToSpaceND(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -166,9 +183,11 @@
                          const int32_t* blockSize,
                          const int32_t* padding, const Shape& paddingShape,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("spaceToBatchGeneric");
     // Needed by low level implementation, but not really used.
     tflite::Dims<4> blockSizeDim;
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToBatchND::float");
         tflite::optimized_ops::SpaceToBatchND(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -177,6 +196,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::SpaceToBatchND::uint8");
         tflite::optimized_ops::SpaceToBatchND(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
@@ -193,6 +213,7 @@
 
 bool squeezeGeneric(const void* inputData, const Shape& inputShape,
                     void* outputData, const Shape& outputShape) {
+    NNTRACE_COMP("squeezeGeneric");
     size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
     memcpy(outputData, inputData, count);
     return true;
@@ -201,6 +222,7 @@
 bool transposeGeneric(const uint8_t* inputData, const Shape& inputShape,
                       const int32_t* perm, const Shape& permShape,
                       uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("transposeGeneric");
     // Reverse the permuted axes and convert to 4D due to the way Dims are
     // constructed.
     const int32_t kOutputDimensionNum = 4;
@@ -215,6 +237,7 @@
         reversed_perm[k] = k;
     }
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Transpose::float");
         tflite::reference_ops::Transpose(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -222,6 +245,7 @@
                 convertShapeToDims(outputShape),
                 reversed_perm);
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("optimized_ops::Transpose::uint8");
         tflite::reference_ops::Transpose(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
diff --git a/common/operations/SVDF.cpp b/common/operations/SVDF.cpp
index 38224a6..cdba351 100644
--- a/common/operations/SVDF.cpp
+++ b/common/operations/SVDF.cpp
@@ -19,6 +19,8 @@
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -38,6 +40,7 @@
 
 SVDF::SVDF(const Operation& operation,
            std::vector<RunTimeOperandInfo>& operands) {
+    NNTRACE_TRANS("SVDF::SVDF");
     input_ = GetInput(operation, operands, kInputTensor);
     weights_feature_ = GetInput(operation, operands, kWeightsFeatureTensor);
     weights_time_ = GetInput(operation, operands, kWeightsTimeTensor);
@@ -56,6 +59,7 @@
                    std::vector<RunTimeOperandInfo> &operands,
                    Shape *stateShape,
                    Shape *outputShape) {
+  NNTRACE_TRANS("SVDF::Prepare");
   // Check we have all the inputs and outputs we need.
   const int num_inputs = NumInputsWithValues(operation, operands);
 
@@ -103,6 +107,8 @@
 }
 
 bool SVDF::Eval() {
+    NNTRACE_COMP("SVDF::Eval");
+
     const int rank = params_.rank_;
     const int batch_size = SizeOfDimension(input_, 0);
     const int input_size = SizeOfDimension(input_, 1);
diff --git a/common/operations/SimpleMath.cpp b/common/operations/SimpleMath.cpp
index 045ac97..e842439 100644
--- a/common/operations/SimpleMath.cpp
+++ b/common/operations/SimpleMath.cpp
@@ -24,6 +24,8 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -50,9 +52,11 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("addFloat32");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastAdd");
         #define ANDROID_NN_BROADCAST_ADD(activation)                                              \
             tflite::optimized_ops::BroadcastAdd<tflite::FusedActivationFunctionType::activation>( \
                     in1, convertShapeToDims(shape1),                                              \
@@ -66,6 +70,7 @@
         CalculateActivationRangeFloat(activation, &output_activation_min,
                                       &output_activation_max);
 
+        NNTRACE_COMP_SWITCH("optimized_ops::Add");
         tflite::optimized_ops::Add(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -80,6 +85,7 @@
                const uint8_t* in2, const Shape& shape2,
                int32_t activation,
                uint8_t* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("addQuant8");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     const int32_t input1_offset = -shape1.offset;
@@ -118,6 +124,7 @@
                                   &output_activation_max);
 
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastAdd");
         tflite::optimized_ops::BroadcastAdd(
                 left_shift,
                 in1, convertShapeToDims(shape1),
@@ -128,6 +135,7 @@
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Add");
         #define ANDROID_NN_NORMAL_ADD(activation)                                        \
             tflite::optimized_ops::Add<tflite::FusedActivationFunctionType::activation>( \
                     left_shift,                                                          \
@@ -150,10 +158,12 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("mulFloat32");
     bool needBroadcast = !SameShape(shape1, shape2);
 
     if (needBroadcast) {
-    #define ANDROID_NN_BROADCAST_MUL(activation)                                              \
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastMul");
+        #define ANDROID_NN_BROADCAST_MUL(activation)                                          \
         tflite::optimized_ops::BroadcastMul<tflite::FusedActivationFunctionType::activation>( \
                 in1, convertShapeToDims(shape1),                                              \
                 in2, convertShapeToDims(shape2),                                              \
@@ -166,6 +176,7 @@
         CalculateActivationRangeFloat(activation, &output_activation_min,
                                       &output_activation_max);
 
+        NNTRACE_COMP_SWITCH("optimized_ops::Mul");
         tflite::optimized_ops::Mul(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -180,6 +191,7 @@
                const uint8_t* in2, const Shape& shape2,
                int32_t activation,
                uint8_t* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("mulQuant8");
     const int32_t input1_offset = -shape1.offset;
     const int32_t input2_offset = -shape2.offset;
     const int32_t output_offset = shapeOut.offset;
@@ -198,6 +210,7 @@
                                   &output_activation_max);
 
     // Use BROADCAST version to handle the normal case.
+    NNTRACE_COMP_SWITCH("optimized_ops::BroadcastMul");
     tflite::optimized_ops::BroadcastMul(
                 in1, convertShapeToDims(shape1), input1_offset,
                 in2, convertShapeToDims(shape2), input2_offset,
@@ -211,7 +224,9 @@
 bool floorFloat32(const float* inputData,
                   float* outputData,
                   const Shape& shape) {
+    NNTRACE_TRANS("floorFloat32");
     tflite::Dims<4> dim = convertShapeToDims(shape);
+    NNTRACE_COMP_SWITCH("optimized_ops::Floor");
     tflite::optimized_ops::Floor(inputData, dim, outputData, dim);
     return true;
 }
@@ -219,7 +234,9 @@
 bool dequantizeQuant8ToFloat32(const uint8_t* inputData,
                                float* outputData,
                                const Shape& shape) {
+    NNTRACE_TRANS("dequantizeQuant8ToFloat32");
     tflite::Dims<4> dim = convertShapeToDims(shape);
+    NNTRACE_COMP_SWITCH("optimized_ops::Dequantize");
     tflite::optimized_ops::Dequantize(inputData, dim,
                                       shape.offset, shape.scale,
                                       outputData, dim);
@@ -230,18 +247,21 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("subFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
     bool needBroadcast = !SameShape(shape1, shape2);
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastSub");
         tflite::optimized_ops::BroadcastSub(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Sub");
         tflite::optimized_ops::Sub(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -255,18 +275,21 @@
                 const float* in2, const Shape& shape2,
                 int32_t activation,
                 float* out, const Shape& shapeOut) {
+    NNTRACE_TRANS("divFloat32");
     float output_activation_min, output_activation_max;
     CalculateActivationRangeFloat(activation, &output_activation_min,
                                   &output_activation_max);
 
     bool needBroadcast = !SameShape(shape1, shape2);
     if (needBroadcast) {
+        NNTRACE_COMP_SWITCH("optimized_ops::BroadcastDiv");
         tflite::optimized_ops::BroadcastDiv(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
                 output_activation_min, output_activation_max,
                 out, convertShapeToDims(shapeOut));
     } else {
+        NNTRACE_COMP_SWITCH("optimized_ops::Div");
         tflite::optimized_ops::Div(
                 in1, convertShapeToDims(shape1),
                 in2, convertShapeToDims(shape2),
@@ -279,6 +302,7 @@
 bool meanGeneric(const uint8_t* inputData, const Shape& inputShape,
                  const int32_t* axis, const Shape& axisShape, bool keepDims,
                  uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("meanGeneric");
     // Creates a temp index to iterate through input data.
     int32_t* scratchBuffer = new int32_t[getNumberOfDimensions(inputShape)];
 
@@ -293,6 +317,7 @@
             LOG(ERROR) << "Failed to allocate tempSumBuffer for MEAN";
             result = false;
         } else {
+            NNTRACE_COMP_SWITCH("optimized_ops::Mean");
             tflite::reference_ops::Mean<float, float>(
                     const_cast<float*>(reinterpret_cast<const float*>(inputData)),
                     reinterpret_cast<const int*>(inputShape.dimensions.data()),
@@ -310,6 +335,7 @@
             LOG(ERROR) << "Failed to allocate tempSumBuffer for MEAN";
             result = false;
         } else {
+            NNTRACE_COMP_SWITCH("optimized_ops::Mean");
             tflite::reference_ops::Mean<uint8_t, int32_t>(
                     const_cast<uint8_t*>(inputData),
                     reinterpret_cast<const int*>(inputShape.dimensions.data()),
diff --git a/common/operations/StridedSlice.cpp b/common/operations/StridedSlice.cpp
index 9db9523..222c48d 100644
--- a/common/operations/StridedSlice.cpp
+++ b/common/operations/StridedSlice.cpp
@@ -23,6 +23,8 @@
 
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 
+#include "Tracing.h"
+
 namespace android {
 namespace nn {
 
@@ -31,6 +33,7 @@
                          const int32_t* stridesData,
                          int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask,
                          uint8_t* outputData, const Shape& outputShape) {
+    NNTRACE_TRANS("stridedSliceGeneric");
     // This Op only supports 1-4D cases and since we use the reference 4D
     // implementation, the 1-3D tensors are mapped to 4D.
     const int kMaxDim = 4;
@@ -56,6 +59,7 @@
     endMask = ReverseMaskBits(endMask, numInputDims);
 
     if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        NNTRACE_COMP_SWITCH("reference_ops::StridedSlice::float");
         tflite::reference_ops::StridedSlice(
                 reinterpret_cast<const float*>(inputData),
                 convertShapeToDims(inputShape),
@@ -64,6 +68,7 @@
                 reinterpret_cast<float*>(outputData),
                 convertShapeToDims(outputShape));
     } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NNTRACE_COMP_SWITCH("reference_ops::StridedSlice::uint8");
         tflite::reference_ops::StridedSlice(
                 reinterpret_cast<const uint8_t*>(inputData),
                 convertShapeToDims(inputShape),
diff --git a/driver/sample/Android.bp b/driver/sample/Android.bp
index 51a9df2..96a73de 100644
--- a/driver/sample/Android.bp
+++ b/driver/sample/Android.bp
@@ -27,6 +27,7 @@
     ],
     shared_libs: [
         "libbase",
+        "libcutils",
         "libdl",
         "libhardware",
         "libhidlbase",
diff --git a/driver/sample/SampleDriver.cpp b/driver/sample/SampleDriver.cpp
index a608852..d43f597 100644
--- a/driver/sample/SampleDriver.cpp
+++ b/driver/sample/SampleDriver.cpp
@@ -20,6 +20,7 @@
 
 #include "CpuExecutor.h"
 #include "HalInterfaces.h"
+#include "Tracing.h"
 #include "ValidateHal.h"
 
 #include <android-base/logging.h>
@@ -31,6 +32,8 @@
 namespace sample_driver {
 
 Return<void> SampleDriver::getCapabilities(getCapabilities_cb cb) {
+    NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_INITIALIZATION,
+                 "SampleDriver::getCapabilities");
     return getCapabilities_1_1(
         [&](ErrorStatus error, const V1_1::Capabilities& capabilities) {
             // TODO(dgross): Do we need to check compliantWithV1_0(capabilities)?
@@ -40,6 +43,8 @@
 
 Return<void> SampleDriver::getSupportedOperations(const V1_0::Model& model,
                                                   getSupportedOperations_cb cb) {
+    NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_COMPILATION,
+                 "SampleDriver::getSupportedOperations");
     if (!validateModel(model)) {
         VLOG(DRIVER) << "getSupportedOperations";
         std::vector<bool> supported;
@@ -51,6 +56,8 @@
 
 Return<ErrorStatus> SampleDriver::prepareModel(const V1_0::Model& model,
                                                const sp<IPreparedModelCallback>& callback) {
+    NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_COMPILATION,
+                 "SampleDriver::prepareModel");
     if (callback.get() == nullptr) {
         VLOG(DRIVER) << "prepareModel";
         LOG(ERROR) << "invalid callback passed to prepareModel";
@@ -68,6 +75,8 @@
 Return<ErrorStatus> SampleDriver::prepareModel_1_1(const V1_1::Model& model,
                                                    ExecutionPreference preference,
                                                    const sp<IPreparedModelCallback>& callback) {
+    NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_COMPILATION,
+                 "SampleDriver::prepareModel_1_1");
     if (VLOG_IS_ON(DRIVER)) {
         VLOG(DRIVER) << "prepareModel_1_1";
         logModelToInfo(model);
@@ -92,6 +101,8 @@
 }
 
 Return<DeviceStatus> SampleDriver::getStatus() {
+    NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_UNSPECIFIED,
+                 "SampleDriver::getStatus");
     VLOG(DRIVER) << "getStatus()";
     return DeviceStatus::AVAILABLE;
 }
@@ -113,12 +124,16 @@
 
 void SamplePreparedModel::asyncExecute(const Request& request,
                                        const sp<IExecutionCallback>& callback) {
+    NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_INPUTS_AND_OUTPUTS,
+                 "SampleDriver::asyncExecute");
     std::vector<RunTimePoolInfo> requestPoolInfos;
     if (!setRunTimePoolInfosFromHidlMemories(&requestPoolInfos, request.pools)) {
         callback->notify(ErrorStatus::GENERAL_FAILURE);
         return;
     }
 
+    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_EXECUTION,
+                        "SampleDriver::asyncExecute");
     CpuExecutor executor;
     int n = executor.run(mModel, request, mPoolInfos, requestPoolInfos);
     VLOG(DRIVER) << "executor.run returned " << n;
@@ -132,6 +147,8 @@
 
 Return<ErrorStatus> SamplePreparedModel::execute(const Request& request,
                                                  const sp<IExecutionCallback>& callback) {
+    NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_EXECUTION,
+                 "SampleDriver::execute");
     VLOG(DRIVER) << "execute(" << SHOW_IF_DEBUG(toString(request)) << ")";
     if (callback.get() == nullptr) {
         LOG(ERROR) << "invalid callback passed to execute";
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index b5a472b..76d16d9 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -23,6 +23,7 @@
 #include "HalInterfaces.h"
 #include "Manager.h"
 #include "ModelBuilder.h"
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <mutex>
@@ -219,6 +220,7 @@
 // Ensure that executionCallback->notify() is called.
 static void cpuFallbackFull(const ExecutionBuilder* executionBuilder,
                             const sp<ExecutionCallback>& executionCallback) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
     VLOG(EXECUTION) << "cpuFallbackFull";
     StepExecutor executor(executionBuilder, executionBuilder->getModel(),
                           nullptr /* no VersionedIDevice, so CPU */,
@@ -244,6 +246,7 @@
                                const ExecutionPlan* plan,
                                std::shared_ptr<ExecutionPlan::Controller> controller,
                                const sp<ExecutionCallback>& executionCallback) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
     VLOG(EXECUTION) << "cpuFallbackPartial";
     std::shared_ptr<StepExecutor> executor;
     int n = plan->fallback(controller, &executor);
@@ -578,6 +581,7 @@
         }
     }
 
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "StepExecutor::startComputeOnDevice");
     // We separate the input & output pools so that we reduce the copying done if we
     // do an eventual remoting (hidl_memory->update()).  We could also use it to set
     // protection on read only memory but that's not currently done.
@@ -618,6 +622,9 @@
         request.pools[i] = mMemories[i]->getHidlMemory();
     }
 
+    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
+                        "StepExecutor::startComputeOnDevice::execute");
+
     // Prepare the callback for asynchronous execution. sp<ExecutionCallback>
     // object is returned when the execution has been successfully launched,
     // otherwise a nullptr is returned. The executionCallback is abstracted in
@@ -649,6 +656,8 @@
     // TODO: Remove this synchronization point when the block of code below is
     // removed.
     executionCallback->wait();
+    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_EXECUTION,
+                        "StepExecutor::startComputeOnDevice::waited");
     Return<ErrorStatus> callbackStatus = executionCallback->getStatus();
     if (!callbackStatus.isOk() || callbackStatus != ErrorStatus::NONE) {
         VLOG(EXECUTION) << "**Execute async failed**";
@@ -661,6 +670,7 @@
     // TODO: Move this block of code somewhere else. It should not be in the
     // startCompute function.
     // TODO: outputMemory->update(); outputMemory->commit()
+    NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "StepExecutor::startComputeOnDevice");
     for (auto& info : mOutputs) {
         if (info.state == ModelArgumentInfo::POINTER) {
             DataLocation& loc = info.locationAndLength;
@@ -682,6 +692,7 @@
                                    const std::vector<RunTimePoolInfo>& modelPoolInfos,
                                    const std::vector<RunTimePoolInfo>& requestPoolInfos,
                                    const sp<IExecutionCallback>& executionCallback) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "asyncStartComputeOnCpu");
     CpuExecutor executor;
     int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
     executionCallback->notify(convertResultCodeToErrorStatus(err));
@@ -689,6 +700,8 @@
 
 int StepExecutor::startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback) {
     // TODO: use a thread pool
+    // TODO(mikie): this could have NNTRACE so we could measure the overhead of
+    //              spinning up a new thread.
 
     Model model;
     mModel->setHidlModel(&model);
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index c8f7550..5fd370d 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -23,6 +23,7 @@
 #include "ExecutionBuilder.h"
 #include "Manager.h"
 #include "ModelBuilder.h"
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <functional>
@@ -46,6 +47,10 @@
     model->setHidlModel(&hidlModel);
 
     sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();
+
+    // Note that some work within VersionedIDevice will be subtracted from the
+    // IPC layer
+    NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModel");
     Return<ErrorStatus> prepareLaunchStatus = device->getInterface()->prepareModel(
         hidlModel, static_cast<ExecutionPreference>(executionPreference), preparedModelCallback);
     if (!prepareLaunchStatus.isOk()) {
diff --git a/runtime/NeuralNetworks.cpp b/runtime/NeuralNetworks.cpp
index 6f1dcca..e384f3d 100644
--- a/runtime/NeuralNetworks.cpp
+++ b/runtime/NeuralNetworks.cpp
@@ -29,6 +29,7 @@
 #include "Memory.h"
 #include "NeuralNetworksOEM.h"
 #include "ModelBuilder.h"
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <memory>
@@ -251,6 +252,7 @@
 
 int ANeuralNetworksMemory_createFromFd(size_t size, int prot, int fd, size_t offset,
                                        ANeuralNetworksMemory** memory) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksMemory_createFromFd");
     *memory = nullptr;
     std::unique_ptr<MemoryFd> m = std::make_unique<MemoryFd>();
     if (m == nullptr) {
@@ -265,12 +267,14 @@
 }
 
 void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksMemory_free");
     // No validation.  Free of nullptr is valid.
     Memory* m = reinterpret_cast<Memory*>(memory);
     delete m;
 }
 
 int ANeuralNetworksModel_create(ANeuralNetworksModel** model) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_create");
     initVLogMask();
     if (!model) {
         LOG(ERROR) << "ANeuralNetworksModel_create passed a nullptr";
@@ -286,12 +290,14 @@
 }
 
 void ANeuralNetworksModel_free(ANeuralNetworksModel* model) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksModel_free");
     // No validation.  Free of nullptr is valid.
     ModelBuilder* m = reinterpret_cast<ModelBuilder*>(model);
     delete m;
 }
 
 int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_finish");
     if (!model) {
         LOG(ERROR) << "ANeuralNetworksModel_finish passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -302,6 +308,7 @@
 
 int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model,
                                     const ANeuralNetworksOperandType* type) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_addOperand");
     if (!model || !type) {
         LOG(ERROR) << "ANeuralNetworksModel_addOperand passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -312,6 +319,7 @@
 
 int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model, int32_t index,
                                          const void* buffer, size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_setOperandValue");
     if (!model || (!buffer && length != 0)) {
         LOG(ERROR) << "ANeuralNetworksModel_setOperandValue passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -323,6 +331,7 @@
 int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel* model, int32_t index,
                                                    const ANeuralNetworksMemory* memory,
                                                    size_t offset, size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_setOperandValueFromMemory");
     if (!model || !memory) {
         LOG(ERROR) << "ANeuralNetworksModel_setOperandValue passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -336,6 +345,7 @@
                                       ANeuralNetworksOperationType type, uint32_t inputCount,
                                       const uint32_t* inputs, uint32_t outputCount,
                                       const uint32_t* outputs) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_addOperation");
     if (!model || !inputs || !outputs) {
         LOG(ERROR) << "ANeuralNetworksModel_addOperation passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -347,6 +357,7 @@
 int ANeuralNetworksModel_identifyInputsAndOutputs(ANeuralNetworksModel* model, uint32_t inputCount,
                                                   const uint32_t* inputs, uint32_t outputCount,
                                                   const uint32_t* outputs) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_identifyInputsAndOutputs");
     if (!model || !inputs || !outputs) {
         LOG(ERROR) << ("ANeuralNetworksModel_identifyInputsAndOutputs passed a nullptr");
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -357,6 +368,7 @@
 
 int ANeuralNetworksModel_relaxComputationFloat32toFloat16(ANeuralNetworksModel* model,
                                                           bool allow) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_relaxComputationFloat32toFloat16");
     if (!model) {
         LOG(ERROR) << ("ANeuralNetworksModel_relaxComputationFloat32toFloat16 passed a nullptr");
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -367,6 +379,7 @@
 
 int ANeuralNetworksCompilation_create(ANeuralNetworksModel* model,
                                       ANeuralNetworksCompilation** compilation) {
+    NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_create");
     if (!model || !compilation) {
         LOG(ERROR) << "ANeuralNetworksCompilation_create passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -380,6 +393,7 @@
 }
 
 void ANeuralNetworksCompilation_free(ANeuralNetworksCompilation* compilation) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksCompilation_free");
     // No validation.  Free of nullptr is valid.
     // TODO specification says that a compilation-in-flight can be deleted
     CompilationBuilder* c = reinterpret_cast<CompilationBuilder*>(compilation);
@@ -388,6 +402,7 @@
 
 int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compilation,
                                              int32_t preference) {
+    NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_setPreference");
     if (!compilation) {
         LOG(ERROR) << "ANeuralNetworksCompilation_setPreference passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -397,6 +412,7 @@
 }
 
 int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation* compilation) {
+    NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_finish");
     if (!compilation) {
         LOG(ERROR) << "ANeuralNetworksCompilation_finish passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -407,6 +423,7 @@
 
 int ANeuralNetworksExecution_create(ANeuralNetworksCompilation* compilation,
                                     ANeuralNetworksExecution** execution) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksExecution_create");
     if (!compilation || !execution) {
         LOG(ERROR) << "ANeuralNetworksExecution_create passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -420,6 +437,7 @@
 }
 
 void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksExecution_free");
     // TODO specification says that an execution-in-flight can be deleted
     // No validation.  Free of nullptr is valid.
     ExecutionBuilder* r = reinterpret_cast<ExecutionBuilder*>(execution);
@@ -429,6 +447,7 @@
 int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32_t index,
                                       const ANeuralNetworksOperandType* type, const void* buffer,
                                       size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setInput");
     if (!execution || (!buffer && length != 0)) {
         LOG(ERROR) << "ANeuralNetworksExecution_setInput passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -441,6 +460,7 @@
                                                 const ANeuralNetworksOperandType* type,
                                                 const ANeuralNetworksMemory* memory, size_t offset,
                                                 size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setInputFromMemory");
     if (!execution || !memory) {
         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -454,6 +474,7 @@
 int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int32_t index,
                                        const ANeuralNetworksOperandType* type, void* buffer,
                                        size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setOutput");
     if (!execution || (!buffer && length != 0)) {
         LOG(ERROR) << "ANeuralNetworksExecution_setOutput passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -466,6 +487,7 @@
                                                  const ANeuralNetworksOperandType* type,
                                                  const ANeuralNetworksMemory* memory, size_t offset,
                                                  size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setOutputFromMemory");
     if (!execution || !memory) {
         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -478,6 +500,7 @@
 
 int ANeuralNetworksExecution_startCompute(ANeuralNetworksExecution* execution,
                                           ANeuralNetworksEvent** event) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksExecution_startCompute");
     if (!execution || !event) {
         LOG(ERROR) << "ANeuralNetworksExecution_startCompute passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -504,6 +527,7 @@
 }
 
 int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksEvent_wait");
     if (event == nullptr) {
         LOG(ERROR) << "ANeuralNetworksEvent_wait passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -515,6 +539,7 @@
 }
 
 void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksEvent_free");
     // No validation.  Free of nullptr is valid.
     if (event) {
         sp<ExecutionCallback>* e = reinterpret_cast<sp<ExecutionCallback>*>(event);
diff --git a/runtime/VersionedIDevice.cpp b/runtime/VersionedIDevice.cpp
index 110eecc..1226552 100644
--- a/runtime/VersionedIDevice.cpp
+++ b/runtime/VersionedIDevice.cpp
@@ -16,6 +16,7 @@
 
 #include "VersionedIDevice.h"
 
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <android-base/logging.h>
@@ -32,6 +33,7 @@
     std::pair<ErrorStatus, Capabilities> result;
 
     if (mDeviceV1_1 != nullptr) {
+        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_INITIALIZATION, "getCapabilities_1_1");
         Return<void> ret = mDeviceV1_1->getCapabilities_1_1(
             [&result](ErrorStatus error, const Capabilities& capabilities) {
                 result = std::make_pair(error, capabilities);
@@ -41,8 +43,10 @@
             return {ErrorStatus::GENERAL_FAILURE, {}};
         }
     } else if (mDeviceV1_0 != nullptr) {
+        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_INITIALIZATION, "getCapabilities");
         Return<void> ret = mDeviceV1_0->getCapabilities(
             [&result](ErrorStatus error, const V1_0::Capabilities& capabilities) {
+                // Time taken to convert capabilities is trivial
                 result = std::make_pair(error, convertToV1_1(capabilities));
             });
         if (!ret.isOk()) {
@@ -62,6 +66,7 @@
     std::pair<ErrorStatus, hidl_vec<bool>> result;
 
     if (mDeviceV1_1 != nullptr) {
+        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations_1_1");
         Return<void> ret = mDeviceV1_1->getSupportedOperations_1_1(
             model, [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
                 result = std::make_pair(error, supported);
@@ -71,8 +76,10 @@
             return {ErrorStatus::GENERAL_FAILURE, {}};
         }
     } else if (mDeviceV1_0 != nullptr && compliantWithV1_0(model)) {
+        V1_0::Model model10 = convertToV1_0(model);
+        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations_1_0");
         Return<void> ret = mDeviceV1_0->getSupportedOperations(
-            convertToV1_0(model), [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
+            model10, [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
                 result = std::make_pair(error, supported);
             });
         if (!ret.isOk()) {
@@ -98,17 +105,35 @@
             return ErrorStatus::GENERAL_FAILURE;
         }
         return static_cast<ErrorStatus>(ret);
-    } else if (mDeviceV1_0 != nullptr && compliantWithV1_0(model)) {
-        Return<ErrorStatus> ret = mDeviceV1_0->prepareModel(convertToV1_0(model), callback);
-        if (!ret.isOk()) {
-            LOG(ERROR) << "prepareModel failure: " << ret.description();
+    } else if (mDeviceV1_0 != nullptr) {
+        bool compliant = false;
+        V1_0::Model model10;
+        {
+            // Attribute time spent in model inspection and conversion to
+            // Runtime, as the time may be substantial (0.03ms for mobilenet,
+            // but could be larger for other models).
+            NNTRACE_FULL_SUBTRACT(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_COMPILATION,
+                                  "VersionedIDevice::prepareModel");
+            compliant = compliantWithV1_0(model);
+            if (compliant) {
+                model10 = convertToV1_0(model);  // copy is elided
+            }
+        }
+        if (compliant) {
+            Return<ErrorStatus> ret = mDeviceV1_0->prepareModel(model10, callback);
+            if (!ret.isOk()) {
+                LOG(ERROR) << "prepareModel failure: " << ret.description();
+                return ErrorStatus::GENERAL_FAILURE;
+            }
+            return static_cast<ErrorStatus>(ret);
+        } else {
+            // TODO: partition the model such that v1.1 ops are not passed to v1.0
+            // device
+            LOG(ERROR) << "Could not handle prepareModel!";
             return ErrorStatus::GENERAL_FAILURE;
         }
-        return static_cast<ErrorStatus>(ret);
     } else {
-        // TODO: partition the model such that v1.1 ops are not passed to v1.0
-        // device
-        LOG(ERROR) << "Could not handle prepareModel!";
+        LOG(ERROR) << "prepareModel called with no device";
         return ErrorStatus::GENERAL_FAILURE;
     }
 }
diff --git a/runtime/test/Android.bp b/runtime/test/Android.bp
index 7dbb28a..addc5c0 100644
--- a/runtime/test/Android.bp
+++ b/runtime/test/Android.bp
@@ -22,6 +22,7 @@
     shared_libs: [
         "libandroid",
         "libbase",
+        "libcutils",
         "libhidlbase",
         "libhidltransport",
         "libhidlmemory",
diff --git a/runtime/test/GeneratedUtils.cpp b/runtime/test/GeneratedUtils.cpp
index fc5671f..a9a33fa 100644
--- a/runtime/test/GeneratedUtils.cpp
+++ b/runtime/test/GeneratedUtils.cpp
@@ -31,6 +31,17 @@
 #include <map>
 #include <thread>
 
+// Systrace is not available from CTS tests due to platform layering
+// constraints. We reuse the NNTEST_ONLY_PUBLIC_API flag, as that should also be
+// the case for CTS (public APIs only).
+#ifndef NNTEST_ONLY_PUBLIC_API
+#include "Tracing.h"
+#else
+#define NNTRACE_FULL_RAW(...)
+#define NNTRACE_APP(...)
+#define NNTRACE_APP_SWITCH(...)
+#endif
+
 namespace generated_tests {
 using namespace android::nn::wrapper;
 using namespace test_helper;
@@ -62,9 +73,13 @@
 }
 
 Compilation createAndCompileModel(Model* model, std::function<void(Model*)> createModel) {
+    NNTRACE_APP(NNTRACE_PHASE_PREPARATION, "createAndCompileModel");
+
     createModel(model);
     model->finish();
     graphDump("", *model);
+
+    NNTRACE_APP_SWITCH(NNTRACE_PHASE_COMPILATION, "createAndCompileModel");
     Compilation compilation(model);
     compilation.finish();
 
@@ -86,6 +101,7 @@
     // If in relaxed mode, set the error range to be 5ULP of FP16.
     float fpRange = !model->isRelaxed() ? 1e-5f : 5.0f * 0.0009765625f;
     for (auto& example : examples) {
+        NNTRACE_APP(NNTRACE_PHASE_EXECUTION, "executeWithCompilation example");
         SCOPED_TRACE(exampleNo);
         // TODO: We leave it as a copy here.
         // Should verify if the input gets modified by the test later.
@@ -94,6 +110,7 @@
 
         Execution execution(compilation);
 
+        NNTRACE_APP_SWITCH(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "executeWithCompilation example");
         // Set all inputs
         for_all(inputs, [&execution](int idx, const void* p, size_t s) {
             const void* buffer = s == 0 ? nullptr : p;
@@ -108,9 +125,11 @@
             ASSERT_EQ(Result::NO_ERROR, execution.setOutput(idx, buffer, s));
         });
 
+        NNTRACE_APP_SWITCH(NNTRACE_PHASE_EXECUTION, "executeWithCompilation example");
         Result r = execution.compute();
         ASSERT_EQ(Result::NO_ERROR, r);
 
+        NNTRACE_APP_SWITCH(NNTRACE_PHASE_RESULTS, "executeWithCompilation example");
         // Dump all outputs for the slicing tool
         if (dumpToFile) {
             s << "output" << exampleNo << " = {\n";
@@ -133,6 +152,7 @@
                  std::function<bool(int)> isIgnored,
                  std::vector<MixedTypedExample>& examples,
                  std::string dumpFile) {
+    NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeOnce");
     Model model;
     Compilation compilation = createAndCompileModel(&model, createModel);
     executeWithCompilation(&model, &compilation, isIgnored, examples, dumpFile);
@@ -142,6 +162,7 @@
 void executeMultithreadedOwnCompilation(std::function<void(Model*)> createModel,
                                         std::function<bool(int)> isIgnored,
                                         std::vector<MixedTypedExample>& examples) {
+    NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeMultithreadedOwnCompilation");
     SCOPED_TRACE("MultithreadedOwnCompilation");
     std::vector<std::thread> threads;
     for (int i = 0; i < 10; i++) {
@@ -157,6 +178,7 @@
 void executeMultithreadedSharedCompilation(std::function<void(Model*)> createModel,
                                            std::function<bool(int)> isIgnored,
                                            std::vector<MixedTypedExample>& examples) {
+    NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeMultithreadedSharedCompilation");
     SCOPED_TRACE("MultithreadedSharedCompilation");
     Model model;
     Compilation compilation = createAndCompileModel(&model, createModel);