NNAPI systrace for timing statistics

This adds systrace tracing to NNAPI. The tracing
will be helpful for:
- getting numbers on where time is spent currently
- diagnosing and improving performance
- benchmarking

TODOs:
- Write analysis tools for traces

Change-Id: I9026f1043428cb715b577901bec3a2e1e39a82e3
Merged-In: I9026f1043428cb715b577901bec3a2e1e39a82e3
Bug: 78137932
Test: manually run systrace.py against unit tests
Test: manually run systrace.py against benchmarking app
(cherry picked from commit e9e637ab73b68b5982281a3f7c621f6a75d51743)
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index b5a472b..76d16d9 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -23,6 +23,7 @@
 #include "HalInterfaces.h"
 #include "Manager.h"
 #include "ModelBuilder.h"
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <mutex>
@@ -219,6 +220,7 @@
 // Ensure that executionCallback->notify() is called.
 static void cpuFallbackFull(const ExecutionBuilder* executionBuilder,
                             const sp<ExecutionCallback>& executionCallback) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
     VLOG(EXECUTION) << "cpuFallbackFull";
     StepExecutor executor(executionBuilder, executionBuilder->getModel(),
                           nullptr /* no VersionedIDevice, so CPU */,
@@ -244,6 +246,7 @@
                                const ExecutionPlan* plan,
                                std::shared_ptr<ExecutionPlan::Controller> controller,
                                const sp<ExecutionCallback>& executionCallback) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
     VLOG(EXECUTION) << "cpuFallbackPartial";
     std::shared_ptr<StepExecutor> executor;
     int n = plan->fallback(controller, &executor);
@@ -578,6 +581,7 @@
         }
     }
 
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "StepExecutor::startComputeOnDevice");
     // We separate the input & output pools so that we reduce the copying done if we
     // do an eventual remoting (hidl_memory->update()).  We could also use it to set
     // protection on read only memory but that's not currently done.
@@ -618,6 +622,9 @@
         request.pools[i] = mMemories[i]->getHidlMemory();
     }
 
+    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
+                        "StepExecutor::startComputeOnDevice::execute");
+
     // Prepare the callback for asynchronous execution. sp<ExecutionCallback>
     // object is returned when the execution has been successfully launched,
     // otherwise a nullptr is returned. The executionCallback is abstracted in
@@ -649,6 +656,8 @@
     // TODO: Remove this synchronization point when the block of code below is
     // removed.
     executionCallback->wait();
+    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_EXECUTION,
+                        "StepExecutor::startComputeOnDevice::waited");
     Return<ErrorStatus> callbackStatus = executionCallback->getStatus();
     if (!callbackStatus.isOk() || callbackStatus != ErrorStatus::NONE) {
         VLOG(EXECUTION) << "**Execute async failed**";
@@ -661,6 +670,7 @@
     // TODO: Move this block of code somewhere else. It should not be in the
     // startCompute function.
     // TODO: outputMemory->update(); outputMemory->commit()
+    NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "StepExecutor::startComputeOnDevice");
     for (auto& info : mOutputs) {
         if (info.state == ModelArgumentInfo::POINTER) {
             DataLocation& loc = info.locationAndLength;
@@ -682,6 +692,7 @@
                                    const std::vector<RunTimePoolInfo>& modelPoolInfos,
                                    const std::vector<RunTimePoolInfo>& requestPoolInfos,
                                    const sp<IExecutionCallback>& executionCallback) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "asyncStartComputeOnCpu");
     CpuExecutor executor;
     int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
     executionCallback->notify(convertResultCodeToErrorStatus(err));
@@ -689,6 +700,8 @@
 
 int StepExecutor::startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback) {
     // TODO: use a thread pool
+    // TODO(mikie): this could have NNTRACE so we could measure the overhead of
+    //              spinning up a new thread.
 
     Model model;
     mModel->setHidlModel(&model);
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index c8f7550..5fd370d 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -23,6 +23,7 @@
 #include "ExecutionBuilder.h"
 #include "Manager.h"
 #include "ModelBuilder.h"
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <functional>
@@ -46,6 +47,10 @@
     model->setHidlModel(&hidlModel);
 
     sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();
+
+    // Note that some work within VersionedIDevice will be subtracted from the
+    // IPC layer
+    NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModel");
     Return<ErrorStatus> prepareLaunchStatus = device->getInterface()->prepareModel(
         hidlModel, static_cast<ExecutionPreference>(executionPreference), preparedModelCallback);
     if (!prepareLaunchStatus.isOk()) {
diff --git a/runtime/NeuralNetworks.cpp b/runtime/NeuralNetworks.cpp
index 6f1dcca..e384f3d 100644
--- a/runtime/NeuralNetworks.cpp
+++ b/runtime/NeuralNetworks.cpp
@@ -29,6 +29,7 @@
 #include "Memory.h"
 #include "NeuralNetworksOEM.h"
 #include "ModelBuilder.h"
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <memory>
@@ -251,6 +252,7 @@
 
 int ANeuralNetworksMemory_createFromFd(size_t size, int prot, int fd, size_t offset,
                                        ANeuralNetworksMemory** memory) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksMemory_createFromFd");
     *memory = nullptr;
     std::unique_ptr<MemoryFd> m = std::make_unique<MemoryFd>();
     if (m == nullptr) {
@@ -265,12 +267,14 @@
 }
 
 void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksMemory_free");
     // No validation.  Free of nullptr is valid.
     Memory* m = reinterpret_cast<Memory*>(memory);
     delete m;
 }
 
 int ANeuralNetworksModel_create(ANeuralNetworksModel** model) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_create");
     initVLogMask();
     if (!model) {
         LOG(ERROR) << "ANeuralNetworksModel_create passed a nullptr";
@@ -286,12 +290,14 @@
 }
 
 void ANeuralNetworksModel_free(ANeuralNetworksModel* model) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksModel_free");
     // No validation.  Free of nullptr is valid.
     ModelBuilder* m = reinterpret_cast<ModelBuilder*>(model);
     delete m;
 }
 
 int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_finish");
     if (!model) {
         LOG(ERROR) << "ANeuralNetworksModel_finish passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -302,6 +308,7 @@
 
 int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model,
                                     const ANeuralNetworksOperandType* type) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_addOperand");
     if (!model || !type) {
         LOG(ERROR) << "ANeuralNetworksModel_addOperand passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -312,6 +319,7 @@
 
 int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model, int32_t index,
                                          const void* buffer, size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_setOperandValue");
     if (!model || (!buffer && length != 0)) {
         LOG(ERROR) << "ANeuralNetworksModel_setOperandValue passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -323,6 +331,7 @@
 int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel* model, int32_t index,
                                                    const ANeuralNetworksMemory* memory,
                                                    size_t offset, size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_setOperandValueFromMemory");
     if (!model || !memory) {
         LOG(ERROR) << "ANeuralNetworksModel_setOperandValue passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -336,6 +345,7 @@
                                       ANeuralNetworksOperationType type, uint32_t inputCount,
                                       const uint32_t* inputs, uint32_t outputCount,
                                       const uint32_t* outputs) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_addOperation");
     if (!model || !inputs || !outputs) {
         LOG(ERROR) << "ANeuralNetworksModel_addOperation passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -347,6 +357,7 @@
 int ANeuralNetworksModel_identifyInputsAndOutputs(ANeuralNetworksModel* model, uint32_t inputCount,
                                                   const uint32_t* inputs, uint32_t outputCount,
                                                   const uint32_t* outputs) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_identifyInputsAndOutputs");
     if (!model || !inputs || !outputs) {
         LOG(ERROR) << ("ANeuralNetworksModel_identifyInputsAndOutputs passed a nullptr");
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -357,6 +368,7 @@
 
 int ANeuralNetworksModel_relaxComputationFloat32toFloat16(ANeuralNetworksModel* model,
                                                           bool allow) {
+    NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_relaxComputationFloat32toFloat16");
     if (!model) {
         LOG(ERROR) << ("ANeuralNetworksModel_relaxComputationFloat32toFloat16 passed a nullptr");
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -367,6 +379,7 @@
 
 int ANeuralNetworksCompilation_create(ANeuralNetworksModel* model,
                                       ANeuralNetworksCompilation** compilation) {
+    NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_create");
     if (!model || !compilation) {
         LOG(ERROR) << "ANeuralNetworksCompilation_create passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -380,6 +393,7 @@
 }
 
 void ANeuralNetworksCompilation_free(ANeuralNetworksCompilation* compilation) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksCompilation_free");
     // No validation.  Free of nullptr is valid.
     // TODO specification says that a compilation-in-flight can be deleted
     CompilationBuilder* c = reinterpret_cast<CompilationBuilder*>(compilation);
@@ -388,6 +402,7 @@
 
 int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compilation,
                                              int32_t preference) {
+    NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_setPreference");
     if (!compilation) {
         LOG(ERROR) << "ANeuralNetworksCompilation_setPreference passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -397,6 +412,7 @@
 }
 
 int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation* compilation) {
+    NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_finish");
     if (!compilation) {
         LOG(ERROR) << "ANeuralNetworksCompilation_finish passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -407,6 +423,7 @@
 
 int ANeuralNetworksExecution_create(ANeuralNetworksCompilation* compilation,
                                     ANeuralNetworksExecution** execution) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksExecution_create");
     if (!compilation || !execution) {
         LOG(ERROR) << "ANeuralNetworksExecution_create passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -420,6 +437,7 @@
 }
 
 void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksExecution_free");
     // TODO specification says that an execution-in-flight can be deleted
     // No validation.  Free of nullptr is valid.
     ExecutionBuilder* r = reinterpret_cast<ExecutionBuilder*>(execution);
@@ -429,6 +447,7 @@
 int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32_t index,
                                       const ANeuralNetworksOperandType* type, const void* buffer,
                                       size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setInput");
     if (!execution || (!buffer && length != 0)) {
         LOG(ERROR) << "ANeuralNetworksExecution_setInput passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -441,6 +460,7 @@
                                                 const ANeuralNetworksOperandType* type,
                                                 const ANeuralNetworksMemory* memory, size_t offset,
                                                 size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setInputFromMemory");
     if (!execution || !memory) {
         LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -454,6 +474,7 @@
 int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int32_t index,
                                        const ANeuralNetworksOperandType* type, void* buffer,
                                        size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setOutput");
     if (!execution || (!buffer && length != 0)) {
         LOG(ERROR) << "ANeuralNetworksExecution_setOutput passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -466,6 +487,7 @@
                                                  const ANeuralNetworksOperandType* type,
                                                  const ANeuralNetworksMemory* memory, size_t offset,
                                                  size_t length) {
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setOutputFromMemory");
     if (!execution || !memory) {
         LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -478,6 +500,7 @@
 
 int ANeuralNetworksExecution_startCompute(ANeuralNetworksExecution* execution,
                                           ANeuralNetworksEvent** event) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksExecution_startCompute");
     if (!execution || !event) {
         LOG(ERROR) << "ANeuralNetworksExecution_startCompute passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -504,6 +527,7 @@
 }
 
 int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksEvent_wait");
     if (event == nullptr) {
         LOG(ERROR) << "ANeuralNetworksEvent_wait passed a nullptr";
         return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -515,6 +539,7 @@
 }
 
 void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
+    NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksEvent_free");
     // No validation.  Free of nullptr is valid.
     if (event) {
         sp<ExecutionCallback>* e = reinterpret_cast<sp<ExecutionCallback>*>(event);
diff --git a/runtime/VersionedIDevice.cpp b/runtime/VersionedIDevice.cpp
index 110eecc..1226552 100644
--- a/runtime/VersionedIDevice.cpp
+++ b/runtime/VersionedIDevice.cpp
@@ -16,6 +16,7 @@
 
 #include "VersionedIDevice.h"
 
+#include "Tracing.h"
 #include "Utils.h"
 
 #include <android-base/logging.h>
@@ -32,6 +33,7 @@
     std::pair<ErrorStatus, Capabilities> result;
 
     if (mDeviceV1_1 != nullptr) {
+        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_INITIALIZATION, "getCapabilities_1_1");
         Return<void> ret = mDeviceV1_1->getCapabilities_1_1(
             [&result](ErrorStatus error, const Capabilities& capabilities) {
                 result = std::make_pair(error, capabilities);
@@ -41,8 +43,10 @@
             return {ErrorStatus::GENERAL_FAILURE, {}};
         }
     } else if (mDeviceV1_0 != nullptr) {
+        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_INITIALIZATION, "getCapabilities");
         Return<void> ret = mDeviceV1_0->getCapabilities(
             [&result](ErrorStatus error, const V1_0::Capabilities& capabilities) {
+                // Time taken to convert capabilities is trivial
                 result = std::make_pair(error, convertToV1_1(capabilities));
             });
         if (!ret.isOk()) {
@@ -62,6 +66,7 @@
     std::pair<ErrorStatus, hidl_vec<bool>> result;
 
     if (mDeviceV1_1 != nullptr) {
+        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations_1_1");
         Return<void> ret = mDeviceV1_1->getSupportedOperations_1_1(
             model, [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
                 result = std::make_pair(error, supported);
@@ -71,8 +76,10 @@
             return {ErrorStatus::GENERAL_FAILURE, {}};
         }
     } else if (mDeviceV1_0 != nullptr && compliantWithV1_0(model)) {
+        V1_0::Model model10 = convertToV1_0(model);
+        NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations_1_0");
         Return<void> ret = mDeviceV1_0->getSupportedOperations(
-            convertToV1_0(model), [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
+            model10, [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
                 result = std::make_pair(error, supported);
             });
         if (!ret.isOk()) {
@@ -98,17 +105,35 @@
             return ErrorStatus::GENERAL_FAILURE;
         }
         return static_cast<ErrorStatus>(ret);
-    } else if (mDeviceV1_0 != nullptr && compliantWithV1_0(model)) {
-        Return<ErrorStatus> ret = mDeviceV1_0->prepareModel(convertToV1_0(model), callback);
-        if (!ret.isOk()) {
-            LOG(ERROR) << "prepareModel failure: " << ret.description();
+    } else if (mDeviceV1_0 != nullptr) {
+        bool compliant = false;
+        V1_0::Model model10;
+        {
+            // Attribute time spent in model inspection and conversion to
+            // Runtime, as the time may be substantial (0.03ms for mobilenet,
+            // but could be larger for other models).
+            NNTRACE_FULL_SUBTRACT(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_COMPILATION,
+                                  "VersionedIDevice::prepareModel");
+            compliant = compliantWithV1_0(model);
+            if (compliant) {
+                model10 = convertToV1_0(model);  // copy is elided
+            }
+        }
+        if (compliant) {
+            Return<ErrorStatus> ret = mDeviceV1_0->prepareModel(model10, callback);
+            if (!ret.isOk()) {
+                LOG(ERROR) << "prepareModel failure: " << ret.description();
+                return ErrorStatus::GENERAL_FAILURE;
+            }
+            return static_cast<ErrorStatus>(ret);
+        } else {
+            // TODO: partition the model such that v1.1 ops are not passed to v1.0
+            // device
+            LOG(ERROR) << "Could not handle prepareModel!";
             return ErrorStatus::GENERAL_FAILURE;
         }
-        return static_cast<ErrorStatus>(ret);
     } else {
-        // TODO: partition the model such that v1.1 ops are not passed to v1.0
-        // device
-        LOG(ERROR) << "Could not handle prepareModel!";
+        LOG(ERROR) << "prepareModel called with no device";
         return ErrorStatus::GENERAL_FAILURE;
     }
 }
diff --git a/runtime/test/Android.bp b/runtime/test/Android.bp
index 7dbb28a..addc5c0 100644
--- a/runtime/test/Android.bp
+++ b/runtime/test/Android.bp
@@ -22,6 +22,7 @@
     shared_libs: [
         "libandroid",
         "libbase",
+        "libcutils",
         "libhidlbase",
         "libhidltransport",
         "libhidlmemory",
diff --git a/runtime/test/GeneratedUtils.cpp b/runtime/test/GeneratedUtils.cpp
index fc5671f..a9a33fa 100644
--- a/runtime/test/GeneratedUtils.cpp
+++ b/runtime/test/GeneratedUtils.cpp
@@ -31,6 +31,17 @@
 #include <map>
 #include <thread>
 
+// Systrace is not available from CTS tests due to platform layering
+// constraints. We reuse the NNTEST_ONLY_PUBLIC_API flag, as that should also be
+// the case for CTS (public APIs only).
+#ifndef NNTEST_ONLY_PUBLIC_API
+#include "Tracing.h"
+#else
+#define NNTRACE_FULL_RAW(...)
+#define NNTRACE_APP(...)
+#define NNTRACE_APP_SWITCH(...)
+#endif
+
 namespace generated_tests {
 using namespace android::nn::wrapper;
 using namespace test_helper;
@@ -62,9 +73,13 @@
 }
 
 Compilation createAndCompileModel(Model* model, std::function<void(Model*)> createModel) {
+    NNTRACE_APP(NNTRACE_PHASE_PREPARATION, "createAndCompileModel");
+
     createModel(model);
     model->finish();
     graphDump("", *model);
+
+    NNTRACE_APP_SWITCH(NNTRACE_PHASE_COMPILATION, "createAndCompileModel");
     Compilation compilation(model);
     compilation.finish();
 
@@ -86,6 +101,7 @@
     // If in relaxed mode, set the error range to be 5ULP of FP16.
     float fpRange = !model->isRelaxed() ? 1e-5f : 5.0f * 0.0009765625f;
     for (auto& example : examples) {
+        NNTRACE_APP(NNTRACE_PHASE_EXECUTION, "executeWithCompilation example");
         SCOPED_TRACE(exampleNo);
         // TODO: We leave it as a copy here.
         // Should verify if the input gets modified by the test later.
@@ -94,6 +110,7 @@
 
         Execution execution(compilation);
 
+        NNTRACE_APP_SWITCH(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "executeWithCompilation example");
         // Set all inputs
         for_all(inputs, [&execution](int idx, const void* p, size_t s) {
             const void* buffer = s == 0 ? nullptr : p;
@@ -108,9 +125,11 @@
             ASSERT_EQ(Result::NO_ERROR, execution.setOutput(idx, buffer, s));
         });
 
+        NNTRACE_APP_SWITCH(NNTRACE_PHASE_EXECUTION, "executeWithCompilation example");
         Result r = execution.compute();
         ASSERT_EQ(Result::NO_ERROR, r);
 
+        NNTRACE_APP_SWITCH(NNTRACE_PHASE_RESULTS, "executeWithCompilation example");
         // Dump all outputs for the slicing tool
         if (dumpToFile) {
             s << "output" << exampleNo << " = {\n";
@@ -133,6 +152,7 @@
                  std::function<bool(int)> isIgnored,
                  std::vector<MixedTypedExample>& examples,
                  std::string dumpFile) {
+    NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeOnce");
     Model model;
     Compilation compilation = createAndCompileModel(&model, createModel);
     executeWithCompilation(&model, &compilation, isIgnored, examples, dumpFile);
@@ -142,6 +162,7 @@
 void executeMultithreadedOwnCompilation(std::function<void(Model*)> createModel,
                                         std::function<bool(int)> isIgnored,
                                         std::vector<MixedTypedExample>& examples) {
+    NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeMultithreadedOwnCompilation");
     SCOPED_TRACE("MultithreadedOwnCompilation");
     std::vector<std::thread> threads;
     for (int i = 0; i < 10; i++) {
@@ -157,6 +178,7 @@
 void executeMultithreadedSharedCompilation(std::function<void(Model*)> createModel,
                                            std::function<bool(int)> isIgnored,
                                            std::vector<MixedTypedExample>& examples) {
+    NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeMultithreadedSharedCompilation");
     SCOPED_TRACE("MultithreadedSharedCompilation");
     Model model;
     Compilation compilation = createAndCompileModel(&model, createModel);