NNAPI systrace for timing statistics
This adds systrace tracing to NNAPI. The tracing
will be helpful for:
- getting numbers on where time is spent currently
- diagnosing and improving performance
- benchmarking
TODOs:
- Write analysis tools for traces
Change-Id: I9026f1043428cb715b577901bec3a2e1e39a82e3
Merged-In: I9026f1043428cb715b577901bec3a2e1e39a82e3
Bug: 78137932
Test: manually run systrace.py against unit tests
Test: manually run systrace.py against benchmarking app
(cherry picked from commit e9e637ab73b68b5982281a3f7c621f6a75d51743)
diff --git a/common/CpuExecutor.cpp b/common/CpuExecutor.cpp
index 8f87067..be79fa5 100644
--- a/common/CpuExecutor.cpp
+++ b/common/CpuExecutor.cpp
@@ -20,6 +20,7 @@
#include "NeuralNetworks.h"
#include "Operations.h"
+#include "Tracing.h"
#include "Eigen/Core"
#include <omp.h>
@@ -189,12 +190,14 @@
int CpuExecutor::run(const V1_0::Model& model, const Request& request,
const std::vector<RunTimePoolInfo>& modelPoolInfos,
const std::vector<RunTimePoolInfo>& requestPoolInfos) {
+ NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run::V1_0");
return run(convertToV1_1(model), request, modelPoolInfos, requestPoolInfos);
}
int CpuExecutor::run(const V1_1::Model& model, const Request& request,
const std::vector<RunTimePoolInfo>& modelPoolInfos,
const std::vector<RunTimePoolInfo>& requestPoolInfos) {
+ NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run::V1_1");
VLOG(CPUEXE) << "CpuExecutor::run() with request("
<< SHOW_IF_DEBUG(toString(request)) << ")";
diff --git a/common/ValidateHal.cpp b/common/ValidateHal.cpp
index 0955178..8016b1f 100644
--- a/common/ValidateHal.cpp
+++ b/common/ValidateHal.cpp
@@ -18,6 +18,7 @@
#include "ValidateHal.h"
#include "NeuralNetworks.h"
+#include "Tracing.h"
#include "Utils.h"
#include <android-base/logging.h>
@@ -405,6 +406,8 @@
template<typename VersionedModel>
static bool validateModelVersioned(const VersionedModel& model) {
+ NNTRACE_FULL(NNTRACE_LAYER_UTILITY, NNTRACE_PHASE_UNSPECIFIED,
+ "validateModelVersioned");
return (validateOperands(model.operands, model.operandValues, model.pools) &&
validateOperations(model.operations, model.operands) &&
validateModelInputOutputs(model.inputIndexes, model.operands,
diff --git a/common/include/Tracing.h b/common/include/Tracing.h
new file mode 100644
index 0000000..c2eae7a
--- /dev/null
+++ b/common/include/Tracing.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_ML_NN_COMMON_TRACING_H
+#define ANDROID_ML_NN_COMMON_TRACING_H
+
+#define ATRACE_TAG ATRACE_TAG_NNAPI
+#include "utils/Trace.h"
+
+// Neural Networks API (NNAPI) systracing
+//
+// Primary goal of the tracing is to capture and present timings for NNAPI.
+// (Other uses include providing visibility to split of execution between
+// drivers and the CPU fallback, and the ability to visualize call sequences).
+//
+// The tracing has three parts:
+// 1 Trace macros defined in this file and used throughout the codebase,
+// modelled after and using atrace. These implement a naming convention for
+// the tracepoints, interpreted by the systrace parser.
+// 2 Android systrace (atrace) on-device capture and host-based analysis.
+// 3 A systrace parser (TODO) to summarize the timings.
+//
+// For an overview and introduction, please refer to the "NNAPI Systrace design
+// and HOWTO" (internal Docs for now). This header doesn't try to replicate all
+// the information in that document.
+//
+// Glossary:
+// - Phase: stage in processing (e.g., Preparation, Compilation, Execution);
+// Overall phase nests rest, Execution nests Input/Output, Transformation,
+// Computation and Results - otherwise not nested (Initialization phase
+// functions may occur inside other phases but will be counted out during
+// analysis). Nested phases (other than Initialization) are analysed as a
+// breakdown of the parent phase.
+// - Layer: component in the stack (from top to bottom: App, Runtime, IPC,
+// Driver/CPU). Calls to lower layers are typically nested within calls to upper
+// layers.
+// - Bucket: unit of timing analysis, the combination of Phase and Layer (and
+// thus also typically nested).
+// - Detail: specific unit being executed, typically a function.
+
+// Convenience macros to be used in the code (phases defined below).
+// (Macros so that string concatenation is done at compile time).
+//
+// These exist in three variants:
+// - Simple (NNTRACE_<layer and potentially phase>) - to be used when only one
+// Phase is active within a scope
+// - "Switch" (NNTRACE_<...>_SWITCH) - to be used when multiple Phases
+// share a scope (e.g., transformation of data and computation in same
+// function).
+// - "Subtract" (NNTRACE_<...>_SUBTRACT) - to be used when nesting is violated
+// and the time should be subtracted from the parent scope
+// Arguments:
+// - phase: one of the NNTRACE_PHASE_* macros defined below.
+// - detail: free-form string constant, typically function name.
+// Example usage:
+// // Simple
+// int ANeuralNetworksMemory_createFromFd(...) {
+// NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksMemory_createFromFd");
+// }
+// // Switch
+// bool concatenationFloat32(...) {
+// NNTRACE_TRANS("concatenationFloat32"); // Transformation of data begins
+// ...
+// NNTRACE_COMP_SWITCH("optimized_ops::Concatenation"); // Transformation
+// // ends and computation
+// // begins
+// }
+// // Subtract
+// static int compile(...) {
+// NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModel");
+// device->getInterface()->prepareModel(..., preparedModelCallback);
+// preparedModelCallback->wait()
+// }
+// ErrorStatus VersionedIDevice::prepareModel(...) {
+// ... IPC work ...
+// {
+// NNTRACE_FULL_SUBTRACT(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_COMPILATION,
+// "VersionedIDevice::prepareModel");
+// ... Runtime work ...
+// }
+// ... IPC work ...
+// }
+//
+// Layer Application - For native applications (e.g., unit tests)
+#define NNTRACE_APP(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_APPLICATION, phase, detail)
+#define NNTRACE_APP_SWITCH(phase, detail) \
+ NNTRACE_FULL_SWITCH(NNTRACE_LAYER_APPLICATION, phase, detail)
+// Layer Runtime - For the NNAPI runtime
+#define NNTRACE_RT(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_RUNTIME, phase, detail)
+#define NNTRACE_RT_SWITCH(phase, detail) NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, phase, detail)
+// Layer CPU - CPU executor
+#define NNTRACE_CPU(phase, detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, phase, detail)
+#define NNTRACE_COMP(detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, \
+ NNTRACE_PHASE_COMPUTATION, detail)
+#define NNTRACE_COMP_SWITCH(detail) NNTRACE_FULL_SWITCH(NNTRACE_LAYER_CPU, \
+ NNTRACE_PHASE_COMPUTATION, detail)
+#define NNTRACE_TRANS(detail) NNTRACE_FULL(NNTRACE_LAYER_CPU, \
+ NNTRACE_PHASE_TRANSFORMATION, detail)
+
+// Fully specified macros to be used when no convenience wrapper exists for your
+// need.
+#define NNTRACE_FULL(layer, phase, detail) NNTRACE_NAME_1(("[NN_" layer "_" phase "]" detail))
+#define NNTRACE_FULL_SWITCH(layer, phase, detail) \
+ NNTRACE_NAME_SWITCH(("[SW][NN_" layer "_" phase "]" detail))
+#define NNTRACE_FULL_SUBTRACT(layer, phase, detail) \
+ NNTRACE_NAME_1(("[SUB][NN_" layer "_" phase "]" detail))
+// Raw macro without scoping requirements, for special cases
+#define NNTRACE_FULL_RAW(layer, phase, detail) android::ScopedTrace PASTE(___tracer, __LINE__) \
+ (ATRACE_TAG, ("[NN_" layer "_" phase "]" detail))
+
+// Tracing buckets - for calculating timing summaries over.
+//
+// Phases
+#define NNTRACE_PHASE_OVERALL "PO" // Overall program, e.g., one benchmark case
+#define NNTRACE_PHASE_INITIALIZATION "PI" // Initialization - not related to a model
+#define NNTRACE_PHASE_PREPARATION "PP" // Model construction
+#define NNTRACE_PHASE_COMPILATION "PC" // Model compilation
+#define NNTRACE_PHASE_EXECUTION "PE" // Executing the model
+#define NNTRACE_PHASE_TERMINATION "PT" // Tearing down
+#define NNTRACE_PHASE_UNSPECIFIED "PU" // Helper code called from multiple phases
+// Subphases of execution
+#define NNTRACE_PHASE_INPUTS_AND_OUTPUTS "PIO" // Setting inputs/outputs and allocating buffers
+#define NNTRACE_PHASE_TRANSFORMATION "PTR" // Transforming data for computation
+#define NNTRACE_PHASE_COMPUTATION "PCO" // Computing operations' outputs
+#define NNTRACE_PHASE_RESULTS "PR" // Reading out results
+// Layers
+#define NNTRACE_LAYER_APPLICATION "LA"
+#define NNTRACE_LAYER_RUNTIME "LR"
+#define NNTRACE_LAYER_IPC "LI"
+#define NNTRACE_LAYER_DRIVER "LD"
+#define NNTRACE_LAYER_CPU "LC"
+#define NNTRACE_LAYER_OTHER "LO"
+#define NNTRACE_LAYER_UTILITY "LU" // Code used from multiple layers
+
+
+// Implementation
+//
+// Almost same as ATRACE_NAME, but enforcing explicit distinction between
+// phase-per-scope and switching phases.
+//
+// Basic trace, one per scope allowed to enforce disjointness
+#define NNTRACE_NAME_1(name) android::ScopedTrace ___tracer_1(ATRACE_TAG, name)
+// Switching trace, more than one per scope allowed, translated by
+// systrace_parser.py. This is mainly useful for tracing multiple phases through
+// one function / scope.
+#define NNTRACE_NAME_SWITCH(name) android::ScopedTrace PASTE(___tracer, __LINE__) \
+ (ATRACE_TAG, name); \
+ (void)___tracer_1 // ensure switch is only used after a basic trace
+
+
+// Disallow use of raw ATRACE macros
+#undef ATRACE_NAME
+#undef ATRACE_CALL
+
+#endif // ANDROID_ML_NN_COMMON_TRACING_H
diff --git a/common/operations/Activation.cpp b/common/operations/Activation.cpp
index b80984d..5da58f9 100644
--- a/common/operations/Activation.cpp
+++ b/common/operations/Activation.cpp
@@ -19,11 +19,14 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
bool reluFloat32(const float* inputData, const Shape& inputShape,
float* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("reluFloat32");
int numElements = getNumberOfElements(inputShape);
for (int i=0; i<numElements; i++, inputData++, outputData++) {
*outputData = std::max(0.f, *inputData);
@@ -33,6 +36,7 @@
bool relu1Float32(const float* inputData, const Shape& inputShape,
float* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("relu1Float32");
int numElements = getNumberOfElements(inputShape);
for (int i=0; i<numElements; i++, inputData++, outputData++) {
*outputData = std::min(std::max(-1.f, *inputData), 1.f);
@@ -42,6 +46,7 @@
bool relu6Float32(const float* inputData, const Shape& inputShape,
float* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("relu6Float32");
int numElements = getNumberOfElements(inputShape);
for (int i=0; i<numElements; i++, inputData++, outputData++) {
*outputData = std::min(std::max(0.f, *inputData), 6.f);
@@ -51,6 +56,7 @@
bool tanhFloat32(const float* inputData, const Shape& inputShape,
float* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("tanhFloat32");
int numElements = getNumberOfElements(inputShape);
for (int i=0; i<numElements; i++, inputData++, outputData++) {
*outputData = std::tanh(*inputData);
@@ -60,6 +66,7 @@
bool logisticFloat32(const float* inputData, const Shape& inputShape,
float* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("logisticFloat32");
int numElements = getNumberOfElements(inputShape);
for (int i=0; i<numElements; i++, inputData++, outputData++) {
*outputData = 1.f / (1.f + std::exp(-*inputData));
@@ -70,6 +77,7 @@
bool softmaxFloat32(const float* inputData, const Shape& inputShape,
const float beta,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("softmaxFloat32");
tflite::Dims<4> dim;
if (getNumberOfDimensions(inputShape) == 2) {
uint32_t batch_size = getSizeOfDimension(inputShape, 0);
@@ -85,6 +93,7 @@
return false;
}
+ NNTRACE_COMP_SWITCH("optimized_ops::Softmax");
tflite::optimized_ops::Softmax(inputData, dim, beta,
outputData, dim);
return true;
@@ -107,18 +116,21 @@
bool reluQuant8(const uint8_t* inputData, const Shape& inputShape,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("reluQuant8");
ANDROID_NN_RELUX_QUANT8(kActivationRelu)
return true;
}
bool relu1Quant8(const uint8_t* inputData, const Shape& inputShape,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("relu1Quant8");
ANDROID_NN_RELUX_QUANT8(kActivationRelu1)
return true;
}
bool relu6Quant8(const uint8_t* inputData, const Shape& inputShape,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("relu6Quant8");
ANDROID_NN_RELUX_QUANT8(kActivationRelu6)
return true;
}
@@ -127,6 +139,7 @@
bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("logisticQuant8");
if (outputShape.offset != 0 || outputShape.scale != 1.f / 256) {
LOG(ERROR) << "incorrect scale / offset for output";
return false;
@@ -149,6 +162,7 @@
int32_t input_range_radius =
CalculateInputRadius(kInputIntegerBits, input_left_shift);
+ NNTRACE_COMP_SWITCH("optimized_ops::Logistic");
tflite::optimized_ops::Logistic(
inputData, convertShapeToDims(inputShape),
inputShape.offset, input_range_radius,
@@ -161,6 +175,7 @@
bool softmaxQuant8(const uint8_t* inputData, const Shape& inputShape,
const float beta,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("softmaxQuant8");
tflite::Dims<4> dim;
if (getNumberOfDimensions(inputShape) == 2) {
uint32_t batch_size = getSizeOfDimension(inputShape, 0);
@@ -196,6 +211,7 @@
float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits,
input_left_shift);
+ NNTRACE_COMP_SWITCH("optimized_ops::Softmax");
tflite::optimized_ops::Softmax(inputData, dim, input_multiplier,
input_left_shift, diff_min,
outputData, dim);
diff --git a/common/operations/Concatenation.cpp b/common/operations/Concatenation.cpp
index fc65e43..cc1fa5a 100644
--- a/common/operations/Concatenation.cpp
+++ b/common/operations/Concatenation.cpp
@@ -19,12 +19,15 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
const std::vector<Shape>& inputShapes, int32_t axis,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("concatenationFloat32");
int num_inputs = inputShapes.size();
std::vector<tflite::Dims<4>*> inputDimsPtr(num_inputs);
std::vector<tflite::Dims<4> > inputDims(num_inputs);
@@ -33,6 +36,7 @@
inputDimsPtr[i] = &inputDims[i];
}
+ NNTRACE_COMP_SWITCH("optimized_ops::Concatenation");
tflite::optimized_ops::Concatenation<tflite::FusedActivationFunctionType::kNone, float>(
getNumberOfDimensions(outputShape) - axis - 1,
inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
@@ -44,6 +48,7 @@
bool concatenationQuant8(const std::vector<const uint8_t*>& inputDataPtrs,
const std::vector<Shape>& inputShapes, int32_t axis,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("concatenationQuant8");
int num_inputs = inputShapes.size();
std::vector<tflite::Dims<4>*> inputDimsPtr(num_inputs);
std::vector<tflite::Dims<4> > inputDims(num_inputs);
@@ -52,6 +57,7 @@
inputDimsPtr[i] = &inputDims[i];
}
+ NNTRACE_COMP_SWITCH("optimized_ops::Concatenation");
tflite::optimized_ops::Concatenation<tflite::FusedActivationFunctionType::kNone, uint8_t>(
getNumberOfDimensions(outputShape) - axis - 1,
inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
diff --git a/common/operations/Conv2D.cpp b/common/operations/Conv2D.cpp
index c16426c..60f9132 100644
--- a/common/operations/Conv2D.cpp
+++ b/common/operations/Conv2D.cpp
@@ -19,6 +19,8 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -84,6 +86,7 @@
int32_t stride_width, int32_t stride_height,
int32_t activation,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("convFloat32");
ANDROID_NN_CONV_PARAMETERS(float)
@@ -95,6 +98,7 @@
// Prevent concurrent executions that may access the scratch buffer.
std::unique_lock<std::mutex> lock(executionMutex);
+ NNTRACE_COMP_SWITCH("optimized_ops::Conv");
tflite::optimized_ops::Conv(
inputData, convertShapeToDims(inputShape),
filterData, convertShapeToDims(filterShape),
@@ -116,6 +120,7 @@
int32_t stride_width, int32_t stride_height,
int32_t activation,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("convQuant8");
ANDROID_NN_CONV_PARAMETERS(uint8_t)
@@ -146,6 +151,8 @@
std::unique_lock<std::mutex> lock(executionMutex);
// Alow gemmlowp automatically decide how many threads to use.
gemm_context.set_max_num_threads(0);
+
+ NNTRACE_COMP_SWITCH("optimized_ops::Conv");
tflite::optimized_ops::Conv(
inputData, convertShapeToDims(inputShape), inputOffset,
filterData, convertShapeToDims(filterShape), filterOffset,
diff --git a/common/operations/DepthwiseConv2D.cpp b/common/operations/DepthwiseConv2D.cpp
index 5dd67e2..60c5e41 100644
--- a/common/operations/DepthwiseConv2D.cpp
+++ b/common/operations/DepthwiseConv2D.cpp
@@ -20,6 +20,8 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -42,6 +44,7 @@
int32_t stride_width, int32_t stride_height,
int32_t depth_multiplier, int32_t activation,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("depthwiseConvFloat32");
ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
@@ -49,6 +52,7 @@
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
+ NNTRACE_COMP_SWITCH("optimized_ops::DepthwiseConv");
tflite::optimized_ops::DepthwiseConv(
inputData, convertShapeToDims(inputShape),
filterData, convertShapeToDims(filterShape),
@@ -70,6 +74,7 @@
int32_t stride_width, int32_t stride_height,
int32_t depth_multiplier, int32_t activation,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("depthwiseConvQuant8");
ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
@@ -94,6 +99,7 @@
uint32_t filterOffset = -filterShape.offset;
uint32_t outputOffset = outputShape.offset;
+ NNTRACE_COMP_SWITCH("optimized_ops::DepthwiseConv");
tflite::optimized_ops::DepthwiseConv(
inputData, convertShapeToDims(inputShape), inputOffset,
filterData, convertShapeToDims(filterShape), filterOffset,
diff --git a/common/operations/EmbeddingLookup.cpp b/common/operations/EmbeddingLookup.cpp
index 504c684..9f231ed 100644
--- a/common/operations/EmbeddingLookup.cpp
+++ b/common/operations/EmbeddingLookup.cpp
@@ -20,6 +20,8 @@
#include "HalInterfaces.h"
#include "Operations.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -32,6 +34,7 @@
}
bool EmbeddingLookup::Eval() {
+ NNTRACE_COMP("EmbeddingLookup::Eval");
const int row_size = value_->shape().dimensions[0];
const int total_bytes = sizeOfData(value_->type, value_->dimensions);
const int row_bytes = total_bytes/row_size;
diff --git a/common/operations/FullyConnected.cpp b/common/operations/FullyConnected.cpp
index 4e2deff..12da51b 100644
--- a/common/operations/FullyConnected.cpp
+++ b/common/operations/FullyConnected.cpp
@@ -20,6 +20,8 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -33,6 +35,7 @@
const float* biasData, const Shape& biasShape,
int32_t activation,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("fullyConnectedFloat32");
float output_activation_min, output_activation_max;
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
@@ -42,6 +45,7 @@
uint32_t batch_size = getSizeOfDimension(outputShape, 0);
uint32_t input_n_elements = getNumberOfElements(inputShape);
if (batch_size * batch_size == input_n_elements) {
+ NNTRACE_COMP_SWITCH("reference_ops::FullyConnected");
tflite::reference_ops::FullyConnected(
inputData, convertShapeToDims(inputShape),
weightsData, convertShapeToDims(weightsShape),
@@ -49,6 +53,7 @@
output_activation_min, output_activation_max,
outputData, convertShapeToDims(outputShape));
} else {
+ NNTRACE_COMP_SWITCH("optimized_ops::FullyConnected");
tflite::optimized_ops::FullyConnected(
inputData, convertShapeToDims(inputShape),
weightsData, convertShapeToDims(weightsShape),
@@ -64,6 +69,7 @@
const int32_t* biasData, const Shape& biasShape,
int32_t activation,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("fullyConnectedQuant8");
int32_t inputOffset = -inputShape.offset;
int32_t weightsOffset = -weightsShape.offset;
int32_t outputOffset = outputShape.offset;
@@ -91,6 +97,7 @@
// Alow gemmlowp automatically decide how many threads to use.
gemm_context.set_max_num_threads(0);
+ NNTRACE_COMP_SWITCH("optimized_ops::FullyConnected");
tflite::optimized_ops::FullyConnected(
inputData, convertShapeToDims(inputShape), inputOffset,
weightsData, convertShapeToDims(weightsShape), weightsOffset,
diff --git a/common/operations/HashtableLookup.cpp b/common/operations/HashtableLookup.cpp
index 1c8d802..e864b3d 100644
--- a/common/operations/HashtableLookup.cpp
+++ b/common/operations/HashtableLookup.cpp
@@ -20,6 +20,8 @@
#include "HalInterfaces.h"
#include "Operations.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -42,6 +44,7 @@
}
bool HashtableLookup::Eval() {
+ NNTRACE_COMP("HashtableLookup::Eval");
const int num_rows = value_->shape().dimensions[0];
const int row_bytes = sizeOfData(value_->type, value_->dimensions) / num_rows;
void* pointer = nullptr;
diff --git a/common/operations/LSHProjection.cpp b/common/operations/LSHProjection.cpp
index 57d1475..97183e2 100644
--- a/common/operations/LSHProjection.cpp
+++ b/common/operations/LSHProjection.cpp
@@ -20,6 +20,8 @@
#include "HalInterfaces.h"
#include "util/hash/farmhash.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -141,6 +143,8 @@
}
bool LSHProjection::Eval() {
+ NNTRACE_COMP("LSHProjection::Eval");
+
int32_t* out_buf = reinterpret_cast<int32_t*>(output_->buffer);
switch (type_) {
diff --git a/common/operations/LSTM.cpp b/common/operations/LSTM.cpp
index 80b0eb9..7667c66 100644
--- a/common/operations/LSTM.cpp
+++ b/common/operations/LSTM.cpp
@@ -19,6 +19,8 @@
#include "CpuExecutor.h"
#include "HalInterfaces.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -300,6 +302,8 @@
}
bool LSTMCell::Eval() {
+ NNTRACE_COMP("LSTMCell::Eval");
+
const uint32_t n_batch = input_->shape().dimensions[0];
const uint32_t n_input = input_->shape().dimensions[1];
// n_cell and n_output will be the same size when there is no projection.
diff --git a/common/operations/Normalization.cpp b/common/operations/Normalization.cpp
index eccb3bd..7f36dbf 100644
--- a/common/operations/Normalization.cpp
+++ b/common/operations/Normalization.cpp
@@ -19,11 +19,14 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
bool l2normFloat32(const float* inputData, const Shape& inputShape,
float* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("optimized_ops::L2Normalization::float");
tflite::optimized_ops::L2Normalization<tflite::FusedActivationFunctionType::kNone>(
inputData, convertShapeToDims(inputShape),
outputData, convertShapeToDims(outputShape));
@@ -33,6 +36,7 @@
bool l2normQuant8(const uint8_t* inputData, const Shape& inputShape,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("optimized_ops::L2Normalization::uint8");
tflite::optimized_ops::L2Normalization(
inputData, convertShapeToDims(inputShape),
inputShape.offset,
@@ -44,6 +48,7 @@
bool localResponseNormFloat32(const float* inputData, const Shape& inputShape,
int32_t radius, float bias, float alpha, float beta,
float* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("optimized_ops::LocalResponseNormalization::float");
tflite::optimized_ops::LocalResponseNormalization(
inputData, convertShapeToDims(inputShape),
radius, bias, alpha, beta,
diff --git a/common/operations/Pooling.cpp b/common/operations/Pooling.cpp
index db4497a..7f60323 100644
--- a/common/operations/Pooling.cpp
+++ b/common/operations/Pooling.cpp
@@ -19,6 +19,8 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -37,6 +39,7 @@
int32_t stride_width, int32_t stride_height,
int32_t filter_width, int32_t filter_height, int32_t activation,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("averagePoolFloat32");
ANDROID_NN_POOLING_PARAMETERS
@@ -44,6 +47,7 @@
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
+ NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
tflite::optimized_ops::AveragePool(
inputData, convertShapeToDims(inputShape),
stride_width, stride_height, paddingWidth, paddingHeight,
@@ -60,6 +64,7 @@
int32_t stride_width, int32_t stride_height,
int32_t filter_width, int32_t filter_height, int32_t activation,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("averagePoolQuant8");
ANDROID_NN_POOLING_PARAMETERS
@@ -70,6 +75,7 @@
&output_activation_min,
&output_activation_max);
+ NNTRACE_COMP_SWITCH("optimized_ops::AveragePool");
tflite::optimized_ops::AveragePool(
inputData, convertShapeToDims(inputShape),
stride_width, stride_height, paddingWidth, paddingHeight,
@@ -86,6 +92,7 @@
int32_t stride_width, int32_t stride_height,
int32_t filter_width, int32_t filter_height, int32_t activation,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("l2PoolFloat32");
ANDROID_NN_POOLING_PARAMETERS
@@ -93,6 +100,7 @@
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
+ NNTRACE_COMP_SWITCH("optimized_ops::L2Pool");
tflite::optimized_ops::L2Pool(
inputData, convertShapeToDims(inputShape),
stride_width, stride_height, paddingWidth, paddingHeight,
@@ -109,6 +117,7 @@
int32_t stride_width, int32_t stride_height,
int32_t filter_width, int32_t filter_height, int32_t activation,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("maxPoolFloat32");
ANDROID_NN_POOLING_PARAMETERS
@@ -116,6 +125,7 @@
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
+ NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
tflite::optimized_ops::MaxPool(
inputData, convertShapeToDims(inputShape),
stride_width, stride_height, paddingWidth, paddingHeight,
@@ -132,6 +142,7 @@
int32_t stride_width, int32_t stride_height,
int32_t filter_width, int32_t filter_height, int32_t activation,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("maxPoolQuant8");
ANDROID_NN_POOLING_PARAMETERS
@@ -142,6 +153,7 @@
&output_activation_min,
&output_activation_max);
+ NNTRACE_COMP_SWITCH("optimized_ops::MaxPool");
tflite::optimized_ops::MaxPool(
inputData, convertShapeToDims(inputShape),
stride_width, stride_height, paddingWidth, paddingHeight,
diff --git a/common/operations/RNN.cpp b/common/operations/RNN.cpp
index 8a00734..4d7a4c9 100644
--- a/common/operations/RNN.cpp
+++ b/common/operations/RNN.cpp
@@ -19,11 +19,14 @@
#include "CpuExecutor.h"
#include "HalInterfaces.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
RNN::RNN(const Operation& operation,
std::vector<RunTimeOperandInfo>& operands) {
+ NNTRACE_TRANS("RNN::RNN");
input_ = GetInput(operation, operands, kInputTensor);
weights_ = GetInput(operation, operands, kWeightsTensor);
recurrent_weights_ = GetInput(operation, operands, kRecurrentWeightsTensor);
@@ -41,6 +44,7 @@
std::vector<RunTimeOperandInfo> &operands,
Shape *hiddenStateShape,
Shape *outputShape) {
+ NNTRACE_TRANS("RNN::Prepare");
// Check we have all the inputs and outputs we need.
const int num_inputs = NumInputsWithValues(operation, operands);
NN_CHECK(num_inputs == 5 || num_inputs == 6);
@@ -78,6 +82,8 @@
}
bool RNN::Eval() {
+ NNTRACE_COMP("RNN::Eval");
+
const float* bias_ptr = reinterpret_cast<float*>(bias_->buffer);
const uint32_t batch_size = input_->shape().dimensions[0];
diff --git a/common/operations/Reshape.cpp b/common/operations/Reshape.cpp
index 5803968..5a05aed 100644
--- a/common/operations/Reshape.cpp
+++ b/common/operations/Reshape.cpp
@@ -24,11 +24,14 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
bool reshapeGeneric(const void* inputData, const Shape& inputShape,
void* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("reshapeGeneric");
size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
memcpy(outputData, inputData, count);
return true;
@@ -36,6 +39,7 @@
bool resizeBilinearFloat32(const float* inputData, const Shape& inputShape,
float* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("resizeBilinearFloat32");
int32_t height = (int32_t) getSizeOfDimension(outputShape, 1);
int32_t width = (int32_t) getSizeOfDimension(outputShape, 2);
@@ -44,6 +48,7 @@
Shape outDimShape;
outDimShape.dimensions = {1, 1, 1, 2};
+ NNTRACE_COMP_SWITCH("optimized_ops::ResizeBilinear");
tflite::optimized_ops::ResizeBilinear(
inputData, convertShapeToDims(inputShape),
outDimData, convertShapeToDims(outDimShape),
@@ -54,14 +59,17 @@
bool depthToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
int32_t blockSize,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("depthToSpaceGeneric");
if (inputShape.type == OperandType::TENSOR_FLOAT32) {
- tflite::optimized_ops::DepthToSpace(
- reinterpret_cast<const float*>(inputData),
- convertShapeToDims(inputShape),
- blockSize,
- reinterpret_cast<float*>(outputData),
- convertShapeToDims(outputShape));
+ NNTRACE_COMP_SWITCH("optimized_ops::DepthToSpace::float");
+ tflite::optimized_ops::DepthToSpace(
+ reinterpret_cast<const float*>(inputData),
+ convertShapeToDims(inputShape),
+ blockSize,
+ reinterpret_cast<float*>(outputData),
+ convertShapeToDims(outputShape));
} else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+ NNTRACE_COMP_SWITCH("optimized_ops::DepthToSpace::uint8");
tflite::optimized_ops::DepthToSpace(
reinterpret_cast<const uint8_t*>(inputData),
convertShapeToDims(inputShape),
@@ -78,7 +86,9 @@
bool spaceToDepthGeneric(const uint8_t* inputData, const Shape& inputShape,
int32_t blockSize,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("spaceToDepthGeneric");
if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+ NNTRACE_COMP_SWITCH("optimized_ops::SpaceToDepth::float");
tflite::optimized_ops::SpaceToDepth(
reinterpret_cast<const float*>(inputData),
convertShapeToDims(inputShape),
@@ -86,6 +96,7 @@
reinterpret_cast<float*>(outputData),
convertShapeToDims(outputShape));
} else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+ NNTRACE_COMP_SWITCH("optimized_ops::SpaceToDepth::uint8");
tflite::optimized_ops::SpaceToDepth(
reinterpret_cast<const uint8_t*>(inputData),
convertShapeToDims(inputShape),
@@ -102,6 +113,7 @@
bool padGeneric(const uint8_t* inputData, const Shape& inputShape,
const int32_t* paddings,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("padGeneric");
int32_t numInputDims = static_cast<int32_t>(getNumberOfDimensions(inputShape));
std::vector<int> beforePadding;
@@ -113,6 +125,7 @@
}
if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+ NNTRACE_COMP_SWITCH("optimized_ops::Pad::float");
tflite::optimized_ops::Pad(
reinterpret_cast<const float*>(inputData),
convertShapeToDims(inputShape),
@@ -120,6 +133,7 @@
reinterpret_cast<float*>(outputData),
convertShapeToDims(outputShape));
} else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+ NNTRACE_COMP_SWITCH("optimized_ops::Pad::uint8");
tflite::optimized_ops::Pad(
reinterpret_cast<const uint8_t*>(inputData),
convertShapeToDims(inputShape),
@@ -136,18 +150,21 @@
bool batchToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
const int32_t* blockSize,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("batchToSpaceGeneric");
// Needed by low level implementation, but not really used.
tflite::Dims<4> blockSizeDim, cropsDim;
const int32 crops[4] = {0, 0, 0, 0};
if (inputShape.type == OperandType::TENSOR_FLOAT32) {
- tflite::optimized_ops::BatchToSpaceND(
- reinterpret_cast<const float*>(inputData),
- convertShapeToDims(inputShape),
- blockSize, blockSizeDim,
- crops, cropsDim,
- reinterpret_cast<float*>(outputData),
- convertShapeToDims(outputShape));
+ NNTRACE_COMP_SWITCH("optimized_ops::BatchToSpaceND::float");
+ tflite::optimized_ops::BatchToSpaceND(
+ reinterpret_cast<const float*>(inputData),
+ convertShapeToDims(inputShape),
+ blockSize, blockSizeDim,
+ crops, cropsDim,
+ reinterpret_cast<float*>(outputData),
+ convertShapeToDims(outputShape));
} else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+ NNTRACE_COMP_SWITCH("optimized_ops::BatchToSpaceND::uint8");
tflite::optimized_ops::BatchToSpaceND(
reinterpret_cast<const uint8_t*>(inputData),
convertShapeToDims(inputShape),
@@ -166,9 +183,11 @@
const int32_t* blockSize,
const int32_t* padding, const Shape& paddingShape,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("spaceToBatchGeneric");
// Needed by low level implementation, but not really used.
tflite::Dims<4> blockSizeDim;
if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+ NNTRACE_COMP_SWITCH("optimized_ops::SpaceToBatchND::float");
tflite::optimized_ops::SpaceToBatchND(
reinterpret_cast<const float*>(inputData),
convertShapeToDims(inputShape),
@@ -177,6 +196,7 @@
reinterpret_cast<float*>(outputData),
convertShapeToDims(outputShape));
} else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+ NNTRACE_COMP_SWITCH("optimized_ops::SpaceToBatchND::uint8");
tflite::optimized_ops::SpaceToBatchND(
reinterpret_cast<const uint8_t*>(inputData),
convertShapeToDims(inputShape),
@@ -193,6 +213,7 @@
bool squeezeGeneric(const void* inputData, const Shape& inputShape,
void* outputData, const Shape& outputShape) {
+ NNTRACE_COMP("squeezeGeneric");
size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
memcpy(outputData, inputData, count);
return true;
@@ -201,6 +222,7 @@
bool transposeGeneric(const uint8_t* inputData, const Shape& inputShape,
const int32_t* perm, const Shape& permShape,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("transposeGeneric");
// Reverse the permuted axes and convert to 4D due to the way Dims are
// constructed.
const int32_t kOutputDimensionNum = 4;
@@ -215,6 +237,7 @@
reversed_perm[k] = k;
}
if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+ NNTRACE_COMP_SWITCH("optimized_ops::Transpose::float");
tflite::reference_ops::Transpose(
reinterpret_cast<const float*>(inputData),
convertShapeToDims(inputShape),
@@ -222,6 +245,7 @@
convertShapeToDims(outputShape),
reversed_perm);
} else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+ NNTRACE_COMP_SWITCH("optimized_ops::Transpose::uint8");
tflite::reference_ops::Transpose(
reinterpret_cast<const uint8_t*>(inputData),
convertShapeToDims(inputShape),
diff --git a/common/operations/SVDF.cpp b/common/operations/SVDF.cpp
index 38224a6..cdba351 100644
--- a/common/operations/SVDF.cpp
+++ b/common/operations/SVDF.cpp
@@ -19,6 +19,8 @@
#include "CpuExecutor.h"
#include "HalInterfaces.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -38,6 +40,7 @@
SVDF::SVDF(const Operation& operation,
std::vector<RunTimeOperandInfo>& operands) {
+ NNTRACE_TRANS("SVDF::SVDF");
input_ = GetInput(operation, operands, kInputTensor);
weights_feature_ = GetInput(operation, operands, kWeightsFeatureTensor);
weights_time_ = GetInput(operation, operands, kWeightsTimeTensor);
@@ -56,6 +59,7 @@
std::vector<RunTimeOperandInfo> &operands,
Shape *stateShape,
Shape *outputShape) {
+ NNTRACE_TRANS("SVDF::Prepare");
// Check we have all the inputs and outputs we need.
const int num_inputs = NumInputsWithValues(operation, operands);
@@ -103,6 +107,8 @@
}
bool SVDF::Eval() {
+ NNTRACE_COMP("SVDF::Eval");
+
const int rank = params_.rank_;
const int batch_size = SizeOfDimension(input_, 0);
const int input_size = SizeOfDimension(input_, 1);
diff --git a/common/operations/SimpleMath.cpp b/common/operations/SimpleMath.cpp
index 045ac97..e842439 100644
--- a/common/operations/SimpleMath.cpp
+++ b/common/operations/SimpleMath.cpp
@@ -24,6 +24,8 @@
#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -50,9 +52,11 @@
const float* in2, const Shape& shape2,
int32_t activation,
float* out, const Shape& shapeOut) {
+ NNTRACE_TRANS("addFloat32");
bool needBroadcast = !SameShape(shape1, shape2);
if (needBroadcast) {
+ NNTRACE_COMP_SWITCH("optimized_ops::BroadcastAdd");
#define ANDROID_NN_BROADCAST_ADD(activation) \
tflite::optimized_ops::BroadcastAdd<tflite::FusedActivationFunctionType::activation>( \
in1, convertShapeToDims(shape1), \
@@ -66,6 +70,7 @@
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
+ NNTRACE_COMP_SWITCH("optimized_ops::Add");
tflite::optimized_ops::Add(
in1, convertShapeToDims(shape1),
in2, convertShapeToDims(shape2),
@@ -80,6 +85,7 @@
const uint8_t* in2, const Shape& shape2,
int32_t activation,
uint8_t* out, const Shape& shapeOut) {
+ NNTRACE_TRANS("addQuant8");
bool needBroadcast = !SameShape(shape1, shape2);
const int32_t input1_offset = -shape1.offset;
@@ -118,6 +124,7 @@
&output_activation_max);
if (needBroadcast) {
+ NNTRACE_COMP_SWITCH("optimized_ops::BroadcastAdd");
tflite::optimized_ops::BroadcastAdd(
left_shift,
in1, convertShapeToDims(shape1),
@@ -128,6 +135,7 @@
output_activation_min, output_activation_max,
out, convertShapeToDims(shapeOut));
} else {
+ NNTRACE_COMP_SWITCH("optimized_ops::Add");
#define ANDROID_NN_NORMAL_ADD(activation) \
tflite::optimized_ops::Add<tflite::FusedActivationFunctionType::activation>( \
left_shift, \
@@ -150,10 +158,12 @@
const float* in2, const Shape& shape2,
int32_t activation,
float* out, const Shape& shapeOut) {
+ NNTRACE_TRANS("mulFloat32");
bool needBroadcast = !SameShape(shape1, shape2);
if (needBroadcast) {
- #define ANDROID_NN_BROADCAST_MUL(activation) \
+ NNTRACE_COMP_SWITCH("optimized_ops::BroadcastMul");
+ #define ANDROID_NN_BROADCAST_MUL(activation) \
tflite::optimized_ops::BroadcastMul<tflite::FusedActivationFunctionType::activation>( \
in1, convertShapeToDims(shape1), \
in2, convertShapeToDims(shape2), \
@@ -166,6 +176,7 @@
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
+ NNTRACE_COMP_SWITCH("optimized_ops::Mul");
tflite::optimized_ops::Mul(
in1, convertShapeToDims(shape1),
in2, convertShapeToDims(shape2),
@@ -180,6 +191,7 @@
const uint8_t* in2, const Shape& shape2,
int32_t activation,
uint8_t* out, const Shape& shapeOut) {
+ NNTRACE_TRANS("mulQuant8");
const int32_t input1_offset = -shape1.offset;
const int32_t input2_offset = -shape2.offset;
const int32_t output_offset = shapeOut.offset;
@@ -198,6 +210,7 @@
&output_activation_max);
// Use BROADCAST version to handle the normal case.
+ NNTRACE_COMP_SWITCH("optimized_ops::BroadcastMul");
tflite::optimized_ops::BroadcastMul(
in1, convertShapeToDims(shape1), input1_offset,
in2, convertShapeToDims(shape2), input2_offset,
@@ -211,7 +224,9 @@
bool floorFloat32(const float* inputData,
float* outputData,
const Shape& shape) {
+ NNTRACE_TRANS("floorFloat32");
tflite::Dims<4> dim = convertShapeToDims(shape);
+ NNTRACE_COMP_SWITCH("optimized_ops::Floor");
tflite::optimized_ops::Floor(inputData, dim, outputData, dim);
return true;
}
@@ -219,7 +234,9 @@
bool dequantizeQuant8ToFloat32(const uint8_t* inputData,
float* outputData,
const Shape& shape) {
+ NNTRACE_TRANS("dequantizeQuant8ToFloat32");
tflite::Dims<4> dim = convertShapeToDims(shape);
+ NNTRACE_COMP_SWITCH("optimized_ops::Dequantize");
tflite::optimized_ops::Dequantize(inputData, dim,
shape.offset, shape.scale,
outputData, dim);
@@ -230,18 +247,21 @@
const float* in2, const Shape& shape2,
int32_t activation,
float* out, const Shape& shapeOut) {
+ NNTRACE_TRANS("subFloat32");
float output_activation_min, output_activation_max;
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
bool needBroadcast = !SameShape(shape1, shape2);
if (needBroadcast) {
+ NNTRACE_COMP_SWITCH("optimized_ops::BroadcastSub");
tflite::optimized_ops::BroadcastSub(
in1, convertShapeToDims(shape1),
in2, convertShapeToDims(shape2),
output_activation_min, output_activation_max,
out, convertShapeToDims(shapeOut));
} else {
+ NNTRACE_COMP_SWITCH("optimized_ops::Sub");
tflite::optimized_ops::Sub(
in1, convertShapeToDims(shape1),
in2, convertShapeToDims(shape2),
@@ -255,18 +275,21 @@
const float* in2, const Shape& shape2,
int32_t activation,
float* out, const Shape& shapeOut) {
+ NNTRACE_TRANS("divFloat32");
float output_activation_min, output_activation_max;
CalculateActivationRangeFloat(activation, &output_activation_min,
&output_activation_max);
bool needBroadcast = !SameShape(shape1, shape2);
if (needBroadcast) {
+ NNTRACE_COMP_SWITCH("optimized_ops::BroadcastDiv");
tflite::optimized_ops::BroadcastDiv(
in1, convertShapeToDims(shape1),
in2, convertShapeToDims(shape2),
output_activation_min, output_activation_max,
out, convertShapeToDims(shapeOut));
} else {
+ NNTRACE_COMP_SWITCH("optimized_ops::Div");
tflite::optimized_ops::Div(
in1, convertShapeToDims(shape1),
in2, convertShapeToDims(shape2),
@@ -279,6 +302,7 @@
bool meanGeneric(const uint8_t* inputData, const Shape& inputShape,
const int32_t* axis, const Shape& axisShape, bool keepDims,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("meanGeneric");
// Creates a temp index to iterate through input data.
int32_t* scratchBuffer = new int32_t[getNumberOfDimensions(inputShape)];
@@ -293,6 +317,7 @@
LOG(ERROR) << "Failed to allocate tempSumBuffer for MEAN";
result = false;
} else {
+ NNTRACE_COMP_SWITCH("optimized_ops::Mean");
tflite::reference_ops::Mean<float, float>(
const_cast<float*>(reinterpret_cast<const float*>(inputData)),
reinterpret_cast<const int*>(inputShape.dimensions.data()),
@@ -310,6 +335,7 @@
LOG(ERROR) << "Failed to allocate tempSumBuffer for MEAN";
result = false;
} else {
+ NNTRACE_COMP_SWITCH("optimized_ops::Mean");
tflite::reference_ops::Mean<uint8_t, int32_t>(
const_cast<uint8_t*>(inputData),
reinterpret_cast<const int*>(inputShape.dimensions.data()),
diff --git a/common/operations/StridedSlice.cpp b/common/operations/StridedSlice.cpp
index 9db9523..222c48d 100644
--- a/common/operations/StridedSlice.cpp
+++ b/common/operations/StridedSlice.cpp
@@ -23,6 +23,8 @@
#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "Tracing.h"
+
namespace android {
namespace nn {
@@ -31,6 +33,7 @@
const int32_t* stridesData,
int32_t beginMask, int32_t endMask, int32_t shrinkAxisMask,
uint8_t* outputData, const Shape& outputShape) {
+ NNTRACE_TRANS("stridedSliceGeneric");
// This Op only supports 1-4D cases and since we use the reference 4D
// implementation, the 1-3D tensors are mapped to 4D.
const int kMaxDim = 4;
@@ -56,6 +59,7 @@
endMask = ReverseMaskBits(endMask, numInputDims);
if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+ NNTRACE_COMP_SWITCH("reference_ops::StridedSlice::float");
tflite::reference_ops::StridedSlice(
reinterpret_cast<const float*>(inputData),
convertShapeToDims(inputShape),
@@ -64,6 +68,7 @@
reinterpret_cast<float*>(outputData),
convertShapeToDims(outputShape));
} else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+ NNTRACE_COMP_SWITCH("reference_ops::StridedSlice::uint8");
tflite::reference_ops::StridedSlice(
reinterpret_cast<const uint8_t*>(inputData),
convertShapeToDims(inputShape),
diff --git a/driver/sample/Android.bp b/driver/sample/Android.bp
index 51a9df2..96a73de 100644
--- a/driver/sample/Android.bp
+++ b/driver/sample/Android.bp
@@ -27,6 +27,7 @@
],
shared_libs: [
"libbase",
+ "libcutils",
"libdl",
"libhardware",
"libhidlbase",
diff --git a/driver/sample/SampleDriver.cpp b/driver/sample/SampleDriver.cpp
index a608852..d43f597 100644
--- a/driver/sample/SampleDriver.cpp
+++ b/driver/sample/SampleDriver.cpp
@@ -20,6 +20,7 @@
#include "CpuExecutor.h"
#include "HalInterfaces.h"
+#include "Tracing.h"
#include "ValidateHal.h"
#include <android-base/logging.h>
@@ -31,6 +32,8 @@
namespace sample_driver {
Return<void> SampleDriver::getCapabilities(getCapabilities_cb cb) {
+ NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_INITIALIZATION,
+ "SampleDriver::getCapabilities");
return getCapabilities_1_1(
[&](ErrorStatus error, const V1_1::Capabilities& capabilities) {
// TODO(dgross): Do we need to check compliantWithV1_0(capabilities)?
@@ -40,6 +43,8 @@
Return<void> SampleDriver::getSupportedOperations(const V1_0::Model& model,
getSupportedOperations_cb cb) {
+ NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_COMPILATION,
+ "SampleDriver::getSupportedOperations");
if (!validateModel(model)) {
VLOG(DRIVER) << "getSupportedOperations";
std::vector<bool> supported;
@@ -51,6 +56,8 @@
Return<ErrorStatus> SampleDriver::prepareModel(const V1_0::Model& model,
const sp<IPreparedModelCallback>& callback) {
+ NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_COMPILATION,
+ "SampleDriver::prepareModel");
if (callback.get() == nullptr) {
VLOG(DRIVER) << "prepareModel";
LOG(ERROR) << "invalid callback passed to prepareModel";
@@ -68,6 +75,8 @@
Return<ErrorStatus> SampleDriver::prepareModel_1_1(const V1_1::Model& model,
ExecutionPreference preference,
const sp<IPreparedModelCallback>& callback) {
+ NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_COMPILATION,
+ "SampleDriver::prepareModel_1_1");
if (VLOG_IS_ON(DRIVER)) {
VLOG(DRIVER) << "prepareModel_1_1";
logModelToInfo(model);
@@ -92,6 +101,8 @@
}
Return<DeviceStatus> SampleDriver::getStatus() {
+ NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_UNSPECIFIED,
+ "SampleDriver::getStatus");
VLOG(DRIVER) << "getStatus()";
return DeviceStatus::AVAILABLE;
}
@@ -113,12 +124,16 @@
void SamplePreparedModel::asyncExecute(const Request& request,
const sp<IExecutionCallback>& callback) {
+ NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_INPUTS_AND_OUTPUTS,
+ "SampleDriver::asyncExecute");
std::vector<RunTimePoolInfo> requestPoolInfos;
if (!setRunTimePoolInfosFromHidlMemories(&requestPoolInfos, request.pools)) {
callback->notify(ErrorStatus::GENERAL_FAILURE);
return;
}
+ NNTRACE_FULL_SWITCH(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_EXECUTION,
+ "SampleDriver::asyncExecute");
CpuExecutor executor;
int n = executor.run(mModel, request, mPoolInfos, requestPoolInfos);
VLOG(DRIVER) << "executor.run returned " << n;
@@ -132,6 +147,8 @@
Return<ErrorStatus> SamplePreparedModel::execute(const Request& request,
const sp<IExecutionCallback>& callback) {
+ NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_EXECUTION,
+ "SampleDriver::execute");
VLOG(DRIVER) << "execute(" << SHOW_IF_DEBUG(toString(request)) << ")";
if (callback.get() == nullptr) {
LOG(ERROR) << "invalid callback passed to execute";
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index b5a472b..76d16d9 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -23,6 +23,7 @@
#include "HalInterfaces.h"
#include "Manager.h"
#include "ModelBuilder.h"
+#include "Tracing.h"
#include "Utils.h"
#include <mutex>
@@ -219,6 +220,7 @@
// Ensure that executionCallback->notify() is called.
static void cpuFallbackFull(const ExecutionBuilder* executionBuilder,
const sp<ExecutionCallback>& executionCallback) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull");
VLOG(EXECUTION) << "cpuFallbackFull";
StepExecutor executor(executionBuilder, executionBuilder->getModel(),
nullptr /* no VersionedIDevice, so CPU */,
@@ -244,6 +246,7 @@
const ExecutionPlan* plan,
std::shared_ptr<ExecutionPlan::Controller> controller,
const sp<ExecutionCallback>& executionCallback) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial");
VLOG(EXECUTION) << "cpuFallbackPartial";
std::shared_ptr<StepExecutor> executor;
int n = plan->fallback(controller, &executor);
@@ -578,6 +581,7 @@
}
}
+ NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "StepExecutor::startComputeOnDevice");
// We separate the input & output pools so that we reduce the copying done if we
// do an eventual remoting (hidl_memory->update()). We could also use it to set
// protection on read only memory but that's not currently done.
@@ -618,6 +622,9 @@
request.pools[i] = mMemories[i]->getHidlMemory();
}
+ NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
+ "StepExecutor::startComputeOnDevice::execute");
+
// Prepare the callback for asynchronous execution. sp<ExecutionCallback>
// object is returned when the execution has been successfully launched,
// otherwise a nullptr is returned. The executionCallback is abstracted in
@@ -649,6 +656,8 @@
// TODO: Remove this synchronization point when the block of code below is
// removed.
executionCallback->wait();
+ NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_EXECUTION,
+ "StepExecutor::startComputeOnDevice::waited");
Return<ErrorStatus> callbackStatus = executionCallback->getStatus();
if (!callbackStatus.isOk() || callbackStatus != ErrorStatus::NONE) {
VLOG(EXECUTION) << "**Execute async failed**";
@@ -661,6 +670,7 @@
// TODO: Move this block of code somewhere else. It should not be in the
// startCompute function.
// TODO: outputMemory->update(); outputMemory->commit()
+ NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "StepExecutor::startComputeOnDevice");
for (auto& info : mOutputs) {
if (info.state == ModelArgumentInfo::POINTER) {
DataLocation& loc = info.locationAndLength;
@@ -682,6 +692,7 @@
const std::vector<RunTimePoolInfo>& modelPoolInfos,
const std::vector<RunTimePoolInfo>& requestPoolInfos,
const sp<IExecutionCallback>& executionCallback) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "asyncStartComputeOnCpu");
CpuExecutor executor;
int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
executionCallback->notify(convertResultCodeToErrorStatus(err));
@@ -689,6 +700,8 @@
int StepExecutor::startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback) {
// TODO: use a thread pool
+ // TODO(mikie): this could have NNTRACE so we could measure the overhead of
+ // spinning up a new thread.
Model model;
mModel->setHidlModel(&model);
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index c8f7550..5fd370d 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -23,6 +23,7 @@
#include "ExecutionBuilder.h"
#include "Manager.h"
#include "ModelBuilder.h"
+#include "Tracing.h"
#include "Utils.h"
#include <functional>
@@ -46,6 +47,10 @@
model->setHidlModel(&hidlModel);
sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();
+
+ // Note that some work within VersionedIDevice will be subtracted from the
+ // IPC layer
+ NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModel");
Return<ErrorStatus> prepareLaunchStatus = device->getInterface()->prepareModel(
hidlModel, static_cast<ExecutionPreference>(executionPreference), preparedModelCallback);
if (!prepareLaunchStatus.isOk()) {
diff --git a/runtime/NeuralNetworks.cpp b/runtime/NeuralNetworks.cpp
index 6f1dcca..e384f3d 100644
--- a/runtime/NeuralNetworks.cpp
+++ b/runtime/NeuralNetworks.cpp
@@ -29,6 +29,7 @@
#include "Memory.h"
#include "NeuralNetworksOEM.h"
#include "ModelBuilder.h"
+#include "Tracing.h"
#include "Utils.h"
#include <memory>
@@ -251,6 +252,7 @@
int ANeuralNetworksMemory_createFromFd(size_t size, int prot, int fd, size_t offset,
ANeuralNetworksMemory** memory) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksMemory_createFromFd");
*memory = nullptr;
std::unique_ptr<MemoryFd> m = std::make_unique<MemoryFd>();
if (m == nullptr) {
@@ -265,12 +267,14 @@
}
void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) {
+ NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksMemory_free");
// No validation. Free of nullptr is valid.
Memory* m = reinterpret_cast<Memory*>(memory);
delete m;
}
int ANeuralNetworksModel_create(ANeuralNetworksModel** model) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_create");
initVLogMask();
if (!model) {
LOG(ERROR) << "ANeuralNetworksModel_create passed a nullptr";
@@ -286,12 +290,14 @@
}
void ANeuralNetworksModel_free(ANeuralNetworksModel* model) {
+ NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksModel_free");
// No validation. Free of nullptr is valid.
ModelBuilder* m = reinterpret_cast<ModelBuilder*>(model);
delete m;
}
int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_finish");
if (!model) {
LOG(ERROR) << "ANeuralNetworksModel_finish passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -302,6 +308,7 @@
int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model,
const ANeuralNetworksOperandType* type) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_addOperand");
if (!model || !type) {
LOG(ERROR) << "ANeuralNetworksModel_addOperand passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -312,6 +319,7 @@
int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model, int32_t index,
const void* buffer, size_t length) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_setOperandValue");
if (!model || (!buffer && length != 0)) {
LOG(ERROR) << "ANeuralNetworksModel_setOperandValue passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -323,6 +331,7 @@
int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel* model, int32_t index,
const ANeuralNetworksMemory* memory,
size_t offset, size_t length) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_setOperandValueFromMemory");
if (!model || !memory) {
LOG(ERROR) << "ANeuralNetworksModel_setOperandValue passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -336,6 +345,7 @@
ANeuralNetworksOperationType type, uint32_t inputCount,
const uint32_t* inputs, uint32_t outputCount,
const uint32_t* outputs) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_addOperation");
if (!model || !inputs || !outputs) {
LOG(ERROR) << "ANeuralNetworksModel_addOperation passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -347,6 +357,7 @@
int ANeuralNetworksModel_identifyInputsAndOutputs(ANeuralNetworksModel* model, uint32_t inputCount,
const uint32_t* inputs, uint32_t outputCount,
const uint32_t* outputs) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_identifyInputsAndOutputs");
if (!model || !inputs || !outputs) {
LOG(ERROR) << ("ANeuralNetworksModel_identifyInputsAndOutputs passed a nullptr");
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -357,6 +368,7 @@
int ANeuralNetworksModel_relaxComputationFloat32toFloat16(ANeuralNetworksModel* model,
bool allow) {
+ NNTRACE_RT(NNTRACE_PHASE_PREPARATION, "ANeuralNetworksModel_relaxComputationFloat32toFloat16");
if (!model) {
LOG(ERROR) << ("ANeuralNetworksModel_relaxComputationFloat32toFloat16 passed a nullptr");
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -367,6 +379,7 @@
int ANeuralNetworksCompilation_create(ANeuralNetworksModel* model,
ANeuralNetworksCompilation** compilation) {
+ NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_create");
if (!model || !compilation) {
LOG(ERROR) << "ANeuralNetworksCompilation_create passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -380,6 +393,7 @@
}
void ANeuralNetworksCompilation_free(ANeuralNetworksCompilation* compilation) {
+ NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksCompilation_free");
// No validation. Free of nullptr is valid.
// TODO specification says that a compilation-in-flight can be deleted
CompilationBuilder* c = reinterpret_cast<CompilationBuilder*>(compilation);
@@ -388,6 +402,7 @@
int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compilation,
int32_t preference) {
+ NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_setPreference");
if (!compilation) {
LOG(ERROR) << "ANeuralNetworksCompilation_setPreference passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -397,6 +412,7 @@
}
int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation* compilation) {
+ NNTRACE_RT(NNTRACE_PHASE_COMPILATION, "ANeuralNetworksCompilation_finish");
if (!compilation) {
LOG(ERROR) << "ANeuralNetworksCompilation_finish passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -407,6 +423,7 @@
int ANeuralNetworksExecution_create(ANeuralNetworksCompilation* compilation,
ANeuralNetworksExecution** execution) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksExecution_create");
if (!compilation || !execution) {
LOG(ERROR) << "ANeuralNetworksExecution_create passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -420,6 +437,7 @@
}
void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) {
+ NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksExecution_free");
// TODO specification says that an execution-in-flight can be deleted
// No validation. Free of nullptr is valid.
ExecutionBuilder* r = reinterpret_cast<ExecutionBuilder*>(execution);
@@ -429,6 +447,7 @@
int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32_t index,
const ANeuralNetworksOperandType* type, const void* buffer,
size_t length) {
+ NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setInput");
if (!execution || (!buffer && length != 0)) {
LOG(ERROR) << "ANeuralNetworksExecution_setInput passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -441,6 +460,7 @@
const ANeuralNetworksOperandType* type,
const ANeuralNetworksMemory* memory, size_t offset,
size_t length) {
+ NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setInputFromMemory");
if (!execution || !memory) {
LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -454,6 +474,7 @@
int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int32_t index,
const ANeuralNetworksOperandType* type, void* buffer,
size_t length) {
+ NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setOutput");
if (!execution || (!buffer && length != 0)) {
LOG(ERROR) << "ANeuralNetworksExecution_setOutput passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -466,6 +487,7 @@
const ANeuralNetworksOperandType* type,
const ANeuralNetworksMemory* memory, size_t offset,
size_t length) {
+ NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "ANeuralNetworksExecution_setOutputFromMemory");
if (!execution || !memory) {
LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -478,6 +500,7 @@
int ANeuralNetworksExecution_startCompute(ANeuralNetworksExecution* execution,
ANeuralNetworksEvent** event) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksExecution_startCompute");
if (!execution || !event) {
LOG(ERROR) << "ANeuralNetworksExecution_startCompute passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -504,6 +527,7 @@
}
int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "ANeuralNetworksEvent_wait");
if (event == nullptr) {
LOG(ERROR) << "ANeuralNetworksEvent_wait passed a nullptr";
return ANEURALNETWORKS_UNEXPECTED_NULL;
@@ -515,6 +539,7 @@
}
void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
+ NNTRACE_RT(NNTRACE_PHASE_TERMINATION, "ANeuralNetworksEvent_free");
// No validation. Free of nullptr is valid.
if (event) {
sp<ExecutionCallback>* e = reinterpret_cast<sp<ExecutionCallback>*>(event);
diff --git a/runtime/VersionedIDevice.cpp b/runtime/VersionedIDevice.cpp
index 110eecc..1226552 100644
--- a/runtime/VersionedIDevice.cpp
+++ b/runtime/VersionedIDevice.cpp
@@ -16,6 +16,7 @@
#include "VersionedIDevice.h"
+#include "Tracing.h"
#include "Utils.h"
#include <android-base/logging.h>
@@ -32,6 +33,7 @@
std::pair<ErrorStatus, Capabilities> result;
if (mDeviceV1_1 != nullptr) {
+ NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_INITIALIZATION, "getCapabilities_1_1");
Return<void> ret = mDeviceV1_1->getCapabilities_1_1(
[&result](ErrorStatus error, const Capabilities& capabilities) {
result = std::make_pair(error, capabilities);
@@ -41,8 +43,10 @@
return {ErrorStatus::GENERAL_FAILURE, {}};
}
} else if (mDeviceV1_0 != nullptr) {
+ NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_INITIALIZATION, "getCapabilities");
Return<void> ret = mDeviceV1_0->getCapabilities(
[&result](ErrorStatus error, const V1_0::Capabilities& capabilities) {
+ // Time taken to convert capabilities is trivial
result = std::make_pair(error, convertToV1_1(capabilities));
});
if (!ret.isOk()) {
@@ -62,6 +66,7 @@
std::pair<ErrorStatus, hidl_vec<bool>> result;
if (mDeviceV1_1 != nullptr) {
+ NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations_1_1");
Return<void> ret = mDeviceV1_1->getSupportedOperations_1_1(
model, [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
result = std::make_pair(error, supported);
@@ -71,8 +76,10 @@
return {ErrorStatus::GENERAL_FAILURE, {}};
}
} else if (mDeviceV1_0 != nullptr && compliantWithV1_0(model)) {
+ V1_0::Model model10 = convertToV1_0(model);
+ NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "getSupportedOperations_1_0");
Return<void> ret = mDeviceV1_0->getSupportedOperations(
- convertToV1_0(model), [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
+ model10, [&result](ErrorStatus error, const hidl_vec<bool>& supported) {
result = std::make_pair(error, supported);
});
if (!ret.isOk()) {
@@ -98,17 +105,35 @@
return ErrorStatus::GENERAL_FAILURE;
}
return static_cast<ErrorStatus>(ret);
- } else if (mDeviceV1_0 != nullptr && compliantWithV1_0(model)) {
- Return<ErrorStatus> ret = mDeviceV1_0->prepareModel(convertToV1_0(model), callback);
- if (!ret.isOk()) {
- LOG(ERROR) << "prepareModel failure: " << ret.description();
+ } else if (mDeviceV1_0 != nullptr) {
+ bool compliant = false;
+ V1_0::Model model10;
+ {
+ // Attribute time spent in model inspection and conversion to
+ // Runtime, as the time may be substantial (0.03ms for mobilenet,
+ // but could be larger for other models).
+ NNTRACE_FULL_SUBTRACT(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_COMPILATION,
+ "VersionedIDevice::prepareModel");
+ compliant = compliantWithV1_0(model);
+ if (compliant) {
+ model10 = convertToV1_0(model); // copy is elided
+ }
+ }
+ if (compliant) {
+ Return<ErrorStatus> ret = mDeviceV1_0->prepareModel(model10, callback);
+ if (!ret.isOk()) {
+ LOG(ERROR) << "prepareModel failure: " << ret.description();
+ return ErrorStatus::GENERAL_FAILURE;
+ }
+ return static_cast<ErrorStatus>(ret);
+ } else {
+ // TODO: partition the model such that v1.1 ops are not passed to v1.0
+ // device
+ LOG(ERROR) << "Could not handle prepareModel!";
return ErrorStatus::GENERAL_FAILURE;
}
- return static_cast<ErrorStatus>(ret);
} else {
- // TODO: partition the model such that v1.1 ops are not passed to v1.0
- // device
- LOG(ERROR) << "Could not handle prepareModel!";
+ LOG(ERROR) << "prepareModel called with no device";
return ErrorStatus::GENERAL_FAILURE;
}
}
diff --git a/runtime/test/Android.bp b/runtime/test/Android.bp
index 7dbb28a..addc5c0 100644
--- a/runtime/test/Android.bp
+++ b/runtime/test/Android.bp
@@ -22,6 +22,7 @@
shared_libs: [
"libandroid",
"libbase",
+ "libcutils",
"libhidlbase",
"libhidltransport",
"libhidlmemory",
diff --git a/runtime/test/GeneratedUtils.cpp b/runtime/test/GeneratedUtils.cpp
index fc5671f..a9a33fa 100644
--- a/runtime/test/GeneratedUtils.cpp
+++ b/runtime/test/GeneratedUtils.cpp
@@ -31,6 +31,17 @@
#include <map>
#include <thread>
+// Systrace is not available from CTS tests due to platform layering
+// constraints. We reuse the NNTEST_ONLY_PUBLIC_API flag, as that should also be
+// the case for CTS (public APIs only).
+#ifndef NNTEST_ONLY_PUBLIC_API
+#include "Tracing.h"
+#else
+#define NNTRACE_FULL_RAW(...)
+#define NNTRACE_APP(...)
+#define NNTRACE_APP_SWITCH(...)
+#endif
+
namespace generated_tests {
using namespace android::nn::wrapper;
using namespace test_helper;
@@ -62,9 +73,13 @@
}
Compilation createAndCompileModel(Model* model, std::function<void(Model*)> createModel) {
+ NNTRACE_APP(NNTRACE_PHASE_PREPARATION, "createAndCompileModel");
+
createModel(model);
model->finish();
graphDump("", *model);
+
+ NNTRACE_APP_SWITCH(NNTRACE_PHASE_COMPILATION, "createAndCompileModel");
Compilation compilation(model);
compilation.finish();
@@ -86,6 +101,7 @@
// If in relaxed mode, set the error range to be 5ULP of FP16.
float fpRange = !model->isRelaxed() ? 1e-5f : 5.0f * 0.0009765625f;
for (auto& example : examples) {
+ NNTRACE_APP(NNTRACE_PHASE_EXECUTION, "executeWithCompilation example");
SCOPED_TRACE(exampleNo);
// TODO: We leave it as a copy here.
// Should verify if the input gets modified by the test later.
@@ -94,6 +110,7 @@
Execution execution(compilation);
+ NNTRACE_APP_SWITCH(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "executeWithCompilation example");
// Set all inputs
for_all(inputs, [&execution](int idx, const void* p, size_t s) {
const void* buffer = s == 0 ? nullptr : p;
@@ -108,9 +125,11 @@
ASSERT_EQ(Result::NO_ERROR, execution.setOutput(idx, buffer, s));
});
+ NNTRACE_APP_SWITCH(NNTRACE_PHASE_EXECUTION, "executeWithCompilation example");
Result r = execution.compute();
ASSERT_EQ(Result::NO_ERROR, r);
+ NNTRACE_APP_SWITCH(NNTRACE_PHASE_RESULTS, "executeWithCompilation example");
// Dump all outputs for the slicing tool
if (dumpToFile) {
s << "output" << exampleNo << " = {\n";
@@ -133,6 +152,7 @@
std::function<bool(int)> isIgnored,
std::vector<MixedTypedExample>& examples,
std::string dumpFile) {
+ NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeOnce");
Model model;
Compilation compilation = createAndCompileModel(&model, createModel);
executeWithCompilation(&model, &compilation, isIgnored, examples, dumpFile);
@@ -142,6 +162,7 @@
void executeMultithreadedOwnCompilation(std::function<void(Model*)> createModel,
std::function<bool(int)> isIgnored,
std::vector<MixedTypedExample>& examples) {
+ NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeMultithreadedOwnCompilation");
SCOPED_TRACE("MultithreadedOwnCompilation");
std::vector<std::thread> threads;
for (int i = 0; i < 10; i++) {
@@ -157,6 +178,7 @@
void executeMultithreadedSharedCompilation(std::function<void(Model*)> createModel,
std::function<bool(int)> isIgnored,
std::vector<MixedTypedExample>& examples) {
+ NNTRACE_APP(NNTRACE_PHASE_OVERALL, "executeMultithreadedSharedCompilation");
SCOPED_TRACE("MultithreadedSharedCompilation");
Model model;
Compilation compilation = createAndCompileModel(&model, createModel);