runtime/ExecutionPlan.cpp - platform/packages/modules/NeuralNetworks - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define LOG_TAG "ExecutionPlan"

 #include "ExecutionPlan.h"

 #include <cutils/native_handle.h>
 #include <fcntl.h>
 #include <openssl/sha.h>
 #include <sys/stat.h>
 #include <sys/types.h>

 #include <algorithm>
 #include <functional>
 #include <map>
 #include <memory>
 #include <mutex>
 #include <queue>
 #include <set>
 #include <string>
 #include <type_traits>
 #include <unordered_set>
 #include <utility>
 #include <vector>

 #include "BurstBuilder.h"
 #include "Callbacks.h"
 #include "CompilationBuilder.h"
 #include "ControlFlow.h"
 #include "CpuExecutor.h"
 #include "ExecutionBuilder.h"
 #include "ExecutionBurstController.h"
 #include "GraphDump.h"
 #include "Manager.h"
 #include "MetaModel.h"
 #include "ModelBuilder.h"
 #include "OperationsUtils.h"
 #include "TokenHasher.h"
 #include "Tracing.h"
 #include "TypeManager.h"
 #include "Utils.h"

 namespace android {
 namespace nn {

 namespace {

 using namespace hal;

 // The index of the main model in SourceModels.
 constexpr uint32_t kMainModelInSourceModels = 0;

 // Compiles the model on device.
 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
 // device name, device version string, and the execution preference in this function.
 int compile(const Device& device, const ModelBuilder& model, int executionPreference,
             int compilationPriority, const std::optional<Deadline>& deadline,
             const std::string& cacheDir, TokenHasher* token,
             std::shared_ptr<PreparedModel>* preparedModel) {
     CHECK(token != nullptr);
     CHECK(preparedModel != nullptr);
     *preparedModel = nullptr;

     std::optional<CacheToken> cacheToken;
     if (device.isCachingSupported() && token->ok() &&
         token->updateFromString(device.getName().c_str()) &&
         token->updateFromString(device.getVersionString().c_str()) &&
         token->update(&executionPreference, sizeof(executionPreference)) &&
         token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) {
         cacheToken.emplace(token->getCacheToken());
     }

     const ModelFactory makeModel = [&model] { return model.makeHidlModel(); };
     const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
     const Priority priority = convertToHalPriority(compilationPriority);
     const auto [n, returnedPreparedModel] =
             device.prepareModel(makeModel, preference, priority, deadline, cacheDir, cacheToken);
     *preparedModel = returnedPreparedModel;
     return n;
 }

 typedef std::function<void(uint32_t)> OperationReadyCallback;

 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
                            const Operand& fromOperand) {
     if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
         fromOperand.extraParams.getDiscriminator() ==
                 OperandExtraParams::hidl_discriminator::channelQuant) {
         auto& fromChannelQuant = fromOperand.extraParams.channelQuant();
         ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
                 .channelDim = fromChannelQuant.channelDim,
                 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
                 .scales = fromChannelQuant.scales.data(),
         };
         return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
     } else if (isExtensionOperandType(fromOperand.type) &&
                fromOperand.extraParams.getDiscriminator() ==
                        OperandExtraParams::hidl_discriminator::extension) {
         hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension();
         return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
                                              extensionData.size());
     } else if (fromOperand.extraParams.getDiscriminator() !=
                        OperandExtraParams::hidl_discriminator::none ||
                fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
         LOG(ERROR) << "Type " << toString(fromOperand.type)
                    << " has an unexpected extraParams discriminator: "
                    << static_cast<int>(fromOperand.extraParams.getDiscriminator());
         return ANEURALNETWORKS_BAD_DATA;
     } else {
         return ANEURALNETWORKS_NO_ERROR;
     }
 }

 // This class tracks whether we know the value of an operand as operations
 // are processed.
 class OperandTracker {
    public:
     // Creates the tracker for this model. Figure out which operations can be
     // executed right away and cb for each one of them.
     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
     // Mark the specified operation as having been processed. The output
     // of the operation now being known, this may make new operations to be
     // able to run.  Call cb for each one of them.
     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);

    private:
     const ModelBuilder* mModel;
     std::multimap<uint32_t, uint32_t> mOperandToOperations;
     std::vector<uint32_t> mUnknownInputCount;  // For each operation
 };

 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
     : mModel(model) {
     const auto& operations = mModel->getOperations();
     mUnknownInputCount.resize(operations.size());
     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
         const Operation& operation = operations[operationIndex];
         uint32_t count = 0;
         for (uint32_t operandIndex : operation.inputs) {
             auto lifetime = mModel->getOperand(operandIndex).lifetime;
             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
                 lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
                 count++;
                 mOperandToOperations.emplace(operandIndex, operationIndex);
             }
         }
         if (count == 0) {
             cb(operationIndex);
         }
         mUnknownInputCount[operationIndex] = count;
     }
 }

 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
     // Mark all its outputs as known.
     const Operation& operation = mModel->getOperations()[operationIndex];
     for (uint32_t operandIndex : operation.outputs) {
         auto range = mOperandToOperations.equal_range(operandIndex);
         for (auto i = range.first; i != range.second; i++) {
             uint32_t& count = mUnknownInputCount[i->second];
             if (--count == 0) {
                 cb(i->second);
             }
         }
     }
 }

 }  // namespace

 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
                              std::shared_ptr<Device> device)
     : mPlan(plan),
       mIndex(stepIndex),
       mSourceModelIndex(sourceModelIndex),
       mStepModel(),
       mDevice(device),
       mToken(plan->getCacheToken()) {}

 // Adds an operand if it has not been added already.
 // Sets the index in the step model for the corresponding operand.
 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
                               OperandKind kind) {
     // Have we added this operand already?
     auto i = mOperandMap.find(sourceOperandIndex);
     if (i != mOperandMap.end()) {
         CHECK(kind == INPUT);
         *stepOperandIndex = i->second;
         return ANEURALNETWORKS_NO_ERROR;
     }

     // First time we add this operand.
     *stepOperandIndex = mStepModel.operandCount();
     mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);

     // Add the operand to the step model.
     const ModelBuilder& sourceModel = *getSourceModel();
     const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
     ANeuralNetworksOperandType type = {
             .type = static_cast<int32_t>(operand.type),
             .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
             .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
             .scale = operand.scale,
             .zeroPoint = operand.zeroPoint,
     };

     int n = mStepModel.addOperand(type);
     if (n != ANEURALNETWORKS_NO_ERROR) {
         LOG(ERROR) << "Previous error occurred when partitioning the graph";
         return n;
     }

     n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
     if (n != ANEURALNETWORKS_NO_ERROR) {
         LOG(ERROR) << "Error when copying extra parameters to the operand";
         return n;
     }

     // Sets its value.
     switch (operand.lifetime) {
         case OperandLifeTime::CONSTANT_COPY: {
             const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
             n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
             }
         } break;
         case OperandLifeTime::CONSTANT_REFERENCE: {
             const Memory* memory = sourceModel.getMemories()[operand.location.poolIndex];
             n = mStepModel.setOperandValueFromMemory(
                     *stepOperandIndex, memory, operand.location.offset, operand.location.length);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
             }
         } break;
         case OperandLifeTime::NO_VALUE: {
             n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
             }
         } break;
         case OperandLifeTime::TEMPORARY_VARIABLE: {  // handled similarly to SUBGRAPH_OUTPUT
             if (kind == INPUT) {
                 // The first time we've seen this operand is as an
                 // input.  That means it must be defined by a
                 // different partition, and is an input to this one.
                 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
             } else {
                 // The first time we've seen this operand is as an
                 // output.  It may be an input to a different
                 // partition, so keep track of it.
                 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
                                           mIndex);
             }
         } break;
         case OperandLifeTime::SUBGRAPH_INPUT: {
             mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
         } break;
         case OperandLifeTime::SUBGRAPH_OUTPUT: {  // handled similarly to TEMPORARY_VARIABLE
             if (kind == INPUT) {
                 // The first time we've seen this operand is as an
                 // input.  That means it must be defined by a
                 // different partition, and is an input to this one.
                 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
             } else {
                 // The first time we've seen this operand is as an
                 // output.
                 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
             }
         } break;
         case OperandLifeTime::SUBGRAPH: {
             const ModelBuilder* model = sourceModel.getReferencedModel(operand);
             n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
             }
         } break;
         default: {
             CHECK(!"unexpected");
         } break;
     }

     return ANEURALNETWORKS_NO_ERROR;
 }

 int ExecutionStep::addOperation(int operationIndex) {
     const Operation& operation = getSourceModel()->getOperation(operationIndex);
     if (mToken.ok()) {
         mToken.update(&operationIndex, sizeof(operationIndex));
     }

     // Convert the input and output operand indexes.
     //
     // We expect operations to be added in topological order.  Therefore:
     //
     // - We may not have seen an input if it is a model input, a
     //   constant, or an operand written by a different partition.
     //
     // - We should not have seen any outputs.
     auto addOperands = [this](const hidl_vec<uint32_t>& sourceModelOperands,
                               std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
         const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
         for (uint32_t i = 0; i < operandCount; i++) {
             NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
         }
         return ANEURALNETWORKS_NO_ERROR;
     };

     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
     std::vector<uint32_t> inputs(inputCount);
     std::vector<uint32_t> outputs(outputCount);
     NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
     NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
     return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
                                    outputCount, outputs.data());
 }

 void ExecutionStep::mapInputsAndOutputs(
         std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
         const std::map<SourceOperandIndex, ConstantReferenceLocation>&
                 sourceOperandToConstantReference) const {
     auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
             it != sourceOperandToOffsetOfTemporary.end()) {
             executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second);
         } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
                    it != sourceOperandToInputIndex.end()) {
             executor->mapInput(it->second, stepInputIndex);
         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
                    it != sourceOperandToOutputIndex.end()) {
             executor->mapOutputToInput(it->second, stepInputIndex);
         } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
                    it != sourceOperandToConstantReference.end()) {
             // Constant partition boundary operand. This could be an IF branch
             // model input or a WHILE variable initializer.
             executor->setInputFromMemory(stepInputIndex, it->second.memory, it->second.offset);
         } else {
             CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
                          << toString(sourceOperandIndex);
         }
     };
     auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
         SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
             it != sourceOperandToOffsetOfTemporary.end()) {
             executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second);
         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
                    it != sourceOperandToOutputIndex.end()) {
             executor->mapOutput(it->second, stepOutputIndex);
         } else {
             CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
                          << toString(sourceOperandIndex);
         }
     };
     for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
         mapInput(mStepModelInputs[i].first, i);
     }
     for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
         mapOutput(mStepModelOutputs[i].first, i);
     }
 }

 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
     auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
         const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
         if (it == mTemporaryToDefiningExecutionStep.end()) {
             // The operand is not a temporary or is not defined by an
             // ExecutionStep (i.e. it's an output of an IF or a WHILE).
             // The latter case is handled by ExecutionPlan::makeController().
             return;
         }
         uint32_t stepIndex = it->second;
         CHECK_LT(stepIndex, mSteps.size());
         mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
     };
     for (const auto& logicalStep : mSteps) {
         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
             for (const auto& input : step->getTempsAsStepModelInputs()) {
                 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
                 recordAsOutputIfTemporary(sourceOperandIndex);
             }
         } else if (const IfStep* step = logicalStep->tryIfStep()) {
             recordAsOutputIfTemporary(step->conditionOperandIndex);
             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
                 recordAsOutputIfTemporary(sourceOperandIndex);
             }
         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
             for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
                 recordAsOutputIfTemporary(sourceOperandIndex);
             }
         } else {
             CHECK(logicalStep->isGoto());
         }
     }
 }

 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
     const auto it = mOperandMap.find(stepOperandIndex);
     CHECK(it != mOperandMap.end());
     mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
 }

 const ModelBuilder* ExecutionStep::getSourceModel() const {
     return mPlan->getSourceModels().getModel(mSourceModelIndex);
 }

 void ExecutionStep::logStepModel() const {
     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;

     auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
         if (!toLog.empty()) {
             toLog += ", ";
         }
         toLog += toString(e.first);
         toLog += "->";
         toLog += toString(e.second);
     };

     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
         std::string toLog;
         for (const auto& e : map) {
             logRemapEntry(toLog, e);
         }
         VLOG(COMPILATION) << name << ": " << toLog;
     };
     auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
         std::string toLog;
         for (const auto& e : set) {
             logRemapEntry(toLog, e);
         }
         VLOG(COMPILATION) << name << ": " << toLog;
     };

     logRemapVector("step model inputs", mStepModelInputs);
     logRemapVector("step model outputs", mStepModelOutputs);
     logRemapVector("model inputs", mModelInputs);
     logRemapVector("model outputs", mModelOutputs);
     logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
     logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
     logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
 }

 static bool hasUnknownSize(const Operand& operand) {
     if (operand.dimensions.size() == 0) {
         return TypeManager::get()->isTensorType(operand.type);
     }
     for (uint32_t dimension : operand.dimensions) {
         if (dimension == 0) {
             return true;
         }
     }
     return false;
 }

 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
                                    int32_t executionPreference, int32_t priority) {
     CHECK(mDevice != nullptr);

     for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
         const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
         if (hasUnknownSize(operand)) {
             *hasOutputOfUnknownSize = true;
             VLOG(COMPILATION) << "StepModelOutput (operand#" << toString(stepModelOutput.first)
                               << " of source graph) has unknown size: " << toString(operand);
         }
     }

     mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());

     mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
     mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
                             mTempsAsStepModelInputs.end());
     mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
                             mOutputsAsStepModelInputs.end());

     mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
     mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
                              mTempsAsStepModelOutputs.end());

     if (mSourceModelIndex == kMainModelInSourceModels) {
         std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
         for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
             mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
         }
         std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
         for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
             mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
         }

         // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
         // mModelInputs being the first inputs, as specified by mStepModelInputs.
         mInputIndexStepModelToMainModel.resize(mModelInputs.size());
         std::transform(mModelInputs.begin(), mModelInputs.end(),
                        mInputIndexStepModelToMainModel.begin(),
                        [&mainModelOperandToInputIndex](auto& e) {
                            uint32_t sourceOperandIndex = e.first;
                            return mainModelOperandToInputIndex[sourceOperandIndex];
                        });

         // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
         // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
         mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
         std::transform(mModelOutputs.begin(), mModelOutputs.end(),
                        mOutputIndexStepModelToMainModel.begin(),
                        [&mainModelOperandToOutputIndex](auto& e) {
                            uint32_t sourceOperandIndex = e.first;
                            return mainModelOperandToOutputIndex[sourceOperandIndex];
                        });

         // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
         // on mOutputsAsStepModelInputs being the first outputs.
         mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
         std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
                        mOutputsAsStepModelInputsIndexToMainModel.begin(),
                        [&mainModelOperandToOutputIndex](auto& e) {
                            uint32_t sourceOperandIndex = e.first;
                            return mainModelOperandToOutputIndex[sourceOperandIndex];
                        });
     }

     if (VLOG_IS_ON(COMPILATION)) {
         logStepModel();
     }

     std::vector<uint32_t> inputs(mStepModelInputs.size());
     std::vector<uint32_t> outputs(mStepModelOutputs.size());
     std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
                    [](auto& e) { return e.second; });
     std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
                    [](auto& e) { return e.second; });
     NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
                                                            outputs.size(), outputs.data()));
     NN_RETURN_IF_ERROR(mStepModel.finish());

     // TODO: Move compilation elsewhere?
     VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
     return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheDir(),
                    &mToken, &mPreparedStepModel);
 }

 void ExecutionStep::dump() const {
     if (VLOG_IS_ON(COMPILATION)) {
         VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
         logModelToInfo(mStepModel.makeHidlModel());
     }
 }

 std::string toString(const IfStep& step) {
     std::ostringstream oss;
     oss << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
         << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
     return oss.str();
 }

 std::string toString(const WhileStep& step) {
     std::ostringstream oss;
     oss << "Step#" << step.index << ": while cond=" << step.condStepIndex
         << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
     return oss.str();
 }

 std::string toString(const GotoStep& step) {
     std::ostringstream oss;
     oss << "Step#" << step.index << ": goto " << step.gotoStepIndex;
     return oss.str();
 }

 void LogicalStep::dump() const {
     if (VLOG_IS_ON(COMPILATION)) {
         if (const IfStep* step = tryIfStep()) {
             VLOG(COMPILATION) << toString(*step);
         } else if (const WhileStep* step = tryWhileStep()) {
             VLOG(COMPILATION) << toString(*step);
         } else if (const GotoStep* step = tryGotoStep()) {
             VLOG(COMPILATION) << toString(*step);
         } else {
             executionStep()->dump();
         }
     }
 }

 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
                                         int32_t executionPreference, int32_t priority,
                                         const std::optional<Deadline>& deadline) {
     CHECK(!deadline.has_value());
     const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);

     auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
         for (const auto& sourceOperandIndex : operands) {
             const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
             const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
             if (hasUnknownSize(operand)) {
                 return true;
             }
         }
         return false;
     };

     findTempsAsStepModelOutputs();
     for (const auto& logicalStep : mSteps) {
         if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
             int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
                                           executionPreference, priority);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 VLOG(COMPILATION)
                         << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
                 return n;
             }
         } else if (IfStep* step = logicalStep->tryIfStep()) {
             if (containsUnknownSize(step->outerOutputOperands)) {
                 mHasStepModelOutputOfUnknownSize = true;
             }
         } else if (WhileStep* step = logicalStep->tryWhileStep()) {
             if (containsUnknownSize(step->outerOutputOperands)) {
                 mHasStepModelOutputOfUnknownSize = true;
             }
         } else {
             CHECK(logicalStep->isGoto());
         }
     }
     if (mHasStepModelOutputOfUnknownSize) {
         VLOG(COMPILATION)
                 << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
         return ANEURALNETWORKS_OP_FAILED;
     }

     for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
         mSourceOperandToInputIndex[index] = i;
     }
     for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
         SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
         mSourceOperandToOutputIndex[index] = i;
     }

     findControlFlowBoundaryConstants(sourceModels);

     mSuccessfulFinish = true;
     return ANEURALNETWORKS_NO_ERROR;
 }

 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
         const SourceModels* sourceModels) {
     auto handleBoundaryConstants = [this,
                                     sourceModels](const SourceOperandIndex& sourceOperandIndex) {
         const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
         const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
         const DataLocation& location = operand.location;
         if (operand.lifetime == OperandLifeTime::CONSTANT_COPY) {
             mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
                     .buffer = sourceModel->getPointerToOperandValue(location.offset),
                     .length = location.length,
             };
         } else if (operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
             mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
                     .memory = sourceModel->getMemories()[location.poolIndex],
                     .offset = location.offset,
                     .length = location.length,
             };
         }
     };
     for (const auto& logicalStep : mSteps) {
         if (const IfStep* step = logicalStep->tryIfStep()) {
             handleBoundaryConstants(step->conditionOperandIndex);
             for (const auto& sourceOperandIndex : step->outerInputOperands) {
                 handleBoundaryConstants(sourceOperandIndex);
             }
         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
             for (const auto& sourceOperandIndex : step->outerInputOperands) {
                 handleBoundaryConstants(sourceOperandIndex);
             }
         }
     }
 }

 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
                                       int32_t priority, const std::optional<Deadline>& deadline) {
     CHECK(mDevice != nullptr);
     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
     const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
                           &mToken, &mPreparedModel);
     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
     return n;
 }

 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
                           const std::optional<Deadline>& deadline) {
     CHECK(mBody != nullptr);
     return mBody->finish(&getSourceModels(), executionPreference, priority, deadline);
 }

 ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
                                       const BurstBuilder* burstBuilder)
     : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {}

 ExecutionPlan::Controller::Controller(
         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
         const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
         std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary,
         std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2,
         std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
         std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
         const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
         std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference)
     : mPlan(plan),
       mExecutionBuilder(executionBuilder),
       mBurstBuilder(burstBuilder),
       mSourceOperandToOffsetOfTemporary(std::move(sourceOperandToOffsetOfTemporary)),
       mSourceOperandToOffsetOfTemporary2(std::move(sourceOperandToOffsetOfTemporary2)),
       mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
       mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
       mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
       mNextStepIndex(0),
       mLastStepIndex(kBadStepIndex) {
     if (totalSizeOfTemporaries == 0) {
         return;
     }
     int n;
     std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
     if (n != ANEURALNETWORKS_NO_ERROR) {
         LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
         mNextStepIndex = kBadStepIndex;
     }
     for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
         memcpy(mTemporaries->getPointer() + mSourceOperandToOffsetOfTemporary[sourceOperandIndex],
                location.buffer, location.length);
     }
 }

 // Attempt to create a burst object for each PreparedModel/Partition. If the
 // burst controller object cannot be made, return a nullptr in its place to
 // indicate the regular execution path should be used. This can occur either
 // because PreparedModel was nullptr (cpu was best choice), or because the
 // IPreparedModel was of insufficient version or failed to configure the burst.
 std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts(
         int preference) const {
     switch (mState) {
         // burst object for each partition in the compound case
         case COMPOUND: {
             std::vector<std::shared_ptr<ExecutionBurstController>> bursts;
             bursts.reserve(compound()->mSteps.size());
             for (const auto& logicalStep : compound()->mSteps) {
                 if (!logicalStep->isExecution()) {
                     bursts.push_back(nullptr);
                     continue;
                 }
                 if (const auto preparedModel =
                             logicalStep->executionStep()->getPreparedStepModel()) {
                     const bool preferPowerOverLatency =
                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
                     bursts.push_back(
                             preparedModel->configureExecutionBurst(preferPowerOverLatency));
                 } else {
                     bursts.push_back(nullptr);
                 }
             }
             return bursts;
         }
         // single burst object for the simple case
         case SIMPLE: {
             std::vector<std::shared_ptr<ExecutionBurstController>> burst;
             auto simpleBody = simple();
             if (const auto preparedModel = simpleBody->mPreparedModel) {
                 const bool preferPowerOverLatency =
                         (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
                 burst.push_back(preparedModel->configureExecutionBurst(preferPowerOverLatency));
             } else {
                 burst.push_back(nullptr);
             }
             return burst;
         }
         // no burst objects made
         default:
             return {};
     }
 }

 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
         ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
     CHECK(isValid());
     if (mState == SIMPLE) {
         return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
     }
     // Create the layout for a Memory object big enough to hold
     // - every partition boundary TEMPORARY operand and
     // - buffers required by the control flow implementation.
     //
     // TODO: Rethink this approach for managing temporaries.  Some
     // alternatives:
     //
     // 1) Adopt a memory layout scheme analogous to stack allocation,
     // where objects of non-overlapping lifetime can occupy the same
     // storage.  We would still have a single Memory object in this
     // case.
     //
     // 2) Do something like what CpuExecutor does, and do allocations
     // and deallocations on the fly (during execution) before first
     // reference and after last reference, respectively.  This would
     // mean having one Memory object per TEMPORARY; or, in a more
     // complicated implementation, one Memory object per set of
     // temporaries that have the same lifetime.  Note that the Android
     // system limits the number of shared memory objects, which are
     // what our Memory objects represent.
     //
     uint32_t totalSizeOfTemporaries = 0;
     auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) {
         totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
         const uint32_t offset = totalSizeOfTemporaries;
         totalSizeOfTemporaries += size;
         return offset;
     };
     // This function has two modes of operation:
     // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
     //    TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source
     //    operands, and panic if we see a source operand of another lifetime.
     // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
     //    SUBGRAPH_OUTPUT source operands and panic if we see a source operand
     //    of another lifetime.
     auto mapTemporary =
             [executionBuilder, addTemporaryOfSize](
                     const SourceOperandIndex& sourceOperandIndex,
                     std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary,
                     OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) {
                 CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
                       lifetime == OperandLifeTime::SUBGRAPH_OUTPUT);
                 const Operand& sourceOperand =
                         executionBuilder->getSourceOperand(sourceOperandIndex);
                 if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE &&
                     sourceOperand.lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
                     // See the caller for explanation.
                     return;
                 }
                 CHECK(sourceOperand.lifetime == lifetime);
                 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
                 CHECK_NE(size, 0u);
                 const uint32_t offset = addTemporaryOfSize(size);
                 auto [_, isNew] =
                         sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
                 CHECK(isNew);
                 VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
                                 << " offset = " << offset;
             };
     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary;
     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2;
     for (const auto& logicalStep : compound()->mSteps) {
         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
             // Allocate memory for ExecutionStep temporary outputs that are
             // inputs to other steps, as determined by
             // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
             //
             // We don't allocate memory for step model output operands with
             // source operand lifetime SUBGRAPH_OUTPUT because they will be
             // - managed by the client (main model outputs),
             // - assigned a location of another operand (when this step model
             //   output is a branch model output of an IF; see
             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
             // - allocated by a WHILE (when this step model output
             //   is a condition or body model output of a WHILE; see the
             //   step->bodyOutputOperands and step->condOutputOperand handling
             //   below).
             for (const auto& output : step->getTempsAsStepModelOutputs()) {
                 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
                              &sourceOperandToOffsetOfTemporary);
             }
         } else if (const IfStep* step = logicalStep->tryIfStep()) {
             // Allocate memory for all temporary outputs of an IfStep because
             // they are going to be written to by a branch model. We don't
             // perform unused output operand optimisation for referenced models.
             //
             // We don't allocate memory for branch output operands because they
             // use the same location as the corresponding outer output operands,
             // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
             //
             // We don't allocate memory for outer output operands with source
             // operand lifetime SUBGRAPH_OUTPUT because they will be
             // - managed by the client (main model outputs),
             // - assigned a location of another operand (when this IF outer
             //   output is a branch model output of another IF; see
             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
             // - allocated by a WHILE (when this IF outer output
             //   is a condition or body model output of a WHILE; see the
             //   step->bodyOutputOperands and step->condOutputOperand handling
             //   below).
             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
             }
         } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
             // Allocate memory for all temporary outputs of an WhileStep because
             // they are going to be written to by the WHILE loop.
             //
             // We don't allocate memory for outer output operands with source
             // operand lifetime SUBGRAPH_OUTPUT because they will be
             // - managed by the client (main model outputs),
             // - assigned a location of another operand (when this WHILE outer
             //   output is a branch model output of an IF; see
             //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
             // - allocated by another WHILE (when this WHILE outer output
             //   is a condition or body model output of another WHILE; see the
             //   step->bodyOutputOperands and step->condOutputOperand handling
             //   below).
             for (const auto& sourceOperandIndex : step->outerOutputOperands) {
                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary);
             }
             // Allocate memory for body model outputs. Note that we could use
             // the outer output operand memory instead but we currently don't do
             // so (b/148206073).
             for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary,
                              OperandLifeTime::SUBGRAPH_OUTPUT);
                 // Allocate another set of temporaries for double buffering.
                 mapTemporary(sourceOperandIndex, &sourceOperandToOffsetOfTemporary2,
                              OperandLifeTime::SUBGRAPH_OUTPUT);
             }
             // Allocate memory for condition model output.
             // TODO: Share one condition output memory region between all loops.
             mapTemporary(step->condOutputOperand, &sourceOperandToOffsetOfTemporary,
                          OperandLifeTime::SUBGRAPH_OUTPUT);
         } else {
             CHECK(logicalStep->isGoto());
         }
     }
     // Allocate temporary memory for boundary CONSTANT_COPY operands.
     for (const auto& [sourceOperandIndex, location] :
          compound()->mSourceOperandToBoundaryConstantCopy) {
         const uint32_t offset = addTemporaryOfSize(location.length);
         sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset);
         VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
                         << " offset = " << offset;
     }
     return std::shared_ptr<Controller>(new Controller(
             this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
             std::move(sourceOperandToOffsetOfTemporary),
             std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex,
             compound()->mSourceOperandToOutputIndex,
             compound()->mSourceOperandToBoundaryConstantCopy,
             compound()->mSourceOperandToBoundaryConstantReference));
 }

 // TODO: Find a better way to provide this functionality.
 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
                             std::shared_ptr<StepExecutor>* executor) const {
     *executor = nullptr;

     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
                     << "): mNextStepIndex = " << controller->mNextStepIndex;

     if (controller->mLastStepIndex == Controller::kBadStepIndex) {
         // We haven't called next().
         return ANEURALNETWORKS_OP_FAILED;
     }

     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
         // The last call to next() did not produce an executor.
         return ANEURALNETWORKS_OP_FAILED;
     }

     controller->mNextStepIndex = controller->mLastStepIndex;
     return next(controller, executor);
 }

 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
     : mInfo(RunTimePoolInfo::createFromExistingBuffer(reinterpret_cast<uint8_t*>(pointer), size)),
       mOffset(0) {}

 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
     : mInfo(std::move(info)), mOffset(offset) {}

 void* ExecutionPlan::Buffer::getPointer() const {
     return mInfo.getBuffer() + mOffset;
 }

 uint32_t ExecutionPlan::Buffer::getSize() const {
     return mInfo.getSize() - mOffset;
 }

 void ExecutionPlan::Buffer::flush() const {
     mInfo.flush();
 }

 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
         const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
     switch (info.state()) {
         case ModelArgumentInfo::POINTER: {
             return Buffer(info.buffer(), info.length());
         } break;
         case ModelArgumentInfo::MEMORY: {
             if (std::optional<RunTimePoolInfo> poolInfo =
                         executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
                 return Buffer(*poolInfo, info.locationAndLength().offset);
             } else {
                 LOG(ERROR) << "Unable to map operand memory pool";
                 return std::nullopt;
             }
         } break;
         case ModelArgumentInfo::HAS_NO_VALUE: {
             LOG(ERROR) << "Attempting to read an operand that has no value";
             return std::nullopt;
         } break;
         default: {
             LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
             return std::nullopt;
         } break;
     }
 }

 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
         std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
     const auto& sourceOperandToOffsetOfTemporary = controller->mSourceOperandToOffsetOfTemporary;
     const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
     const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
     if (auto it = sourceOperandToOffsetOfTemporary.find(operandIndex);
         it != sourceOperandToOffsetOfTemporary.end()) {
         const uint32_t offset = it->second;
         const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
         return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
     } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
                it != sourceOperandToInputIndex.end()) {
         const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
     } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
                it != sourceOperandToOutputIndex.end()) {
         const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
         return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
     }
     return std::nullopt;
 }

 bool ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
                                        SourceOperandIndex operandIndex) const {
     std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
     CHECK(buffer != std::nullopt) << "Unable to read operand " << toString(operandIndex);
     bool8 value;
     CHECK_GE(buffer->getSize(), sizeof(value));
     std::memcpy(&value, buffer->getPointer(), sizeof(value));
     VLOG(EXECUTION) << "readConditionValue: " << static_cast<int>(value);
     return value;
 }

 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
                         std::shared_ptr<StepExecutor>* executor,
                         std::shared_ptr<ExecutionBurstController>* burstController) const {
     controller->mLastStepIndex = controller->mNextStepIndex;
     *executor = nullptr;
     if (burstController != nullptr) {
         *burstController = nullptr;
     }

     VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
                     << "): mNextStepIndex = " << controller->mNextStepIndex;

     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
         return ANEURALNETWORKS_OP_FAILED;
     }

     if (mState == EMPTY) {
         CHECK_EQ(controller->mNextStepIndex, 0u);  // end
         controller->mNextStepIndex = Controller::kBadStepIndex;
         return ANEURALNETWORKS_NO_ERROR;
     }

     if (mState == SIMPLE) {
         if (controller->mNextStepIndex == 0) {
             // First (and only) step.
             auto simpleBody = simple();
             *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
                                                        simpleBody->mModel, simpleBody->mDevice,
                                                        simpleBody->mPreparedModel);
             (*executor)->mapInputsAndOutputsTrivially();
             if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
                 *burstController = controller->mBurstBuilder->getControllerAt(0);
             }
             controller->mNextStepIndex = 1;
             return ANEURALNETWORKS_NO_ERROR;
         }

         CHECK_EQ(controller->mNextStepIndex, 1u);  // end
         controller->mNextStepIndex = Controller::kBadStepIndex;
         return ANEURALNETWORKS_NO_ERROR;
     }

     return nextCompound(controller, executor, burstController);
 }

 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
         return ANEURALNETWORKS_OP_FAILED;
     }

     auto compoundBody = compound();
     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
         controller->mNextStepIndex = Controller::kBadStepIndex;  // end
         return ANEURALNETWORKS_NO_ERROR;
     }

     const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
     if (const IfStep* step = logicalStep->tryIfStep()) {
         return nextCompound(step, controller, executor, burstController);
     } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
         return nextCompound(step, controller, executor, burstController);
     } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
         return nextCompound(step, controller, executor, burstController);
     } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
         return nextCompound(step, controller, executor, burstController);
     } else {
         CHECK(false) << "Unknown step variant";
         return ANEURALNETWORKS_BAD_STATE;
     }
 }

 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
     VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
                     << step->getDevice()->getName();
     *executor =
             std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
                                            step->getDevice(), step->getPreparedStepModel(), step);
     step->mapInputsAndOutputs(
             *executor, controller->mTemporaries.get(),
             controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex,
             controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference);
     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
     }

     controller->mNextStepIndex++;
     return ANEURALNETWORKS_NO_ERROR;
 }

 // The first argument is the "source" operand, the second operand is the "destination".
 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
                                          const SourceOperandIndex& innerOperand) {
     VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
                     << toString(outerOperand);
 #ifdef NN_DEBUGGABLE
     CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
                      mSourceOperandToInputIndex.count(innerOperand) +
                      mSourceOperandToOutputIndex.count(innerOperand) +
                      mSourceOperandToConstantReference.count(innerOperand),
              1u);
 #endif
     mSourceOperandToOffsetOfTemporary.erase(innerOperand);
     mSourceOperandToInputIndex.erase(innerOperand);
     mSourceOperandToOutputIndex.erase(innerOperand);
     mSourceOperandToConstantReference.erase(innerOperand);
     if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
         it != mSourceOperandToOffsetOfTemporary.end()) {
         mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
     } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
                it != mSourceOperandToInputIndex.end()) {
         mSourceOperandToInputIndex.emplace(innerOperand, it->second);
     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
                it != mSourceOperandToOutputIndex.end()) {
         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
     } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
                it != mSourceOperandToConstantReference.end()) {
         mSourceOperandToConstantReference.emplace(innerOperand, it->second);
     } else {
         CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
                      << " from operand " << toString(outerOperand);
     }
 }

 // The first argument is the "source" operand, the second operand is the "destination".
 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
                                           const SourceOperandIndex& innerOperand) {
     VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
                     << toString(outerOperand);
 #ifdef NN_DEBUGGABLE
     CHECK_LE(mSourceOperandToOffsetOfTemporary.count(innerOperand) +
                      mSourceOperandToOutputIndex.count(innerOperand),
              1u);
 #endif
     mSourceOperandToOffsetOfTemporary.erase(innerOperand);
     mSourceOperandToOutputIndex.erase(innerOperand);
     if (auto it = mSourceOperandToOffsetOfTemporary.find(outerOperand);
         it != mSourceOperandToOffsetOfTemporary.end()) {
         mSourceOperandToOffsetOfTemporary.emplace(innerOperand, it->second);
     } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
                it != mSourceOperandToOutputIndex.end()) {
         mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
     } else {
         CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
                      << " from operand " << toString(outerOperand);
     }
 }

 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
     VLOG(EXECUTION) << "next: " << toString(*step);
     bool condValue = readConditionValue(controller, step->conditionOperandIndex);
     controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
     const std::vector<SourceOperandIndex>& branchInputOperands =
             condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
     const std::vector<SourceOperandIndex>& branchOutputOperands =
             condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
     CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
     CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
     for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
         // We have to do this assignment just before executing this step to
         // accommodate cases when the IF resides within a WHILE condition or
         // body model and for some j the i-th input of the IF branch model is
         // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
         // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
         // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
         // In such cases, the WhileStep modifies the location of
         // step->outerInputOperands[i] to implement double buffering.
         controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
     }
     for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
         // We have to do this assignment just before executing this step to
         // accommodate the case when the IF resides within a WHILE body
         // model and the i-th output of the IF branch model is an
         // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
         // some j). In that case, the WhileStep modifies the location of
         // step->outerOutputOperands[i] to implement double buffering.
         controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
     }
     return nextCompound(controller, executor, burstController);
 }

 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
     WhileState& state = controller->mWhileState[controller->mNextStepIndex];
     if (state.stage == WhileState::EVALUATE_CONDITION) {
         state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
                         << ": evaluating condition";
         controller->mNextStepIndex = step->condStepIndex;

         if (state.iteration == 0) {
             state.startTime = std::chrono::steady_clock::now();
         }

         // iteration = 0   cond inputs = outer inputs
         // iteration = 1   cond inputs = body outputs
         // iteration = 2   cond inputs = body outputs
         // iteration = 3   cond inputs = ...
         uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
         CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
         CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
         for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
             bool operandIsInputOnly = i >= loopBodyOutputCount;
             controller->setInput((state.iteration == 0 || operandIsInputOnly)
                                          ? step->outerInputOperands[i]
                                          : step->bodyOutputOperands[i],
                                  step->condInputOperands[i]);
         }

         state.stage = WhileState::EVALUATE_BODY;
         return nextCompound(controller, executor, burstController);
     }

     CHECK(state.stage == WhileState::EVALUATE_BODY);
     bool condValue = readConditionValue(controller, step->condOutputOperand);

     std::chrono::nanoseconds timeoutDuration(
             controller->mExecutionBuilder->getLoopTimeoutDuration());
     auto duration = std::chrono::steady_clock::now() - state.startTime;
     if (duration > timeoutDuration) {
         LOG(ERROR) << "WHILE loop timed out after "
                    << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
                    << " ms";
         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
     }

     if (condValue) {
         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
                         << ": evaluating body";
         controller->mNextStepIndex = step->bodyStepIndex;

         // iteration = 0   body inputs = cond inputs = outer inputs   body outputs = tmp1
         // iteration = 1   body inputs = cond inputs = tmp1           body outputs = tmp2
         // iteration = 2   body inputs = cond inputs = tmp2           body outputs = tmp1
         // iteration = 3   body inputs = cond inputs = ...            body outputs = ...
 #ifdef NN_DEBUGGABLE
         CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
         CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
         CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
         CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
 #endif
         for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
             controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
         }
         if (state.iteration != 0) {
             for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
 #ifdef NN_DEBUGGABLE
                 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
                 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
                 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary.count(outputOperand), 1u);
                 CHECK_EQ(controller->mSourceOperandToOffsetOfTemporary2.count(outputOperand), 1u);
 #endif
                 std::swap(controller->mSourceOperandToOffsetOfTemporary[outputOperand],
                           controller->mSourceOperandToOffsetOfTemporary2[outputOperand]);
             }
         }
     } else {
         VLOG(EXECUTION) << "next: " << toString(*step) << ": iteration " << state.iteration
                         << ": exiting loop";
         controller->mNextStepIndex = step->exitStepIndex;

         // Copy body outputs to outer outputs.
         // TODO: Use outer outputs instead of tmp2 to avoid copying?
         CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
         for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
             // condInputOperands[i] points to a body output operand from the
             // last iteration if we've executed at least one iteration and to a
             // WHILE operation input operand otherwise.
             const SourceOperandIndex& innerOperand = step->condInputOperands[i];
             const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
             std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
             if (outerBuffer == std::nullopt) {
                 return ANEURALNETWORKS_OP_FAILED;
             }
             const Operand& sourceOperand =
                     controller->mExecutionBuilder->getSourceOperand(outerOperand);
             const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
             CHECK_NE(size, 0u);
             std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
             if (innerBuffer == std::nullopt) {
                 return ANEURALNETWORKS_OP_FAILED;
             }
             CHECK_LE(size, innerBuffer->getSize());
             CHECK_LE(size, outerBuffer->getSize());
             memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
             outerBuffer->flush();
         }
         state.iteration = WhileState::kOutsideLoop;
     }

     state.stage = WhileState::EVALUATE_CONDITION;
     return nextCompound(controller, executor, burstController);
 }

 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
                                 std::shared_ptr<ExecutionBurstController>* burstController) const {
     VLOG(EXECUTION) << "next: " << toString(*step);
     controller->mNextStepIndex = step->gotoStepIndex;
     return nextCompound(controller, executor, burstController);
 }

 void ExecutionPlan::becomeCompoundIfEmpty() {
     CHECK(mState != SIMPLE);
     if (mState == EMPTY) {
         mBody = new CompoundBody();
         mState = COMPOUND;
     }
 }

 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
                                                      const std::shared_ptr<Device> device) {
     becomeCompoundIfEmpty();
     auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
                                               compound()->mSteps.size(), sourceModelIndex, device);
     compound()->mSteps.push_back(step);
     return step->executionStep();
 }

 IfStep* ExecutionPlan::createNewIfStep() {
     becomeCompoundIfEmpty();
     auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
     step->ifStep()->index = compound()->mSteps.size();
     compound()->mSteps.push_back(step);
     return step->ifStep();
 }

 WhileStep* ExecutionPlan::createNewWhileStep() {
     becomeCompoundIfEmpty();
     auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
     step->whileStep()->index = compound()->mSteps.size();
     compound()->mSteps.push_back(step);
     return step->whileStep();
 }

 GotoStep* ExecutionPlan::createNewGotoStep() {
     becomeCompoundIfEmpty();
     auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
     step->gotoStep()->index = compound()->mSteps.size();
     compound()->mSteps.push_back(step);
     return step->gotoStep();
 }

 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
                                      const ModelBuilder* model) {
     CHECK(mState == EMPTY);
     mBody = new SimpleBody(device, model, mCacheDir, mToken);
     mState = SIMPLE;
 }

 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
     auto [it, isNew] =
             compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
     CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
                  << toString(sourceOperandIndex) << " already defined by step " << it->second;
 }

 void ExecutionPlan::dump() const {
     if (mBody) {
         mBody->dump();
     } else {
         VLOG(COMPILATION) << "EMPTY";
     }
 }

 void ExecutionPlan::reset() {
     if (mBody) {
         delete mBody;
         mBody = nullptr;
     }
     mState = EMPTY;
 }

 bool ExecutionPlan::isSimpleCpu() const {
     return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
 }

 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
     switch (mState) {
         case EMPTY:
             return Kind::EMPTY;
         case SIMPLE:
             nnAssert(mBody);
             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
         case COMPOUND:
             nnAssert(mBody);
             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
         default:
             nnAssert(!"unexpected state");
             return Kind::ERROR;
     }
 }

 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
     return simple()->mDevice;
 }

 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
     return compound()->mSteps;
 }

 bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
     return mBody->hasStepModelOutputsOfUnknownSize();
 }

 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
     return simple()->mToken.getCacheToken();
 }

 void ExecutionPlan::SimpleBody::dump() const {
     VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
 }

 void ExecutionPlan::CompoundBody::dump() const {
     for (const auto& step : mSteps) {
         step->dump();
     }
 }

 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
                                                        const StepRoleCallback& callback) const {
     callback(mPreparedModel.get(), IOType::INPUT, index);
 }

 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
                                                         const StepRoleCallback& callback) const {
     callback(mPreparedModel.get(), IOType::OUTPUT, index);
 }

 // Map an input role of the main model to the input/output roles in the step models:
 // - An input role of the main model may be used as an input of multiple step models.
 // - An input role of the main model should not be used as an output of any step model.
 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
                                                          const StepRoleCallback& callback) const {
     for (const auto& logicalStep : mSteps) {
         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
             // Model input as step model input.
             const auto& inputMapping = step->getInputIndexStepModelToMainModel();
             for (uint32_t i = 0; i < inputMapping.size(); i++) {
                 if (inputMapping[i] == index) {
                     callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
                 }
             }
         }
     }
 }

 // Map an output role of the main model to the input/output roles in the step models:
 // - An output role of the main model may only be used as one output of one single step model.
 // - An output role of the main model may be used as an input of multiple step models.
 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
                                                           const StepRoleCallback& callback) const {
     bool found = false;
     for (const auto& logicalStep : mSteps) {
         if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
             // Model output as step model output.
             if (!found) {
                 const auto& outputMapping = step->getOutputIndexStepModelToMainModel();
                 for (uint32_t i = 0; i < outputMapping.size(); i++) {
                     if (outputMapping[i] == index) {
                         callback(step->getPreparedStepModel().get(), IOType::OUTPUT, i);
                         found = true;
                         break;
                     }
                 }
             }
             // Model output as step model input.
             const auto& inputToOutputMapping = step->getOutputsAsStepModelInputsIndexToMainModel();
             for (uint32_t i = 0; i < inputToOutputMapping.size(); i++) {
                 if (inputToOutputMapping[i] == index) {
                     callback(step->getPreparedStepModel().get(), IOType::INPUT, i);
                 }
             }
         }
     }
 }

 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
                                    uint32_t preference, uint32_t priority,
                                    const std::optional<Deadline>& deadline,
                                    ExecutionPlan* plan) const {
     uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
     NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
                                                 deadline, plan));
     int n = plan->finish(preference, priority, deadline);
     if (VLOG_IS_ON(COMPILATION)) {
         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
         logModelToInfo(makeHidlModel());
         plan->dump();
     }
     return n;
 }

 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
                                            const std::vector<std::shared_ptr<Device>>& devices,
                                            uint32_t preference, uint32_t priority,
                                            const std::optional<Deadline>& deadline,
                                            ExecutionPlan* plan) const {
     // This function uses a heuristic approach to partitioning the graph.
     // It should be good enough for the first release.

     SourceModels* sourceModels = &plan->getSourceModels();
     const size_t deviceCount = devices.size();
     const size_t operationCount = mOperations.size();

     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
                       << "sourceModelIndex = " << sourceModelIndex << ", "
                       << "deviceCount = " << deviceCount << ", "
                       << "operationCount = " << operationCount;

     // Figure out where each operation will best execute.
     // The value of the vector is the index in the devices vector.
     std::vector<int> bestDeviceForOperation(operationCount);
     NN_RETURN_IF_ERROR(
             findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));

     // A special value produced by findBestDeviceForEachOperation meaning that
     // this is a control flow operation scheduled for interpreted execution
     // (see LogicalStep).
     const int kControlFlow = deviceCount;

     // If one device will run all the operations, we don't need to split the work.
     if (sourceModelIndex == kMainModelInSourceModels &&
         std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
         const int bestDeviceIndex = bestDeviceForOperation[0];
         if (bestDeviceIndex != kControlFlow) {  // The model is not a single control flow operation.
             VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
                               << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
             plan->becomeSingleStep(devices[bestDeviceIndex], this);
             return plan->finish(preference, priority, deadline);
         }
     }

     // No easy solution, we need to split the work.

     // We keep track of the operations that are ready to run for each device.
     // perDeviceQueue[deviceCount] is for interpreted execution of control flow
     // (see LogicalStep).
     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);

     // This helper function enqueues the operation on the appropriate queue.
     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
         int deviceIndex = bestDeviceForOperation[operationIndex];
         perDeviceQueue[deviceIndex].push(operationIndex);
         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
                           << deviceIndex;
     };

     // This helper function finds a device that has operations ready to process.
     // We start by looking at the control flow queue, and then look at the
     // devices in reverse order (i.e., starting at the end of the devices
     // vector). Earlier devices have a chance to prepare more of the inputs
     // required by other devices. This function returns -1 if all queues are
     // empty.
     auto findNextDeviceToProcess = [&]() -> int {
         for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
             if (!perDeviceQueue[i].empty()) {
                 return i;
             }
         }
         return -1;
     };

     OperandTracker tracker(this, enqueueOnAppropriateDevice);
     // For each iteration of this loop, we'll create an execution step.
     while (true) {
         // Find the device we'll do this step for.
         int deviceIndex = findNextDeviceToProcess();
         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
         if (deviceIndex < 0) {
             break;
         }

         // Assign as much as possible to this device.
         auto& queue = perDeviceQueue[deviceIndex];
         if (deviceIndex != kControlFlow) {
             ExecutionStep* step =
                     plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
             while (!queue.empty()) {
                 uint32_t operationIndex = queue.front();
                 queue.pop();
                 int n = step->addOperation(operationIndex);
                 if (n != ANEURALNETWORKS_NO_ERROR) {
                     LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
                     return n;
                 }
                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
             }
         } else {
             while (!queue.empty()) {
                 uint32_t operationIndex = queue.front();
                 queue.pop();
                 const Operation& operation = getOperation(operationIndex);
                 if (operation.type == OperationType::IF) {
                     namespace op = operation_if;
                     const Operand& thenOperand =
                             getOperand(operation.inputs[op::kThenModelOperand]);
                     const Operand& elseOperand =
                             getOperand(operation.inputs[op::kElseModelOperand]);
                     const ModelBuilder* thenModel = getReferencedModel(thenOperand);
                     const ModelBuilder* elseModel = getReferencedModel(elseOperand);
                     uint32_t thenModelIndex = sourceModels->addModel(thenModel);
                     uint32_t elseModelIndex = sourceModels->addModel(elseModel);

                     // Emits the following:
                     // Index  Step
                     //   i    if then=(i + 1) else=(j + 1)
                     //  ...   (then model steps)
                     //   j    goto k
                     //  ...   (else model steps)
                     //   k    (steps after the IF)
                     IfStep* ifStep = plan->createNewIfStep();
                     ifStep->conditionOperandIndex = SourceOperandIndex(
                             sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
                     ifStep->thenStepIndex = plan->getNextStepIndex();
                     NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
                             thenModelIndex, devices, preference, priority, deadline, plan));
                     GotoStep* afterThenBranch = plan->createNewGotoStep();
                     ifStep->elseStepIndex = plan->getNextStepIndex();
                     NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
                             elseModelIndex, devices, preference, priority, deadline, plan));
                     afterThenBranch->gotoStepIndex = plan->getNextStepIndex();

                     // Outer model operands.
                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
                         ifStep->outerInputOperands.emplace_back(sourceModelIndex,
                                                                 operation.inputs[i]);
                     }
                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
                         ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
                                                                  operation.outputs[i]);
                     }
                     // Then model operands.
                     for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
                         ifStep->thenBranchInputOperands.emplace_back(
                                 thenModelIndex, thenModel->getInputOperandIndex(i));
                     }
                     for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
                         ifStep->thenBranchOutputOperands.emplace_back(
                                 thenModelIndex, thenModel->getOutputOperandIndex(i));
                     }
                     // Else model operands.
                     for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
                         ifStep->elseBranchInputOperands.emplace_back(
                                 elseModelIndex, elseModel->getInputOperandIndex(i));
                     }
                     for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
                         ifStep->elseBranchOutputOperands.emplace_back(
                                 elseModelIndex, elseModel->getOutputOperandIndex(i));
                     }
                 } else if (operation.type == OperationType::WHILE) {
                     namespace op = operation_while;
                     const Operand& condOperand =
                             getOperand(operation.inputs[op::kCondModelOperand]);
                     const Operand& bodyOperand =
                             getOperand(operation.inputs[op::kBodyModelOperand]);
                     const ModelBuilder* condModel = getReferencedModel(condOperand);
                     const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
                     uint32_t condModelIndex = sourceModels->addModel(condModel);
                     uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);

                     // Emits the following:
                     // Index  Step
                     //   i    while cond=(i + 1) body=(j + 1) exit=(k + 1)
                     //  ...   (cond model steps)
                     //   j    goto i
                     //  ...   (body model steps)
                     //   k    goto i
                     //  ...   (steps after the WHILE)
                     //
                     //  Note that WhileStep has WhileState associated with it.
                     WhileStep* whileStep = plan->createNewWhileStep();
                     whileStep->condStepIndex = plan->getNextStepIndex();
                     NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
                             condModelIndex, devices, preference, priority, deadline, plan));
                     GotoStep* afterCond = plan->createNewGotoStep();
                     afterCond->gotoStepIndex = whileStep->index;
                     whileStep->bodyStepIndex = plan->getNextStepIndex();
                     NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
                             bodyModelIndex, devices, preference, priority, deadline, plan));
                     GotoStep* afterBody = plan->createNewGotoStep();
                     afterBody->gotoStepIndex = whileStep->index;
                     whileStep->exitStepIndex = plan->getNextStepIndex();

                     // Outer model operands.
                     for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
                         whileStep->outerInputOperands.emplace_back(sourceModelIndex,
                                                                    operation.inputs[i]);
                     }
                     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
                         whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
                                                                     operation.outputs[i]);
                     }
                     // Cond model operands.
                     for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
                         whileStep->condInputOperands.emplace_back(
                                 condModelIndex, condModel->getInputOperandIndex(i));
                     }
                     whileStep->condOutputOperand =
                             SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
                     // Body model operands.
                     for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
                         whileStep->bodyInputOperands.emplace_back(
                                 bodyModelIndex, bodyModel->getInputOperandIndex(i));
                     }
                     for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
                         whileStep->bodyOutputOperands.emplace_back(
                                 bodyModelIndex, bodyModel->getOutputOperandIndex(i));
                     }
                 } else {
                     CHECK(false) << toString(operation.type) << " is not a control flow operation";
                 }
                 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
             }
         }
     }
     return ANEURALNETWORKS_NO_ERROR;
 }

 float ModelBuilder::getPerformance(uint32_t preference,
                                    const std::shared_ptr<Device> device) const {
     // Note that we will call this method multiple times per compilation with
     // the same arguments if there are nested control flow operations and we
     // decide to execute the outer operation on the ExecutionPlan::next()
     // interpreter.
     //
     // This is a potential compilation performance problem. To work around it,
     // the performance value could be cached for the duration of a compilation.
     float perf = 0;
     const size_t operationCount = mOperations.size();
     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
         perf += getPerformance(preference, device, operationIndex);
     }
     return perf;
 }

 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
                                    uint32_t operationIndex) const {
     auto applyPreference = [preference](const PerformanceInfo& perf) {
         return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
     };

     const Operation& operation = getOperation(operationIndex);

     if (operation.type == OperationType::IF) {
         namespace op = operation_if;
         const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
         const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
         const ModelBuilder* thenModel = getReferencedModel(thenOperand);
         const ModelBuilder* elseModel = getReferencedModel(elseOperand);
         return applyPreference(device->getIfPerformance()) +
                0.5 * (thenModel->getPerformance(preference, device) +
                       elseModel->getPerformance(preference, device));
     }

     if (operation.type == OperationType::WHILE) {
         namespace op = operation_while;
         const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
         const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
         const ModelBuilder* condModel = getReferencedModel(condOperand);
         const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
         return applyPreference(device->getWhilePerformance()) +
                condModel->getPerformance(preference, device) +
                bodyModel->getPerformance(preference, device);
     }

     // TODO This assumes that the type is dictated by the first operand. This is
     // currently the case but is not a safe assumption to make in the long term.
     const uint32_t operandIndex = operation.inputs[0];
     const OperandType operandType = mOperands[operandIndex].type;
     switch (operandType) {
         case OperandType::FLOAT32:
             if (mRelaxComputationFloat32toFloat16) {
                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
             }
             break;
         case OperandType::TENSOR_FLOAT32:
             if (mRelaxComputationFloat32toFloat16) {
                 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
             }
             break;
         default:
             break;
     }

     return applyPreference(device->getPerformance(operandType));
 }

 namespace {

 // This class determines whether a given device can execute a given operation
 class CanDo {
    public:
     CanDo() {}

     void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
         mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
     }

     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }

    private:
     std::vector<bool> mSupportsOperationByIndex;
 };

 }  // anonymous namespace

 int ModelBuilder::findBestDeviceForEachOperation(
         uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
         std::vector<int>* bestDeviceForOperation) const {
     const MetaModel metaModel(makeHidlModel(), DeviceManager::get()->strictSlicing());

     const size_t deviceCount = devices.size();
     std::vector<CanDo> canDo(deviceCount);
     for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
         canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
     }

     // Figure out the best driver for each operation.
     const size_t operationCount = mOperations.size();
     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
         const Operation& operation = getOperation(operationIndex);
         // Find which device, including CPU fallback, gives the best performance for this operation.
         int bestChoice = -1;
         float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
         for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
             const auto& device = devices[deviceIndex];
             if (canDo[deviceIndex].check(operationIndex)) {
                 const float perfVal = getPerformance(preference, device, operationIndex);
                 if (bestChoice < 0 || perfVal < bestPerfVal ||
                     (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) {
                     bestChoice = deviceIndex;
                     bestPerfVal = perfVal;
                 }
             } else {
                 // Somewhat noisy logging, but only place where the user of NNAPI can get
                 // feedback on why an operation was not run on a specific device.
                 //
                 // Logs O(operationCount * deviceCount) times, but typically deviceCount is
                 // very small.
                 VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
                                   << toString(operation.type);
             }
         }
         if (bestChoice < 0) {
             LOG(ERROR) << "No driver can do the op";
             return ANEURALNETWORKS_BAD_DATA;
         } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
                    (operation.type == OperationType::IF ||
                     operation.type == OperationType::WHILE)) {
             // Run control flow on the ExecutionPlan::next() interpreter and try
             // to delegate referenced models.
             const int kControlFlow = deviceCount;
             (*bestDeviceForOperation)[operationIndex] = kControlFlow;
             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
                               << toString(operation.type) << ") = -1"
                               << " (NNAPI)";
         } else {
             (*bestDeviceForOperation)[operationIndex] = bestChoice;
             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
                               << toString(operation.type) << ") = " << bestChoice << " ("
                               << devices[bestChoice]->getName() << ")";
         }
     }
     return ANEURALNETWORKS_NO_ERROR;
 }

 }  // namespace nn
 }  // namespace android