| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define LOG_TAG "ExecutionBuilder" |
| |
| #include "ExecutionBuilder.h" |
| |
| #include <ControlFlow.h> |
| #include <CpuExecutor.h> |
| #include <LegacyUtils.h> |
| #include <Tracing.h> |
| #include <android-base/logging.h> |
| #include <nnapi/IBurst.h> |
| #include <nnapi/IPreparedModel.h> |
| #include <nnapi/Types.h> |
| |
| #include <algorithm> |
| #include <limits> |
| #include <map> |
| #include <memory> |
| #include <mutex> |
| #include <optional> |
| #include <string> |
| #include <thread> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
| |
| #include "BurstBuilder.h" |
| #include "CompilationBuilder.h" |
| #include "Manager.h" |
| #include "ModelArgumentInfo.h" |
| #include "ModelBuilder.h" |
| #include "Telemetry.h" |
| #include "TypeManager.h" |
| |
| namespace android { |
| namespace nn { |
| |
| // Partial validation of output shapes returned from driver, to ensure they |
| // conform to a very specific set of rules. |
| static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model, |
| const std::vector<OutputShape>& shapes) { |
| // Enforces the following rules (some of which are from b/154054474): |
| // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE. |
| // If the vector is not empty, it must have as many entries as the step model has outputs. |
| // - If NONE, then either shapes vector is empty, or every shape is |
| // marked isSufficient and, if a tensor, has known rank. |
| // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty. At least one entry |
| // is marked !isSufficient. |
| switch (executionStatus) { |
| case ErrorStatus::NONE: { |
| NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount()) |
| << "With execution ErrorStatus " << executionStatus |
| << " output shapes vector must be empty or of length " << model->outputCount() |
| << " but has length " << shapes.size(); |
| NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(), |
| [](const OutputShape& shape) { return shape.isSufficient; })) |
| << "With execution ErrorStatus " << executionStatus |
| << " at least one output shape is unexpectedly marked !isSufficient"; |
| |
| const TypeManager* tm = TypeManager::get(); |
| for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount; |
| ++outputIndex) { |
| const Operand& outputOperand = model->getOutputOperand(outputIndex); |
| NN_RET_CHECK(!tm->isTensorType(outputOperand.type) || |
| (shapes[outputIndex].dimensions.size() != 0)) |
| << "With execution ErrorStatus " << executionStatus << " output#" |
| << outputIndex << " shape unexpectedly has zero rank"; |
| } |
| |
| break; |
| } |
| case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: { |
| NN_RET_CHECK(shapes.size() == model->outputCount()) |
| << "With execution ErrorStatus " << executionStatus |
| << " output shapes vector must be of length " << model->outputCount() |
| << " but has length " << shapes.size(); |
| NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(), |
| [](const OutputShape& shape) { return !shape.isSufficient; })) |
| << "With execution ErrorStatus " << executionStatus |
| << " at least one output shape must have been marked !isSufficient"; |
| break; |
| } |
| default: { |
| NN_RET_CHECK(shapes.size() == 0) |
| << "With execution ErrorStatus " << executionStatus |
| << " output shapes vector must be empty but has length " << shapes.size(); |
| break; |
| } |
| } |
| return true; |
| } |
| static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model, |
| const std::vector<OutputShape>& shapes) { |
| return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode), |
| model, shapes); |
| } |
| |
| static MeasureTiming measureTiming(const ExecutionBuilder* execution) { |
| return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO; |
| } |
| |
| static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType, |
| const char* tag, bool allowUnspecified) { |
| if (newType != nullptr) { |
| const Extension::OperandTypeInformation* info = nullptr; |
| if (isExtension(operand.type)) { |
| NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info)); |
| } |
| if (validateOperandType(*newType, info, tag, allowUnspecified) != |
| ANEURALNETWORKS_NO_ERROR) { |
| LOG(ERROR) << tag << ": Invalid newType"; |
| return false; |
| } |
| if (operand.dimensions.size() == 0) { |
| return true; |
| } |
| if (operand.dimensions.size() != newType->dimensionCount) { |
| LOG(ERROR) << tag << ": Setting with incompatible dimension count (existing = " |
| << operand.dimensions.size() << ", new = " << newType->dimensionCount << ")"; |
| return false; |
| } |
| for (uint32_t i = 0; i < newType->dimensionCount; i++) { |
| if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) { |
| LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed"; |
| return false; |
| } |
| } |
| } else { |
| if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) && |
| tensorHasUnspecifiedDimensions(operand)) { |
| LOG(ERROR) << tag << ": Setting with operand type that is not fully specified"; |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation) |
| : mCompilation(compilation), |
| mModel(compilation->mModel), |
| mPlan(&compilation->mPlan), |
| mAllowCpuFallback(DeviceManager::partitioningAllowsFallback(compilation->mPartitioning)), |
| mInputs(mModel->inputCount()), |
| mOutputs(mModel->outputCount()) { |
| VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size() |
| << " inputs and " << mOutputs.size() << " outputs"; |
| } |
| |
| SimpleExecutionBuilder::SimpleExecutionBuilder(const CompilationBuilder* compilation) |
| : ExecutionBuilder(compilation) { |
| CHECK(mPlan->isSimple()); |
| } |
| |
| CompoundExecutionBuilder::CompoundExecutionBuilder(const CompilationBuilder* compilation) |
| : ExecutionBuilder(compilation) { |
| CHECK(mPlan->isCompound()); |
| } |
| |
| const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const { |
| return mPlan->getSourceModels().getModel(index); |
| } |
| |
| int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type, |
| const void* buffer, size_t length) { |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mInputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!checkDimensionInfo(mModel->getInputOperand(index), type, |
| "ANeuralNetworksExecution_setInput", buffer == nullptr)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (length > 0xFFFFFFFF) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| uint32_t l = static_cast<uint32_t>(length); |
| if (!mInputs[index].unspecified()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been " |
| "provided"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int n; |
| std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer( |
| mModel->getInputOperand(index), type, const_cast<void*>(buffer), l, |
| mInputAndOutputPaddingEnabled); |
| mHasCalledSetInputOutput = true; |
| return n; |
| } |
| |
| int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, |
| const RuntimeMemory* memory, size_t offset, |
| size_t length) { |
| // Should be similar to StepExecutor::setInputOrOutputFromMemory() |
| |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mInputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " " |
| << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!checkDimensionInfo(mModel->getInputOperand(index), type, |
| "ANeuralNetworksExecution_setInputFromMemory", false)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset, |
| length)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we |
| // allow the client to specify offset == 0 && length == 0 indicating that the entire memory |
| // region is used. We update the length here because the drivers are still expecting a real |
| // length. For other memories that do not allow this semantic, it is checked in |
| // MemoryValidatorBase::validate before reaching here. |
| if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) { |
| length = memory->getSize(); |
| } |
| // TODO validate the rest |
| uint32_t poolIndex = mMemories.add(memory); |
| if (!mInputs[index].unspecified()) { |
| LOG(ERROR) |
| << "ANeuralNetworksExecution_setInputFromMemory called when an input has already " |
| "been provided"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int n; |
| std::tie(n, mInputs[index]) = |
| ModelArgumentInfo::createFromMemory(mModel->getInputOperand(index), type, poolIndex, |
| offset, length, mInputAndOutputPaddingEnabled); |
| mHasCalledSetInputOutput = true; |
| return n; |
| } |
| |
| int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type, |
| void* buffer, size_t length) { |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mOutputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!checkDimensionInfo(mModel->getOutputOperand(index), type, |
| "ANeuralNetworksExecution_setOutput", true)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (length > 0xFFFFFFFF) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| uint32_t l = static_cast<uint32_t>(length); |
| if (!mOutputs[index].unspecified()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been " |
| "provided"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int n; |
| std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromPointer( |
| mModel->getOutputOperand(index), type, buffer, l, mInputAndOutputPaddingEnabled); |
| mHasCalledSetInputOutput = true; |
| return n; |
| } |
| |
| int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, |
| const RuntimeMemory* memory, size_t offset, |
| size_t length) { |
| // Should be similar to StepExecutor::setInputOrOutputFromMemory() |
| |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mOutputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " " |
| << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!checkDimensionInfo(mModel->getOutputOperand(index), type, |
| "ANeuralNetworksExecution_setOutputFromMemory", true)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset, |
| length)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we |
| // allow the client to specify offset == 0 && length == 0 indicating that the entire memory |
| // region is used. We update the length here because the drivers are still expecting a real |
| // length. For other memories that do not allow this semantic, it is checked in |
| // MemoryValidatorBase::validate before reaching here. |
| if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) { |
| length = memory->getSize(); |
| } |
| // TODO validate the rest |
| uint32_t poolIndex = mMemories.add(memory); |
| if (!mOutputs[index].unspecified()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has " |
| "already been provided"; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int n; |
| std::tie(n, mOutputs[index]) = |
| ModelArgumentInfo::createFromMemory(mModel->getOutputOperand(index), type, poolIndex, |
| offset, length, mInputAndOutputPaddingEnabled); |
| mHasCalledSetInputOutput = true; |
| return n; |
| } |
| |
| int ExecutionBuilder::setMeasureTiming(bool measure) { |
| if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on " |
| << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation " |
| << "that was not created by ANeuralNetworksCompilation_createForDevices " |
| << "with numDevices = 1"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| mMeasureTiming = measure; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const { |
| if (!completed()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the " |
| "execution has finished."; |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (completedWith() != Completion::NO_ERROR) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution " |
| "that has encountered an error."; |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| |
| if (!mMeasureTiming) { |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| |
| Timing timingLaunched = mTimingWithoutFencedExecutionCallback; |
| Timing timingFenced = timingLaunched; |
| if (mFencedExecutionCallback != nullptr) { |
| auto result = mFencedExecutionCallback(); |
| if (!result.has_value()) { |
| LOG(ERROR) << "Fenced execution callback failed: " << result.error().message; |
| *duration = UINT64_MAX; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| std::tie(timingLaunched, timingFenced) = std::move(result).value(); |
| } |
| const OptionalDuration selectedDuration = [durationCode, &timingLaunched, |
| &timingFenced]() -> OptionalDuration { |
| switch (durationCode) { |
| case ANEURALNETWORKS_DURATION_ON_HARDWARE: |
| return timingLaunched.timeOnDevice; |
| case ANEURALNETWORKS_DURATION_IN_DRIVER: |
| return timingLaunched.timeInDriver; |
| case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE: |
| return timingFenced.timeOnDevice; |
| case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER: |
| return timingFenced.timeInDriver; |
| default: |
| LOG(FATAL) << "unexpected"; |
| return std::nullopt; |
| } |
| }(); |
| if (selectedDuration.has_value()) { |
| constexpr uint64_t kMaxTiming = std::numeric_limits<uint64_t>::max() - 1; |
| using CommonType = std::common_type_t<Duration::rep, uint64_t>; |
| const auto count = std::min<CommonType>(selectedDuration.value().count(), kMaxTiming); |
| *duration = static_cast<uint64_t>(count); |
| } else { |
| constexpr uint64_t kNoTiming = std::numeric_limits<uint64_t>::max(); |
| *duration = kNoTiming; |
| } |
| |
| VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::setTimeoutDuration(uint64_t duration) { |
| if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution " |
| "created from an ANeuralNetworksCompilation that was not created by " |
| "ANeuralNetworksCompilation_createForDevices with numDevices = 1"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (duration > 0) { |
| mTimeoutDuration = duration; |
| } else { |
| mTimeoutDuration.reset(); |
| } |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| std::optional<uint64_t> ExecutionBuilder::getTimeoutDuration() const { |
| return mTimeoutDuration; |
| } |
| |
| TimePoint ExecutionBuilder::getComputeStartTimePoint() const { |
| CHECK(computationStarted()) << "getComputeStartTimePoint called before " |
| << "execution has started."; |
| return mComputeStartTimePoint; |
| } |
| |
| int ExecutionBuilder::setLoopTimeout(uint64_t duration) { |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (duration > operation_while::kTimeoutNsMaximum) { |
| LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed " |
| << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum; |
| duration = operation_while::kTimeoutNsMaximum; |
| } |
| mLoopTimeoutDuration = duration; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::enableInputAndOutputPadding(bool enable) { |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (mHasCalledSetInputOutput) { |
| LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after an input " |
| "or output is set."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| mInputAndOutputPaddingEnabled = enable; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::setReusable(bool reusable) { |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_setReusable called after the " |
| "execution has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| mReusable = reusable; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::addExtensionAttribute(const char* extensionName, |
| uint16_t attributeCodeWithinExtension, const void* data, |
| size_t length) { |
| if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { |
| LOG(ERROR) << "ANeuralNetworksExecution_addExtensionAttribute called on an " |
| "ANeuralNetworksExecution created from an ANeuralNetworksCompilation that " |
| "was not created by ANeuralNetworksCompilation_createForDevices with " |
| "numDevices = 1"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (computationStarted()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_addExtensionAttribute called after the execution " |
| "has started."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| int32_t attributeToken = 0; |
| if (!TypeManager::get()->getExtensionType(extensionName, attributeCodeWithinExtension, |
| &attributeToken)) { |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| if (std::find_if(mMetadata.begin(), mMetadata.end(), [attributeToken](const auto& entry) { |
| return attributeToken == entry.token; |
| }) != mMetadata.end()) { |
| LOG(ERROR) << "ANeuralNetworksCompilation_addExtensionAttribute called more than once for " |
| "the same attribute"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| const uint8_t* dataPtr = reinterpret_cast<const uint8_t*>(data); |
| mMetadata.push_back({attributeToken, std::vector<uint8_t>(dataPtr, dataPtr + length)}); |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) { |
| if (!completed()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the " |
| "execution has finished."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (completedWith() == Completion::OTHER_ERROR) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution " |
| "that has encountered an error."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| |
| uint32_t count = static_cast<uint32_t>(mOutputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index |
| << " " << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| const auto& dims = mOutputs[index].dimensions(); |
| if (dims.empty()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query " |
| "dimensions of a scalar"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| std::copy(dims.begin(), dims.end(), dimensions); |
| return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR |
| : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE; |
| } |
| |
| int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) { |
| if (!completed()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the " |
| "execution has finished."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (completedWith() == Completion::OTHER_ERROR) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution " |
| "that has encountered an error."; |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| uint32_t count = static_cast<uint32_t>(mOutputs.size()); |
| if (index >= count) { |
| LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " " |
| << count; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| *rank = static_cast<uint32_t>(mOutputs[index].dimensions().size()); |
| return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR |
| : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE; |
| } |
| |
| bool ExecutionBuilder::checkAndSetComputationState(const char* name) { |
| std::lock_guard<std::mutex> lock(mStateMutex); |
| if (!mReusable && mState == State::COMPLETED) { |
| LOG(ERROR) << "ANeuralNetworksExecution_" << name |
| << " called on a non-reusable execution that has already completed"; |
| return false; |
| } |
| if (mState == State::COMPUTATION) { |
| LOG(ERROR) << "ANeuralNetworksExecution_" << name |
| << " called on an execution that has already started"; |
| return false; |
| } |
| mState = State::COMPUTATION; |
| return true; |
| } |
| |
| // TODO(b/132321855): validate that we have full types for all inputs and outputs, |
| // that the graph is not cyclic, |
| static int validateRequest(const std::vector<ModelArgumentInfo>& inputs, |
| const std::vector<ModelArgumentInfo>& outputs) { |
| for (auto& p : inputs) { |
| if (p.state() == ModelArgumentInfo::UNSPECIFIED) { |
| LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all inputs specified"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| } |
| for (auto& p : outputs) { |
| if (p.state() == ModelArgumentInfo::UNSPECIFIED) { |
| LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all outputs specified"; |
| return ANEURALNETWORKS_BAD_DATA; |
| } |
| } |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| int ExecutionBuilder::getValidationResultCode() { |
| if (!mValidationResultCode.has_value()) { |
| mValidationResultCode = validateRequest(mInputs, mOutputs); |
| } |
| return mValidationResultCode.value(); |
| } |
| |
| bool ExecutionBuilder::areOutputsFullySpecified() { |
| if (!mOutputsFullySpecified.has_value()) { |
| mOutputsFullySpecified = true; |
| for (uint32_t i = 0; i < mOutputs.size(); i++) { |
| if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE && |
| TypeManager::get()->isTensorType(mModel->getOutputOperand(i).type) && |
| tensorHasUnspecifiedDimensions(mModel->getOutputOperand(i).type, |
| mOutputs[i].initialDimensions())) { |
| mOutputsFullySpecified = false; |
| break; |
| } |
| } |
| } |
| return mOutputsFullySpecified.value(); |
| } |
| |
| int ExecutionBuilder::prepareForCompute(const char* name, ExecutionMode mode) { |
| if (!checkAndSetComputationState(name)) { |
| return ANEURALNETWORKS_BAD_STATE; |
| } |
| if (int n = getValidationResultCode(); n != ANEURALNETWORKS_NO_ERROR) { |
| return finishComputation(n, {}, mode); |
| } |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| |
| // Attempt synchronous execution of full model on CPU. |
| // TODO: How should we handle timing in this case? |
| // For Q this is irrelevant: We only support timing in conjunction |
| // with an explicit device list; and we do not support CPU fallback |
| // with an explicit device list. See CompilationBuilder::mExplicitDeviceList. |
| static std::tuple<int, std::vector<OutputShape>, Timing> cpuFallbackFull( |
| ExecutionBuilder* executionBuilder) { |
| CHECK(executionBuilder != nullptr); |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull"); |
| VLOG(EXECUTION) << "cpuFallbackFull"; |
| |
| // Get fallback executor. |
| StepExecutor executor(executionBuilder, executionBuilder->getModel(), |
| DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr, |
| /*reusable=*/false); |
| executor.mapInputsAndOutputsTrivially(); |
| |
| // Attempt fallback execution. |
| return executor.computeOnCpuFallback(); |
| } |
| |
| // Attempt synchronous execution on CPU. |
| // TODO: How should we handle timing in this case? |
| // For Q this is irrelevant: We only support timing in conjunction |
| // with an explicit device list; and we do not support CPU fallback |
| // with an explicit device list. See CompilationBuilder::mExplicitDeviceList. |
| static std::tuple<int, std::vector<OutputShape>, Timing, std::shared_ptr<StepExecutor>> |
| cpuFallbackPartial(const ExecutionPlan& plan, |
| std::shared_ptr<ExecutionPlan::Controller> controller) { |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial"); |
| VLOG(EXECUTION) << "cpuFallbackPartial"; |
| |
| // Get fallback executor. |
| std::shared_ptr<StepExecutor> executor; |
| int n1 = plan.fallback(controller, &executor, nullptr, nullptr); |
| if (n1 != ANEURALNETWORKS_NO_ERROR) { |
| return {n1, {}, {}, nullptr}; |
| } |
| CHECK(executor != nullptr); |
| |
| // Attempt fallback execution. |
| auto [n2, outputShapes, timing] = executor->computeOnCpuFallback(); |
| return {n2, std::move(outputShapes), timing, executor}; |
| } |
| |
| std::tuple<int, std::vector<OutputShape>, Timing> SimpleExecutionBuilder::computeInternal( |
| const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) { |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeInternal"); |
| VLOG(EXECUTION) << "SimpleExecutionBuilder::computeInternal"; |
| |
| if (mExecutor == nullptr) { |
| mExecutor = mPlan->makeStepExecutor(mReusable, this); |
| } |
| |
| auto burstController = burstBuilder ? burstBuilder->getControllerAt(0) : nullptr; |
| auto [n, outputShapes, timing] = mExecutor->compute(deadline, burstController); |
| |
| if (n == ANEURALNETWORKS_NO_ERROR) { |
| return {n, std::move(outputShapes), timing}; |
| } |
| |
| // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE is not recoverable. |
| if (n == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) { |
| return {n, std::move(outputShapes), {}}; |
| } |
| |
| // If CPU fallback is not allowed and there was an error, end execution. |
| if (!mAllowCpuFallback) { |
| return {n, {}, {}}; |
| } |
| |
| // If CPU execution was already attempted, do not perform CPU fallback. |
| if (mExecutor->isCpu()) { |
| return {n, {}, {}}; |
| } |
| |
| // If the code has reached this point, a potentially recoverable error |
| // occurred during the execution. Do an execution fallback on the CPU. |
| return cpuFallbackFull(this); |
| } |
| |
| std::tuple<int, std::vector<OutputShape>, Timing> CompoundExecutionBuilder::computeInternal( |
| const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) { |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeInternal"); |
| VLOG(EXECUTION) << "CompoundExecutionBuilder::computeInternal (from plan, iteratively)"; |
| |
| auto controller = mPlan->makeController(this, burstBuilder); |
| std::vector<OutputShape> outputShapes = getInitialOutputShapes(); |
| |
| // On this iteration, do I need to repeat the previous step because it |
| // reported insufficient size? |
| bool doInsufficientSizeFallback = false; |
| |
| while (true) { |
| VLOG(EXECUTION) << "looking for next StepExecutor"; |
| |
| // Get the current step of the execution. |
| std::shared_ptr<StepExecutor> executor; |
| SharedBurst burstController; |
| int n = doInsufficientSizeFallback |
| ? mPlan->fallback(controller, &executor, &burstController, &outputShapes) |
| : mPlan->next(controller, &executor, &burstController, &outputShapes); |
| doInsufficientSizeFallback = false; |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| // During the interpreted execution of control flow, a loop timeout |
| // might occur in ExecutionPlan::next(). |
| bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT || |
| n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT; |
| if (mAllowCpuFallback && !missedDeadline) break; |
| return {n, {}, {}}; |
| } |
| |
| // If the code reached the end of the plan without error, then return |
| // with no error. |
| if (executor == nullptr) { |
| return {ANEURALNETWORKS_NO_ERROR, outputShapes, {}}; |
| } |
| const bool executorIsCpu = executor->isCpu(); |
| |
| // Attempt to execute a single step of the execution. |
| auto [stepN, stepOutputShapes, _] = executor->compute(deadline, burstController); |
| |
| // Update global outputs and dynamic temporaries. |
| StepExecutor::UpdateOutputShapes updateOutputShapes = {}; |
| if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes, |
| &updateOutputShapes)) { |
| stepN = ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| // If execution was successful, continue to next step. |
| if (stepN == ANEURALNETWORKS_NO_ERROR) { |
| if (updateOutputShapes.zeroSizedInput) { |
| // We'll need to do full model CPU fallback |
| VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput"; |
| stepN = ANEURALNETWORKS_OP_FAILED; |
| } else { |
| CHECK(executor->areDynamicTemporariesAllocated()); |
| continue; |
| } |
| } |
| |
| if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) { |
| VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes); |
| if (updateOutputShapes.mainOutputInsufficient || |
| !updateOutputShapes.updatedDynamicTemporary) { |
| // Either: |
| // - At least one main model output is not of sufficient size; or |
| // - we didn't learn anything new about dynamic temporaries. |
| // Neither of these is recoverable, so end execution. |
| return {stepN, outputShapes, {}}; |
| } |
| // Every main model output is of sufficient size. This implies that |
| // at least one dynamic temporary is not of sufficient size. This |
| // is recoverable. |
| doInsufficientSizeFallback = true; |
| continue; |
| } |
| |
| // If CPU fallback is not allowed and there was an error, end execution. |
| if (!mAllowCpuFallback) { |
| return {stepN, {}, {}}; |
| } |
| |
| // If CPU execution was already attempted, perform a full CPU fallback. |
| if (executorIsCpu) { |
| break; |
| } |
| |
| // If the code reaches this point, attempt a partial fallback to CPU. |
| CHECK(mAllowCpuFallback); |
| if (updateOutputShapes.zeroSizedInput) { |
| // Do not attempt a partial fallback. |
| break; |
| } |
| while (true) { |
| auto [fallbackN, fallbackOutputShapes, _, fallbackExecutor] = |
| cpuFallbackPartial(*mPlan, controller); |
| |
| // Update global outputs and dynamic temporaries. |
| StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {}; |
| if (fallbackExecutor != nullptr && |
| !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes, |
| &outputShapes, &fallbackUpdateOutputShapes)) { |
| fallbackN = ANEURALNETWORKS_OP_FAILED; |
| } |
| |
| // If execution was successful, continue to next step. |
| if (fallbackN == ANEURALNETWORKS_NO_ERROR) { |
| if (fallbackUpdateOutputShapes.zeroSizedInput) { |
| // We'll need to do full model CPU fallback |
| VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput"; |
| fallbackN = ANEURALNETWORKS_OP_FAILED; |
| break; |
| } |
| CHECK(fallbackExecutor->areDynamicTemporariesAllocated()); |
| goto nextStep; |
| } |
| |
| if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) { |
| VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " |
| << toString(fallbackUpdateOutputShapes); |
| if (fallbackUpdateOutputShapes.mainOutputInsufficient || |
| !fallbackUpdateOutputShapes.updatedDynamicTemporary) { |
| // Either: |
| // - At least one main model output is not of sufficient size; or |
| // - we didn't learn anything new about dynamic temporaries. |
| // Neither of these is recoverable, so end execution. |
| return {fallbackN, outputShapes, {}}; |
| } |
| // Every main model output is of sufficient size. This implies |
| // that at least one dynamic temporary is not of sufficient |
| // size. This is recoverable. |
| continue; |
| } |
| |
| // If the code reaches this point, then there was an error with the |
| // fallback. In this case, attempt full fallback. |
| break; |
| } |
| |
| // If the code reaches this point, then there was an error with the |
| // fallback. In this case, attempt full fallback. |
| break; |
| |
| nextStep: |
| // Bottom of the outer loop |
| continue; |
| } |
| |
| // If the code has reached this point, a potentially recoverable error |
| // occurred during the step executions. Instead, do a full execution |
| // fallback on the CPU. |
| return cpuFallbackFull(this); |
| } |
| |
| static bool waitForSyncFences(const std::vector<int>& waitFor) { |
| for (int syncFd : waitFor) { |
| if (syncFd > 0) { |
| auto r = syncWait(syncFd, -1); |
| if (r != FenceState::SIGNALED) { |
| VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd; |
| return false; |
| } |
| } |
| } |
| return true; |
| } |
| |
| std::tuple<int, int, ExecuteFencedInfoCallback> SimpleExecutionBuilder::computeFencedInternal( |
| const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, |
| const OptionalTimePoint& deadline) { |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeFencedInternal"); |
| VLOG(EXECUTION) << "SimpleExecutionBuilder::computeFencedInternal"; |
| |
| if (mExecutor == nullptr) { |
| mExecutor = mPlan->makeStepExecutor(mReusable, this); |
| } |
| |
| auto [n, syncFd, callback] = |
| mExecutor->computeFenced(waitFor, timeoutDurationAfterFence, deadline); |
| |
| if (n == ANEURALNETWORKS_NO_ERROR) { |
| return {ANEURALNETWORKS_NO_ERROR, syncFd, callback}; |
| } |
| |
| // If CPU fallback is not allowed and there was an error, end execution. |
| if (!mAllowCpuFallback) { |
| return {n, -1, nullptr}; |
| } |
| |
| // If CPU execution was already attempted, return from the function with an error. |
| if (mExecutor->isCpu()) { |
| return {n, -1, nullptr}; |
| } |
| |
| // If the code has reached this point, a potentially recoverable error |
| // occurred during the step executions. Instead, do a full execution |
| // fallback on the CPU. |
| VLOG(EXECUTION) << "Performing full fallback on the CPU."; |
| if (!waitForSyncFences(waitFor)) { |
| return {ANEURALNETWORKS_OP_FAILED, -1, nullptr}; |
| } |
| auto [fallbackN, fallbackOutputShapes, fallbackTiming] = cpuFallbackFull(this); |
| reportTimingWithoutFencedExecutionCallback(fallbackTiming); |
| return {fallbackN, -1, nullptr}; |
| } |
| |
| // In case of partitioned execution, computeFencedInternal call will return the sync |
| // fence and the fenced compute callback returned from the last partition. |
| // Any failed partition will result in whole execution fallback to CPU if |
| // mAllowCpuFallback is set to true. |
| std::tuple<int, int, ExecuteFencedInfoCallback> CompoundExecutionBuilder::computeFencedInternal( |
| const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, |
| const OptionalTimePoint& deadline) { |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeFencedInternal"); |
| VLOG(EXECUTION) << "CompoundExecutionBuilder::computeFencedInternal (from plan, iteratively)"; |
| |
| // We should have detected this earlier in the call chain and fallen back to |
| // non-fenced execution. This is an implementation limitation: In order to |
| // support dynamic temporarires in this code, we'd need to implement |
| // something like the following: |
| // - If a partition has outputs of unknown size, compute that partition in a |
| // non fenced fashion, just as if it were scheduled on a driver that does |
| // not support fenced execution. |
| // - Implement something similar to the code in CompoundExecutionBuilder::computeInternal() |
| // that handles a step execution that fails with |
| // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE. |
| CHECK(!mCompilation->hasDynamicTemporaries()); |
| |
| // Initiate waitForFds, syncFence for the first step. |
| std::vector<int> waitForFds = waitFor; |
| base::unique_fd syncFence; |
| ExecuteFencedInfoCallback executeFencedInfoCallback; |
| |
| std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr); |
| while (true) { |
| VLOG(EXECUTION) << "looking for next StepExecutor"; |
| |
| // Get the current step of the execution. |
| std::shared_ptr<StepExecutor> executor; |
| int n = mPlan->next(controller, &executor, nullptr, nullptr, syncFence.get()); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| // During the interpreted execution of control flow, a loop timeout |
| // might occur in ExecutionPlan::next(). |
| bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT || |
| n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT; |
| if (mAllowCpuFallback && !missedDeadline) break; |
| // Return -1 for the sync fence fd, and nullptr for the callback. |
| return {n, -1, nullptr}; |
| } |
| |
| // If the code reached the end of the plan without error, then return |
| // with no error. |
| if (executor == nullptr) { |
| return {ANEURALNETWORKS_NO_ERROR, syncFence.release(), executeFencedInfoCallback}; |
| } |
| |
| // Attempt to compute a single step of the execution. |
| auto [stepN, syncFd, callback] = |
| executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline); |
| |
| // Update waitForFds, syncFence for the next step. |
| syncFence.reset(syncFd); |
| executeFencedInfoCallback = callback; |
| waitForFds.clear(); |
| if (syncFd >= 0) { |
| waitForFds = {syncFd}; |
| } |
| |
| // If execution was successful, continue to next step. |
| if (stepN == ANEURALNETWORKS_NO_ERROR) { |
| continue; |
| } |
| // If CPU fallback is not allowed and there was an error, end execution. |
| if (!mAllowCpuFallback) { |
| return {stepN, -1, nullptr}; |
| } |
| |
| // If the code reaches this point, then there was an error with the |
| // fallback. In this case, attempt full fallback. |
| break; |
| } |
| |
| // If the code has reached this point, a potentially recoverable error |
| // occurred during the step executions. Instead, do a full execution |
| // fallback on the CPU. |
| VLOG(EXECUTION) << "Performing full fallback on the CPU."; |
| if (!waitForSyncFences(waitFor)) { |
| return {ANEURALNETWORKS_OP_FAILED, -1, nullptr}; |
| } |
| auto [fullN, fullOutputShapes, _] = cpuFallbackFull(this); |
| return {fullN, -1, nullptr}; |
| } |
| |
| int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor, |
| uint64_t timeoutDurationAfterFence, int* syncFence) { |
| CHECK(syncFence != nullptr); |
| NN_RETURN_IF_ERROR( |
| prepareForCompute("startComputeWithDependencies", ExecutionMode::ASYNC_WITH_DEPS)); |
| if (timeoutDurationAfterFence > 0) { |
| if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { |
| LOG(ERROR) |
| << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero " |
| "duration on an ANeuralNetworksExecution " |
| "created from an ANeuralNetworksCompilation that was not created by " |
| "ANeuralNetworksCompilation_createForDevices with numDevices = 1"; |
| return finishComputation(ANEURALNETWORKS_BAD_DATA, {}, ExecutionMode::ASYNC_WITH_DEPS); |
| } |
| } |
| if (!areOutputsFullySpecified()) { |
| LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies" |
| " not all outputs have fully specified dimensions"; |
| return finishComputation(ANEURALNETWORKS_BAD_DATA, {}, ExecutionMode::ASYNC_WITH_DEPS); |
| } |
| |
| // Unlike ExecutionBuilder::compute, we do not need to reset output dimensions here because |
| // fenced executions do not support dynamic output shape. |
| |
| mComputeStartTimePoint = Clock::now(); |
| VLOG(EXECUTION) << "ExecutionBuilder::computeFenced"; |
| int result; |
| const auto deadline = makeDeadline(mTimeoutDuration); |
| std::tie(result, *syncFence, mFencedExecutionCallback) = |
| computeFencedInternal(waitFor, timeoutDurationAfterFence, deadline); |
| // If there is an error, call finishComputation to mark the computation as completed. |
| // Otherwise, we will call finishComputation in SyncFenceEvent::wait(). |
| if (result != ANEURALNETWORKS_NO_ERROR) { |
| // TODO(miaowang): support dynamic output shape only with memory domain. |
| // For now just return empty output shapes. |
| result = finishComputation(result, {}, ExecutionMode::ASYNC_WITH_DEPS); |
| } |
| return result; |
| } |
| |
| int ExecutionBuilder::compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback, |
| BurstBuilder* burstBuilder) { |
| CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr) |
| << "synchronizationCallback and burstBuilder cannot simultaneously be used"; |
| |
| const bool synchronous = (synchronizationCallback == nullptr); |
| if (!synchronous) { |
| *synchronizationCallback = nullptr; |
| } |
| |
| const char* name = burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute"; |
| const ExecutionMode mode = burstBuilder |
| ? ExecutionMode::BURST |
| : synchronous ? ExecutionMode::SYNC : ExecutionMode::ASYNC; |
| NN_RETURN_IF_ERROR(prepareForCompute(name, mode)); |
| |
| // Validate input memory dimensions. We need to do the validation in every computation because |
| // the memory dimensions may change between computations. |
| for (auto& p : mInputs) { |
| if (p.state() == ModelArgumentInfo::MEMORY) { |
| const RuntimeMemory* memory = mMemories[p.locationAndLength().poolIndex]; |
| if (!memory->getValidator().validateInputDimensions(p.dimensions())) { |
| return finishComputation(ANEURALNETWORKS_OP_FAILED, {}, mode); |
| } |
| } |
| } |
| |
| // Reset output dimensions. |
| if (!areOutputsFullySpecified()) { |
| for (auto& output : mOutputs) { |
| output.reset(); |
| } |
| } |
| |
| const auto deadline = makeDeadline(mTimeoutDuration); |
| mComputeStartTimePoint = Clock::now(); |
| if (synchronous) { |
| if (burstBuilder) { |
| VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)"; |
| } else { |
| VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)"; |
| } |
| const auto [n, outputShapes, timing] = computeInternal(deadline, burstBuilder); |
| if (mMeasureTiming) { |
| mTimingWithoutFencedExecutionCallback = timing; |
| } |
| return finishComputation(n, outputShapes, mode); |
| } else /* asynchronous */ { |
| // TODO: For asynchronous execution, entire plan-based-path should run in an |
| // asynchronous thread -- take the asynchronous thread logic out of |
| // CpuExecution::compute() and use it to wrap the plan-based-path. |
| |
| // TODO: use a thread pool |
| // TODO(mikie): this could have NNTRACE so we could measure the overhead |
| // of spinning up a new thread. |
| |
| // Prepare the callback for asynchronous execution. |
| // std::shared_ptr<ExecutionCallback> object is returned when the |
| // execution has been successfully launched, otherwise a |
| // nullptr is returned. The executionCallback is |
| // abstracted in the NN API as an "event". |
| auto executionCallback = std::make_shared<ExecutionCallback>(); |
| executionCallback->setOnFinish( |
| [this, mode](ErrorStatus error, const std::vector<OutputShape>& outputShapes) { |
| return finishComputation(error, outputShapes, mode); |
| }); |
| const auto asyncStartCompute = [this, deadline, executionCallback] { |
| const auto [n, outputShapes, timing] = computeInternal(deadline, nullptr); |
| const auto status = convertResultCodeToErrorStatus(n); |
| executionCallback->notify(status, outputShapes, timing); |
| }; |
| if (DeviceManager::get()->syncExecRuntime()) { |
| VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)"; |
| asyncStartCompute(); |
| } else { |
| VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)"; |
| std::thread asyncExecution(asyncStartCompute); |
| executionCallback->bindThread(std::move(asyncExecution)); |
| } |
| *synchronizationCallback = executionCallback; |
| return ANEURALNETWORKS_NO_ERROR; |
| } |
| } |
| |
| std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const { |
| std::vector<OutputShape> outputShapes(mOutputs.size()); |
| std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(), |
| [](const auto& x) -> OutputShape { |
| std::vector<uint32_t> dimensions; |
| if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) { |
| dimensions = x.dimensions(); |
| } |
| return {.dimensions = std::move(dimensions), .isSufficient = true}; |
| }); |
| return outputShapes; |
| } |
| |
| // Check if the dimensions "to" is updatable by dimensions "from", where "from" must |
| // have no lower a specification level. |
| static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) { |
| if (to.size() == 0) return true; |
| NN_RET_CHECK_EQ(to.size(), from.size()); |
| for (uint32_t i = 0; i < to.size(); i++) { |
| NN_RET_CHECK(to[i] == from[i] || to[i] == 0); |
| } |
| return true; |
| } |
| |
| static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) { |
| return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient && |
| outputShape.dimensions.size() && |
| (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) != |
| outputShape.dimensions.end()); |
| } |
| |
| bool ExecutionBuilder::updateOutputShapes(ErrorStatus status, |
| const std::vector<OutputShape>& outputShapes) { |
| NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes)); |
| |
| if (outputShapes.size() == 0) { |
| return true; |
| } |
| NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size()); |
| for (uint32_t i = 0; i < outputShapes.size(); i++) { |
| // Check if only unspecified dimensions or rank are overwritten. |
| NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions)); |
| const OperandType operandType = mModel->getOutputOperand(i).type; |
| NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType, |
| outputShapes[i].dimensions)); |
| } |
| for (uint32_t i = 0; i < outputShapes.size(); i++) { |
| mOutputs[i].dimensions() = outputShapes[i].dimensions; |
| mOutputs[i].isSufficient() = outputShapes[i].isSufficient; |
| } |
| return true; |
| } |
| |
| bool ExecutionBuilder::updateMemories() { |
| for (const auto& output : mOutputs) { |
| if (output.state() != ModelArgumentInfo::MEMORY) continue; |
| const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex]; |
| NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()})); |
| } |
| return true; |
| } |
| |
| int ExecutionBuilder::finishComputation(int result, const std::vector<OutputShape>& outputShapes, |
| ExecutionMode mode) { |
| const auto status = convertResultCodeToErrorStatus(result); |
| if (!updateOutputShapes(status, outputShapes) || !updateMemories()) { |
| result = ANEURALNETWORKS_OP_FAILED; |
| } |
| bool success = result == ANEURALNETWORKS_NO_ERROR; |
| for (const auto& output : mOutputs) { |
| if (output.state() != ModelArgumentInfo::MEMORY) continue; |
| const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex]; |
| memory->getValidator().setInitialized(success); |
| } |
| switch (result) { |
| case ANEURALNETWORKS_NO_ERROR: |
| mCompletion = Completion::NO_ERROR; |
| break; |
| case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE: |
| mCompletion = Completion::OUTPUT_INSUFFICIENT_SIZE; |
| break; |
| default: |
| mCompletion = Completion::OTHER_ERROR; |
| break; |
| } |
| { |
| std::lock_guard<std::mutex> lock(mStateMutex); |
| CHECK(mState != State::PREPARATION) |
| << "ExecutionBuilder::finishComputation is called in the preparation state"; |
| CHECK(mState != State::COMPLETED) << "ExecutionBuilder::finishComputation is called twice"; |
| mState = State::COMPLETED; |
| } |
| telemetry::onExecutionFinish(this, mode, result); |
| return result; |
| } |
| |
| std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) { |
| return "{ .updatedDynamicTemporary = " + |
| std::to_string(updateOutputShapes.updatedDynamicTemporary) + |
| ", .mainOutputInsufficient = " + |
| std::to_string(updateOutputShapes.mainOutputInsufficient) + "}"; |
| } |
| |
| bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from, |
| std::vector<OutputShape>* to, UpdateOutputShapes* update) { |
| CHECK(update != nullptr); |
| *update = {.updatedDynamicTemporary = false, |
| .mainOutputInsufficient = false, |
| .zeroSizedInput = false}; |
| |
| NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from)); |
| |
| if (from.size() == 0) { |
| return true; |
| } |
| |
| if (VLOG_IS_ON(EXECUTION)) { |
| for (const auto& shape : from) { |
| VLOG(EXECUTION) << "updateOutputShapes: " << shape; |
| } |
| } |
| |
| if (mExecutionStep != nullptr) { |
| const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel(); |
| NN_RET_CHECK_LE(indexMapping.size(), from.size()); |
| for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) { |
| const uint32_t toIndex = indexMapping[i]; |
| NN_RET_CHECK_GT(to->size(), toIndex); |
| NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions)); |
| (*to)[toIndex] = from[i]; |
| update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient; |
| if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) && |
| isZeroSizedTensor(executionResultCode, from[i])) { |
| update->zeroSizedInput = true; |
| } |
| } |
| |
| if (!mDynamicTemporaries->empty()) { |
| // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep? |
| std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp; |
| for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) { |
| operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first); |
| } |
| |
| const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex(); |
| for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) { |
| const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i); |
| const auto it = |
| operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex); |
| if (it == operandIndexStepModelOutputToSourceModelTemp.end()) { |
| continue; |
| } |
| const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second); |
| VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i |
| << " sourceOperandIndex = (" << sourceOperandIndex.first << ", " |
| << sourceOperandIndex.second << ") is a dynamic temporary"; |
| // This is a temporary, but it might not be a dynamic temporary. |
| const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false); |
| if (loc == std::nullopt) { |
| continue; |
| } |
| NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions)); |
| bool changedShape = false; |
| const uint32_t actualSize = TypeManager::get()->getSizeOfData( |
| mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions); |
| if (actualSize > 0) { |
| changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex, |
| from[i].dimensions, actualSize); |
| } else if (!from[i].isSufficient) { |
| NN_RET_CHECK(loc->paddedLength < UINT32_MAX / 2) |
| << "output#" << i << " paddedLength overflow"; |
| changedShape = mDynamicTemporaries->redeclare( |
| sourceOperandIndex, from[i].dimensions, 2 * loc->paddedLength); |
| } else { |
| // The combination of not-fully-specified dimensions |
| // and isSufficient means that we have no |
| // information about whether the size of the dynamic |
| // temporary is adequate. |
| VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i; |
| if (executionResultCode == ANEURALNETWORKS_NO_ERROR) { |
| NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i])); |
| // This is a zero-sized tensor, and by |
| // definition, any dynamic temporary is an input |
| // to an execution step. |
| update->zeroSizedInput = true; |
| } |
| } |
| if (changedShape) { |
| // TODO: find a better place for this comment. |
| // |
| // isUpdatable(a, b) imposes a partial ordering a <= |
| // b. Every fully specified dimensions vector is an |
| // upper bound of that ordering. Therefore, any |
| // change in dimensions moves towards an upper |
| // bound, and hence there are a finite number of |
| // such changes possible. |
| // |
| // actualSize can only be computed from dimensions |
| // that are an upper bound. Therefore, once |
| // actualSize is computed, it will not change. |
| // |
| // If dimensions are not fully specified, and |
| // estimated size changes, it increases. There is |
| // an upper bound on estimated size to avoid |
| // overflow. |
| // |
| // Therefore, if we retry only when dimensions or |
| // size chage, and we stop retrying if we would |
| // otherwise overflow, we should only retry a finite |
| // number of times. |
| update->updatedDynamicTemporary = true; |
| } |
| } |
| mDynamicTemporaries->vlogDump("finished updateOutputShapes"); |
| } |
| } else { |
| NN_RET_CHECK_EQ(from.size(), to->size()); |
| for (uint32_t i = 0, e = from.size(); i < e; i++) { |
| NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions)); |
| (*to)[i] = from[i]; |
| } |
| } |
| return true; |
| } |
| |
| StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, |
| std::shared_ptr<Device> device, |
| std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable, |
| const ExecutionStep* step, DynamicTemporaries* dynamicTemporaries) |
| : mExecutionBuilder(executionBuilder), |
| mExecutionStep(step), |
| mDynamicTemporaries(dynamicTemporaries), |
| mModel(model), |
| mDevice(device), |
| mPreparedModel(preparedModel), |
| mInputs(model->inputCount()), |
| mOutputs(model->outputCount()), |
| mReusable(reusable) { |
| CHECK(mDevice != nullptr); |
| CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr); |
| CHECK(!(reusable && dynamicTemporaries != nullptr)); |
| VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and " |
| << mOutputs.size() << " outputs"; |
| } |
| |
| bool StepExecutor::areDynamicTemporariesAllocated() const { |
| return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex()); |
| } |
| |
| void StepExecutor::mapInputsAndOutputsTrivially() { |
| mInputs = mExecutionBuilder->mInputs; |
| mOutputs = mExecutionBuilder->mOutputs; |
| mMemories = mExecutionBuilder->mMemories; |
| } |
| |
| void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, |
| ModelArgumentInfo* executorInputOrOutput, |
| const Dimensions* builderDimensions) { |
| auto updateDimensions = [executorInputOrOutput, builderDimensions] { |
| if (!builderDimensions) { |
| return; |
| } |
| executorInputOrOutput->dimensions() = *builderDimensions; |
| }; |
| |
| *executorInputOrOutput = builderInputOrOutput; |
| switch (executorInputOrOutput->state()) { |
| default: |
| CHECK(false) << "unexpected ModelArgumentInfo::state"; |
| break; |
| case ModelArgumentInfo::HAS_NO_VALUE: |
| case ModelArgumentInfo::UNSPECIFIED: |
| break; |
| case ModelArgumentInfo::POINTER: |
| updateDimensions(); |
| break; |
| case ModelArgumentInfo::MEMORY: { |
| updateDimensions(); |
| const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex; |
| const RuntimeMemory* memory = mExecutionBuilder->mMemories[builderPoolIndex]; |
| const uint32_t executorPoolIndex = mMemories.add(memory); |
| executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex; |
| break; |
| } |
| } |
| } |
| |
| int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, |
| const RuntimeMemory* memory, uint32_t offset, |
| uint32_t length, const Dimensions& dimensions, |
| ModelArgumentInfo* inputOrOutputInfo) { |
| // Should be similar to |
| // ExecutionBuilder::setInputFromMemory() |
| // ExecutionBuilder::setOutputFromMemory() |
| |
| uint32_t poolIndex = mMemories.add(memory); |
| CHECK(inputOrOutputInfo->unspecified()); |
| int n; |
| std::tie(n, *inputOrOutputInfo) = |
| ModelArgumentInfo::createFromMemory(inputOrOutputOperand, |
| /*type=*/nullptr, poolIndex, offset, length); |
| if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) { |
| CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions)); |
| inputOrOutputInfo->dimensions() = dimensions; |
| } |
| return n; |
| } |
| |
| static std::string toString(std::vector<uint32_t> dimensions) { |
| std::string ret = "("; |
| bool wroteOne = false; |
| for (uint32_t dimension : dimensions) { |
| if (wroteOne) { |
| ret += ", "; |
| } else { |
| wroteOne = true; |
| } |
| ret += std::to_string(dimension); |
| } |
| ret += ")"; |
| return ret; |
| }; |
| |
| static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) { |
| for (unsigned i = 0; i < args.size(); i++) { |
| const auto& arg = args[i]; |
| std::string prefix = kind + std::string("[") + std::to_string(i) + "] = "; |
| switch (arg.state()) { |
| case ModelArgumentInfo::POINTER: |
| VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim" |
| << toString(arg.dimensions()); |
| break; |
| case ModelArgumentInfo::MEMORY: |
| VLOG(EXECUTION) << prefix << "MEMORY(" |
| << "pool=" << arg.locationAndLength().poolIndex << ", " |
| << "off=" << arg.locationAndLength().offset << ") dim" |
| << toString(arg.dimensions()); |
| break; |
| case ModelArgumentInfo::HAS_NO_VALUE: |
| VLOG(EXECUTION) << prefix << "HAS_NO_VALUE"; |
| break; |
| case ModelArgumentInfo::UNSPECIFIED: |
| VLOG(EXECUTION) << prefix << "UNSPECIFIED"; |
| break; |
| default: |
| VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")"; |
| break; |
| } |
| } |
| } |
| |
| bool StepExecutor::isCpu() const { |
| return mDevice == DeviceManager::getCpuDevice(); |
| } |
| |
| std::pair<int, std::shared_ptr<RuntimeExecution>> StepExecutor::getReusableExecution() { |
| CHECK(mReusable); |
| if (mExecution == nullptr) { |
| CHECK(mPreparedModel != nullptr); |
| const MeasureTiming measure = measureTiming(mExecutionBuilder); |
| const OptionalDuration loopTimeoutDuration = |
| makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); |
| auto [n, execution] = mPreparedModel->createReusableExecution( |
| mInputs, mOutputs, mMemories.getObjects(), measure, loopTimeoutDuration, |
| mExecutionBuilder->getMetadata()); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| return {n, nullptr}; |
| } |
| mExecution = std::move(execution); |
| } |
| return {ANEURALNETWORKS_NO_ERROR, mExecution}; |
| } |
| |
| std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::compute( |
| const OptionalTimePoint& deadline, const SharedBurst& burstController) { |
| if (VLOG_IS_ON(EXECUTION)) { |
| logArguments("input", mInputs); |
| logArguments("output", mOutputs); |
| } |
| |
| int n; |
| std::vector<OutputShape> outputShapes; |
| Timing timing; |
| if (mReusable) { |
| auto [nCreate, execution] = getReusableExecution(); |
| if (nCreate != ANEURALNETWORKS_NO_ERROR) { |
| return {nCreate, {}, {}}; |
| } |
| std::tie(n, outputShapes, timing) = execution->compute(burstController, deadline); |
| } else { |
| CHECK(mPreparedModel != nullptr); |
| const MeasureTiming measure = measureTiming(mExecutionBuilder); |
| const OptionalDuration loopTimeoutDuration = |
| makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); |
| std::tie(n, outputShapes, timing) = mPreparedModel->execute( |
| mInputs, mOutputs, mMemories.getObjects(), burstController, measure, deadline, |
| loopTimeoutDuration, mExecutionBuilder->getMetadata()); |
| } |
| mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing); |
| return {n, std::move(outputShapes), std::move(timing)}; |
| } |
| |
| std::tuple<int, int, ExecuteFencedInfoCallback> StepExecutor::computeFenced( |
| const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, |
| const OptionalTimePoint& deadline) { |
| if (VLOG_IS_ON(EXECUTION)) { |
| logArguments("input", mInputs); |
| logArguments("output", mOutputs); |
| } |
| |
| OptionalDuration optionalTimeoutDurationAfterFence; |
| if (timeoutDurationAfterFence > 0) { |
| optionalTimeoutDurationAfterFence = makeTimeoutDuration(timeoutDurationAfterFence); |
| } |
| |
| int n; |
| int syncFenceFd; |
| ExecuteFencedInfoCallback executeFencedInfoCallback; |
| Timing timing; |
| if (mReusable) { |
| auto [nCreate, execution] = getReusableExecution(); |
| if (nCreate != ANEURALNETWORKS_NO_ERROR) { |
| return {nCreate, -1, nullptr}; |
| } |
| std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) = |
| execution->computeFenced(waitFor, deadline, optionalTimeoutDurationAfterFence); |
| } else { |
| CHECK(mPreparedModel != nullptr); |
| const MeasureTiming measure = measureTiming(mExecutionBuilder); |
| const OptionalDuration loopTimeoutDuration = |
| makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); |
| std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) = mPreparedModel->executeFenced( |
| mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline, |
| loopTimeoutDuration, optionalTimeoutDurationAfterFence, |
| mExecutionBuilder->getMetadata()); |
| } |
| if (syncFenceFd < 0 && executeFencedInfoCallback == nullptr) { |
| mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing); |
| } |
| return {n, syncFenceFd, executeFencedInfoCallback}; |
| } |
| |
| // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute. |
| std::tuple<int, std::vector<OutputShape>, Timing> StepExecutor::computeOnCpuFallback() { |
| NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback"); |
| VLOG(EXECUTION) << "Re-compile the model on CPU"; |
| const ModelFactory makeModel = [this] { return mModel->makeModel(); }; |
| // TODO: Propagate user preference and compilation priority to this point instead of using |
| // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and |
| // ANEURALNETWORKS_PRIORITY_MEDIUM |
| const ExecutionPreference preference = |
| static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER); |
| const Priority priority = convertToCanonicalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT); |
| auto [n, preparedModel] = DeviceManager::getCpuDevice()->prepareModel( |
| makeModel, preference, priority, {}, {}, {}, {}, {}); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| return {n, {}, {}}; |
| } |
| |
| // Prepare device memories for CPU fallback. |
| std::vector<const RuntimeMemory*> memories = mMemories.getObjects(); |
| std::vector<bool> isUsedAsInput(memories.size(), false); |
| std::vector<bool> isUsedAsOutput(memories.size(), false); |
| std::vector<std::unique_ptr<RuntimeMemory>> blobAhwbs; |
| |
| // Mark the input and output usages. |
| for (auto& input : mInputs) { |
| if (input.state() == ModelArgumentInfo::MEMORY) { |
| const uint32_t poolIndex = input.locationAndLength().poolIndex; |
| isUsedAsInput[poolIndex] = true; |
| } |
| } |
| for (auto& output : mOutputs) { |
| if (output.state() == ModelArgumentInfo::MEMORY) { |
| const uint32_t poolIndex = output.locationAndLength().poolIndex; |
| // Cannot allocate output buffers with unknown shapes. |
| if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) { |
| LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands " |
| "has unknown shape."; |
| return {ANEURALNETWORKS_OP_FAILED, {}, {}}; |
| } |
| isUsedAsOutput[poolIndex] = true; |
| } |
| } |
| |
| // Allocate BLOB mode AHardwareBuffers and read the data from input device memories. |
| for (uint32_t i = 0; i < memories.size(); i++) { |
| const RuntimeMemory* memory = mMemories[i]; |
| if (memory->getIBuffer() != nullptr) { |
| const uint32_t size = memory->getValidator().getMetadata().logicalSize; |
| auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size); |
| if (nAhwb != ANEURALNETWORKS_NO_ERROR) { |
| return {nAhwb, {}, {}}; |
| } |
| if (isUsedAsInput[i]) { |
| n = copyIBufferToMemory(memory->getIBuffer(), blobAhwb->getMemory()); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| return {n, {}, {}}; |
| } |
| } |
| memories[i] = blobAhwb.get(); |
| blobAhwbs.push_back(std::move(blobAhwb)); |
| } |
| } |
| |
| const MeasureTiming measure = measureTiming(mExecutionBuilder); |
| const OptionalDuration loopTimeoutDuration = |
| makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); |
| auto [nExecute, outputShapes, timing] = preparedModel->execute( |
| mInputs, mOutputs, memories, nullptr, measure, {}, loopTimeoutDuration, {}); |
| mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing); |
| if (nExecute != ANEURALNETWORKS_NO_ERROR) { |
| return {nExecute, std::move(outputShapes), timing}; |
| } |
| |
| // Write back to output device memories. |
| for (uint32_t i = 0; i < memories.size(); i++) { |
| const RuntimeMemory* memory = mMemories[i]; |
| if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) { |
| n = copyMemoryToIBuffer(memories[i]->getMemory(), memory->getIBuffer(), {}); |
| if (n != ANEURALNETWORKS_NO_ERROR) { |
| return {n, {}, {}}; |
| } |
| } |
| } |
| return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing}; |
| } |
| |
| } // namespace nn |
| } // namespace android |