1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ANDROID_ML_NN_RUNTIME_EXECUTION_BUILDER_H 18 #define ANDROID_ML_NN_RUNTIME_EXECUTION_BUILDER_H 19 20 #include "Callbacks.h" 21 #include "HalInterfaces.h" 22 #include "Memory.h" 23 #include "ModelBuilder.h" 24 #include "NeuralNetworks.h" 25 #include "VersionedInterfaces.h" 26 27 #include <atomic> 28 #include <unordered_map> 29 #include <vector> 30 31 using ::android::hardware::neuralnetworks::V1_2::implementation::ExecutionCallback; 32 using ::android::hardware::neuralnetworks::V1_2::implementation::PreparedModelCallback; 33 34 namespace android { 35 namespace nn { 36 37 class BurstBuilder; 38 class CompilationBuilder; 39 class ExecutionPlan; 40 class ExecutionBurstController; 41 class ExecutionStep; 42 class Memory; 43 class ModelBuilder; 44 class StepExecutor; 45 class Device; 46 47 // TODO move length out of DataLocation 48 struct ModelArgumentInfo { 49 // Whether the argument was specified as being in a Memory, as a pointer, 50 // has no value, or has not been specified. 51 // If POINTER then: 52 // locationAndLength.length is valid. 53 // dimensions is valid. 54 // buffer is valid 55 // If MEMORY then: 56 // locationAndLength.{poolIndex, offset, length} is valid. 57 // dimensions is valid. 58 enum { POINTER, MEMORY, HAS_NO_VALUE, UNSPECIFIED } state = UNSPECIFIED; 59 DataLocation locationAndLength; 60 std::vector<uint32_t> dimensions; 61 void* buffer; 62 bool isSufficient = true; 63 64 int setFromPointer(const Operand& operand, const ANeuralNetworksOperandType* type, void* buffer, 65 uint32_t length); 66 int setFromMemory(const Operand& operand, const ANeuralNetworksOperandType* type, 67 uint32_t poolIndex, uint32_t offset, uint32_t length); 68 int setFromTemporaryMemory(const Operand& operand, uint32_t poolIndex, uint32_t offset, 69 uint32_t length); 70 int updateDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType); 71 }; 72 73 class ExecutionBuilder { 74 friend class StepExecutor; 75 public: 76 ExecutionBuilder(const CompilationBuilder* compilation); 77 78 int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer, 79 size_t length); 80 int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, 81 const Memory* memory, size_t offset, size_t length); 82 int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer, 83 size_t length); 84 int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, 85 const Memory* memory, size_t offset, size_t length); 86 87 int setMeasureTiming(bool measure); 88 89 int getDuration(int32_t durationCode, uint64_t* duration) const; 90 computeAsynchronously(sp<ExecutionCallback> * synchronizationCallback)91 int computeAsynchronously(sp<ExecutionCallback>* synchronizationCallback) { 92 CHECK(synchronizationCallback != nullptr); 93 return compute(synchronizationCallback); 94 } computeSynchronously()95 int computeSynchronously() { return compute(nullptr); } burstCompute(BurstBuilder * burst)96 int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); } 97 98 // Initialize output dimensional information from ModelArgumentInfo. 99 void initializeOutputShapes(std::vector<OutputShape>* outputShapes) const; 100 101 int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions); 102 int getOutputOperandRank(uint32_t index, uint32_t* rank); 103 104 // Handshake with lower-level execution support measureTiming()105 bool measureTiming() const { return mMeasureTiming; } reportTiming(Timing timing)106 void reportTiming(Timing timing) { mTiming = timing; } 107 getCompilation()108 const CompilationBuilder* getCompilation() const { return mCompilation; } getModel()109 const ModelBuilder* getModel() const { return mModel; } 110 111 ErrorStatus finish(ErrorStatus error, const std::vector<OutputShape>& outputShapes); 112 113 private: 114 // If a callback is provided, then this is asynchronous. If a callback is 115 // not provided (i.e., is nullptr), then this is synchronous. 116 // 117 // If burst is provided, then the burst path will be used. If a burst is not 118 // provided (i.e., is nullptr), then a synchronous execution will occur. 119 // 120 // Providing both synchronizationCallback and burstBuilder is an error. 121 int compute(sp<ExecutionCallback>* synchronizationCallback, 122 BurstBuilder* burstBuilder = nullptr); 123 124 const CompilationBuilder* mCompilation; 125 126 // Update output dimensional information from OutputShape to ModelArgumentInfo. 127 bool updateOutputShapes(const std::vector<OutputShape>& outputShapes); 128 129 const ModelBuilder* mModel; 130 const ExecutionPlan* mPlan; 131 132 // This is a DeviceManager::kPartitioning* value captured from 133 // CompilationBuilder when the ExecutionBuilder is constructed. 134 uint32_t mPartitioning; 135 136 // The information we'll send to the driver about the inputs and outputs. 137 // Note that we build this in two steps: 138 // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. 139 // If set from a pointer, don't set the location in the RequestArgument but store it 140 // instead in mInputBuffers or mOutputBuffers. 141 // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for 142 // the m*Buffers entries. Copy the input values into the shared memory. 143 // We do this to avoid creating a lot of shared memory objects if we have a lot of 144 // parameters specified via pointers. We also avoid copying in the case where 145 // some of the nodes will interpreted on the CPU anyway. 146 std::vector<ModelArgumentInfo> mInputs; 147 std::vector<ModelArgumentInfo> mOutputs; 148 MemoryTracker mMemories; 149 150 // Do we ask the driver to measure timing? 151 bool mMeasureTiming = false; 152 153 // Timing reported from the driver 154 Timing mTiming = {}; 155 156 // Properties cannot be set once the execution has started. 157 std::atomic_bool mStarted = false; 158 159 // Timing and output shapes can only be queried after the execution is 160 // finished. 161 std::atomic_bool mFinished = false; 162 }; 163 164 // class StepExecutor is used to execute a single "step" in a 165 // potentially multiple step execution process. The graph associated 166 // with that step is executed in its entirety on a single device (or 167 // on the CPU). 168 class StepExecutor { 169 public: 170 // executionBuilder 171 // Describes the full (possibly multiple-"step") execution. 172 // model 173 // The model to be executed by the executor. Possibly a 174 // submodel of the model from executionBuilder. 175 // driver, preparedModel 176 // The device on which to execute the "step", and the prepared 177 // model to execute on that device. (Both are nullptr in the 178 // case of CPU.) 179 StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, 180 std::shared_ptr<Device> device, 181 std::shared_ptr<VersionedIPreparedModel> preparedModel); 182 183 // Map inputs and outputs from ExecutionBuilder to StepExecutor, 184 // in the case where we have a single-"step" execution (i.e., the executor 185 // is executing the entire model from the ExecutionBuilder). 186 void mapInputsAndOutputsTrivially(); 187 188 // Update output shapes returned from ExecutionCallback to ExecutionBuilder. 189 bool updateOutputShapes(const std::vector<OutputShape>& from, std::vector<OutputShape>* to); 190 191 // Map inputs and outputs from ExecutionBuilder to StepExecutor, 192 // one at a time. Note that these are input/output indexes, not 193 // operand indexes. mapInput(uint32_t builderIndex,uint32_t executorIndex)194 void mapInput(uint32_t builderIndex, uint32_t executorIndex) { 195 mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]); 196 } mapOutput(uint32_t builderIndex,uint32_t executorIndex)197 void mapOutput(uint32_t builderIndex, uint32_t executorIndex) { 198 mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]); 199 } mapOutputToInput(uint32_t builderIndex,uint32_t executorIndex)200 void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex) { 201 mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], 202 &mInputs[executorIndex]); 203 } 204 205 // The input or output is assumed to have the size of the 206 // corresponding operand. setInputFromTemporaryMemory(uint32_t inputIndex,const Memory * memory,uint32_t offset)207 int setInputFromTemporaryMemory(uint32_t inputIndex, const Memory* memory, uint32_t offset) { 208 return setInputOrOutputFromTemporaryMemory(mModel->getInputOperand(inputIndex), 209 memory, offset, 210 &mInputs.at(inputIndex)); 211 } setOutputFromTemporaryMemory(uint32_t outputIndex,const Memory * memory,uint32_t offset)212 int setOutputFromTemporaryMemory(uint32_t outputIndex, const Memory* memory, uint32_t offset) { 213 return setInputOrOutputFromTemporaryMemory(mModel->getOutputOperand(outputIndex), 214 memory, offset, 215 &mOutputs.at(outputIndex)); 216 } 217 218 // Executes using the (driver, preparedModel) specified at construction time. 219 int startCompute(sp<ExecutionCallback>* synchronizationCallback, 220 const std::shared_ptr<ExecutionBurstController>& burstController = nullptr); 221 222 // Executes using the CPU, regardless of the (driver, 223 // preparedModel) specified at construction time. 224 int startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback); 225 226 bool isCpu() const; 227 228 // ExecutionStep has the index mapping between ExecutionBuilder and StepExecutor. setExecutionStep(const std::shared_ptr<const ExecutionStep> & step)229 void setExecutionStep(const std::shared_ptr<const ExecutionStep>& step) { 230 mExecutionStep = step; 231 } 232 233 private: 234 int allocatePointerArgumentsToPool(std::vector<ModelArgumentInfo>* args, Memory* memory); 235 int startComputeOnDevice(sp<ExecutionCallback>* synchronizationCallback, 236 const std::shared_ptr<ExecutionBurstController>& burstController); 237 238 void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, 239 ModelArgumentInfo* executorInputOrOutput); 240 241 int setInputOrOutputFromTemporaryMemory(const Operand& inputOrOutputOperand, 242 const Memory* memory, uint32_t offset, 243 ModelArgumentInfo* inputOrOutputInfo); 244 245 // describes the full (possibly multiple-"step") execution 246 ExecutionBuilder* mExecutionBuilder; 247 248 // describes the single execution step 249 std::shared_ptr<const ExecutionStep> mExecutionStep = nullptr; 250 251 // model to be executed on the executor, in both original and 252 // compiled forms; and device on which to execute it 253 const ModelBuilder* mModel; 254 std::shared_ptr<Device> mDevice; 255 std::shared_ptr<VersionedIPreparedModel> 256 mPreparedModel; // nullptr if CPU execution or if bypassing ExecutionPlan 257 258 // The information we'll send to the driver about the inputs and outputs. 259 // Note that we build this in two steps: 260 // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. 261 // If set from a pointer, don't set the location in the RequestArgument but store it 262 // instead in mInputBuffers or mOutputBuffers. 263 // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for 264 // the m*Buffers entries. Copy the input values into the shared memory. 265 // We do this to avoid creating a lot of shared memory objects if we have a lot of 266 // parameters specified via pointers. We also avoid copying in the case where 267 // some of the nodes will interpreted on the CPU anyway. 268 std::vector<ModelArgumentInfo> mInputs; 269 std::vector<ModelArgumentInfo> mOutputs; 270 MemoryTracker mMemories; 271 }; 272 273 } // namespace nn 274 } // namespace android 275 276 #endif // ANDROID_ML_NN_RUNTIME_EXECUTION_BUILDER_H 277