1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ANDROID_PACKAGES_MODULES_NEURALNETWORKS_RUNTIME_EXECUTION_BUILDER_H 18 #define ANDROID_PACKAGES_MODULES_NEURALNETWORKS_RUNTIME_EXECUTION_BUILDER_H 19 20 #include <ControlFlow.h> 21 #include <CpuExecutor.h> 22 #include <android-base/thread_annotations.h> 23 #include <nnapi/IBurst.h> 24 #include <nnapi/IPreparedModel.h> 25 #include <nnapi/Types.h> 26 #include <nnapi/Validation.h> 27 28 #include <memory> 29 #include <set> 30 #include <string> 31 #include <tuple> 32 #include <utility> 33 #include <vector> 34 35 #include "ExecutionCallback.h" 36 #include "Memory.h" 37 #include "ModelArgumentInfo.h" 38 #include "ModelBuilder.h" 39 #include "NeuralNetworks.h" 40 41 namespace android { 42 namespace nn { 43 44 class BurstBuilder; 45 class CompilationBuilder; 46 class Device; 47 class DynamicTemporaries; 48 class ExecutionPlan; 49 class ExecutionStep; 50 class ModelBuilder; 51 class RuntimeMemory; 52 class RuntimePreparedModel; 53 class RuntimeExecution; 54 class StepExecutor; 55 56 // Execution modes 57 enum class ExecutionMode { ASYNC, SYNC, BURST, ASYNC_WITH_DEPS }; 58 59 class ExecutionBuilder { 60 friend class StepExecutor; 61 62 public: 63 explicit ExecutionBuilder(const CompilationBuilder* compilation); 64 virtual ~ExecutionBuilder() = default; 65 66 int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer, 67 size_t length); 68 int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, 69 const RuntimeMemory* memory, size_t offset, size_t length); 70 int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer, 71 size_t length); 72 int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, 73 const RuntimeMemory* memory, size_t offset, size_t length); 74 75 int setMeasureTiming(bool measure); 76 77 int getDuration(int32_t durationCode, uint64_t* duration) const; 78 79 int setTimeoutDuration(uint64_t duration); 80 81 std::optional<uint64_t> getTimeoutDuration() const; 82 83 int setLoopTimeout(uint64_t duration); 84 getLoopTimeoutDuration()85 uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; } 86 87 int enableInputAndOutputPadding(bool enable); 88 89 int setReusable(bool reusable); 90 91 int addExtensionAttribute(const char* extensionName, uint16_t attributeCodeWithinExtension, 92 const void* data, size_t length); 93 94 int computeFenced(const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence, 95 int* sync_fence); 96 computeAsynchronously(std::shared_ptr<ExecutionCallback> * synchronizationCallback)97 int computeAsynchronously(std::shared_ptr<ExecutionCallback>* synchronizationCallback) { 98 CHECK(synchronizationCallback != nullptr); 99 return compute(synchronizationCallback); 100 } computeSynchronously()101 int computeSynchronously() { return compute(nullptr); } burstCompute(BurstBuilder * burst)102 int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); } 103 104 // Initialize output dimensional information from ModelArgumentInfo. 105 std::vector<OutputShape> getInitialOutputShapes() const; 106 107 int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions); 108 int getOutputOperandRank(uint32_t index, uint32_t* rank); 109 110 // Handshake with lower-level execution support measureTiming()111 bool measureTiming() const { return mMeasureTiming; } reportTimingWithoutFencedExecutionCallback(Timing timing)112 void reportTimingWithoutFencedExecutionCallback(Timing timing) { 113 mTimingWithoutFencedExecutionCallback = timing; 114 } 115 getCompilation()116 const CompilationBuilder* getCompilation() const { return mCompilation; } getModel()117 const ModelBuilder* getModel() const { return mModel; } 118 const ModelBuilder* getSourceModel(uint32_t index) const; getSourceOperand(const std::pair<uint32_t,uint32_t> & sourceOperandIndex)119 const Operand& getSourceOperand(const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const { 120 return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second); 121 } 122 123 // This method will be called at the end of all computation paths to change the state 124 // of the execution object and update output shapes / memories. 125 int finishComputation(int result, const std::vector<OutputShape>& outputShapes, 126 ExecutionMode mode); finishComputation(ErrorStatus error,const std::vector<OutputShape> & outputShapes,ExecutionMode mode)127 ErrorStatus finishComputation(ErrorStatus error, const std::vector<OutputShape>& outputShapes, 128 ExecutionMode mode) { 129 const int result = 130 finishComputation(convertErrorStatusToResultCode(error), outputShapes, mode); 131 return convertResultCodeToErrorStatus(result); 132 } 133 getExecuteFencedInfoCallback()134 const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() { 135 return mFencedExecutionCallback; 136 } 137 inFlight()138 bool inFlight() const { 139 std::lock_guard<std::mutex> lock(mStateMutex); 140 return mState == State::COMPUTATION; 141 } completed()142 bool completed() const { 143 std::lock_guard<std::mutex> lock(mStateMutex); 144 return mState == State::COMPLETED; 145 } 146 147 // Retrieve a computation start point 148 TimePoint getComputeStartTimePoint() const; 149 getInputInfo(uint32_t index)150 const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; } getOutputInfo(uint32_t index)151 const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; } 152 getRunTimePoolInfo(uint32_t poolIndex)153 std::optional<RunTimePoolInfo> getRunTimePoolInfo(uint32_t poolIndex) const { 154 return mMemories[poolIndex]->getRunTimePoolInfo(); 155 } 156 getMetadata()157 const std::vector<TokenValuePair>& getMetadata() const { return mMetadata; } 158 159 protected: 160 // If a callback is provided, then this is asynchronous. If a callback is 161 // not provided (i.e., is nullptr), then this is synchronous. 162 // 163 // If burst is provided, then the burst path will be used. If a burst is not 164 // provided (i.e., is nullptr), then a synchronous execution will occur. 165 // 166 // Providing both synchronizationCallback and burstBuilder is an error. 167 int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback, 168 BurstBuilder* burstBuilder = nullptr); 169 170 virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( 171 const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0; 172 173 virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( 174 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, 175 const OptionalTimePoint& deadline) = 0; 176 177 // This method handles the common preparation and validation logic of compute and computeFenced. 178 // It will be called at the start of every computation. 179 int prepareForCompute(const char* name, ExecutionMode mode); 180 181 const CompilationBuilder* mCompilation; 182 183 // Update output dimensional information from OutputShape to ModelArgumentInfo. 184 bool updateOutputShapes(ErrorStatus status, const std::vector<OutputShape>& outputShapes); 185 186 bool updateMemories(); 187 188 const ModelBuilder* mModel; 189 const ExecutionPlan* mPlan; 190 191 // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured 192 // from CompilationBuilder when the ExecutionBuilder is constructed. 193 bool mAllowCpuFallback; 194 195 // The information we'll send to the driver about the inputs and outputs. 196 // Note that we build this in two steps: 197 // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. 198 // If set from a pointer, don't set the location in the Request::Argument but store it 199 // instead in mInputBuffers or mOutputBuffers. 200 // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for 201 // the m*Buffers entries. Copy the input values into the shared memory. 202 // We do this to avoid creating a lot of shared memory objects if we have a lot of 203 // parameters specified via pointers. We also avoid copying in the case where 204 // some of the nodes will interpreted on the CPU anyway. 205 std::vector<ModelArgumentInfo> mInputs; 206 std::vector<ModelArgumentInfo> mOutputs; 207 MemoryTracker mMemories; 208 209 // Do we ask the driver to measure timing? 210 bool mMeasureTiming = false; 211 212 // Timepoint of computation start, used to evaluate timing 213 // from runtime perspective 214 TimePoint mComputeStartTimePoint; 215 216 // Timing reported from the driver. This field is only used if 217 // mFencedExecutionCallback is nullptr. 218 Timing mTimingWithoutFencedExecutionCallback = {}; 219 220 // Amount of time to complete or abort the execution. 221 std::optional<uint64_t> mTimeoutDuration; 222 223 // Amount of time to complete or abort a loop. 224 uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault; 225 226 // The state of the execution. 227 // Properties can only been set when the execution is in the state State::PREPARATION. 228 // Timing and output shapes can only be queried when the execution is in the state 229 // State::COMPLETED. 230 enum class State { PREPARATION, COMPUTATION, COMPLETED }; 231 State mState GUARDED_BY(mStateMutex) = State::PREPARATION; computationStarted()232 bool computationStarted() const { 233 std::lock_guard<std::mutex> lock(mStateMutex); 234 return mState != State::PREPARATION; 235 } 236 237 // Mutex to guard mState. Note that this not strictly needed because we provide 238 // no thread-safety guarantee to the ANeuralNetworksExecution object. 239 mutable std::mutex mStateMutex; 240 241 // Return false if the execution is in a bad state for starting computation. 242 // Otherwise, return true and set the state to State::COMPUTATION. 243 bool checkAndSetComputationState(const char* name); 244 245 // With what error status has execution completed? 246 enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR }; 247 Completion mCompletion = Completion::OTHER_ERROR; completedWith()248 Completion completedWith() const { 249 CHECK(completed()); 250 return mCompletion; 251 } 252 253 // The result code of request validation. 254 // It is only evaluated once at the first time it's needed. 255 std::optional<int> mValidationResultCode; 256 int getValidationResultCode(); 257 258 // Does every tensor output operand of the model have a fully specified shape? 259 // It is only evaluated once at the first time it's needed. 260 std::optional<bool> mOutputsFullySpecified; 261 bool areOutputsFullySpecified(); 262 263 // The callback used to query execution related info in the case of fenced 264 // execution; otherwise, nullptr. If the execution plan has multiple steps, 265 // this is the callback associated with the last step. If the last step 266 // doesn't support fenced execution (e.g., the driver is too old), or if the 267 // launch of execution on the driver fails, then this callback will be 268 // nullptr. 269 ExecuteFencedInfoCallback mFencedExecutionCallback; 270 271 // Whether set{Input,Output}[FromMemory] can accept padded length or not. 272 bool mInputAndOutputPaddingEnabled = false; 273 274 // enableInputAndOutputPadding may only be called before any call of 275 // set{Input,Output}[FromMemory] 276 bool mHasCalledSetInputOutput = false; 277 278 // Can compute APIs be invoked multiple times on the execution object? 279 bool mReusable = false; 280 281 // Vendor specific metadata 282 std::vector<TokenValuePair> mMetadata; 283 }; 284 285 // For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device. 286 class SimpleExecutionBuilder : public ExecutionBuilder { 287 public: 288 SimpleExecutionBuilder(const CompilationBuilder* compilation); 289 290 std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( 291 const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override; 292 293 std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( 294 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, 295 const OptionalTimePoint& deadline) override; 296 297 private: 298 std::shared_ptr<StepExecutor> mExecutor; 299 }; 300 301 // For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps. 302 class CompoundExecutionBuilder : public ExecutionBuilder { 303 public: 304 CompoundExecutionBuilder(const CompilationBuilder* compilation); 305 306 std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( 307 const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override; 308 309 std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( 310 const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, 311 const OptionalTimePoint& deadline) override; 312 }; 313 314 // class StepExecutor is used to execute a single "step" in a 315 // potentially multiple step execution process. The graph associated 316 // with that step is executed in its entirety on a single device (or 317 // on the CPU). 318 class StepExecutor { 319 public: 320 // executionBuilder 321 // Describes the full (possibly multiple-"step") execution. 322 // model 323 // The model to be executed by the executor. Possibly a single 324 // "step" model of a multiple-"step" executionBuilder. 325 // driver, preparedModel 326 // The device on which to execute the "step", and the prepared 327 // model to execute on that device. For non-fallback StepExecutor, 328 // neither is nullptr; for fallback StepExecutor, both are ignored in 329 // StepExecutor::computeOnCpuFallback and may be nullptr. 330 // reusable 331 // If true, multiple StepExecutor::compute/computeFenced may be called on this 332 // object; otherwise, only one StepExecutor::compute/computeFenced may be called. 333 // reusable must be false if mDynamicTemporaries != nullptr. 334 // step 335 // Contains the output index mapping from the excerpted "step" model to 336 // main model if the execution has multiple "steps". Must be nullptr 337 // otherwise. 338 // (step == nullptr) == (dynamicTemporaries == nullptr) 339 // dynamicTemporaries 340 // If the execution has multiple "steps", describes the temporaries 341 // of source models that do not have fully specified types and are outputs 342 // of "step" models. Must be nullptr otherwise. 343 // (step == nullptr) == (dynamicTemporaries == nullptr) 344 StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, 345 std::shared_ptr<Device> device, 346 std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable, 347 const ExecutionStep* step = nullptr, 348 DynamicTemporaries* dynamicTemporaries = nullptr); 349 350 // Map inputs and outputs from ExecutionBuilder to StepExecutor, 351 // in the case where we have a single-"step" execution (i.e., the executor 352 // is executing the entire model from the ExecutionBuilder). 353 void mapInputsAndOutputsTrivially(); 354 355 // Update output shapes with shapes returned from execution. 356 struct UpdateOutputShapes { 357 // These fields are meaningless unless updateOutputShapes() returns true 358 bool updatedDynamicTemporary; // did shape (dimensions, size) information change for at 359 // least one dynamic temporary? 360 bool mainOutputInsufficient; // is at least one main model output written by this execution 361 // marked !isSufficient? 362 bool zeroSizedInput; // is at least one output of this execution step a zero-sized tensor 363 // that needs to be read by some other step of the same execution? 364 }; 365 bool updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from, 366 std::vector<OutputShape>* to, UpdateOutputShapes* update); 367 368 // Map inputs and outputs from ExecutionBuilder to StepExecutor, 369 // one at a time. Note that these are input/output indexes, not 370 // operand indexes. 371 // 372 // For mapOutputToInput(), outputDimensions may be nullptr if the input 373 // operand has fully specified dimensions. mapInput(uint32_t builderIndex,uint32_t executorIndex)374 void mapInput(uint32_t builderIndex, uint32_t executorIndex) { 375 mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]); 376 } mapOutput(uint32_t builderIndex,uint32_t executorIndex)377 void mapOutput(uint32_t builderIndex, uint32_t executorIndex) { 378 mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]); 379 } mapOutputToInput(uint32_t builderIndex,uint32_t executorIndex,const Dimensions * outputDimensions)380 void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex, 381 const Dimensions* outputDimensions) { 382 mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex], 383 outputDimensions); 384 } 385 386 // dimensions must either have zero rank or must be 387 // consistent with and at least as well specified as operand dimensions 388 // (i.e., either rank must match, or operand rank must be zero; and for each 389 // individual dimension, either dimension must match, or operand dimension 390 // must be zero). 391 int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset, 392 uint32_t length, const Dimensions& dimensions = {}) { 393 return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset, 394 length, dimensions, &mInputs.at(inputIndex)); 395 } 396 int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset, 397 uint32_t length, const Dimensions& dimensions = {}) { 398 return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset, 399 length, dimensions, &mOutputs.at(outputIndex)); 400 } 401 402 // Executes using the (driver, preparedModel) specified at construction time. 403 std::tuple<int, std::vector<OutputShape>, Timing> compute( 404 const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr); 405 406 // Re-compiles and executes using the CPU, regardless of the (driver, 407 // preparedModel) specified at construction time. 408 std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpuFallback(); 409 410 bool isCpu() const; 411 412 // Perform fenced execution and return error_code, sync_fence_fd and a 413 // callback. 414 std::tuple<int, int, ExecuteFencedInfoCallback> computeFenced( 415 const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence, 416 const OptionalTimePoint& deadline); 417 418 // Do the dynamic temporaries defined by this step have valid allocations? 419 // (true if there are no dynamic temporaries defined by this step.) 420 bool areDynamicTemporariesAllocated() const; 421 422 private: 423 // builderDimensions may be nullptr if executorInputOrOutput has fully 424 // specified dimensions. 425 void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, 426 ModelArgumentInfo* executorInputOrOutput, 427 const Dimensions* builderDimensions = nullptr); 428 429 // dimensions must either have zero rank or 430 // must be consistent with and at least as well specified as operand 431 // dimensions (i.e., either rank must match, or operand rank must be zero; 432 // and for each individual dimension, either dimension must match, or 433 // operand dimension must be zero). 434 int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory, 435 uint32_t offset, uint32_t length, const Dimensions& dimensions, 436 ModelArgumentInfo* inputOrOutputInfo); 437 438 // describes the full (possibly multiple-"step") execution 439 ExecutionBuilder* mExecutionBuilder; 440 441 // describes the single execution step 442 const ExecutionStep* mExecutionStep; 443 444 // describes the dynamic temporaries 445 DynamicTemporaries* mDynamicTemporaries; 446 447 // model to be executed on the executor, in both original and 448 // compiled forms; and device on which to execute it 449 const ModelBuilder* mModel; 450 std::shared_ptr<Device> mDevice; 451 std::shared_ptr<RuntimePreparedModel> mPreparedModel; 452 453 // The reusable execution to launch multiple computations. 454 // It is only created once at the first time it's needed. 455 std::shared_ptr<RuntimeExecution> mExecution; 456 // Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure. 457 std::pair<int, std::shared_ptr<RuntimeExecution>> getReusableExecution(); 458 459 // The information we'll send to the driver about the inputs and outputs. 460 // Note that we build this in two steps: 461 // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. 462 // If set from a pointer, don't set the location in the Request::Argument but store it 463 // instead in mInputBuffers or mOutputBuffers. 464 // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for 465 // the m*Buffers entries. Copy the input values into the shared memory. 466 // We do this to avoid creating a lot of shared memory objects if we have a lot of 467 // parameters specified via pointers. We also avoid copying in the case where 468 // some of the nodes will interpreted on the CPU anyway. 469 std::vector<ModelArgumentInfo> mInputs; 470 std::vector<ModelArgumentInfo> mOutputs; 471 MemoryTracker mMemories; 472 473 // Whether compute/computeFenced may be invoked multiple times. 474 bool mReusable = false; 475 }; 476 477 std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes); 478 479 } // namespace nn 480 } // namespace android 481 482 #endif // ANDROID_PACKAGES_MODULES_NEURALNETWORKS_RUNTIME_EXECUTION_BUILDER_H 483