1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_PACKAGES_MODULES_NEURALNETWORKS_RUNTIME_EXECUTION_BUILDER_H
18 #define ANDROID_PACKAGES_MODULES_NEURALNETWORKS_RUNTIME_EXECUTION_BUILDER_H
19 
20 #include <ControlFlow.h>
21 #include <CpuExecutor.h>
22 #include <android-base/thread_annotations.h>
23 #include <nnapi/IBurst.h>
24 #include <nnapi/IPreparedModel.h>
25 #include <nnapi/Types.h>
26 #include <nnapi/Validation.h>
27 
28 #include <memory>
29 #include <set>
30 #include <string>
31 #include <tuple>
32 #include <utility>
33 #include <vector>
34 
35 #include "ExecutionCallback.h"
36 #include "Memory.h"
37 #include "ModelArgumentInfo.h"
38 #include "ModelBuilder.h"
39 #include "NeuralNetworks.h"
40 
41 namespace android {
42 namespace nn {
43 
44 class BurstBuilder;
45 class CompilationBuilder;
46 class Device;
47 class DynamicTemporaries;
48 class ExecutionPlan;
49 class ExecutionStep;
50 class ModelBuilder;
51 class RuntimeMemory;
52 class RuntimePreparedModel;
53 class RuntimeExecution;
54 class StepExecutor;
55 
56 // Execution modes
57 enum class ExecutionMode { ASYNC, SYNC, BURST, ASYNC_WITH_DEPS };
58 
59 class ExecutionBuilder {
60     friend class StepExecutor;
61 
62    public:
63     explicit ExecutionBuilder(const CompilationBuilder* compilation);
64     virtual ~ExecutionBuilder() = default;
65 
66     int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer,
67                  size_t length);
68     int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
69                            const RuntimeMemory* memory, size_t offset, size_t length);
70     int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer,
71                   size_t length);
72     int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
73                             const RuntimeMemory* memory, size_t offset, size_t length);
74 
75     int setMeasureTiming(bool measure);
76 
77     int getDuration(int32_t durationCode, uint64_t* duration) const;
78 
79     int setTimeoutDuration(uint64_t duration);
80 
81     std::optional<uint64_t> getTimeoutDuration() const;
82 
83     int setLoopTimeout(uint64_t duration);
84 
getLoopTimeoutDuration()85     uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; }
86 
87     int enableInputAndOutputPadding(bool enable);
88 
89     int setReusable(bool reusable);
90 
91     int addExtensionAttribute(const char* extensionName, uint16_t attributeCodeWithinExtension,
92                               const void* data, size_t length);
93 
94     int computeFenced(const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
95                       int* sync_fence);
96 
computeAsynchronously(std::shared_ptr<ExecutionCallback> * synchronizationCallback)97     int computeAsynchronously(std::shared_ptr<ExecutionCallback>* synchronizationCallback) {
98         CHECK(synchronizationCallback != nullptr);
99         return compute(synchronizationCallback);
100     }
computeSynchronously()101     int computeSynchronously() { return compute(nullptr); }
burstCompute(BurstBuilder * burst)102     int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); }
103 
104     // Initialize output dimensional information from ModelArgumentInfo.
105     std::vector<OutputShape> getInitialOutputShapes() const;
106 
107     int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions);
108     int getOutputOperandRank(uint32_t index, uint32_t* rank);
109 
110     // Handshake with lower-level execution support
measureTiming()111     bool measureTiming() const { return mMeasureTiming; }
reportTimingWithoutFencedExecutionCallback(Timing timing)112     void reportTimingWithoutFencedExecutionCallback(Timing timing) {
113         mTimingWithoutFencedExecutionCallback = timing;
114     }
115 
getCompilation()116     const CompilationBuilder* getCompilation() const { return mCompilation; }
getModel()117     const ModelBuilder* getModel() const { return mModel; }
118     const ModelBuilder* getSourceModel(uint32_t index) const;
getSourceOperand(const std::pair<uint32_t,uint32_t> & sourceOperandIndex)119     const Operand& getSourceOperand(const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const {
120         return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second);
121     }
122 
123     // This method will be called at the end of all computation paths to change the state
124     // of the execution object and update output shapes / memories.
125     int finishComputation(int result, const std::vector<OutputShape>& outputShapes,
126                           ExecutionMode mode);
finishComputation(ErrorStatus error,const std::vector<OutputShape> & outputShapes,ExecutionMode mode)127     ErrorStatus finishComputation(ErrorStatus error, const std::vector<OutputShape>& outputShapes,
128                                   ExecutionMode mode) {
129         const int result =
130                 finishComputation(convertErrorStatusToResultCode(error), outputShapes, mode);
131         return convertResultCodeToErrorStatus(result);
132     }
133 
getExecuteFencedInfoCallback()134     const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() {
135         return mFencedExecutionCallback;
136     }
137 
inFlight()138     bool inFlight() const {
139         std::lock_guard<std::mutex> lock(mStateMutex);
140         return mState == State::COMPUTATION;
141     }
completed()142     bool completed() const {
143         std::lock_guard<std::mutex> lock(mStateMutex);
144         return mState == State::COMPLETED;
145     }
146 
147     // Retrieve a computation start point
148     TimePoint getComputeStartTimePoint() const;
149 
getInputInfo(uint32_t index)150     const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; }
getOutputInfo(uint32_t index)151     const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; }
152 
getRunTimePoolInfo(uint32_t poolIndex)153     std::optional<RunTimePoolInfo> getRunTimePoolInfo(uint32_t poolIndex) const {
154         return mMemories[poolIndex]->getRunTimePoolInfo();
155     }
156 
getMetadata()157     const std::vector<TokenValuePair>& getMetadata() const { return mMetadata; }
158 
159    protected:
160     // If a callback is provided, then this is asynchronous. If a callback is
161     // not provided (i.e., is nullptr), then this is synchronous.
162     //
163     // If burst is provided, then the burst path will be used. If a burst is not
164     // provided (i.e., is nullptr), then a synchronous execution will occur.
165     //
166     // Providing both synchronizationCallback and burstBuilder is an error.
167     int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
168                 BurstBuilder* burstBuilder = nullptr);
169 
170     virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
171             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0;
172 
173     virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
174             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
175             const OptionalTimePoint& deadline) = 0;
176 
177     // This method handles the common preparation and validation logic of compute and computeFenced.
178     // It will be called at the start of every computation.
179     int prepareForCompute(const char* name, ExecutionMode mode);
180 
181     const CompilationBuilder* mCompilation;
182 
183     // Update output dimensional information from OutputShape to ModelArgumentInfo.
184     bool updateOutputShapes(ErrorStatus status, const std::vector<OutputShape>& outputShapes);
185 
186     bool updateMemories();
187 
188     const ModelBuilder* mModel;
189     const ExecutionPlan* mPlan;
190 
191     // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured
192     // from CompilationBuilder when the ExecutionBuilder is constructed.
193     bool mAllowCpuFallback;
194 
195     // The information we'll send to the driver about the inputs and outputs.
196     // Note that we build this in two steps:
197     // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
198     //    If set from a pointer, don't set the location in the Request::Argument but store it
199     //    instead in mInputBuffers or mOutputBuffers.
200     // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
201     //    the m*Buffers entries.  Copy the input values into the shared memory.
202     // We do this to avoid creating a lot of shared memory objects if we have a lot of
203     // parameters specified via pointers.  We also avoid copying in the case where
204     // some of the nodes will interpreted on the CPU anyway.
205     std::vector<ModelArgumentInfo> mInputs;
206     std::vector<ModelArgumentInfo> mOutputs;
207     MemoryTracker mMemories;
208 
209     // Do we ask the driver to measure timing?
210     bool mMeasureTiming = false;
211 
212     // Timepoint of computation start, used to evaluate timing
213     // from runtime perspective
214     TimePoint mComputeStartTimePoint;
215 
216     // Timing reported from the driver.  This field is only used if
217     // mFencedExecutionCallback is nullptr.
218     Timing mTimingWithoutFencedExecutionCallback = {};
219 
220     // Amount of time to complete or abort the execution.
221     std::optional<uint64_t> mTimeoutDuration;
222 
223     // Amount of time to complete or abort a loop.
224     uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault;
225 
226     // The state of the execution.
227     // Properties can only been set when the execution is in the state State::PREPARATION.
228     // Timing and output shapes can only be queried when the execution is in the state
229     // State::COMPLETED.
230     enum class State { PREPARATION, COMPUTATION, COMPLETED };
231     State mState GUARDED_BY(mStateMutex) = State::PREPARATION;
computationStarted()232     bool computationStarted() const {
233         std::lock_guard<std::mutex> lock(mStateMutex);
234         return mState != State::PREPARATION;
235     }
236 
237     // Mutex to guard mState. Note that this not strictly needed because we provide
238     // no thread-safety guarantee to the ANeuralNetworksExecution object.
239     mutable std::mutex mStateMutex;
240 
241     // Return false if the execution is in a bad state for starting computation.
242     // Otherwise, return true and set the state to State::COMPUTATION.
243     bool checkAndSetComputationState(const char* name);
244 
245     // With what error status has execution completed?
246     enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR };
247     Completion mCompletion = Completion::OTHER_ERROR;
completedWith()248     Completion completedWith() const {
249         CHECK(completed());
250         return mCompletion;
251     }
252 
253     // The result code of request validation.
254     // It is only evaluated once at the first time it's needed.
255     std::optional<int> mValidationResultCode;
256     int getValidationResultCode();
257 
258     // Does every tensor output operand of the model have a fully specified shape?
259     // It is only evaluated once at the first time it's needed.
260     std::optional<bool> mOutputsFullySpecified;
261     bool areOutputsFullySpecified();
262 
263     // The callback used to query execution related info in the case of fenced
264     // execution; otherwise, nullptr.  If the execution plan has multiple steps,
265     // this is the callback associated with the last step.  If the last step
266     // doesn't support fenced execution (e.g., the driver is too old), or if the
267     // launch of execution on the driver fails, then this callback will be
268     // nullptr.
269     ExecuteFencedInfoCallback mFencedExecutionCallback;
270 
271     // Whether set{Input,Output}[FromMemory] can accept padded length or not.
272     bool mInputAndOutputPaddingEnabled = false;
273 
274     // enableInputAndOutputPadding may only be called before any call of
275     // set{Input,Output}[FromMemory]
276     bool mHasCalledSetInputOutput = false;
277 
278     // Can compute APIs be invoked multiple times on the execution object?
279     bool mReusable = false;
280 
281     // Vendor specific metadata
282     std::vector<TokenValuePair> mMetadata;
283 };
284 
285 // For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device.
286 class SimpleExecutionBuilder : public ExecutionBuilder {
287    public:
288     SimpleExecutionBuilder(const CompilationBuilder* compilation);
289 
290     std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
291             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
292 
293     std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
294             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
295             const OptionalTimePoint& deadline) override;
296 
297    private:
298     std::shared_ptr<StepExecutor> mExecutor;
299 };
300 
301 // For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps.
302 class CompoundExecutionBuilder : public ExecutionBuilder {
303    public:
304     CompoundExecutionBuilder(const CompilationBuilder* compilation);
305 
306     std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
307             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
308 
309     std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
310             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
311             const OptionalTimePoint& deadline) override;
312 };
313 
314 // class StepExecutor is used to execute a single "step" in a
315 // potentially multiple step execution process.  The graph associated
316 // with that step is executed in its entirety on a single device (or
317 // on the CPU).
318 class StepExecutor {
319    public:
320     // executionBuilder
321     //     Describes the full (possibly multiple-"step") execution.
322     // model
323     //     The model to be executed by the executor.  Possibly a single
324     //     "step" model of a multiple-"step" executionBuilder.
325     // driver, preparedModel
326     //     The device on which to execute the "step", and the prepared
327     //     model to execute on that device. For non-fallback StepExecutor,
328     //     neither is nullptr; for fallback StepExecutor, both are ignored in
329     //     StepExecutor::computeOnCpuFallback and may be nullptr.
330     // reusable
331     //     If true, multiple StepExecutor::compute/computeFenced may be called on this
332     //     object; otherwise, only one StepExecutor::compute/computeFenced may be called.
333     //     reusable must be false if mDynamicTemporaries != nullptr.
334     // step
335     //     Contains the output index mapping from the excerpted "step" model to
336     //     main model if the execution has multiple "steps". Must be nullptr
337     //     otherwise.
338     //     (step == nullptr) == (dynamicTemporaries == nullptr)
339     // dynamicTemporaries
340     //     If the execution has multiple "steps", describes the temporaries
341     //     of source models that do not have fully specified types and are outputs
342     //     of "step" models. Must be nullptr otherwise.
343     //     (step == nullptr) == (dynamicTemporaries == nullptr)
344     StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
345                  std::shared_ptr<Device> device,
346                  std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
347                  const ExecutionStep* step = nullptr,
348                  DynamicTemporaries* dynamicTemporaries = nullptr);
349 
350     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
351     // in the case where we have a single-"step" execution (i.e., the executor
352     // is executing the entire model from the ExecutionBuilder).
353     void mapInputsAndOutputsTrivially();
354 
355     // Update output shapes with shapes returned from execution.
356     struct UpdateOutputShapes {
357         // These fields are meaningless unless updateOutputShapes() returns true
358         bool updatedDynamicTemporary;  // did shape (dimensions, size) information change for at
359                                        // least one dynamic temporary?
360         bool mainOutputInsufficient;  // is at least one main model output written by this execution
361                                       // marked !isSufficient?
362         bool zeroSizedInput;  // is at least one output of this execution step a zero-sized tensor
363                               // that needs to be read by some other step of the same execution?
364     };
365     bool updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
366                             std::vector<OutputShape>* to, UpdateOutputShapes* update);
367 
368     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
369     // one at a time.  Note that these are input/output indexes, not
370     // operand indexes.
371     //
372     // For mapOutputToInput(), outputDimensions may be nullptr if the input
373     // operand has fully specified dimensions.
mapInput(uint32_t builderIndex,uint32_t executorIndex)374     void mapInput(uint32_t builderIndex, uint32_t executorIndex) {
375         mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]);
376     }
mapOutput(uint32_t builderIndex,uint32_t executorIndex)377     void mapOutput(uint32_t builderIndex, uint32_t executorIndex) {
378         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]);
379     }
mapOutputToInput(uint32_t builderIndex,uint32_t executorIndex,const Dimensions * outputDimensions)380     void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex,
381                           const Dimensions* outputDimensions) {
382         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex],
383                          outputDimensions);
384     }
385 
386     // dimensions must either have zero rank or must be
387     // consistent with and at least as well specified as operand dimensions
388     // (i.e., either rank must match, or operand rank must be zero; and for each
389     // individual dimension, either dimension must match, or operand dimension
390     // must be zero).
391     int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset,
392                            uint32_t length, const Dimensions& dimensions = {}) {
393         return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset,
394                                           length, dimensions, &mInputs.at(inputIndex));
395     }
396     int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset,
397                             uint32_t length, const Dimensions& dimensions = {}) {
398         return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset,
399                                           length, dimensions, &mOutputs.at(outputIndex));
400     }
401 
402     // Executes using the (driver, preparedModel) specified at construction time.
403     std::tuple<int, std::vector<OutputShape>, Timing> compute(
404             const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr);
405 
406     // Re-compiles and executes using the CPU, regardless of the (driver,
407     // preparedModel) specified at construction time.
408     std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpuFallback();
409 
410     bool isCpu() const;
411 
412     // Perform fenced execution and return error_code, sync_fence_fd and a
413     // callback.
414     std::tuple<int, int, ExecuteFencedInfoCallback> computeFenced(
415             const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
416             const OptionalTimePoint& deadline);
417 
418     // Do the dynamic temporaries defined by this step have valid allocations?
419     // (true if there are no dynamic temporaries defined by this step.)
420     bool areDynamicTemporariesAllocated() const;
421 
422    private:
423     // builderDimensions may be nullptr if executorInputOrOutput has fully
424     // specified dimensions.
425     void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
426                           ModelArgumentInfo* executorInputOrOutput,
427                           const Dimensions* builderDimensions = nullptr);
428 
429     // dimensions must either have zero rank or
430     // must be consistent with and at least as well specified as operand
431     // dimensions (i.e., either rank must match, or operand rank must be zero;
432     // and for each individual dimension, either dimension must match, or
433     // operand dimension must be zero).
434     int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory,
435                                    uint32_t offset, uint32_t length, const Dimensions& dimensions,
436                                    ModelArgumentInfo* inputOrOutputInfo);
437 
438     // describes the full (possibly multiple-"step") execution
439     ExecutionBuilder* mExecutionBuilder;
440 
441     // describes the single execution step
442     const ExecutionStep* mExecutionStep;
443 
444     // describes the dynamic temporaries
445     DynamicTemporaries* mDynamicTemporaries;
446 
447     // model to be executed on the executor, in both original and
448     // compiled forms; and device on which to execute it
449     const ModelBuilder* mModel;
450     std::shared_ptr<Device> mDevice;
451     std::shared_ptr<RuntimePreparedModel> mPreparedModel;
452 
453     // The reusable execution to launch multiple computations.
454     // It is only created once at the first time it's needed.
455     std::shared_ptr<RuntimeExecution> mExecution;
456     // Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure.
457     std::pair<int, std::shared_ptr<RuntimeExecution>> getReusableExecution();
458 
459     // The information we'll send to the driver about the inputs and outputs.
460     // Note that we build this in two steps:
461     // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
462     //    If set from a pointer, don't set the location in the Request::Argument but store it
463     //    instead in mInputBuffers or mOutputBuffers.
464     // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
465     //    the m*Buffers entries.  Copy the input values into the shared memory.
466     // We do this to avoid creating a lot of shared memory objects if we have a lot of
467     // parameters specified via pointers.  We also avoid copying in the case where
468     // some of the nodes will interpreted on the CPU anyway.
469     std::vector<ModelArgumentInfo> mInputs;
470     std::vector<ModelArgumentInfo> mOutputs;
471     MemoryTracker mMemories;
472 
473     // Whether compute/computeFenced may be invoked multiple times.
474     bool mReusable = false;
475 };
476 
477 std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes);
478 
479 }  // namespace nn
480 }  // namespace android
481 
482 #endif  // ANDROID_PACKAGES_MODULES_NEURALNETWORKS_RUNTIME_EXECUTION_BUILDER_H
483