1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_ML_NN_RUNTIME_EXECUTION_BUILDER_H
18 #define ANDROID_ML_NN_RUNTIME_EXECUTION_BUILDER_H
19 
20 #include "Callbacks.h"
21 #include "HalInterfaces.h"
22 #include "Memory.h"
23 #include "ModelBuilder.h"
24 #include "NeuralNetworks.h"
25 #include "VersionedInterfaces.h"
26 
27 #include <atomic>
28 #include <unordered_map>
29 #include <vector>
30 
31 using ::android::hardware::neuralnetworks::V1_2::implementation::ExecutionCallback;
32 using ::android::hardware::neuralnetworks::V1_2::implementation::PreparedModelCallback;
33 
34 namespace android {
35 namespace nn {
36 
37 class BurstBuilder;
38 class CompilationBuilder;
39 class ExecutionPlan;
40 class ExecutionBurstController;
41 class ExecutionStep;
42 class Memory;
43 class ModelBuilder;
44 class StepExecutor;
45 class Device;
46 
47 // TODO move length out of DataLocation
48 struct ModelArgumentInfo {
49     // Whether the argument was specified as being in a Memory, as a pointer,
50     // has no value, or has not been specified.
51     // If POINTER then:
52     //   locationAndLength.length is valid.
53     //   dimensions is valid.
54     //   buffer is valid
55     // If MEMORY then:
56     //   locationAndLength.{poolIndex, offset, length} is valid.
57     //   dimensions is valid.
58     enum { POINTER, MEMORY, HAS_NO_VALUE, UNSPECIFIED } state = UNSPECIFIED;
59     DataLocation locationAndLength;
60     std::vector<uint32_t> dimensions;
61     void* buffer;
62     bool isSufficient = true;
63 
64     int setFromPointer(const Operand& operand, const ANeuralNetworksOperandType* type, void* buffer,
65                        uint32_t length);
66     int setFromMemory(const Operand& operand, const ANeuralNetworksOperandType* type,
67                       uint32_t poolIndex, uint32_t offset, uint32_t length);
68     int setFromTemporaryMemory(const Operand& operand, uint32_t poolIndex, uint32_t offset,
69                                uint32_t length);
70     int updateDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType);
71 };
72 
73 class ExecutionBuilder {
74     friend class StepExecutor;
75 public:
76     ExecutionBuilder(const CompilationBuilder* compilation);
77 
78     int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer,
79                  size_t length);
80     int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
81                            const Memory* memory, size_t offset, size_t length);
82     int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer,
83                   size_t length);
84     int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
85                             const Memory* memory, size_t offset, size_t length);
86 
87     int setMeasureTiming(bool measure);
88 
89     int getDuration(int32_t durationCode, uint64_t* duration) const;
90 
computeAsynchronously(sp<ExecutionCallback> * synchronizationCallback)91     int computeAsynchronously(sp<ExecutionCallback>* synchronizationCallback) {
92         CHECK(synchronizationCallback != nullptr);
93         return compute(synchronizationCallback);
94     }
computeSynchronously()95     int computeSynchronously() { return compute(nullptr); }
burstCompute(BurstBuilder * burst)96     int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); }
97 
98     // Initialize output dimensional information from ModelArgumentInfo.
99     void initializeOutputShapes(std::vector<OutputShape>* outputShapes) const;
100 
101     int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions);
102     int getOutputOperandRank(uint32_t index, uint32_t* rank);
103 
104     // Handshake with lower-level execution support
measureTiming()105     bool measureTiming() const { return mMeasureTiming; }
reportTiming(Timing timing)106     void reportTiming(Timing timing) { mTiming = timing; }
107 
getCompilation()108     const CompilationBuilder* getCompilation() const { return mCompilation; }
getModel()109     const ModelBuilder* getModel() const { return mModel; }
110 
111     ErrorStatus finish(ErrorStatus error, const std::vector<OutputShape>& outputShapes);
112 
113    private:
114     // If a callback is provided, then this is asynchronous. If a callback is
115     // not provided (i.e., is nullptr), then this is synchronous.
116     //
117     // If burst is provided, then the burst path will be used. If a burst is not
118     // provided (i.e., is nullptr), then a synchronous execution will occur.
119     //
120     // Providing both synchronizationCallback and burstBuilder is an error.
121     int compute(sp<ExecutionCallback>* synchronizationCallback,
122                 BurstBuilder* burstBuilder = nullptr);
123 
124     const CompilationBuilder* mCompilation;
125 
126     // Update output dimensional information from OutputShape to ModelArgumentInfo.
127     bool updateOutputShapes(const std::vector<OutputShape>& outputShapes);
128 
129     const ModelBuilder* mModel;
130     const ExecutionPlan* mPlan;
131 
132     // This is a DeviceManager::kPartitioning* value captured from
133     // CompilationBuilder when the ExecutionBuilder is constructed.
134     uint32_t mPartitioning;
135 
136     // The information we'll send to the driver about the inputs and outputs.
137     // Note that we build this in two steps:
138     // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
139     //    If set from a pointer, don't set the location in the RequestArgument but store it
140     //    instead in mInputBuffers or mOutputBuffers.
141     // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
142     //    the m*Buffers entries.  Copy the input values into the shared memory.
143     // We do this to avoid creating a lot of shared memory objects if we have a lot of
144     // parameters specified via pointers.  We also avoid copying in the case where
145     // some of the nodes will interpreted on the CPU anyway.
146     std::vector<ModelArgumentInfo> mInputs;
147     std::vector<ModelArgumentInfo> mOutputs;
148     MemoryTracker mMemories;
149 
150     // Do we ask the driver to measure timing?
151     bool mMeasureTiming = false;
152 
153     // Timing reported from the driver
154     Timing mTiming = {};
155 
156     // Properties cannot be set once the execution has started.
157     std::atomic_bool mStarted = false;
158 
159     // Timing and output shapes can only be queried after the execution is
160     // finished.
161     std::atomic_bool mFinished = false;
162 };
163 
164 // class StepExecutor is used to execute a single "step" in a
165 // potentially multiple step execution process.  The graph associated
166 // with that step is executed in its entirety on a single device (or
167 // on the CPU).
168 class StepExecutor {
169    public:
170     // executionBuilder
171     //     Describes the full (possibly multiple-"step") execution.
172     // model
173     //     The model to be executed by the executor.  Possibly a
174     //     submodel of the model from executionBuilder.
175     // driver, preparedModel
176     //     The device on which to execute the "step", and the prepared
177     //     model to execute on that device.  (Both are nullptr in the
178     //     case of CPU.)
179     StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
180                  std::shared_ptr<Device> device,
181                  std::shared_ptr<VersionedIPreparedModel> preparedModel);
182 
183     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
184     // in the case where we have a single-"step" execution (i.e., the executor
185     // is executing the entire model from the ExecutionBuilder).
186     void mapInputsAndOutputsTrivially();
187 
188     // Update output shapes returned from ExecutionCallback to ExecutionBuilder.
189     bool updateOutputShapes(const std::vector<OutputShape>& from, std::vector<OutputShape>* to);
190 
191     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
192     // one at a time.  Note that these are input/output indexes, not
193     // operand indexes.
mapInput(uint32_t builderIndex,uint32_t executorIndex)194     void mapInput(uint32_t builderIndex, uint32_t executorIndex) {
195         mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]);
196     }
mapOutput(uint32_t builderIndex,uint32_t executorIndex)197     void mapOutput(uint32_t builderIndex, uint32_t executorIndex) {
198         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]);
199     }
mapOutputToInput(uint32_t builderIndex,uint32_t executorIndex)200     void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex) {
201         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex],
202                          &mInputs[executorIndex]);
203     }
204 
205     // The input or output is assumed to have the size of the
206     // corresponding operand.
setInputFromTemporaryMemory(uint32_t inputIndex,const Memory * memory,uint32_t offset)207     int setInputFromTemporaryMemory(uint32_t inputIndex, const Memory* memory, uint32_t offset) {
208         return setInputOrOutputFromTemporaryMemory(mModel->getInputOperand(inputIndex),
209                                                    memory, offset,
210                                                    &mInputs.at(inputIndex));
211     }
setOutputFromTemporaryMemory(uint32_t outputIndex,const Memory * memory,uint32_t offset)212     int setOutputFromTemporaryMemory(uint32_t outputIndex, const Memory* memory, uint32_t offset) {
213         return setInputOrOutputFromTemporaryMemory(mModel->getOutputOperand(outputIndex),
214                                                    memory, offset,
215                                                    &mOutputs.at(outputIndex));
216     }
217 
218     // Executes using the (driver, preparedModel) specified at construction time.
219     int startCompute(sp<ExecutionCallback>* synchronizationCallback,
220                      const std::shared_ptr<ExecutionBurstController>& burstController = nullptr);
221 
222     // Executes using the CPU, regardless of the (driver,
223     // preparedModel) specified at construction time.
224     int startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback);
225 
226     bool isCpu() const;
227 
228     // ExecutionStep has the index mapping between ExecutionBuilder and StepExecutor.
setExecutionStep(const std::shared_ptr<const ExecutionStep> & step)229     void setExecutionStep(const std::shared_ptr<const ExecutionStep>& step) {
230         mExecutionStep = step;
231     }
232 
233    private:
234     int allocatePointerArgumentsToPool(std::vector<ModelArgumentInfo>* args, Memory* memory);
235     int startComputeOnDevice(sp<ExecutionCallback>* synchronizationCallback,
236                              const std::shared_ptr<ExecutionBurstController>& burstController);
237 
238     void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
239                           ModelArgumentInfo* executorInputOrOutput);
240 
241     int setInputOrOutputFromTemporaryMemory(const Operand& inputOrOutputOperand,
242                                             const Memory* memory, uint32_t offset,
243                                             ModelArgumentInfo* inputOrOutputInfo);
244 
245     // describes the full (possibly multiple-"step") execution
246     ExecutionBuilder* mExecutionBuilder;
247 
248     // describes the single execution step
249     std::shared_ptr<const ExecutionStep> mExecutionStep = nullptr;
250 
251     // model to be executed on the executor, in both original and
252     // compiled forms; and device on which to execute it
253     const ModelBuilder* mModel;
254     std::shared_ptr<Device> mDevice;
255     std::shared_ptr<VersionedIPreparedModel>
256             mPreparedModel;  // nullptr if CPU execution or if bypassing ExecutionPlan
257 
258     // The information we'll send to the driver about the inputs and outputs.
259     // Note that we build this in two steps:
260     // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
261     //    If set from a pointer, don't set the location in the RequestArgument but store it
262     //    instead in mInputBuffers or mOutputBuffers.
263     // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
264     //    the m*Buffers entries.  Copy the input values into the shared memory.
265     // We do this to avoid creating a lot of shared memory objects if we have a lot of
266     // parameters specified via pointers.  We also avoid copying in the case where
267     // some of the nodes will interpreted on the CPU anyway.
268     std::vector<ModelArgumentInfo> mInputs;
269     std::vector<ModelArgumentInfo> mOutputs;
270     MemoryTracker mMemories;
271 };
272 
273 } // namespace nn
274 } // namespace android
275 
276 #endif // ANDROID_ML_NN_RUNTIME_EXECUTION_BUILDER_H
277