1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "CpuExecutor"
18 
19 #include "CpuExecutor.h"
20 
21 #include <android-base/scopeguard.h>
22 #include <nnapi/SharedMemory.h>
23 #include <nnapi/TypeUtils.h>
24 
25 #include <limits>
26 #include <memory>
27 #include <utility>
28 #include <vector>
29 
30 #include "ControlFlow.h"
31 #include "NeuralNetworks.h"
32 #include "OperationResolver.h"
33 #include "Operations.h"
34 #include "OperationsExecutionUtils.h"
35 #include "Tracing.h"
36 
37 // b/109953668, disable OpenMP
38 #ifdef NNAPI_OPENMP
39 #include <omp.h>
40 
41 #include <Eigen/Core>
42 #endif  // NNAPI_OPENMP
43 
44 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
45 #include "BidirectionalSequenceLSTM.h"
46 #include "Cast.h"
47 #include "EmbeddingLookup.h"
48 #include "ExpandDims.h"
49 #include "HashtableLookup.h"
50 #include "LSHProjection.h"
51 #include "LSTM.h"
52 #include "MaximumMinimum.h"
53 #include "Multinomial.h"
54 #include "Pow.h"
55 #include "QuantizedLSTM.h"
56 #include "RNN.h"
57 #include "SVDF.h"
58 #include "Tile.h"
59 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
60 
61 namespace android {
62 namespace nn {
63 namespace {
64 
65 class OperationExecutionContext : public IOperationExecutionContext {
66     DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
67 
68    public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)69     OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
70         : operation(operation), operands(operands) {}
71 
72     uint32_t getNumInputs() const override;
73     OperandType getInputType(uint32_t index) const override;
74     Shape getInputShape(uint32_t index) const override;
75     const void* getInputBuffer(uint32_t index) const override;
76     const Operand::ExtraParams& getInputExtraParams(uint32_t index) const override;
77 
78     uint32_t getNumOutputs() const override;
79     OperandType getOutputType(uint32_t index) const override;
80     Shape getOutputShape(uint32_t index) const override;
81     void* getOutputBuffer(uint32_t index) override;
82 
83     // Return false on failure and store the result code.
84     // Use getResultCode() to retrieve it at the end of the operation execution.
85     bool setOutputShape(uint32_t index, const Shape& shape) override;
86     int getResultCode() const;
87 
88     bool isOmittedInput(uint32_t index) const override;
89     bool isOmittedOutput(uint32_t index) const override;
90 
91     // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
92     bool checkNoOmittedOperand() const;
93     // Return false if any of inputs has dimension 0.
94     bool checkNoZeroSizedInput() const;
95 
96    private:
97     const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
98     const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
99     RunTimeOperandInfo* getOutputInfo(uint32_t index);
100 
101     const Operation* operation;
102     RunTimeOperandInfo* operands;
103 
104     int result = ANEURALNETWORKS_NO_ERROR;
105 };
106 
getInputInfo(uint32_t index) const107 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
108     CHECK(index < operation->inputs.size());
109     return &operands[operation->inputs[index]];
110 }
111 
getOutputInfo(uint32_t index) const112 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
113     CHECK(index < operation->outputs.size());
114     return &operands[operation->outputs[index]];
115 }
116 
getOutputInfo(uint32_t index)117 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
118     CHECK(index < operation->outputs.size());
119     return &operands[operation->outputs[index]];
120 }
121 
getInputType(uint32_t index) const122 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
123     return getInputInfo(index)->type;
124 }
125 
getInputShape(uint32_t index) const126 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
127     return getInputInfo(index)->shape();
128 }
129 
getInputBuffer(uint32_t index) const130 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
131     return getInputInfo(index)->buffer;
132 }
133 
getInputExtraParams(uint32_t index) const134 const Operand::ExtraParams& OperationExecutionContext::getInputExtraParams(uint32_t index) const {
135     return getInputInfo(index)->extraParams;
136 }
137 
getOutputType(uint32_t index) const138 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
139     return getOutputInfo(index)->type;
140 }
141 
getOutputShape(uint32_t index) const142 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
143     return getOutputInfo(index)->shape();
144 }
145 
getOutputBuffer(uint32_t index)146 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
147     return getOutputInfo(index)->buffer;
148 }
149 
getNumInputs() const150 uint32_t OperationExecutionContext::getNumInputs() const {
151     return operation->inputs.size();
152 }
153 
getNumOutputs() const154 uint32_t OperationExecutionContext::getNumOutputs() const {
155     return operation->outputs.size();
156 }
157 
getResultCode() const158 int OperationExecutionContext::getResultCode() const {
159     return result;
160 }
161 
162 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
163 // Updates the RunTimeOperandInfo with the newly calculated shape.
164 // Allocate the buffer if we need to.
165 //
166 // TODO(b/153081229): This function currently cannot handle extension operands well. We need to
167 //                    propagate the extension type info into this function.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)168 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
169     // For user-provided model output operands, the parameters must match the Shape
170     // calculated from the preparation step.
171     if (info->lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
172         if (info->type != shape.type) {
173             LOG(ERROR) << "Invalid type for model output";
174             *result = ANEURALNETWORKS_OP_FAILED;
175             return false;
176         }
177         if (info->scale != shape.scale) {
178             LOG(ERROR) << "Invalid scale for model output";
179             *result = ANEURALNETWORKS_OP_FAILED;
180             return false;
181         }
182         if (info->zeroPoint != shape.offset) {
183             LOG(ERROR) << "Invalid zeroPoint for model output";
184             *result = ANEURALNETWORKS_OP_FAILED;
185             return false;
186         }
187         if (info->extraParams != shape.extraParams) {
188             LOG(ERROR) << "Invalid extraParams for model output";
189             *result = ANEURALNETWORKS_OP_FAILED;
190             return false;
191         }
192     }
193 
194     auto combined = combineDimensions(shape.dimensions, info->dimensions);
195     if (!combined.has_value()) {
196         LOG(ERROR) << "Invalid dimensions for model operand: " << combined.error();
197         *result = ANEURALNETWORKS_OP_FAILED;
198         return false;
199     }
200     info->dimensions = std::move(combined.value());
201     info->type = shape.type;
202     info->scale = shape.scale;
203     info->zeroPoint = shape.offset;
204     info->extraParams = shape.extraParams;
205 
206     // TODO(b/153081229): We bypass the overflow check on extension operands because we do not know
207     //                    the sizes of extension types.
208     if (!isExtension(info->type) &&
209         nonExtensionOperandSizeOfDataOverflowsUInt32(info->type, info->dimensions)) {
210         LOG(ERROR) << "Operand data size overflows uint32_t";
211         *result = ANEURALNETWORKS_OP_FAILED;
212         return false;
213     }
214 
215     // Allocate the buffer only if the combined dimension is fully specified
216     if (info->buffer == nullptr && (info->lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
217                                     info->lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT)) {
218         if (isExtension(info->type)) {
219             LOG(ERROR) << "Cannot allocate a variable of an extension type";
220             *result = ANEURALNETWORKS_OP_FAILED;
221             return false;
222         }
223         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
224         if (length > 0) {
225             info->buffer = new uint8_t[length];
226             if (info->buffer == nullptr) {
227                 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
228                 return false;
229             }
230             info->length = length;
231         }
232     }
233     if (!info->isSufficient()) {
234         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
235         LOG(ERROR) << "Insufficient size for model operand: require = " << length
236                    << ", provided = " << info->length;
237         *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
238         return false;
239     }
240     *result = ANEURALNETWORKS_NO_ERROR;
241     return true;
242 }
243 
setOutputShape(uint32_t index,const Shape & shape)244 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
245     return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
246 }
247 
isOmittedInput(uint32_t index) const248 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
249     return getInputInfo(index)->lifetime == Operand::LifeTime::NO_VALUE;
250 }
251 
isOmittedOutput(uint32_t index) const252 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
253     return getOutputInfo(index)->lifetime == Operand::LifeTime::NO_VALUE;
254 }
255 
checkNoOmittedOperand() const256 bool OperationExecutionContext::checkNoOmittedOperand() const {
257     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
258         NN_RET_CHECK(!isOmittedInput(i))
259                 << operation->type << " input operand " << i << " is required but missing.";
260     }
261     for (uint32_t i = 0; i < operation->outputs.size(); i++) {
262         NN_RET_CHECK(!isOmittedOutput(i))
263                 << operation->type << " output operand " << i << " is required but missing.";
264     }
265     return true;
266 }
267 
checkNoZeroSizedInput() const268 bool OperationExecutionContext::checkNoZeroSizedInput() const {
269     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
270         if (isOmittedInput(i)) continue;
271         for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
272             NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0u)
273                     << operation->type << " does not support zero-sized tensor, but input " << i
274                     << " dimension " << j << " is 0.";
275         }
276     }
277     return true;
278 }
279 
280 }  // namespace
281 
282 // Used to keep a pointer to a memory pool.
283 //
284 // In the case of an "mmap_fd" pool, owns the mmap region
285 // returned by getBuffer() -- i.e., that region goes away
286 // when the RunTimePoolInfo is destroyed or is assigned to.
287 class RunTimePoolInfo::RunTimePoolInfoImpl {
288    public:
289     RunTimePoolInfoImpl(SharedMemory memory, Mapping mapping);
290 
291     uint8_t* getBuffer() const;
292     uint32_t getSize() const;
293 
294     bool flush() const;
295 
getMemory() const296     const SharedMemory& getMemory() const { return mMemory; }
297 
298    private:
299     const SharedMemory mMemory;
300     const Mapping mMapping;
301 };
302 
RunTimePoolInfoImpl(SharedMemory memory,Mapping mapping)303 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(SharedMemory memory, Mapping mapping)
304     : mMemory(std::move(memory)), mMapping(std::move(mapping)) {}
305 
getBuffer() const306 uint8_t* RunTimePoolInfo::RunTimePoolInfoImpl::getBuffer() const {
307     return std::visit(
308             [](auto* pointer) {
309                 // Writing to a const buffer may lead to undefined behavior.
310                 // TODO: Refactor the code to avoid the const_cast.
311                 return static_cast<uint8_t*>(const_cast<void*>(pointer));
312             },
313             mMapping.pointer);
314 }
315 
getSize() const316 uint32_t RunTimePoolInfo::RunTimePoolInfoImpl::getSize() const {
317     CHECK_LE(mMapping.size, std::numeric_limits<uint32_t>::max());
318     return static_cast<uint32_t>(mMapping.size);
319 }
320 
321 // Making sure the output data are correctly updated after execution.
flush() const322 bool RunTimePoolInfo::RunTimePoolInfoImpl::flush() const {
323     return nn::flush(mMapping);
324 }
325 
326 // TODO: short term, make share memory mapping and updating a utility function.
327 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromMemory(const SharedMemory & memory)328 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromMemory(const SharedMemory& memory) {
329     auto mapping = map(memory);
330     if (!mapping.has_value()) {
331         LOG(ERROR) << "Can't map shared memory: " << mapping.error().message;
332         return std::nullopt;
333     }
334     const auto impl =
335             std::make_shared<const RunTimePoolInfoImpl>(memory, std::move(mapping).value());
336     return RunTimePoolInfo(impl);
337 }
338 
createFromExistingBuffer(uint8_t * buffer,uint32_t size)339 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer, uint32_t size) {
340     auto mapping = Mapping{.pointer = buffer, .size = size};
341     const auto impl = std::make_shared<const RunTimePoolInfoImpl>(std::make_shared<const Memory>(),
342                                                                   std::move(mapping));
343     return RunTimePoolInfo(impl);
344 }
345 
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)346 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
347     : mImpl(impl) {}
348 
getBuffer() const349 uint8_t* RunTimePoolInfo::getBuffer() const {
350     return mImpl->getBuffer();
351 }
352 
getSize() const353 uint32_t RunTimePoolInfo::getSize() const {
354     return mImpl->getSize();
355 }
356 
flush() const357 bool RunTimePoolInfo::flush() const {
358     return mImpl->flush();
359 }
360 
getMemory() const361 const SharedMemory& RunTimePoolInfo::getMemory() const {
362     return mImpl->getMemory();
363 }
364 
setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo> * poolInfos,const std::vector<SharedMemory> & pools)365 bool setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo>* poolInfos,
366                                               const std::vector<SharedMemory>& pools) {
367     CHECK(poolInfos != nullptr);
368     poolInfos->clear();
369     poolInfos->reserve(pools.size());
370     for (const auto& pool : pools) {
371         if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromMemory(pool)) {
372             poolInfos->push_back(*poolInfo);
373         } else {
374             LOG(ERROR) << "Could not map pools";
375             poolInfos->clear();
376             return false;
377         }
378     }
379     return true;
380 }
381 
setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo> * poolInfos,const std::vector<Request::MemoryPool> & pools)382 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
383                                         const std::vector<Request::MemoryPool>& pools) {
384     CHECK(poolInfos != nullptr);
385     poolInfos->clear();
386     poolInfos->reserve(pools.size());
387     for (const auto& pool : pools) {
388         if (!std::holds_alternative<SharedMemory>(pool)) {
389             LOG(ERROR) << "Unknown memory token";
390             poolInfos->clear();
391             return false;
392         }
393         if (std::optional<RunTimePoolInfo> poolInfo =
394                     RunTimePoolInfo::createFromMemory(std::get<SharedMemory>(pool))) {
395             poolInfos->push_back(*poolInfo);
396         } else {
397             LOG(ERROR) << "Could not map pools";
398             poolInfos->clear();
399             return false;
400         }
401     }
402     return true;
403 }
404 
405 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
406 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)407 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
408     uint32_t spatialSize = fromDim[2] * fromDim[3];
409     for (uint32_t n = 0; n < fromDim[0]; n++) {
410         for (uint32_t hw = 0; hw < spatialSize; hw++) {
411             for (uint32_t c = 0; c < fromDim[1]; c++) {
412                 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
413                 *to++ = from[fromIndex];
414             }
415         }
416     }
417     return true;
418 }
419 
420 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)421 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
422     uint32_t spatialSize = fromDim[1] * fromDim[2];
423     for (uint32_t n = 0; n < fromDim[0]; n++) {
424         for (uint32_t c = 0; c < fromDim[3]; c++) {
425             for (uint32_t hw = 0; hw < spatialSize; hw++) {
426                 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
427                 *to++ = from[fromIndex];
428             }
429         }
430     }
431     return true;
432 }
433 
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)434 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
435                           std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
436     int result;
437     if (from.dimensions.size() != 4) {
438         LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
439         return false;
440     }
441     to.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
442     if (data_layout) {
443         // convert dimensions
444         Shape inShape = from.shape();
445         auto& fromDim = from.dimensions;
446         inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
447         // allocate buffer
448         to.buffer = nullptr;
449         if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
450             return false;
451         }
452         ptr_guard.reset(to.buffer);
453         // convert value
454         if (from.type == OperandType::TENSOR_FLOAT32) {
455             return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
456                                             reinterpret_cast<const float*>(from.buffer), fromDim);
457         } else if (from.type == OperandType::TENSOR_FLOAT16) {
458             return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
459                                                reinterpret_cast<const _Float16*>(from.buffer),
460                                                fromDim);
461         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
462             return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
463                                               reinterpret_cast<const uint8_t*>(from.buffer),
464                                               fromDim);
465         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
466             return convertToNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
467                                              reinterpret_cast<const int8_t*>(from.buffer), fromDim);
468         } else {
469             LOG(ERROR) << "Unsupported data type";
470             return false;
471         }
472     } else {
473         to = from;
474     }
475     return true;
476 }
477 
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)478 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
479                             bool data_layout, int* result) {
480     if (from.dimensions.size() != 4) {
481         LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
482         return false;
483     }
484     if (data_layout) {
485         // convert dimensions
486         Shape outShape = from.shape();
487         auto& fromDim = from.dimensions;
488         outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
489         // allocate buffer
490         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
491             return false;
492         }
493         // convert value
494         if (from.type == OperandType::TENSOR_FLOAT32) {
495             return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
496                                               reinterpret_cast<const float*>(from.buffer), fromDim);
497         } else if (from.type == OperandType::TENSOR_FLOAT16) {
498             return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
499                                                  reinterpret_cast<const _Float16*>(from.buffer),
500                                                  fromDim);
501         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
502             return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
503                                                 reinterpret_cast<const uint8_t*>(from.buffer),
504                                                 fromDim);
505         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
506             return convertFromNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
507                                                reinterpret_cast<const int8_t*>(from.buffer),
508                                                fromDim);
509         } else {
510             LOG(ERROR) << "Unsupported data type";
511             return false;
512         }
513     } else {
514         Shape outShape = from.shape();
515         to.buffer = from.buffer;
516         to.length = from.length;
517         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
518             return false;
519         }
520     }
521     return true;
522 }
523 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
524 
525 // Decrements the usage count for the operands listed.  Frees the memory
526 // allocated for any temporary variable with a count of zero.
consumeOperationInputs(const std::vector<uint32_t> & inputs,RunTimeOperandInfo * operands)527 static void consumeOperationInputs(const std::vector<uint32_t>& inputs,
528                                    RunTimeOperandInfo* operands) {
529     for (uint32_t i : inputs) {
530         auto& info = operands[i];
531         // Check if it's a static or model input/output.
532         if (info.numberOfUsesLeft == 0) {
533             continue;
534         }
535         info.numberOfUsesLeft--;
536         if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
537             delete[] info.buffer;
538             info.buffer = nullptr;
539         }
540     }
541 }
542 
543 // This function only frees TEMPORARY_VARIABLE operands that are unused
544 // outputs because consumeOperationInputs takes care of any operands
545 // that are inputs to an operation.
freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo> * operands)546 static void freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo>* operands) {
547     for (auto& info : *operands) {
548         if (info.lifetime == Operand::LifeTime::TEMPORARY_VARIABLE && info.numberOfUsesLeft == 0 &&
549             info.buffer != nullptr) {
550             delete[] info.buffer;
551             info.buffer = nullptr;
552         }
553     }
554 }
555 
556 // Ignore the .pools entry in model and request.  This will have been taken care of
557 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)558 int CpuExecutor::run(const Model& model, const Request& request,
559                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
560                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
561     NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
562     VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(request) << ")";
563     mModelOperandValues = model.operandValues.data();
564     mModelPoolInfos = &modelPoolInfos;
565     mReferencedSubgraphs = &model.referenced;
566 
567     // b/109953668, disable OpenMP
568 #ifdef NNAPI_OPENMP
569     ScopedOpenmpSettings openMpSettings;
570 #endif  // NNAPI_OPENMP
571 
572     std::vector<RunTimeOperandInfo> operands = initializeRunTimeInfo(model.main);
573     updateForArguments(model.main.inputIndexes, request.inputs, requestPoolInfos, operands.data());
574     updateForArguments(model.main.outputIndexes, request.outputs, requestPoolInfos,
575                        operands.data());
576     int result = executeSubgraph(model.main, operands.data());
577     freeUnusedSubgraphOperands(&operands);
578 
579     if (result == ANEURALNETWORKS_NO_ERROR) {
580         VLOG(CPUEXE) << "Completed run normally";
581         for (auto& runtimeInfo : requestPoolInfos) {
582             runtimeInfo.flush();
583         }
584     }
585 
586     // Only report the output shapes when the result code is NO_ERROR or OUTPUT_INSUFFICIENT_SIZE.
587     if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
588         setOutputShapes(model.main.outputIndexes, operands);
589     } else {
590         mOutputShapes.clear();
591     }
592 
593     mFinished = true;
594     mModelOperandValues = nullptr;
595     mModelPoolInfos = nullptr;
596     mReferencedSubgraphs = nullptr;
597     return result;
598 }
599 
executeSubgraph(const Model::Subgraph & subgraph,RunTimeOperandInfo * operands)600 int CpuExecutor::executeSubgraph(const Model::Subgraph& subgraph, RunTimeOperandInfo* operands) {
601     VLOG(CPUEXE) << "CpuExecutor::executeSubgraph " << subgraph;
602     // The graph has serialized the operation in execution order.
603     for (const auto& operation : subgraph.operations) {
604         NN_RETURN_IF_ERROR(executeOperation(operation, operands));
605     }
606     return ANEURALNETWORKS_NO_ERROR;
607 }
608 
initializeRunTimeInfo(const Model::Subgraph & subgraph)609 std::vector<RunTimeOperandInfo> CpuExecutor::initializeRunTimeInfo(
610         const Model::Subgraph& subgraph) {
611     VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
612     const size_t count = subgraph.operands.size();
613     std::vector<RunTimeOperandInfo> operands(count);
614     std::vector<uint32_t> numberOfConsumers =
615             countNumberOfConsumers(count, subgraph.operations).value();
616     for (size_t i = 0; i < count; i++) {
617         const Operand& from = subgraph.operands[i];
618         RunTimeOperandInfo& to = operands[i];
619         to.type = from.type;
620         to.dimensions = from.dimensions;
621         to.scale = from.scale;
622         to.zeroPoint = from.zeroPoint;
623         to.length = from.location.length;
624         to.lifetime = from.lifetime;
625         to.extraParams = from.extraParams;
626         switch (from.lifetime) {
627             case Operand::LifeTime::TEMPORARY_VARIABLE:
628                 to.buffer = nullptr;
629                 to.numberOfUsesLeft = numberOfConsumers[i];
630                 break;
631             case Operand::LifeTime::CONSTANT_COPY:
632                 to.buffer = const_cast<uint8_t*>(mModelOperandValues + from.location.offset);
633                 to.numberOfUsesLeft = 0;
634                 break;
635             case Operand::LifeTime::CONSTANT_REFERENCE: {
636                 auto poolIndex = from.location.poolIndex;
637                 CHECK_LT(poolIndex, mModelPoolInfos->size());
638                 auto& r = (*mModelPoolInfos)[poolIndex];
639                 to.buffer = r.getBuffer() + from.location.offset;
640                 to.numberOfUsesLeft = 0;
641                 break;
642             }
643             case Operand::LifeTime::SUBGRAPH: {
644                 auto subgraphIndex = from.location.offset;
645                 CHECK_LT(subgraphIndex, mReferencedSubgraphs->size());
646                 to.buffer = reinterpret_cast<uint8_t*>(
647                         const_cast<Model::Subgraph*>(&(*mReferencedSubgraphs)[subgraphIndex]));
648                 to.numberOfUsesLeft = 0;
649             } break;
650             case Operand::LifeTime::POINTER: {
651                 to.buffer = reinterpret_cast<uint8_t*>(
652                         const_cast<void*>(std::get<const void*>(from.location.pointer)));
653                 to.numberOfUsesLeft = 0;
654             } break;
655             case Operand::LifeTime::SUBGRAPH_INPUT:
656             case Operand::LifeTime::SUBGRAPH_OUTPUT:
657             case Operand::LifeTime::NO_VALUE:
658                 to.buffer = nullptr;
659                 to.numberOfUsesLeft = 0;
660                 break;
661         }
662     }
663     return operands;
664 }
665 
updateForArguments(const std::vector<uint32_t> & indexes,const std::vector<Request::Argument> & arguments,const std::vector<RunTimePoolInfo> & requestPoolInfos,RunTimeOperandInfo * operands)666 void CpuExecutor::updateForArguments(const std::vector<uint32_t>& indexes,
667                                      const std::vector<Request::Argument>& arguments,
668                                      const std::vector<RunTimePoolInfo>& requestPoolInfos,
669                                      RunTimeOperandInfo* operands) {
670     CHECK_EQ(indexes.size(), arguments.size());
671     for (size_t i = 0; i < indexes.size(); i++) {
672         const uint32_t operandIndex = indexes[i];
673         const Request::Argument& from = arguments[i];
674         RunTimeOperandInfo& to = operands[operandIndex];
675         if (!from.dimensions.empty()) {
676             // It's the responsibility of the caller to validate that
677             // from.dimensions only modifies the dimensions that were
678             // unspecified in the model.  That's the case in SampleDriver.cpp
679             // with the call to validateRequest().
680             // TODO make sure that's the case for the default CPU path.
681             to.dimensions = from.dimensions;
682         }
683         switch (from.lifetime) {
684             case Request::Argument::LifeTime::NO_VALUE: {
685                 to.lifetime = Operand::LifeTime::NO_VALUE;
686                 CHECK(to.buffer == nullptr);
687                 to.length = 0;
688                 break;
689             }
690             case Request::Argument::LifeTime::POOL: {
691                 auto poolIndex = from.location.poolIndex;
692                 CHECK_LT(poolIndex, requestPoolInfos.size());
693                 auto& r = requestPoolInfos[poolIndex];
694                 to.buffer = r.getBuffer() + from.location.offset;
695                 if (from.location.offset == 0 && from.location.length == 0) {
696                     // Use the entire memory region.
697                     to.length = r.getSize();
698                 } else {
699                     to.length = from.location.length;
700                 }
701                 break;
702             }
703             case Request::Argument::LifeTime::POINTER: {
704                 constexpr auto fn = [](const void* ptr) {
705                     return static_cast<const uint8_t*>(ptr);
706                 };
707                 auto ptr = std::visit(fn, from.location.pointer);
708                 // Writing to a const buffer may lead to undefined behavior.
709                 // TODO: Refactor the code to avoid the const_cast.
710                 to.buffer = const_cast<uint8_t*>(ptr);
711                 to.length = from.location.length;
712                 break;
713             }
714         }
715     }
716 }
717 
executeOperation(const Operation & operation,RunTimeOperandInfo * operands)718 int CpuExecutor::executeOperation([[maybe_unused]] const Operation& operation,
719                                   [[maybe_unused]] RunTimeOperandInfo* operands) {
720 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
721     if (hasDeadlinePassed(mDeadline)) {
722         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
723     }
724     if (operation.type == OperationType::IF) {
725         int result = executeIfOperation(operation, operands);
726         if (result != ANEURALNETWORKS_NO_ERROR) {
727             LOG(ERROR) << "IF failed.";
728         }
729         return result;
730     }
731     if (operation.type == OperationType::WHILE) {
732         int result = executeWhileOperation(operation, operands);
733         if (result != ANEURALNETWORKS_NO_ERROR) {
734             LOG(ERROR) << "WHILE failed.";
735         }
736         return result;
737     }
738 
739     // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << operation << ")";
740     const std::vector<uint32_t>& ins = operation.inputs;
741     const std::vector<uint32_t>& outs = operation.outputs;
742     bool success = false;
743     int result = ANEURALNETWORKS_NO_ERROR;
744 
745     // Function to verify that the number of input and output parameters
746     // matches what is expected.  Also checks that all the parameters have
747     // values. This function is to be used only for operations that do not
748     // accept optional arguments.
749     // TODO Have a version that works for optional arguments.
750     auto allParametersPresent = [&operation, &operands, &ins, &outs](size_t requiredIns,
751                                                                      size_t requiredOuts) -> bool {
752         auto verify = [&operation, &operands](size_t requiredCount,
753                                               const std::vector<uint32_t>& indexes,
754                                               const char* type) -> bool {
755             size_t actualCount = indexes.size();
756             if (actualCount != requiredCount) {
757                 LOG(ERROR) << operation.type << ": Invalid number of " << type << " operands. Got "
758                            << actualCount << " of " << requiredCount;
759                 return false;
760             }
761             for (size_t i = 0; i < actualCount; i++) {
762                 if (operands[indexes[i]].lifetime == Operand::LifeTime::NO_VALUE) {
763                     LOG(ERROR) << operation.type << " " << type << " operand " << i
764                                << " is required but missing.";
765                     return false;
766                 }
767             }
768             return true;
769         };
770 
771         auto verifyNoZeroSizedInputs = [&operation,
772                                         &operands](const std::vector<uint32_t>& indexes) {
773             for (size_t i = 0; i < indexes.size(); i++) {
774                 for (size_t j = 0; j < operands[indexes[i]].dimensions.size(); j++) {
775                     if (operands[indexes[i]].dimensions[j] == 0) {
776                         LOG(ERROR) << operation.type
777                                    << " does not support zero-sized tensor, but input " << i
778                                    << " dimension " << j << " is zero.";
779                         return false;
780                     }
781                 }
782             }
783             return true;
784         };
785 
786         return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
787                verifyNoZeroSizedInputs(ins);
788     };
789 
790     switch (operation.type) {
791         case OperationType::OEM_OPERATION: {
792             LOG(ERROR) << "OEM operation not supported for CPU execution";
793             success = false;
794         } break;
795         case OperationType::RESHAPE: {
796             if (!allParametersPresent(2, 1)) {
797                 return ANEURALNETWORKS_BAD_DATA;
798             }
799             const RunTimeOperandInfo& input = operands[ins[0]];
800             const RunTimeOperandInfo& targetShape = operands[ins[1]];
801 
802             RunTimeOperandInfo& output = operands[outs[0]];
803             Shape outShape = output.shape();
804 
805             success = reshapePrepare(input.shape(),
806                                      reinterpret_cast<const int32_t*>(targetShape.buffer),
807                                      getNumberOfElements(targetShape.shape()), &outShape) &&
808                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
809                       copyData(input.buffer, input.shape(), output.buffer, outShape);
810         } break;
811         case OperationType::DEPTH_TO_SPACE: {
812             const size_t inCount = ins.size();
813             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
814                 return ANEURALNETWORKS_BAD_DATA;
815             }
816             const RunTimeOperandInfo& input = operands[ins[0]];
817             int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
818             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
819 
820             RunTimeOperandInfo& output = operands[outs[0]];
821             Shape outShape = output.shape();
822 
823             RunTimeOperandInfo input_tmp, output_tmp;
824             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
825             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
826                 success = false;
827                 break;
828             }
829             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
830             output_tmp.buffer = data_layout ? nullptr : output.buffer;
831             output_tmp.length = data_layout ? 0 : output.length;
832             if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
833                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
834                 if (!data_layout) output.dimensions = output_tmp.dimensions;
835                 break;
836             }
837             switch (input_tmp.type) {
838                 case OperandType::TENSOR_FLOAT32: {
839                     success = depthToSpaceGeneric(
840                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
841                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
842                     break;
843                 }
844                 case OperandType::TENSOR_FLOAT16: {
845                     success = depthToSpaceGeneric(
846                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
847                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
848                     break;
849                 }
850                 case OperandType::TENSOR_QUANT8_ASYMM: {
851                     success = depthToSpaceGeneric(
852                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
853                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
854                     break;
855                 }
856                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
857                     success = depthToSpaceGeneric(
858                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
859                             blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
860                     break;
861                 }
862                 default: {
863                     LOG(ERROR) << "Unsupported data type";
864                     success = false;
865                 }
866             }
867             if (data_layout) {
868                 output_tmp_guard.reset(output_tmp.buffer);
869             }
870             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
871                 success = false;
872                 break;
873             }
874         } break;
875         case OperationType::SPACE_TO_DEPTH: {
876             const size_t inCount = ins.size();
877             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
878                 return ANEURALNETWORKS_BAD_DATA;
879             }
880             const RunTimeOperandInfo& input = operands[ins[0]];
881             int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
882             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
883 
884             RunTimeOperandInfo& output = operands[outs[0]];
885             Shape outShape = output.shape();
886 
887             RunTimeOperandInfo input_tmp, output_tmp;
888             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
889             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
890                 success = false;
891                 break;
892             }
893             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
894             output_tmp.buffer = data_layout ? nullptr : output.buffer;
895             output_tmp.length = data_layout ? 0 : output.length;
896 
897             if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
898                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
899                 if (!data_layout) output.dimensions = output_tmp.dimensions;
900                 break;
901             }
902             switch (input_tmp.type) {
903                 case OperandType::TENSOR_FLOAT32: {
904                     success = spaceToDepthGeneric(
905                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
906                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
907                     break;
908                 }
909                 case OperandType::TENSOR_FLOAT16: {
910                     success = spaceToDepthGeneric(
911                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
912                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
913                     break;
914                 }
915                 case OperandType::TENSOR_QUANT8_ASYMM: {
916                     success = spaceToDepthGeneric(
917                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
918                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
919                     break;
920                 }
921                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
922                     success = spaceToDepthGeneric(
923                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
924                             blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
925                     break;
926                 }
927                 default: {
928                     LOG(ERROR) << "Unsupported data type";
929                     success = false;
930                 }
931             }
932             if (data_layout) {
933                 output_tmp_guard.reset(output_tmp.buffer);
934             }
935             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
936                 success = false;
937                 break;
938             }
939         } break;
940         case OperationType::EMBEDDING_LOOKUP: {
941             if (!allParametersPresent(2, 1)) {
942                 return ANEURALNETWORKS_BAD_DATA;
943             }
944             const RunTimeOperandInfo& values = operands[ins[EmbeddingLookup::kValueTensor]];
945             const RunTimeOperandInfo& lookups = operands[ins[EmbeddingLookup::kLookupTensor]];
946             RunTimeOperandInfo& output = operands[outs[EmbeddingLookup::kOutputTensor]];
947 
948             Shape outputShape;
949             EmbeddingLookup lookup(operation, operands);
950 
951             success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
952                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
953         } break;
954         case OperationType::HASHTABLE_LOOKUP: {
955             if (!allParametersPresent(3, 2)) {
956                 return ANEURALNETWORKS_BAD_DATA;
957             }
958             const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
959             const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
960             const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
961 
962             RunTimeOperandInfo& output = operands[outs[HashtableLookup::kOutputTensor]];
963             RunTimeOperandInfo& hits = operands[outs[HashtableLookup::kHitsTensor]];
964 
965             Shape outputShape, hitShape;
966             HashtableLookup lookup(operation, operands);
967 
968             success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
969                                              &outputShape, &hitShape) &&
970                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
971                       setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
972         } break;
973         case OperationType::LSH_PROJECTION: {
974             RunTimeOperandInfo& output = operands[outs[LSHProjection::kOutputTensor]];
975             Shape outputShape;
976             if (!LSHProjection::Prepare(operation, operands, &outputShape) ||
977                 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
978                 break;
979             }
980 
981             LSHProjection lsh(operation, operands);
982             const RunTimeOperandInfo& hash = operands[ins[LSHProjection::kHashTensor]];
983             switch (hash.type) {
984                 case OperandType::TENSOR_FLOAT32: {
985                     success = lsh.Eval<float>();
986                     break;
987                 }
988                 case OperandType::TENSOR_FLOAT16: {
989                     success = lsh.Eval<_Float16>();
990                     break;
991                 }
992                 default: {
993                     success = false;
994                     LOG(ERROR) << "Unsupported data type";
995                 }
996             }
997         } break;
998         case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
999             const auto merge_outputs = getScalarData<bool>(
1000                     operands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1001             const bool output_state = (outs.size() == 5 || outs.size() == 6);
1002             RunTimeOperandInfo& fwOutput =
1003                     operands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1004             Shape fwOutputShape, bwOutputShape, fwOutputActivationStateShape,
1005                     fwOutputCellStateShape, bwOutputActivationStateShape, bwOutputCellStateShape;
1006 
1007             BidirectionalSequenceLSTM lstm(operation, operands);
1008             success = lstm.Prepare(operation, operands, &fwOutputShape, &bwOutputShape,
1009                                    &fwOutputActivationStateShape, &fwOutputCellStateShape,
1010                                    &bwOutputActivationStateShape, &bwOutputCellStateShape) &&
1011                       setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1012             if (!merge_outputs) {
1013                 RunTimeOperandInfo& bwOutput =
1014                         operands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1015                 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1016             }
1017             if (output_state) {
1018                 uint32_t delta = merge_outputs ? 1 : 0;
1019                 RunTimeOperandInfo& fwOutputActivationState =
1020                         operands[outs[BidirectionalSequenceLSTM::kFwOutputActivationStateTensor -
1021                                       delta]];
1022                 RunTimeOperandInfo& fwOutputCellState =
1023                         operands[outs[BidirectionalSequenceLSTM::kFwOutputCellStateTensor - delta]];
1024                 RunTimeOperandInfo& bwOutputActivationState =
1025                         operands[outs[BidirectionalSequenceLSTM::kBwOutputActivationStateTensor -
1026                                       delta]];
1027                 RunTimeOperandInfo& bwOutputCellState =
1028                         operands[outs[BidirectionalSequenceLSTM::kBwOutputCellStateTensor - delta]];
1029                 success = success &&
1030                           setInfoAndAllocateIfNeeded(&fwOutputActivationState,
1031                                                      fwOutputActivationStateShape, &result) &&
1032                           setInfoAndAllocateIfNeeded(&fwOutputCellState, fwOutputCellStateShape,
1033                                                      &result) &&
1034                           setInfoAndAllocateIfNeeded(&bwOutputActivationState,
1035                                                      bwOutputActivationStateShape, &result) &&
1036                           setInfoAndAllocateIfNeeded(&bwOutputCellState, bwOutputCellStateShape,
1037                                                      &result);
1038             }
1039             success = success && lstm.Eval();
1040         } break;
1041         case OperationType::LSTM: {
1042             RunTimeOperandInfo& scratch = operands[outs[LSTMCell::kScratchBufferTensor]];
1043             RunTimeOperandInfo& outputStateOut = operands[outs[LSTMCell::kOutputStateOutTensor]];
1044             RunTimeOperandInfo& cellStateOut = operands[outs[LSTMCell::kCellStateOutTensor]];
1045             RunTimeOperandInfo& output = operands[outs[LSTMCell::kOutputTensor]];
1046 
1047             Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1048             LSTMCell lstm_cell(operation, operands);
1049 
1050             success = lstm_cell.Prepare(operation, operands, &scratchShape, &outputStateShape,
1051                                         &cellStateShape, &outputShape) &&
1052                       setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1053                       setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1054                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1055                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1056         } break;
1057         case OperationType::RANDOM_MULTINOMIAL: {
1058             if (!allParametersPresent(3, 1)) {
1059                 return ANEURALNETWORKS_BAD_DATA;
1060             }
1061             RunTimeOperandInfo& output = operands[outs[Multinomial::kOutputTensor]];
1062 
1063             Shape outputShape;
1064             Multinomial multinomial(operation, operands);
1065 
1066             success = Multinomial::Prepare(operation, operands, &outputShape) &&
1067                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1068                       multinomial.Eval();
1069         } break;
1070         case OperationType::RNN: {
1071             if (!allParametersPresent(6, 2)) {
1072                 return ANEURALNETWORKS_BAD_DATA;
1073             }
1074 
1075             RunTimeOperandInfo& hiddenStateOut = operands[outs[RNN::kHiddenStateOutTensor]];
1076             RunTimeOperandInfo& output = operands[outs[RNN::kOutputTensor]];
1077 
1078             Shape hiddenStateShape, outputShape;
1079             RNN rnn_cell(operation, operands);
1080 
1081             success = RNN::Prepare(operation, operands, &hiddenStateShape, &outputShape) &&
1082                       setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1083                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1084         } break;
1085         case OperationType::SVDF: {
1086             RunTimeOperandInfo& stateOut = operands[outs[SVDF::kStateOutTensor]];
1087             RunTimeOperandInfo& output = operands[outs[SVDF::kOutputTensor]];
1088 
1089             Shape stateShape, outputShape;
1090             SVDF svdf(operation, operands);
1091 
1092             success = SVDF::Prepare(operation, operands, &stateShape, &outputShape) &&
1093                       setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1094                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1095         } break;
1096         case OperationType::BATCH_TO_SPACE_ND: {
1097             const size_t inCount = ins.size();
1098             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1099                 return ANEURALNETWORKS_BAD_DATA;
1100             }
1101             const RunTimeOperandInfo& input = operands[ins[0]];
1102             const RunTimeOperandInfo& blockSize = operands[ins[1]];
1103             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
1104 
1105             RunTimeOperandInfo& output = operands[outs[0]];
1106             Shape outShape = output.shape();
1107 
1108             RunTimeOperandInfo input_tmp, output_tmp;
1109             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1110             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1111                 success = false;
1112                 break;
1113             }
1114             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1115             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1116             output_tmp.length = data_layout ? 0 : output.length;
1117 
1118             if (!batchToSpacePrepare(input_tmp.shape(),
1119                                      reinterpret_cast<const int32_t*>(blockSize.buffer),
1120                                      blockSize.shape(), &outShape) ||
1121                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1122                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1123                 break;
1124             }
1125             switch (input_tmp.type) {
1126                 case OperandType::TENSOR_FLOAT32: {
1127                     success = batchToSpaceGeneric(
1128                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1129                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1130                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1131                     break;
1132                 }
1133                 case OperandType::TENSOR_FLOAT16: {
1134                     success = batchToSpaceGeneric(
1135                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1136                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1137                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1138                     break;
1139                 }
1140                 case OperandType::TENSOR_QUANT8_ASYMM: {
1141                     success = batchToSpaceGeneric(
1142                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1143                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1144                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1145                     break;
1146                 }
1147                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1148                     success = batchToSpaceGeneric(
1149                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1150                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1151                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1152                     break;
1153                 }
1154                 default: {
1155                     LOG(ERROR) << "Unsupported data type";
1156                     success = false;
1157                 }
1158             }
1159             if (data_layout) {
1160                 output_tmp_guard.reset(output_tmp.buffer);
1161             }
1162             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1163                 success = false;
1164                 break;
1165             }
1166         } break;
1167         case OperationType::SPACE_TO_BATCH_ND: {
1168             const size_t inCount = ins.size();
1169             if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1170                 return ANEURALNETWORKS_BAD_DATA;
1171             }
1172             const RunTimeOperandInfo& input = operands[ins[0]];
1173             const RunTimeOperandInfo& blockSize = operands[ins[1]];
1174             const RunTimeOperandInfo& paddings = operands[ins[2]];
1175             bool data_layout = inCount == 4 ? getScalarData<bool>(operands[ins[3]]) : false;
1176 
1177             RunTimeOperandInfo& output = operands[outs[0]];
1178             Shape outShape = output.shape();
1179 
1180             RunTimeOperandInfo input_tmp, output_tmp;
1181             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1182             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1183                 success = false;
1184                 break;
1185             }
1186             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1187             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1188             output_tmp.length = data_layout ? 0 : output.length;
1189 
1190             if (!spaceToBatchPrepare(
1191                         input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1192                         blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1193                         paddings.shape(), &outShape) ||
1194                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1195                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1196                 break;
1197             }
1198             switch (input_tmp.type) {
1199                 case OperandType::TENSOR_FLOAT32: {
1200                     success = spaceToBatchGeneric(
1201                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1202                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1203                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1204                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1205                     break;
1206                 }
1207                 case OperandType::TENSOR_FLOAT16: {
1208                     success = spaceToBatchGeneric(
1209                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1210                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1211                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1212                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1213                     break;
1214                 }
1215                 case OperandType::TENSOR_QUANT8_ASYMM: {
1216                     success = spaceToBatchGeneric(
1217                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1218                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1219                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1220                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1221                     break;
1222                 }
1223                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1224                     success = spaceToBatchGeneric(
1225                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1226                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1227                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1228                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1229                     break;
1230                 }
1231                 default: {
1232                     LOG(ERROR) << "Unsupported data type";
1233                     success = false;
1234                 }
1235             }
1236             if (data_layout) {
1237                 output_tmp_guard.reset(output_tmp.buffer);
1238             }
1239             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1240                 success = false;
1241                 break;
1242             }
1243         } break;
1244         case OperationType::PAD:
1245         case OperationType::PAD_V2: {
1246             const bool isV2 = operation.type == OperationType::PAD_V2;
1247             if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1248                 return ANEURALNETWORKS_BAD_DATA;
1249             }
1250             const RunTimeOperandInfo& input = operands[ins[0]];
1251             const RunTimeOperandInfo& paddings = operands[ins[1]];
1252 
1253             RunTimeOperandInfo& output = operands[outs[0]];
1254             Shape outShape = output.shape();
1255 
1256             if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1257                             paddings.shape(), &outShape) ||
1258                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1259                 break;
1260             }
1261             if (input.type == OperandType::TENSOR_FLOAT32) {
1262                 float pad_value = isV2 ? getScalarData<float>(operands[ins[2]]) : 0;
1263                 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1264                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1265                                      reinterpret_cast<float*>(output.buffer), outShape);
1266             } else if (input.type == OperandType::TENSOR_FLOAT16) {
1267                 _Float16 pad_value = isV2 ? getScalarData<_Float16>(operands[ins[2]]) : 0;
1268                 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1269                                      reinterpret_cast<const int32_t*>(paddings.buffer),
1270                                      static_cast<_Float16>(pad_value),
1271                                      reinterpret_cast<_Float16*>(output.buffer), outShape);
1272             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1273                 uint8_t pad_value =
1274                         isV2 ? getScalarData<uint8_t>(operands[ins[2]]) : outShape.offset;
1275                 success = padGeneric(input.buffer, input.shape(),
1276                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1277                                      output.buffer, outShape);
1278             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1279                 uint8_t pad_value =
1280                         isV2 ? getScalarData<int8_t>(operands[ins[2]]) : outShape.offset;
1281                 success = padGeneric(input.buffer, input.shape(),
1282                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1283                                      output.buffer, outShape);
1284             }
1285         } break;
1286         case OperationType::CAST: {
1287             if (!allParametersPresent(1, 1)) {
1288                 return ANEURALNETWORKS_BAD_DATA;
1289             }
1290             const RunTimeOperandInfo& input = operands[ins[0]];
1291 
1292             RunTimeOperandInfo& output = operands[outs[0]];
1293             Shape outShape = output.shape();
1294 
1295             success = cast::prepare(input.shape(), &outShape) &&
1296                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1297                       cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1298         } break;
1299         case OperationType::MEAN: {
1300             if (!allParametersPresent(3, 1)) {
1301                 return ANEURALNETWORKS_BAD_DATA;
1302             }
1303             const RunTimeOperandInfo& input = operands[ins[0]];
1304             const RunTimeOperandInfo& axis = operands[ins[1]];
1305             int32_t keepDims = getScalarData<int32_t>(operands[ins[2]]);
1306 
1307             RunTimeOperandInfo& output = operands[outs[0]];
1308             Shape outShape = output.shape();
1309 
1310             if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1311                              axis.shape(), keepDims > 0, &outShape) ||
1312                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1313                 break;
1314             }
1315             if (input.type == OperandType::TENSOR_FLOAT16) {
1316                 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1317                                       reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1318                                       keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1319                                       outShape);
1320             } else if (input.type == OperandType::TENSOR_FLOAT32) {
1321                 success = meanGeneric<float, float>(
1322                         reinterpret_cast<float*>(input.buffer), input.shape(),
1323                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1324                         reinterpret_cast<float*>(output.buffer), outShape);
1325             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1326                 success = meanGeneric<uint8_t, int32_t>(
1327                         reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1328                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1329                         reinterpret_cast<uint8_t*>(output.buffer), outShape);
1330             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1331                 success = meanGeneric<int8_t, int32_t>(
1332                         reinterpret_cast<int8_t*>(input.buffer), input.shape(),
1333                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1334                         reinterpret_cast<int8_t*>(output.buffer), outShape);
1335             }
1336         } break;
1337         case OperationType::ARGMAX:
1338         case OperationType::ARGMIN: {
1339             if (!allParametersPresent(2, 1)) {
1340                 return ANEURALNETWORKS_BAD_DATA;
1341             }
1342             const RunTimeOperandInfo& input = operands[ins[0]];
1343             int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1344 
1345             RunTimeOperandInfo& output = operands[outs[0]];
1346             Shape outShape = output.shape();
1347 
1348             const bool isArgMin = operation.type == OperationType::ARGMIN;
1349             success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1350                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1351                       argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1352                                        outShape);
1353         } break;
1354         case OperationType::EXPAND_DIMS: {
1355             if (!allParametersPresent(2, 1)) {
1356                 return ANEURALNETWORKS_BAD_DATA;
1357             }
1358             const RunTimeOperandInfo& input = operands[ins[0]];
1359             int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1360 
1361             RunTimeOperandInfo& output = operands[outs[0]];
1362             Shape outShape = output.shape();
1363 
1364             success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1365                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1366                       expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1367         } break;
1368         case OperationType::SPLIT: {
1369             const size_t outCount = outs.size();
1370             if (!allParametersPresent(3, outCount)) {
1371                 return ANEURALNETWORKS_BAD_DATA;
1372             }
1373 
1374             const RunTimeOperandInfo& input = operands[ins[0]];
1375             const int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1376             const int32_t numOutputs = getScalarData<int32_t>(operands[ins[2]]);
1377 
1378             if (static_cast<size_t>(numOutputs) != outs.size()) {
1379                 return ANEURALNETWORKS_BAD_DATA;
1380             }
1381 
1382             std::vector<Shape> outputShapes(numOutputs);
1383             for (int i = 0; i < numOutputs; ++i) {
1384                 outputShapes[i] = operands[outs[i]].shape();
1385             }
1386 
1387             success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1388             for (int i = 0; i < numOutputs; ++i) {
1389                 success = success && setInfoAndAllocateIfNeeded(&(operands[outs[i]]),
1390                                                                 outputShapes[i], &result);
1391             }
1392             switch (input.type) {
1393                 case OperandType::TENSOR_FLOAT16: {
1394                     std::vector<_Float16*> outputDataPtrs(numOutputs);
1395                     for (int i = 0; i < numOutputs; ++i) {
1396                         outputDataPtrs[i] = reinterpret_cast<_Float16*>(operands[outs[i]].buffer);
1397                     }
1398                     success = success &&
1399                               splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1400                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1401                 } break;
1402                 case OperandType::TENSOR_FLOAT32: {
1403                     std::vector<float*> outputDataPtrs(numOutputs);
1404                     for (int i = 0; i < numOutputs; ++i) {
1405                         outputDataPtrs[i] = reinterpret_cast<float*>(operands[outs[i]].buffer);
1406                     }
1407                     success = success &&
1408                               splitFloat32(reinterpret_cast<const float*>(input.buffer),
1409                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1410                 } break;
1411                 case OperandType::TENSOR_INT32: {
1412                     std::vector<int32_t*> outputDataPtrs(numOutputs);
1413                     for (int i = 0; i < numOutputs; ++i) {
1414                         outputDataPtrs[i] = reinterpret_cast<int32_t*>(operands[outs[i]].buffer);
1415                     }
1416                     success = success &&
1417                               splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1418                                          input.shape(), axis, &outputDataPtrs, outputShapes);
1419                 } break;
1420                 case OperandType::TENSOR_QUANT8_ASYMM: {
1421                     std::vector<uint8_t*> outputDataPtrs(numOutputs);
1422                     for (int i = 0; i < numOutputs; ++i) {
1423                         outputDataPtrs[i] = reinterpret_cast<uint8_t*>(operands[outs[i]].buffer);
1424                     }
1425                     success = success &&
1426                               splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1427                                           input.shape(), axis, &outputDataPtrs, outputShapes);
1428                 } break;
1429                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1430                     std::vector<int8_t*> outputDataPtrs(numOutputs);
1431                     for (int i = 0; i < numOutputs; ++i) {
1432                         outputDataPtrs[i] = reinterpret_cast<int8_t*>(operands[outs[i]].buffer);
1433                     }
1434                     success = success &&
1435                               splitQuant8Signed(reinterpret_cast<const int8_t*>(input.buffer),
1436                                                 input.shape(), axis, &outputDataPtrs, outputShapes);
1437                 } break;
1438                 default: {
1439                     return ANEURALNETWORKS_BAD_DATA;
1440                 }
1441             }
1442         } break;
1443         case OperationType::MAXIMUM:
1444         case OperationType::MINIMUM: {
1445             if (!allParametersPresent(2, 1)) {
1446                 return ANEURALNETWORKS_BAD_DATA;
1447             }
1448             const RunTimeOperandInfo& in1 = operands[ins[0]];
1449             const RunTimeOperandInfo& in2 = operands[ins[1]];
1450 
1451             RunTimeOperandInfo& output = operands[outs[0]];
1452             Shape outputShape = output.shape();
1453 
1454             const bool isMinimum = operation.type == OperationType::MINIMUM;
1455             success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1456                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1457                       maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1458                                             isMinimum, output.buffer, outputShape);
1459         } break;
1460         case OperationType::GROUPED_CONV_2D: {
1461             const size_t inCount = ins.size();
1462             if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1463                 return ANEURALNETWORKS_BAD_DATA;
1464             }
1465             const RunTimeOperandInfo& input = operands[ins[0]];
1466             const RunTimeOperandInfo& filter = operands[ins[1]];
1467             const RunTimeOperandInfo& bias = operands[ins[2]];
1468 
1469             int32_t padding_left, padding_right;
1470             int32_t padding_top, padding_bottom;
1471             int32_t padding_implicit = 0;
1472             int32_t stride_width, stride_height;
1473             int32_t numGroups;
1474             int32_t activation;
1475             bool data_layout = false;
1476 
1477             if (inCount == 12) {
1478                 padding_left = getScalarData<int32_t>(operands[ins[3]]);
1479                 padding_right = getScalarData<int32_t>(operands[ins[4]]);
1480                 padding_top = getScalarData<int32_t>(operands[ins[5]]);
1481                 padding_bottom = getScalarData<int32_t>(operands[ins[6]]);
1482                 stride_width = getScalarData<int32_t>(operands[ins[7]]);
1483                 stride_height = getScalarData<int32_t>(operands[ins[8]]);
1484                 numGroups = getScalarData<int32_t>(operands[ins[9]]);
1485                 activation = getScalarData<int32_t>(operands[ins[10]]);
1486                 data_layout = getScalarData<bool>(operands[ins[11]]);
1487             } else {
1488                 padding_implicit = getScalarData<int32_t>(operands[ins[3]]);
1489                 stride_width = getScalarData<int32_t>(operands[ins[4]]);
1490                 stride_height = getScalarData<int32_t>(operands[ins[5]]);
1491                 numGroups = getScalarData<int32_t>(operands[ins[6]]);
1492                 activation = getScalarData<int32_t>(operands[ins[7]]);
1493                 data_layout = getScalarData<bool>(operands[ins[8]]);
1494             }
1495 
1496             RunTimeOperandInfo& output = operands[outs[0]];
1497             Shape outShape = output.shape();
1498 
1499             RunTimeOperandInfo input_tmp, output_tmp;
1500             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1501             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1502                 success = false;
1503                 break;
1504             }
1505             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1506             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1507             output_tmp.length = data_layout ? 0 : output.length;
1508 
1509             if (inCount == 9) {
1510                 Shape inputShape = input_tmp.shape();
1511                 Shape filterShape = filter.shape();
1512                 int32_t input_width = getSizeOfDimension(inputShape, 2);
1513                 int32_t input_height = getSizeOfDimension(inputShape, 1);
1514                 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1515                 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1516                 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1517                                          &padding_left, &padding_right);
1518                 calculateExplicitPadding(input_height, stride_height, filter_height,
1519                                          padding_implicit, &padding_top, &padding_bottom);
1520             }
1521 
1522             if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1523                                     padding_right, padding_top, padding_bottom, stride_width,
1524                                     stride_height, numGroups, &outShape) ||
1525                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1526                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1527                 success = false;
1528                 break;
1529             }
1530 
1531             if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1532                 success = groupedConvFloat32(
1533                         reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1534                         reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1535                         reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1536                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1537                         numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1538                         outShape);
1539             } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1540                 success = groupedConvFloat16(
1541                         reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1542                         reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1543                         reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1544                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1545                         numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1546                         outShape);
1547             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1548                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1549                     success = groupedConvQuant8PerChannel(
1550                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1551                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1552                             std::get<Operand::SymmPerChannelQuantParams>(filter.extraParams)
1553                                     .scales.data(),
1554                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1555                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1556                             stride_height, numGroups, activation,
1557                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1558                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1559                     success = groupedConvQuant8(
1560                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1561                             reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1562                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1563                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1564                             stride_height, numGroups, activation,
1565                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1566                 }
1567             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1568                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1569                     success = groupedConvQuant8PerChannel(
1570                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1571                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1572                             std::get<Operand::SymmPerChannelQuantParams>(filter.extraParams)
1573                                     .scales.data(),
1574                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1575                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1576                             stride_height, numGroups, activation,
1577                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1578                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1579                     success = groupedConvQuant8(
1580                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1581                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1582                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1583                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1584                             stride_height, numGroups, activation,
1585                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1586                 }
1587             }
1588 
1589             if (data_layout) {
1590                 output_tmp_guard.reset(output_tmp.buffer);
1591             }
1592             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1593                 success = false;
1594                 break;
1595             }
1596         } break;
1597         case OperationType::TILE: {
1598             if (!allParametersPresent(2, 1)) {
1599                 return ANEURALNETWORKS_BAD_DATA;
1600             }
1601             const RunTimeOperandInfo& input = operands[ins[0]];
1602             const RunTimeOperandInfo& multiples = operands[ins[1]];
1603 
1604             RunTimeOperandInfo& output = operands[outs[0]];
1605             Shape outShape = output.shape();
1606 
1607             success =
1608                     tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1609                                   multiples.shape(), &outShape) &&
1610                     setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1611                     tile::eval(input.buffer, input.shape(),
1612                                reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1613                                outShape);
1614         } break;
1615         case OperationType::QUANTIZED_16BIT_LSTM: {
1616             if (!allParametersPresent(15, 2)) {
1617                 return ANEURALNETWORKS_BAD_DATA;
1618             }
1619 
1620             RunTimeOperandInfo& cellStateOut =
1621                     operands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1622             RunTimeOperandInfo& output = operands[outs[QuantizedLSTMCell::kOutputTensor]];
1623 
1624             Shape cellStateOutShape, outputShape;
1625             QuantizedLSTMCell quantizedLSTMCell(operation, operands);
1626 
1627             success = QuantizedLSTMCell::prepare(operation, operands, &cellStateOutShape,
1628                                                  &outputShape) &&
1629                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1630                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1631                       quantizedLSTMCell.eval();
1632         } break;
1633         case OperationType::POW: {
1634             if (!allParametersPresent(2, 1)) {
1635                 return ANEURALNETWORKS_BAD_DATA;
1636             }
1637             const RunTimeOperandInfo& base = operands[ins[0]];
1638             const RunTimeOperandInfo& exponent = operands[ins[1]];
1639 
1640             RunTimeOperandInfo& output = operands[outs[0]];
1641             Shape outShape = output.shape();
1642 
1643             success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1644                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1645                       pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1646                                 output.buffer, outShape);
1647         } break;
1648         default: {
1649             const OperationRegistration* operationRegistration =
1650                     mOperationResolver->findOperation(operation.type);
1651             if (operationRegistration == nullptr) {
1652                 LOG(ERROR) << operation.type << " not registered";
1653             } else if (operationRegistration->prepare == nullptr ||
1654                        operationRegistration->execute == nullptr) {
1655                 LOG(ERROR) << "Incomplete operation registration: " << operation.type;
1656             } else {
1657                 OperationExecutionContext context(&operation, operands);
1658                 success = operationRegistration->flags.allowOmittedOperand ||
1659                           context.checkNoOmittedOperand();
1660                 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1661                                       context.checkNoZeroSizedInput());
1662                 success = success && operationRegistration->prepare(&context) &&
1663                           operationRegistration->execute(&context);
1664                 result = context.getResultCode();
1665             }
1666         }
1667     }
1668     if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1669         result = ANEURALNETWORKS_OP_FAILED;
1670     }
1671     if (result != ANEURALNETWORKS_NO_ERROR) {
1672         LOG(ERROR) << operation.type << " failed.";
1673     }
1674 
1675     consumeOperationInputs(ins, operands);
1676     return result;
1677 #else
1678     LOG(ERROR) << "Built without CPU execution support";
1679     return ANEURALNETWORKS_OP_FAILED;
1680 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
1681 }
1682 
1683 // Copies RunTimeOperandInfo, preserving the original lifetime and numberOfUsesLeft
1684 // to prevent deallocation of subgraph inputs and outputs.
setInfoExceptLifetime(RunTimeOperandInfo * to,const RunTimeOperandInfo & from)1685 static void setInfoExceptLifetime(RunTimeOperandInfo* to, const RunTimeOperandInfo& from) {
1686     auto originalLifetime = to->lifetime;
1687     auto originalNumberOfUsesLeft = to->numberOfUsesLeft;
1688     *to = from;
1689     to->lifetime = originalLifetime;
1690     to->numberOfUsesLeft = originalNumberOfUsesLeft;
1691 }
1692 
executeIfOperation(const Operation & operation,RunTimeOperandInfo * operands)1693 int CpuExecutor::executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1694     namespace op = operation_if;
1695     const RunTimeOperandInfo& condOperand = operands[operation.inputs[op::kCondBoolOperand]];
1696     if (condOperand.buffer == nullptr) {
1697         LOG(ERROR) << "Cannot read IF condition operand value";
1698         return ANEURALNETWORKS_OP_FAILED;
1699     }
1700     const bool condValue = *reinterpret_cast<const bool8*>(condOperand.buffer);
1701     VLOG(CPUEXE) << "CpuExecutor::executeIfOperation: condition value: " << condValue;
1702 
1703     const uint32_t branchInputIndex = condValue ? op::kThenModelOperand : op::kElseModelOperand;
1704     const RunTimeOperandInfo& branchOperand = operands[operation.inputs[branchInputIndex]];
1705     const Model::Subgraph& branchSubgraph =
1706             *reinterpret_cast<const Model::Subgraph*>(branchOperand.buffer);
1707     std::vector<RunTimeOperandInfo> branchOperands = initializeRunTimeInfo(branchSubgraph);
1708 
1709     // Initialize inner input and output operands from outer operands.
1710     for (uint32_t i = 0, n = branchSubgraph.inputIndexes.size(); i < n; ++i) {
1711         setInfoExceptLifetime(&branchOperands[branchSubgraph.inputIndexes[i]],
1712                               operands[operation.inputs[op::kFirstInput + i]]);
1713     }
1714     for (uint32_t i = 0, n = branchSubgraph.outputIndexes.size(); i < n; ++i) {
1715         setInfoExceptLifetime(&branchOperands[branchSubgraph.outputIndexes[i]],
1716                               operands[operation.outputs[i]]);
1717     }
1718 
1719     NN_RETURN_IF_ERROR(executeSubgraph(branchSubgraph, branchOperands.data()));
1720     freeUnusedSubgraphOperands(&branchOperands);
1721 
1722     // Update outer outputs.
1723     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1724         setInfoExceptLifetime(&operands[operation.outputs[i]],
1725                               branchOperands[branchSubgraph.outputIndexes[i]]);
1726     }
1727 
1728     consumeOperationInputs(operation.inputs, operands);
1729     return ANEURALNETWORKS_NO_ERROR;
1730 }
1731 
executeWhileOperation(const Operation & operation,RunTimeOperandInfo * operands)1732 int CpuExecutor::executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1733     namespace op = operation_while;
1734     const RunTimeOperandInfo& condModelOperand = operands[operation.inputs[op::kCondModelOperand]];
1735     const RunTimeOperandInfo& bodyModelOperand = operands[operation.inputs[op::kBodyModelOperand]];
1736     const Model::Subgraph& condSubgraph =
1737             *reinterpret_cast<const Model::Subgraph*>(condModelOperand.buffer);
1738     const Model::Subgraph& bodySubgraph =
1739             *reinterpret_cast<const Model::Subgraph*>(bodyModelOperand.buffer);
1740     std::vector<RunTimeOperandInfo> condOperands = initializeRunTimeInfo(condSubgraph);
1741     std::vector<RunTimeOperandInfo> bodyOperands = initializeRunTimeInfo(bodySubgraph);
1742 
1743     // The code below implements the following sequence of subgraph input and output buffer
1744     // assignments:
1745     // iteration = 0   cond inputs = body inputs = outer inputs   body outputs = tmp1
1746     // iteration = 1   cond inputs = body inputs = tmp1           body outputs = tmp2
1747     // iteration = 2   cond inputs = body inputs = tmp2           body outputs = tmp1
1748     // iteration = 3   cond inputs = body inputs = ...            body outputs = ...
1749 
1750     // For body output double buffering.
1751     std::vector<uint8_t*> tmp1(bodySubgraph.outputIndexes.size());
1752     std::vector<uint8_t*> tmp2(bodySubgraph.outputIndexes.size());
1753 
1754     // Ensure objects are freed
1755     auto cleanupGuard = base::make_scope_guard(
1756             [&tmp1, &tmp2, &condOperands, &bodyOperands, &operation, &operands] {
1757                 auto freeLoopOutputs = [](const std::vector<uint8_t*>& tmp) {
1758                     for (auto buffer : tmp) {
1759                         if (buffer != nullptr) {
1760                             delete[] buffer;
1761                         }
1762                     }
1763                 };
1764 
1765                 freeLoopOutputs(tmp1);
1766                 freeLoopOutputs(tmp2);
1767                 freeUnusedSubgraphOperands(&condOperands);
1768                 freeUnusedSubgraphOperands(&bodyOperands);
1769                 consumeOperationInputs(operation.inputs, operands);
1770             });
1771 
1772     // For body outputs with unknown shape, we skip double buffering and
1773     // allocate on each iteration instead. This allows growing output tensors
1774     // inside a WHILE loop.
1775     std::vector<bool> bodyOutputHasUnknownShape(bodySubgraph.outputIndexes.size());
1776     for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1777         const Operand& operand = bodySubgraph.operands[bodySubgraph.outputIndexes[i]];
1778         bodyOutputHasUnknownShape[i] = nonExtensionOperandSizeOfData(operand) == 0;
1779     }
1780 
1781     // Initialize condition inputs from outer operands.
1782     for (uint32_t i = 0, n = condSubgraph.inputIndexes.size(); i < n; ++i) {
1783         setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1784                               operands[operation.inputs[op::kFirstInput + i]]);
1785     }
1786 
1787     // Store condition output on the stack.
1788     RunTimeOperandInfo& condOutput = condOperands[condSubgraph.outputIndexes[0]];
1789     bool8 condValue = {/* initialized memory */};
1790     condOutput.buffer = &condValue;
1791     condOutput.length = sizeof(condValue);
1792 
1793     std::chrono::nanoseconds timeoutDuration(mLoopTimeoutDuration);
1794     const auto startTime = Clock::now();
1795     for (uint32_t iteration = 0;; ++iteration) {
1796         VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: iteration " << iteration;
1797         if (iteration != 0) {
1798             // Set condition inputs from previous iteration outputs.
1799             for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1800                 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1801                                       bodyOperands[bodySubgraph.outputIndexes[i]]);
1802             }
1803         }
1804         NN_RETURN_IF_ERROR(executeSubgraph(condSubgraph, condOperands.data()));
1805         VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: condition value: "
1806                      << static_cast<int>(condValue);
1807         if (!condValue) {
1808             break;
1809         }
1810 
1811         const auto duration = Clock::now() - startTime;
1812         if (duration > timeoutDuration) {
1813             LOG(ERROR) << "CpuExecutor::executeWhileOperation: timed out after "
1814                        << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1815                        << " ms";
1816             return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1817         }
1818 
1819         // Set body inputs from condition inputs.
1820         for (uint32_t i = 0, n = bodySubgraph.inputIndexes.size(); i < n; ++i) {
1821             bodyOperands[bodySubgraph.inputIndexes[i]] = condOperands[condSubgraph.inputIndexes[i]];
1822         }
1823         // Set body outputs.
1824         auto& outputBuffer = iteration % 2 == 0 ? tmp1 : tmp2;
1825         for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1826             RunTimeOperandInfo& info = bodyOperands[bodySubgraph.outputIndexes[i]];
1827             if (bodyOutputHasUnknownShape[i]) {
1828                 // Reset dimensions and buffer.
1829                 info.dimensions = bodySubgraph.operands[bodySubgraph.outputIndexes[i]].dimensions;
1830                 if (outputBuffer[i] != nullptr) {
1831                     delete[] outputBuffer[i];
1832                     outputBuffer[i] = nullptr;
1833                 }
1834             }
1835             info.buffer = outputBuffer[i];
1836         }
1837 
1838         NN_RETURN_IF_ERROR(executeSubgraph(bodySubgraph, bodyOperands.data()));
1839 
1840         // Update output buffer information in case we have allocated new buffers.
1841         for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1842             outputBuffer[i] = bodyOperands[bodySubgraph.outputIndexes[i]].buffer;
1843         }
1844     }
1845 
1846     // Copy body outputs to outer outputs.
1847     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1848         RunTimeOperandInfo& outerOperand = operands[operation.outputs[i]];
1849         RunTimeOperandInfo& innerOperand = condOperands[condSubgraph.inputIndexes[i]];
1850         if (int error; !setInfoAndAllocateIfNeeded(&outerOperand, innerOperand.shape(), &error)) {
1851             return error;
1852         }
1853         CHECK_EQ(outerOperand.length, innerOperand.length);
1854         // TODO: Use the outer buffer as tmp1 to avoid copies.
1855         std::memcpy(outerOperand.buffer, innerOperand.buffer, innerOperand.length);
1856     }
1857 
1858     return ANEURALNETWORKS_NO_ERROR;
1859 }
1860 
setOutputShapes(const std::vector<uint32_t> & outputIndexes,const std::vector<RunTimeOperandInfo> & operands)1861 void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
1862                                   const std::vector<RunTimeOperandInfo>& operands) {
1863     mOutputShapes.resize(outputIndexes.size());
1864     for (uint32_t i = 0; i < outputIndexes.size(); i++) {
1865         const uint32_t operandIndex = outputIndexes[i];
1866         const RunTimeOperandInfo& from = operands[operandIndex];
1867         mOutputShapes[i].dimensions = from.dimensions;
1868         mOutputShapes[i].isSufficient = from.isSufficient();
1869         VLOG(EXECUTION) << "CpuExecutor::setOutputShapes: mOutputShapes[" << i
1870                         << "] = " << mOutputShapes[i];
1871     }
1872 }
1873 
1874 // b/109953668, disable OpenMP
1875 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1876 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1877     mBlocktimeInitial = kmp_get_blocktime();
1878     kmp_set_blocktime(20);  // ms, see b/109645291
1879 
1880 #if NNAPI_LIMIT_CPU_THREADS
1881     // Code not yet enabled. Choosing the number of threads to be based on
1882     // benchmarking. See longer comment by the class declaration.
1883     mMaxThreadsInitial = Eigen::nbThreads();
1884     const int nProcs = omp_get_num_procs();
1885     int threads = nProcs;
1886     if (nProcs >= 8) {
1887         threads = nProcs - 4;
1888     } else if (nProcs >= 4) {
1889         threads = nProcs - 2;
1890     }
1891     Eigen::setNbThreads(threads);
1892 #endif
1893 }
1894 
~ScopedOpenmpSettings()1895 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1896     kmp_set_blocktime(mBlocktimeInitial);
1897 #if NNAPI_LIMIT_CPU_THREADS
1898     Eigen::setNbThreads(mMaxThreadsInitial);
1899 #endif
1900 }
1901 #endif  // NNAPI_OPENMP
1902 
1903 }  // namespace nn
1904 }  // namespace android
1905