1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "CpuExecutor"
18 
19 #include "CpuExecutor.h"
20 
21 #include "NeuralNetworks.h"
22 #include "OperationResolver.h"
23 #include "Operations.h"
24 #include "OperationsUtils.h"
25 #include "Tracing.h"
26 
27 #include "Eigen/Core"
28 // b/109953668, disable OpenMP
29 #ifdef NNAPI_OPENMP
30 #include <omp.h>
31 #endif  // NNAPI_OPENMP
32 #include <android/hardware_buffer.h>
33 #include <sys/mman.h>
34 
35 namespace android {
36 namespace nn {
37 
38 namespace {
39 
40 class OperationExecutionContext : public IOperationExecutionContext {
41     DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
42 
43    public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)44     OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
45         : operation(operation), operands(operands) {}
46 
47     uint32_t getNumInputs() const override;
48     OperandType getInputType(uint32_t index) const override;
49     Shape getInputShape(uint32_t index) const override;
50     const void* getInputBuffer(uint32_t index) const override;
51     const Operand::ExtraParams getInputExtraParams(uint32_t index) const override;
52 
53     uint32_t getNumOutputs() const override;
54     OperandType getOutputType(uint32_t index) const override;
55     Shape getOutputShape(uint32_t index) const override;
56     void* getOutputBuffer(uint32_t index) override;
57 
58     // Return false on failure and store the result code.
59     // Use getResultCode() to retrieve it at the end of the operation execution.
60     bool setOutputShape(uint32_t index, const Shape& shape) override;
61     int getResultCode() const;
62 
63     bool isOmittedInput(uint32_t index) const override;
64     bool isOmittedOutput(uint32_t index) const override;
65 
66     // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
67     bool checkNoOmittedOperand() const;
68     // Return false if any of inputs has dimension 0.
69     bool checkNoZeroSizedInput() const;
70 
71    private:
72     const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
73     const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
74     RunTimeOperandInfo* getOutputInfo(uint32_t index);
75 
76     const Operation* operation;
77     RunTimeOperandInfo* operands;
78 
79     int result = ANEURALNETWORKS_NO_ERROR;
80 };
81 
getInputInfo(uint32_t index) const82 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
83     CHECK(index < operation->inputs.size());
84     return &operands[operation->inputs[index]];
85 }
86 
getOutputInfo(uint32_t index) const87 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
88     CHECK(index < operation->outputs.size());
89     return &operands[operation->outputs[index]];
90 }
91 
getOutputInfo(uint32_t index)92 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
93     CHECK(index < operation->outputs.size());
94     return &operands[operation->outputs[index]];
95 }
96 
getInputType(uint32_t index) const97 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
98     return getInputInfo(index)->type;
99 }
100 
getInputShape(uint32_t index) const101 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
102     return getInputInfo(index)->shape();
103 }
104 
getInputBuffer(uint32_t index) const105 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
106     return getInputInfo(index)->buffer;
107 }
108 
getInputExtraParams(uint32_t index) const109 const Operand::ExtraParams OperationExecutionContext::getInputExtraParams(uint32_t index) const {
110     return getInputInfo(index)->extraParams;
111 }
112 
getOutputType(uint32_t index) const113 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
114     return getOutputInfo(index)->type;
115 }
116 
getOutputShape(uint32_t index) const117 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
118     return getOutputInfo(index)->shape();
119 }
120 
getOutputBuffer(uint32_t index)121 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
122     return getOutputInfo(index)->buffer;
123 }
124 
getNumInputs() const125 uint32_t OperationExecutionContext::getNumInputs() const {
126     return operation->inputs.size();
127 }
128 
getNumOutputs() const129 uint32_t OperationExecutionContext::getNumOutputs() const {
130     return operation->outputs.size();
131 }
132 
getResultCode() const133 int OperationExecutionContext::getResultCode() const {
134     return result;
135 }
136 
137 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
138 // Updates the RunTimeOperandInfo with the newly calculated shape.
139 // Allocate the buffer if we need to.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)140 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
141     // For user-provided model output operands, the parameters must match the Shape
142     // calculated from the preparation step.
143     if (info->lifetime == OperandLifeTime::MODEL_OUTPUT) {
144         if (info->type != shape.type) {
145             LOG(ERROR) << "Invalid type for model output";
146             *result = ANEURALNETWORKS_OP_FAILED;
147             return false;
148         }
149         if (info->type == OperandType::TENSOR_QUANT8_ASYMM) {
150             if (info->scale != shape.scale) {
151                 LOG(ERROR) << "Invalid scale for model output";
152                 *result = ANEURALNETWORKS_OP_FAILED;
153                 return false;
154             }
155             if (info->zeroPoint != shape.offset) {
156                 LOG(ERROR) << "Invalid zeroPoint for model output";
157                 *result = ANEURALNETWORKS_OP_FAILED;
158                 return false;
159             }
160         }
161         if (info->extraParams != shape.extraParams) {
162             LOG(ERROR) << "Invalid extraParams for model output";
163             *result = ANEURALNETWORKS_OP_FAILED;
164             return false;
165         }
166     }
167 
168     std::vector<uint32_t> combined;
169     if (!combineDimensions(shape.dimensions, info->dimensions, &combined)) {
170         LOG(ERROR) << "Invalid dimensions for model operand";
171         *result = ANEURALNETWORKS_OP_FAILED;
172         return false;
173     }
174     info->dimensions = combined;
175     info->type = shape.type;
176     info->scale = shape.scale;
177     info->zeroPoint = shape.offset;
178     info->extraParams = shape.extraParams;
179 
180     // Allocate the buffer only if the combined dimension is fully specified
181     if (info->lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info->buffer == nullptr) {
182         if (isExtensionOperandType(info->type)) {
183             LOG(ERROR) << "Cannot allocate a temporary variable of an extension type";
184             *result = ANEURALNETWORKS_OP_FAILED;
185             return false;
186         }
187         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
188         if (length > 0) {
189             info->buffer = new uint8_t[length];
190             if (info->buffer == nullptr) {
191                 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
192                 return false;
193             }
194             info->length = length;
195         }
196     }
197     if (!info->isSufficient()) {
198         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
199         LOG(ERROR) << "Insufficient size for model operand: require = " << length
200                    << ", provided = " << info->length;
201         *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
202         return false;
203     }
204     *result = ANEURALNETWORKS_NO_ERROR;
205     return true;
206 }
207 
setOutputShape(uint32_t index,const Shape & shape)208 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
209     return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
210 }
211 
isOmittedInput(uint32_t index) const212 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
213     return getInputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
214 }
215 
isOmittedOutput(uint32_t index) const216 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
217     return getOutputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
218 }
219 
checkNoOmittedOperand() const220 bool OperationExecutionContext::checkNoOmittedOperand() const {
221     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
222         NN_RET_CHECK(!isOmittedInput(i)) << getOperationName(operation->type) << " input operand "
223                                          << i << " is required but missing.";
224     }
225     for (uint32_t i = 0; i < operation->outputs.size(); i++) {
226         NN_RET_CHECK(!isOmittedOutput(i)) << getOperationName(operation->type) << " output operand "
227                                           << i << " is required but missing.";
228     }
229     return true;
230 }
231 
checkNoZeroSizedInput() const232 bool OperationExecutionContext::checkNoZeroSizedInput() const {
233     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
234         if (isOmittedInput(i)) continue;
235         for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
236             NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0)
237                     << getOperationName(operation->type)
238                     << " does not support zero-sized tensor, but input " << i << " dimension " << j
239                     << " is 0.";
240         }
241     }
242     return true;
243 }
244 
245 }  // namespace
246 
247 // Used to keep a pointer to a memory pool.
248 //
249 // In the case of an "mmap_fd" pool, owns the mmap region
250 // returned by getBuffer() -- i.e., that region goes away
251 // when the RunTimePoolInfo is destroyed or is assigned to.
252 class RunTimePoolInfo::RunTimePoolInfoImpl {
253    public:
254     RunTimePoolInfoImpl(const hidl_memory& hidlMemory, uint8_t* buffer, const sp<IMemory>& memory,
255                         const sp<GraphicBuffer>& graphicBuffer);
256 
257     // rule of five...
258     ~RunTimePoolInfoImpl();
259     RunTimePoolInfoImpl(const RunTimePoolInfoImpl&) = delete;
260     RunTimePoolInfoImpl(RunTimePoolInfoImpl&&) noexcept = delete;
261     RunTimePoolInfoImpl& operator=(const RunTimePoolInfoImpl&) = delete;
262     RunTimePoolInfoImpl& operator=(RunTimePoolInfoImpl&&) noexcept = delete;
263 
getBuffer() const264     uint8_t* getBuffer() const { return mBuffer; }
265 
266     bool update() const;
267 
getHidlMemory() const268     hidl_memory getHidlMemory() const { return mHidlMemory; }
269 
270    private:
271     const hidl_memory mHidlMemory;     // always used
272     uint8_t* const mBuffer = nullptr;  // always used
273     const sp<IMemory> mMemory;         // only used when hidlMemory.name() == "ashmem"
274     const sp<GraphicBuffer>
275             mGraphicBuffer;  // only used when hidlMemory.name() == "hardware_buffer_blob"
276 };
277 
RunTimePoolInfoImpl(const hidl_memory & hidlMemory,uint8_t * buffer,const sp<IMemory> & memory,const sp<GraphicBuffer> & graphicBuffer)278 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(const hidl_memory& hidlMemory,
279                                                           uint8_t* buffer,
280                                                           const sp<IMemory>& memory,
281                                                           const sp<GraphicBuffer>& graphicBuffer)
282     : mHidlMemory(hidlMemory), mBuffer(buffer), mMemory(memory), mGraphicBuffer(graphicBuffer) {}
283 
~RunTimePoolInfoImpl()284 RunTimePoolInfo::RunTimePoolInfoImpl::~RunTimePoolInfoImpl() {
285     if (mBuffer == nullptr) {
286         return;
287     }
288 
289     const std::string memType = mHidlMemory.name();
290     if (memType == "ashmem") {
291         // nothing to do
292     } else if (memType == "mmap_fd") {
293         const size_t size = mHidlMemory.size();
294         if (munmap(mBuffer, size)) {
295             LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfo(): Can't munmap";
296         }
297     } else if (memType == "hardware_buffer_blob") {
298         mGraphicBuffer->unlock();
299     } else if (memType == "") {
300         // Represents a POINTER argument; nothing to do
301     } else {
302         LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfoImpl(): unsupported hidl_memory type";
303     }
304 }
305 
306 // Making sure the output data are correctly updated after execution.
update() const307 bool RunTimePoolInfo::RunTimePoolInfoImpl::update() const {
308     const std::string memType = mHidlMemory.name();
309     if (memType == "ashmem") {
310         mMemory->commit();
311         return true;
312     }
313     if (memType == "mmap_fd") {
314         int prot = mHidlMemory.handle()->data[1];
315         if (prot & PROT_WRITE) {
316             const size_t size = mHidlMemory.size();
317             return msync(mBuffer, size, MS_SYNC) == 0;
318         }
319     }
320     // No-op for other types of memory.
321     return true;
322 }
323 
324 // TODO: short term, make share memory mapping and updating a utility function.
325 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromHidlMemory(const hidl_memory & hidlMemory)326 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromHidlMemory(
327         const hidl_memory& hidlMemory) {
328     uint8_t* buffer = nullptr;
329     sp<IMemory> memory;
330     sp<GraphicBuffer> graphicBuffer;
331 
332     const auto& memType = hidlMemory.name();
333     if (memType == "ashmem") {
334         memory = mapMemory(hidlMemory);
335         if (memory == nullptr) {
336             LOG(ERROR) << "Can't map shared memory.";
337             return std::nullopt;
338         }
339         memory->update();
340         buffer = reinterpret_cast<uint8_t*>(static_cast<void*>(memory->getPointer()));
341         if (buffer == nullptr) {
342             LOG(ERROR) << "Can't access shared memory.";
343             return std::nullopt;
344         }
345     } else if (memType == "mmap_fd") {
346         size_t size = hidlMemory.size();
347         int fd = hidlMemory.handle()->data[0];
348         int prot = hidlMemory.handle()->data[1];
349         size_t offset = getSizeFromInts(hidlMemory.handle()->data[2], hidlMemory.handle()->data[3]);
350         buffer = static_cast<uint8_t*>(mmap(nullptr, size, prot, MAP_SHARED, fd, offset));
351         if (buffer == MAP_FAILED) {
352             LOG(ERROR) << "RunTimePoolInfo::set(): Can't mmap the file descriptor.";
353             return std::nullopt;
354         }
355     } else if (memType == "hardware_buffer_blob") {
356         auto handle = hidlMemory.handle();
357         auto format = AHARDWAREBUFFER_FORMAT_BLOB;
358         auto usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
359         const uint32_t width = hidlMemory.size();
360         const uint32_t height = 1;  // height is always 1 for BLOB mode AHardwareBuffer.
361         const uint32_t layers = 1;  // layers is always 1 for BLOB mode AHardwareBuffer.
362         const uint32_t stride = hidlMemory.size();
363         graphicBuffer = new GraphicBuffer(handle, GraphicBuffer::HandleWrapMethod::CLONE_HANDLE,
364                                           width, height, format, layers, usage, stride);
365         void* gBuffer = nullptr;
366         int32_t outBytesPerPixel, outBytesPerStride;
367         status_t status =
368                 graphicBuffer->lock(usage, &gBuffer, &outBytesPerPixel, &outBytesPerStride);
369         if (status != NO_ERROR) {
370             LOG(ERROR) << "RunTimePoolInfo Can't lock the AHardwareBuffer.";
371             return std::nullopt;
372         }
373         buffer = static_cast<uint8_t*>(gBuffer);
374     } else {
375         LOG(ERROR) << "RunTimePoolInfo::set(): unsupported hidl_memory type";
376         return std::nullopt;
377     }
378 
379     const auto impl =
380             std::make_shared<const RunTimePoolInfoImpl>(hidlMemory, buffer, memory, graphicBuffer);
381     return {RunTimePoolInfo(impl)};
382 }
383 
createFromExistingBuffer(uint8_t * buffer)384 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer) {
385     const auto impl =
386             std::make_shared<const RunTimePoolInfoImpl>(hidl_memory{}, buffer, nullptr, nullptr);
387     return {impl};
388 }
389 
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)390 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
391     : mImpl(impl) {}
392 
getBuffer() const393 uint8_t* RunTimePoolInfo::getBuffer() const {
394     return mImpl->getBuffer();
395 }
396 
update() const397 bool RunTimePoolInfo::update() const {
398     return mImpl->update();
399 }
400 
getHidlMemory() const401 hidl_memory RunTimePoolInfo::getHidlMemory() const {
402     return mImpl->getHidlMemory();
403 }
404 
setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<hidl_memory> & pools)405 bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
406                                          const hidl_vec<hidl_memory>& pools) {
407     CHECK(poolInfos != nullptr);
408     poolInfos->clear();
409     poolInfos->reserve(pools.size());
410     for (const auto& pool : pools) {
411         if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromHidlMemory(pool)) {
412             poolInfos->push_back(*poolInfo);
413         } else {
414             LOG(ERROR) << "Could not map pools";
415             poolInfos->clear();
416             return false;
417         }
418     }
419     return true;
420 }
421 
422 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)423 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
424     uint32_t spatialSize = fromDim[2] * fromDim[3];
425     for (uint32_t n = 0; n < fromDim[0]; n++) {
426         for (uint32_t hw = 0; hw < spatialSize; hw++) {
427             for (uint32_t c = 0; c < fromDim[1]; c++) {
428                 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
429                 *to++ = from[fromIndex];
430             }
431         }
432     }
433     return true;
434 }
435 
436 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)437 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
438     uint32_t spatialSize = fromDim[1] * fromDim[2];
439     for (uint32_t n = 0; n < fromDim[0]; n++) {
440         for (uint32_t c = 0; c < fromDim[3]; c++) {
441             for (uint32_t hw = 0; hw < spatialSize; hw++) {
442                 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
443                 *to++ = from[fromIndex];
444             }
445         }
446     }
447     return true;
448 }
449 
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)450 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
451                           std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
452     int result;
453     if (from.dimensions.size() != 4) {
454         LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
455         return false;
456     }
457     to.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
458     if (data_layout) {
459         // convert dimensions
460         Shape inShape = from.shape();
461         auto& fromDim = from.dimensions;
462         inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
463         // allocate buffer
464         to.buffer = nullptr;
465         if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
466             return false;
467         }
468         ptr_guard.reset(to.buffer);
469         // convert value
470         if (from.type == OperandType::TENSOR_FLOAT32) {
471             return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
472                                             reinterpret_cast<const float*>(from.buffer), fromDim);
473         } else if (from.type == OperandType::TENSOR_FLOAT16) {
474             return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
475                                                reinterpret_cast<const _Float16*>(from.buffer),
476                                                fromDim);
477         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
478             return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
479                                               reinterpret_cast<const uint8_t*>(from.buffer),
480                                               fromDim);
481         } else {
482             LOG(ERROR) << "Unsupported data type";
483             return false;
484         }
485     } else {
486         to = from;
487     }
488     return true;
489 }
490 
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)491 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
492                             bool data_layout, int* result) {
493     if (from.dimensions.size() != 4) {
494         LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
495         return false;
496     }
497     if (data_layout) {
498         // convert dimensions
499         Shape outShape = from.shape();
500         auto& fromDim = from.dimensions;
501         outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
502         // allocate buffer
503         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
504             return false;
505         }
506         // convert value
507         if (from.type == OperandType::TENSOR_FLOAT32) {
508             return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
509                                               reinterpret_cast<const float*>(from.buffer), fromDim);
510         } else if (from.type == OperandType::TENSOR_FLOAT16) {
511             return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
512                                                  reinterpret_cast<const _Float16*>(from.buffer),
513                                                  fromDim);
514         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
515             return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
516                                                 reinterpret_cast<const uint8_t*>(from.buffer),
517                                                 fromDim);
518         } else {
519             LOG(ERROR) << "Unsupported data type";
520             return false;
521         }
522     } else {
523         Shape outShape = from.shape();
524         to.buffer = from.buffer;
525         to.length = from.length;
526         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
527             return false;
528         }
529     }
530     return true;
531 }
532 
533 // Ignore the .pools entry in model and request.  This will have been taken care of
534 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)535 int CpuExecutor::run(const Model& model, const Request& request,
536                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
537                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
538     NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
539     VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(toString(request)) << ")";
540 
541     // b/109953668, disable OpenMP
542 #ifdef NNAPI_OPENMP
543     ScopedOpenmpSettings openMpSettings;
544 #endif  // NNAPI_OPENMP
545 
546     mModel = &model;
547     mRequest = &request;  // TODO check if mRequest is needed
548     initializeRunTimeInfo(modelPoolInfos, requestPoolInfos);
549     // The model has serialized the operation in execution order.
550     for (const auto& operation : model.operations) {
551         int n = executeOperation(operation);
552         if (n != ANEURALNETWORKS_NO_ERROR) {
553             finish(n);
554             return n;
555         }
556     }
557     for (auto& runtimeInfo : modelPoolInfos) {
558         runtimeInfo.update();
559     }
560     for (auto& runtimeInfo : requestPoolInfos) {
561         runtimeInfo.update();
562     }
563     finish(ANEURALNETWORKS_NO_ERROR);
564     VLOG(CPUEXE) << "Completed run normally";
565     return ANEURALNETWORKS_NO_ERROR;
566 }
567 
initializeRunTimeInfo(const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)568 bool CpuExecutor::initializeRunTimeInfo(const std::vector<RunTimePoolInfo>& modelPoolInfos,
569                                         const std::vector<RunTimePoolInfo>& requestPoolInfos) {
570     VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
571     const size_t count = mModel->operands.size();
572     mOperands.resize(count);
573 
574     // Start by setting the runtime info to what's in the model.
575     for (size_t i = 0; i < count; i++) {
576         const Operand& from = mModel->operands[i];
577         RunTimeOperandInfo& to = mOperands[i];
578         to.type = from.type;
579         to.dimensions = from.dimensions;
580         to.scale = from.scale;
581         to.zeroPoint = from.zeroPoint;
582         to.length = from.location.length;
583         to.lifetime = from.lifetime;
584         to.extraParams = from.extraParams;
585         switch (from.lifetime) {
586             case OperandLifeTime::TEMPORARY_VARIABLE:
587                 to.buffer = nullptr;
588                 to.numberOfUsesLeft = from.numberOfConsumers;
589                 break;
590             case OperandLifeTime::CONSTANT_COPY:
591                 to.buffer = const_cast<uint8_t*>(&mModel->operandValues[from.location.offset]);
592                 to.numberOfUsesLeft = 0;
593                 break;
594             case OperandLifeTime::CONSTANT_REFERENCE: {
595                 auto poolIndex = from.location.poolIndex;
596                 nnAssert(poolIndex < modelPoolInfos.size());
597                 auto& r = modelPoolInfos[poolIndex];
598                 to.buffer = r.getBuffer() + from.location.offset;
599                 to.numberOfUsesLeft = 0;
600                 break;
601             }
602             case OperandLifeTime::MODEL_INPUT:
603             case OperandLifeTime::MODEL_OUTPUT:
604             case OperandLifeTime::NO_VALUE:
605                 to.buffer = nullptr;
606                 to.numberOfUsesLeft = 0;
607                 break;
608             default:
609                 nnAssert(false);
610                 break;
611         }
612     }
613 
614     // Adjust the runtime info for the arguments passed to the model,
615     // modifying the buffer location, and possibly the dimensions.
616     auto updateForArguments = [this, &requestPoolInfos](
617                                       const std::vector<uint32_t>& indexes,
618                                       const hidl_vec<RequestArgument>& arguments) {
619         nnAssert(indexes.size() == arguments.size());
620         for (size_t i = 0; i < indexes.size(); i++) {
621             const uint32_t operandIndex = indexes[i];
622             const RequestArgument& from = arguments[i];
623             RunTimeOperandInfo& to = mOperands[operandIndex];
624             if (from.dimensions.size() > 0) {
625                 // It's the responsibility of the caller to validate that
626                 // from.dimensions only modifies the dimensions that were
627                 // unspecified in the model.  That's the case in SampleDriver.cpp
628                 // with the call to validateRequest().
629                 // TODO make sure that's the case for the default CPU path.
630                 to.dimensions = from.dimensions;
631             }
632             if (from.hasNoValue) {
633                 to.lifetime = OperandLifeTime::NO_VALUE;
634                 nnAssert(to.buffer == nullptr);
635                 to.length = 0;
636             } else {
637                 auto poolIndex = from.location.poolIndex;
638                 nnAssert(poolIndex < requestPoolInfos.size());
639                 auto& r = requestPoolInfos[poolIndex];
640                 to.buffer = r.getBuffer() + from.location.offset;
641                 to.length = from.location.length;
642             }
643         }
644     };
645     updateForArguments(mModel->inputIndexes, mRequest->inputs);
646     updateForArguments(mModel->outputIndexes, mRequest->outputs);
647 
648     return true;
649 }
650 
freeNoLongerUsedOperands(const std::vector<uint32_t> & inputs)651 void CpuExecutor::freeNoLongerUsedOperands(const std::vector<uint32_t>& inputs) {
652     for (uint32_t i : inputs) {
653         auto& info = mOperands[i];
654         // Check if it's a static or model input/output.
655         if (info.numberOfUsesLeft == 0) {
656             continue;
657         }
658         info.numberOfUsesLeft--;
659         if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
660             delete[] info.buffer;
661             info.buffer = nullptr;
662         }
663     }
664 }
665 
executeOperation(const Operation & operation)666 int CpuExecutor::executeOperation(const Operation& operation) {
667     // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << toString(operation) << ")";
668     const hidl_vec<uint32_t>& ins = operation.inputs;
669     const hidl_vec<uint32_t>& outs = operation.outputs;
670     bool success = false;
671     int result = ANEURALNETWORKS_NO_ERROR;
672 
673     // Function to verify that the number of input and output parameters
674     // matches what is expected.  Also checks that all the parameters have
675     // values. This function is to be used only for operations that do not
676     // accept optional arguments.
677     // TODO Have a version that works for optional arguments.
678     auto allParametersPresent = [&operation, &ins, &outs, this](size_t requiredIns,
679                                                                 size_t requiredOuts) -> bool {
680         auto verify = [&operation, this](size_t requiredCount, const hidl_vec<uint32_t>& indexes,
681                                          const char* type) -> bool {
682             size_t actualCount = indexes.size();
683             if (actualCount != requiredCount) {
684                 LOG(ERROR) << getOperationName(operation.type) << ": Invalid number of " << type
685                            << " operands. Got " << actualCount << " of " << requiredCount;
686                 return false;
687             }
688             for (size_t i = 0; i < actualCount; i++) {
689                 if (mOperands[indexes[i]].lifetime == OperandLifeTime::NO_VALUE) {
690                     LOG(ERROR) << getOperationName(operation.type) << " " << type << " operand "
691                                << i << " is required but missing.";
692                     return false;
693                 }
694             }
695             return true;
696         };
697 
698         auto verifyNoZeroSizedInputs = [&operation, this](const hidl_vec<uint32_t>& indexes) {
699             for (size_t i = 0; i < indexes.size(); i++) {
700                 for (size_t j = 0; j < mOperands[indexes[i]].dimensions.size(); j++) {
701                     if (mOperands[indexes[i]].dimensions[j] == 0) {
702                         LOG(ERROR) << getOperationName(operation.type)
703                                    << " does not support zero-sized tensor, but input " << i
704                                    << " dimension " << j << " is zero.";
705                         return false;
706                     }
707                 }
708             }
709             return true;
710         };
711 
712         return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
713                verifyNoZeroSizedInputs(ins);
714     };
715 
716     switch (operation.type) {
717         case OperationType::OEM_OPERATION: {
718             LOG(ERROR) << "OEM operation not supported for CPU execution";
719             success = false;
720         } break;
721         case OperationType::FLOOR: {
722             if (!allParametersPresent(1, 1)) {
723                 return ANEURALNETWORKS_BAD_DATA;
724             }
725             const RunTimeOperandInfo& input = mOperands[ins[0]];
726             RunTimeOperandInfo& output = mOperands[outs[0]];
727             Shape outShape = output.shape();
728 
729             if (!floorPrepare(input.shape(), &outShape) ||
730                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
731                 break;
732             }
733             if (input.type == OperandType::TENSOR_FLOAT32) {
734                 success = floorFloat32(reinterpret_cast<const float*>(input.buffer),
735                                        reinterpret_cast<float*>(output.buffer), outShape);
736             } else if (input.type == OperandType::TENSOR_FLOAT16) {
737                 success = floorFloat16(reinterpret_cast<const _Float16*>(input.buffer),
738                                        reinterpret_cast<_Float16*>(output.buffer), outShape);
739             }
740         } break;
741         case OperationType::DEPTHWISE_CONV_2D: {
742             const size_t inCount = ins.size();
743             if ((inCount != 14 && inCount != 12 && inCount != 11 && inCount != 9 && inCount != 8) ||
744                 !allParametersPresent(inCount, 1)) {
745                 return ANEURALNETWORKS_BAD_DATA;
746             }
747             const RunTimeOperandInfo& input = mOperands[ins[0]];
748             const RunTimeOperandInfo& filter = mOperands[ins[1]];
749             const RunTimeOperandInfo& bias = mOperands[ins[2]];
750 
751             int32_t padding_left, padding_right;
752             int32_t padding_top, padding_bottom;
753             int32_t padding_implicit = 0;
754             int32_t stride_width, stride_height;
755             int32_t dilation_width_factor = 1, dilation_height_factor = 1;
756             int32_t depth_multiplier;
757             int32_t activation;
758             bool data_layout = false;
759             bool useImplicitPadding = false;
760 
761             if ((inCount >= 9 && mOperands[ins[8]].type == OperandType::BOOL) || inCount == 8) {
762                 padding_implicit = getScalarData<int32_t>(mOperands[ins[3]]);
763                 stride_width = getScalarData<int32_t>(mOperands[ins[4]]);
764                 stride_height = getScalarData<int32_t>(mOperands[ins[5]]);
765                 depth_multiplier = getScalarData<int32_t>(mOperands[ins[6]]);
766                 activation = getScalarData<int32_t>(mOperands[ins[7]]);
767                 if (inCount >= 9) {
768                     data_layout = getScalarData<bool>(mOperands[ins[8]]);
769                 }
770                 if (inCount == 11) {
771                     dilation_width_factor = getScalarData<int32_t>(mOperands[ins[9]]);
772                     dilation_height_factor = getScalarData<int32_t>(mOperands[ins[10]]);
773                 }
774                 useImplicitPadding = true;
775             } else if (inCount >= 11 && mOperands[ins[8]].type == OperandType::INT32) {
776                 padding_left = getScalarData<int32_t>(mOperands[ins[3]]);
777                 padding_right = getScalarData<int32_t>(mOperands[ins[4]]);
778                 padding_top = getScalarData<int32_t>(mOperands[ins[5]]);
779                 padding_bottom = getScalarData<int32_t>(mOperands[ins[6]]);
780                 stride_width = getScalarData<int32_t>(mOperands[ins[7]]);
781                 stride_height = getScalarData<int32_t>(mOperands[ins[8]]);
782                 depth_multiplier = getScalarData<int32_t>(mOperands[ins[9]]);
783                 activation = getScalarData<int32_t>(mOperands[ins[10]]);
784                 if (inCount >= 12) {
785                     data_layout = getScalarData<bool>(mOperands[ins[11]]);
786                 }
787                 if (inCount == 14) {
788                     dilation_width_factor = getScalarData<int32_t>(mOperands[ins[12]]);
789                     dilation_height_factor = getScalarData<int32_t>(mOperands[ins[13]]);
790                 }
791             } else {
792                 return ANEURALNETWORKS_BAD_DATA;
793             }
794 
795             RunTimeOperandInfo& output = mOperands[outs[0]];
796             Shape outShape = output.shape();
797 
798             RunTimeOperandInfo input_tmp, output_tmp;
799             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
800             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
801                 success = false;
802                 break;
803             }
804             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
805             output_tmp.buffer = data_layout ? nullptr : output.buffer;
806             output_tmp.length = data_layout ? 0 : output.length;
807 
808             if (useImplicitPadding) {
809                 Shape inputShape = input_tmp.shape();
810                 Shape filterShape = filter.shape();
811                 int32_t input_width = getSizeOfDimension(inputShape, 2);
812                 int32_t input_height = getSizeOfDimension(inputShape, 1);
813                 int32_t filter_width = getSizeOfDimension(filterShape, 2);
814                 int32_t filter_height = getSizeOfDimension(filterShape, 1);
815                 calculateExplicitPadding(input_width, stride_width, dilation_width_factor,
816                                          filter_width, padding_implicit, &padding_left,
817                                          &padding_right);
818                 calculateExplicitPadding(input_height, stride_height, dilation_height_factor,
819                                          filter_height, padding_implicit, &padding_top,
820                                          &padding_bottom);
821             }
822 
823             if (!depthwiseConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
824                                       padding_right, padding_top, padding_bottom, stride_width,
825                                       stride_height, depth_multiplier, dilation_width_factor,
826                                       dilation_height_factor, &outShape) ||
827                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
828                 if (!data_layout) output.dimensions = output_tmp.dimensions;
829                 success = false;
830                 break;
831             }
832             if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
833                 success = depthwiseConvFloat32(
834                         reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
835                         reinterpret_cast<const float*>(filter.buffer), filter.shape(),
836                         reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
837                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
838                         dilation_width_factor, dilation_height_factor, depth_multiplier, activation,
839                         reinterpret_cast<float*>(output_tmp.buffer), outShape);
840             } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
841                 success = depthwiseConvFloat16(
842                         reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
843                         reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
844                         reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
845                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
846                         dilation_width_factor, dilation_height_factor, depth_multiplier, activation,
847                         reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
848             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
849                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
850                     success = depthwiseConvQuant8PerChannel(
851                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
852                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
853                             filter.extraParams.channelQuant().scales.data(),
854                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
855                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
856                             stride_height, dilation_width_factor, dilation_height_factor,
857                             depth_multiplier, activation,
858                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
859                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
860                     success = depthwiseConvQuant8(
861                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
862                             reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
863                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
864                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
865                             stride_height, dilation_width_factor, dilation_height_factor,
866                             depth_multiplier, activation,
867                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
868                 }
869             }
870             if (data_layout) {
871                 output_tmp_guard.reset(output_tmp.buffer);
872             }
873             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
874                 success = false;
875                 break;
876             }
877         } break;
878         case OperationType::LOCAL_RESPONSE_NORMALIZATION: {
879             const size_t inCount = ins.size();
880             if ((inCount != 6 && inCount != 5) || !allParametersPresent(inCount, 1)) {
881                 return ANEURALNETWORKS_BAD_DATA;
882             }
883             const RunTimeOperandInfo& input = mOperands[ins[0]];
884             int32_t radius = getScalarData<int32_t>(mOperands[ins[1]]);
885             float bias = (input.type == OperandType::TENSOR_FLOAT16)
886                                  ? getScalarData<_Float16>(mOperands[ins[2]])
887                                  : getScalarData<float>(mOperands[ins[2]]);
888             float alpha = (input.type == OperandType::TENSOR_FLOAT16)
889                                   ? getScalarData<_Float16>(mOperands[ins[3]])
890                                   : getScalarData<float>(mOperands[ins[3]]);
891             float beta = (input.type == OperandType::TENSOR_FLOAT16)
892                                  ? getScalarData<_Float16>(mOperands[ins[4]])
893                                  : getScalarData<float>(mOperands[ins[4]]);
894             const int32_t axis = inCount == 6 ? getScalarData<int32_t>(mOperands[ins[5]]) : -1;
895 
896             RunTimeOperandInfo& output = mOperands[outs[0]];
897             Shape outShape = output.shape();
898 
899             if (!genericNormalizationPrepare(input.shape(), &outShape) ||
900                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
901                 success = false;
902                 break;
903             }
904             if (input.type == OperandType::TENSOR_FLOAT32) {
905                 success = localResponseNormFloat32(
906                         reinterpret_cast<const float*>(input.buffer), input.shape(), radius, bias,
907                         alpha, beta, axis, reinterpret_cast<float*>(output.buffer), outShape);
908             } else if (input.type == OperandType::TENSOR_FLOAT16) {
909                 success = localResponseNormFloat16(reinterpret_cast<const _Float16*>(input.buffer),
910                                                    input.shape(), radius, bias, alpha, beta, axis,
911                                                    reinterpret_cast<_Float16*>(output.buffer),
912                                                    outShape);
913             }
914         } break;
915         case OperationType::RESHAPE: {
916             if (!allParametersPresent(2, 1)) {
917                 return ANEURALNETWORKS_BAD_DATA;
918             }
919             const RunTimeOperandInfo& input = mOperands[ins[0]];
920             const RunTimeOperandInfo& targetShape = mOperands[ins[1]];
921 
922             RunTimeOperandInfo& output = mOperands[outs[0]];
923             Shape outShape = output.shape();
924 
925             success = reshapePrepare(input.shape(),
926                                      reinterpret_cast<const int32_t*>(targetShape.buffer),
927                                      getNumberOfElements(targetShape.shape()), &outShape) &&
928                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
929                       copyData(input.buffer, input.shape(), output.buffer, outShape);
930         } break;
931         case OperationType::DEPTH_TO_SPACE: {
932             const size_t inCount = ins.size();
933             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
934                 return ANEURALNETWORKS_BAD_DATA;
935             }
936             const RunTimeOperandInfo& input = mOperands[ins[0]];
937             int32_t blockSize = getScalarData<int32_t>(mOperands[ins[1]]);
938             bool data_layout = inCount == 3 ? getScalarData<bool>(mOperands[ins[2]]) : false;
939 
940             RunTimeOperandInfo& output = mOperands[outs[0]];
941             Shape outShape = output.shape();
942 
943             RunTimeOperandInfo input_tmp, output_tmp;
944             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
945             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
946                 success = false;
947                 break;
948             }
949             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
950             output_tmp.buffer = data_layout ? nullptr : output.buffer;
951             output_tmp.length = data_layout ? 0 : output.length;
952             if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
953                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
954                 if (!data_layout) output.dimensions = output_tmp.dimensions;
955                 break;
956             }
957             switch (input_tmp.type) {
958                 case OperandType::TENSOR_FLOAT32: {
959                     success = depthToSpaceGeneric(
960                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
961                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
962                     break;
963                 }
964                 case OperandType::TENSOR_FLOAT16: {
965                     success = depthToSpaceGeneric(
966                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
967                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
968                     break;
969                 }
970                 case OperandType::TENSOR_QUANT8_ASYMM: {
971                     success = depthToSpaceGeneric(
972                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
973                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
974                     break;
975                 }
976                 default: {
977                     LOG(ERROR) << "Unsupported data type";
978                     success = false;
979                 }
980             }
981             if (data_layout) {
982                 output_tmp_guard.reset(output_tmp.buffer);
983             }
984             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
985                 success = false;
986                 break;
987             }
988         } break;
989         case OperationType::SPACE_TO_DEPTH: {
990             const size_t inCount = ins.size();
991             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
992                 return ANEURALNETWORKS_BAD_DATA;
993             }
994             const RunTimeOperandInfo& input = mOperands[ins[0]];
995             int32_t blockSize = getScalarData<int32_t>(mOperands[ins[1]]);
996             bool data_layout = inCount == 3 ? getScalarData<bool>(mOperands[ins[2]]) : false;
997 
998             RunTimeOperandInfo& output = mOperands[outs[0]];
999             Shape outShape = output.shape();
1000 
1001             RunTimeOperandInfo input_tmp, output_tmp;
1002             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1003             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1004                 success = false;
1005                 break;
1006             }
1007             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1008             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1009             output_tmp.length = data_layout ? 0 : output.length;
1010 
1011             if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
1012                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1013                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1014                 break;
1015             }
1016             switch (input_tmp.type) {
1017                 case OperandType::TENSOR_FLOAT32: {
1018                     success = spaceToDepthGeneric(
1019                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1020                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
1021                     break;
1022                 }
1023                 case OperandType::TENSOR_FLOAT16: {
1024                     success = spaceToDepthGeneric(
1025                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1026                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1027                     break;
1028                 }
1029                 case OperandType::TENSOR_QUANT8_ASYMM: {
1030                     success = spaceToDepthGeneric(
1031                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1032                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1033                     break;
1034                 }
1035                 default: {
1036                     LOG(ERROR) << "Unsupported data type";
1037                     success = false;
1038                 }
1039             }
1040             if (data_layout) {
1041                 output_tmp_guard.reset(output_tmp.buffer);
1042             }
1043             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1044                 success = false;
1045                 break;
1046             }
1047         } break;
1048         case OperationType::EMBEDDING_LOOKUP: {
1049             const RunTimeOperandInfo& values = mOperands[ins[EmbeddingLookup::kValueTensor]];
1050             const RunTimeOperandInfo& lookups = mOperands[ins[EmbeddingLookup::kLookupTensor]];
1051             RunTimeOperandInfo& output = mOperands[outs[EmbeddingLookup::kOutputTensor]];
1052 
1053             Shape outputShape;
1054             EmbeddingLookup lookup(operation, mOperands);
1055 
1056             success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
1057                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
1058         } break;
1059         case OperationType::HASHTABLE_LOOKUP: {
1060             const RunTimeOperandInfo& lookups = mOperands[ins[HashtableLookup::kLookupTensor]];
1061             const RunTimeOperandInfo& keys = mOperands[ins[HashtableLookup::kKeyTensor]];
1062             const RunTimeOperandInfo& values = mOperands[ins[HashtableLookup::kValueTensor]];
1063 
1064             RunTimeOperandInfo& output = mOperands[outs[HashtableLookup::kOutputTensor]];
1065             RunTimeOperandInfo& hits = mOperands[outs[HashtableLookup::kHitsTensor]];
1066 
1067             Shape outputShape, hitShape;
1068             HashtableLookup lookup(operation, mOperands);
1069 
1070             success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
1071                                              &outputShape, &hitShape) &&
1072                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1073                       setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
1074         } break;
1075         case OperationType::LSH_PROJECTION: {
1076             RunTimeOperandInfo& output = mOperands[outs[LSHProjection::kOutputTensor]];
1077             Shape outputShape;
1078             if (!LSHProjection::Prepare(operation, mOperands, &outputShape) ||
1079                 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
1080                 break;
1081             }
1082 
1083             LSHProjection lsh(operation, mOperands);
1084             const RunTimeOperandInfo& hash = mOperands[ins[LSHProjection::kHashTensor]];
1085             switch (hash.type) {
1086                 case OperandType::TENSOR_FLOAT32: {
1087                     success = lsh.Eval<float>();
1088                     break;
1089                 }
1090                 case OperandType::TENSOR_FLOAT16: {
1091                     success = lsh.Eval<_Float16>();
1092                     break;
1093                 }
1094                 default: {
1095                     success = false;
1096                     LOG(ERROR) << "Unsupported data type";
1097                 }
1098             }
1099         } break;
1100         case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
1101             const auto merge_outputs = getScalarData<bool>(
1102                     mOperands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1103             RunTimeOperandInfo& fwOutput =
1104                     mOperands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1105             Shape fwOutputShape, bwOutputShape;
1106 
1107             BidirectionalSequenceLSTM lstm(operation, mOperands);
1108             success = lstm.Prepare(operation, mOperands, &fwOutputShape, &bwOutputShape) &&
1109                       setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1110             if (!merge_outputs) {
1111                 RunTimeOperandInfo& bwOutput =
1112                         mOperands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1113                 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1114             }
1115             success = success && lstm.Eval();
1116         } break;
1117         case OperationType::LSTM: {
1118             RunTimeOperandInfo& scratch = mOperands[outs[LSTMCell::kScratchBufferTensor]];
1119             RunTimeOperandInfo& outputStateOut = mOperands[outs[LSTMCell::kOutputStateOutTensor]];
1120             RunTimeOperandInfo& cellStateOut = mOperands[outs[LSTMCell::kCellStateOutTensor]];
1121             RunTimeOperandInfo& output = mOperands[outs[LSTMCell::kOutputTensor]];
1122 
1123             Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1124             LSTMCell lstm_cell(operation, mOperands);
1125 
1126             success = lstm_cell.Prepare(operation, mOperands, &scratchShape, &outputStateShape,
1127                                         &cellStateShape, &outputShape) &&
1128                       setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1129                       setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1130                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1131                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1132         } break;
1133         case OperationType::RANDOM_MULTINOMIAL: {
1134             const RunTimeOperandInfo& lookups = mOperands[ins[HashtableLookup::kLookupTensor]];
1135             const RunTimeOperandInfo& keys = mOperands[ins[HashtableLookup::kKeyTensor]];
1136             const RunTimeOperandInfo& values = mOperands[ins[HashtableLookup::kValueTensor]];
1137             RunTimeOperandInfo& output = mOperands[outs[Multinomial::kOutputTensor]];
1138 
1139             Shape outputShape;
1140             Multinomial multinomial(operation, mOperands);
1141 
1142             success = Multinomial::Prepare(operation, mOperands, &outputShape) &&
1143                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1144                       multinomial.Eval();
1145         } break;
1146         case OperationType::RNN: {
1147             RunTimeOperandInfo& hiddenStateOut = mOperands[outs[RNN::kHiddenStateOutTensor]];
1148             RunTimeOperandInfo& output = mOperands[outs[RNN::kOutputTensor]];
1149 
1150             Shape hiddenStateShape, outputShape;
1151             RNN rnn_cell(operation, mOperands);
1152 
1153             success = RNN::Prepare(operation, mOperands, &hiddenStateShape, &outputShape) &&
1154                       setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1155                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1156         } break;
1157         case OperationType::SVDF: {
1158             RunTimeOperandInfo& stateOut = mOperands[outs[SVDF::kStateOutTensor]];
1159             RunTimeOperandInfo& output = mOperands[outs[SVDF::kOutputTensor]];
1160 
1161             Shape stateShape, outputShape;
1162             SVDF svdf(operation, mOperands);
1163 
1164             success = SVDF::Prepare(operation, mOperands, &stateShape, &outputShape) &&
1165                       setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1166                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1167         } break;
1168         case OperationType::BATCH_TO_SPACE_ND: {
1169             const size_t inCount = ins.size();
1170             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1171                 return ANEURALNETWORKS_BAD_DATA;
1172             }
1173             const RunTimeOperandInfo& input = mOperands[ins[0]];
1174             const RunTimeOperandInfo& blockSize = mOperands[ins[1]];
1175             bool data_layout = inCount == 3 ? getScalarData<bool>(mOperands[ins[2]]) : false;
1176 
1177             RunTimeOperandInfo& output = mOperands[outs[0]];
1178             Shape outShape = output.shape();
1179 
1180             RunTimeOperandInfo input_tmp, output_tmp;
1181             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1182             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1183                 success = false;
1184                 break;
1185             }
1186             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1187             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1188             output_tmp.length = data_layout ? 0 : output.length;
1189 
1190             if (!batchToSpacePrepare(input_tmp.shape(),
1191                                      reinterpret_cast<const int32_t*>(blockSize.buffer),
1192                                      blockSize.shape(), &outShape) ||
1193                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1194                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1195                 break;
1196             }
1197             switch (input_tmp.type) {
1198                 case OperandType::TENSOR_FLOAT32: {
1199                     success = batchToSpaceGeneric(
1200                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1201                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1202                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1203                     break;
1204                 }
1205                 case OperandType::TENSOR_FLOAT16: {
1206                     success = batchToSpaceGeneric(
1207                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1208                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1209                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1210                     break;
1211                 }
1212                 case OperandType::TENSOR_QUANT8_ASYMM: {
1213                     success = batchToSpaceGeneric(
1214                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1215                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1216                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1217                     break;
1218                 }
1219                 default: {
1220                     LOG(ERROR) << "Unsupported data type";
1221                     success = false;
1222                 }
1223             }
1224             if (data_layout) {
1225                 output_tmp_guard.reset(output_tmp.buffer);
1226             }
1227             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1228                 success = false;
1229                 break;
1230             }
1231         } break;
1232         case OperationType::SPACE_TO_BATCH_ND: {
1233             const size_t inCount = ins.size();
1234             if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1235                 return ANEURALNETWORKS_BAD_DATA;
1236             }
1237             const RunTimeOperandInfo& input = mOperands[ins[0]];
1238             const RunTimeOperandInfo& blockSize = mOperands[ins[1]];
1239             const RunTimeOperandInfo& paddings = mOperands[ins[2]];
1240             bool data_layout = inCount == 4 ? getScalarData<bool>(mOperands[ins[3]]) : false;
1241 
1242             RunTimeOperandInfo& output = mOperands[outs[0]];
1243             Shape outShape = output.shape();
1244 
1245             RunTimeOperandInfo input_tmp, output_tmp;
1246             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1247             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1248                 success = false;
1249                 break;
1250             }
1251             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1252             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1253             output_tmp.length = data_layout ? 0 : output.length;
1254 
1255             if (!spaceToBatchPrepare(
1256                         input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1257                         blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1258                         paddings.shape(), &outShape) ||
1259                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1260                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1261                 break;
1262             }
1263             switch (input_tmp.type) {
1264                 case OperandType::TENSOR_FLOAT32: {
1265                     success = spaceToBatchGeneric(
1266                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1267                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1268                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1269                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1270                     break;
1271                 }
1272                 case OperandType::TENSOR_FLOAT16: {
1273                     success = spaceToBatchGeneric(
1274                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1275                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1276                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1277                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1278                     break;
1279                 }
1280                 case OperandType::TENSOR_QUANT8_ASYMM: {
1281                     success = spaceToBatchGeneric(
1282                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1283                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1284                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1285                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1286                     break;
1287                 }
1288                 default: {
1289                     LOG(ERROR) << "Unsupported data type";
1290                     success = false;
1291                 }
1292             }
1293             if (data_layout) {
1294                 output_tmp_guard.reset(output_tmp.buffer);
1295             }
1296             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1297                 success = false;
1298                 break;
1299             }
1300         } break;
1301         case OperationType::PAD:
1302         case OperationType::PAD_V2: {
1303             const bool isV2 = operation.type == OperationType::PAD_V2;
1304             if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1305                 return ANEURALNETWORKS_BAD_DATA;
1306             }
1307             const RunTimeOperandInfo& input = mOperands[ins[0]];
1308             const RunTimeOperandInfo& paddings = mOperands[ins[1]];
1309 
1310             RunTimeOperandInfo& output = mOperands[outs[0]];
1311             Shape outShape = output.shape();
1312 
1313             if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1314                             paddings.shape(), &outShape) ||
1315                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1316                 break;
1317             }
1318             if (input.type == OperandType::TENSOR_FLOAT32) {
1319                 float pad_value = isV2 ? getScalarData<float>(mOperands[ins[2]]) : 0;
1320                 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1321                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1322                                      reinterpret_cast<float*>(output.buffer), outShape);
1323             } else if (input.type == OperandType::TENSOR_FLOAT16) {
1324                 _Float16 pad_value = isV2 ? getScalarData<_Float16>(mOperands[ins[2]]) : 0;
1325                 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1326                                      reinterpret_cast<const int32_t*>(paddings.buffer),
1327                                      static_cast<_Float16>(pad_value),
1328                                      reinterpret_cast<_Float16*>(output.buffer), outShape);
1329             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1330                 uint8_t pad_value =
1331                         isV2 ? getScalarData<uint8_t>(mOperands[ins[2]]) : outShape.offset;
1332                 success = padGeneric(input.buffer, input.shape(),
1333                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1334                                      output.buffer, outShape);
1335             }
1336         } break;
1337         case OperationType::CAST: {
1338             if (!allParametersPresent(1, 1)) {
1339                 return ANEURALNETWORKS_BAD_DATA;
1340             }
1341             const RunTimeOperandInfo& input = mOperands[ins[0]];
1342 
1343             RunTimeOperandInfo& output = mOperands[outs[0]];
1344             Shape outShape = output.shape();
1345 
1346             success = cast::prepare(input.shape(), &outShape) &&
1347                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1348                       cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1349         } break;
1350         case OperationType::SQUEEZE: {
1351             if (ins.size() != 2 || outs.size() != 1 ||
1352                 mOperands[ins[0]].lifetime == OperandLifeTime::NO_VALUE ||
1353                 mOperands[outs[0]].lifetime == OperandLifeTime::NO_VALUE) {
1354                 LOG(ERROR) << "Wrong input/output count or lifetime for SQUEEZE op.";
1355                 return ANEURALNETWORKS_BAD_DATA;
1356             }
1357             const RunTimeOperandInfo& input = mOperands[ins[0]];
1358             const RunTimeOperandInfo& squeezeDims = mOperands[ins[1]];
1359 
1360             RunTimeOperandInfo& output = mOperands[outs[0]];
1361             Shape outShape = output.shape();
1362 
1363             success = squeezePrepare(input.shape(),
1364                                      reinterpret_cast<const int32_t*>(squeezeDims.buffer),
1365                                      squeezeDims.shape(), &outShape) &&
1366                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1367                       copyData(input.buffer, input.shape(), output.buffer, outShape);
1368         } break;
1369         case OperationType::STRIDED_SLICE: {
1370             if (!allParametersPresent(7, 1)) {
1371                 return ANEURALNETWORKS_BAD_DATA;
1372             }
1373             const RunTimeOperandInfo& input = mOperands[ins[0]];
1374             const RunTimeOperandInfo& begins = mOperands[ins[1]];
1375             const RunTimeOperandInfo& ends = mOperands[ins[2]];
1376             const RunTimeOperandInfo& strides = mOperands[ins[3]];
1377             int32_t beginMask = getScalarData<int32_t>(mOperands[ins[4]]);
1378             int32_t endMask = getScalarData<int32_t>(mOperands[ins[5]]);
1379             int32_t shrinkAxisMask = getScalarData<int32_t>(mOperands[ins[6]]);
1380 
1381             RunTimeOperandInfo& output = mOperands[outs[0]];
1382             Shape outShape = output.shape();
1383 
1384             success =
1385                     stridedSlicePrepare(
1386                             input.shape(), reinterpret_cast<const int32_t*>(begins.buffer),
1387                             begins.shape(), reinterpret_cast<const int32_t*>(ends.buffer),
1388                             ends.shape(), reinterpret_cast<const int32_t*>(strides.buffer),
1389                             strides.shape(), beginMask, endMask, shrinkAxisMask, &outShape) &&
1390                     setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1391                     stridedSliceGeneric(input.buffer, input.shape(),
1392                                         reinterpret_cast<const int32_t*>(begins.buffer),
1393                                         reinterpret_cast<const int32_t*>(ends.buffer),
1394                                         reinterpret_cast<const int32_t*>(strides.buffer), beginMask,
1395                                         endMask, shrinkAxisMask, output.buffer, outShape);
1396         } break;
1397         case OperationType::MEAN: {
1398             if (!allParametersPresent(3, 1)) {
1399                 return ANEURALNETWORKS_BAD_DATA;
1400             }
1401             const RunTimeOperandInfo& input = mOperands[ins[0]];
1402             const RunTimeOperandInfo& axis = mOperands[ins[1]];
1403             int32_t keepDims = getScalarData<int32_t>(mOperands[ins[2]]);
1404 
1405             RunTimeOperandInfo& output = mOperands[outs[0]];
1406             Shape outShape = output.shape();
1407 
1408             if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1409                              axis.shape(), keepDims > 0, &outShape) ||
1410                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1411                 break;
1412             }
1413             if (input.type == OperandType::TENSOR_FLOAT16) {
1414                 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1415                                       reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1416                                       keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1417                                       outShape);
1418             } else if (input.type == OperandType::TENSOR_FLOAT32) {
1419                 success = meanGeneric<float, float>(
1420                         reinterpret_cast<float*>(input.buffer), input.shape(),
1421                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1422                         reinterpret_cast<float*>(output.buffer), outShape);
1423             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1424                 success = meanGeneric<uint8_t, int32_t>(
1425                         reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1426                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1427                         reinterpret_cast<uint8_t*>(output.buffer), outShape);
1428             }
1429         } break;
1430         case OperationType::ARGMAX:
1431         case OperationType::ARGMIN: {
1432             if (!allParametersPresent(2, 1)) {
1433                 return ANEURALNETWORKS_BAD_DATA;
1434             }
1435             const RunTimeOperandInfo& input = mOperands[ins[0]];
1436             int32_t axis = getScalarData<int32_t>(mOperands[ins[1]]);
1437 
1438             RunTimeOperandInfo& output = mOperands[outs[0]];
1439             Shape outShape = output.shape();
1440 
1441             const bool isArgMin = operation.type == OperationType::ARGMIN;
1442             success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1443                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1444                       argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1445                                        outShape);
1446         } break;
1447         case OperationType::EXPAND_DIMS: {
1448             if (!allParametersPresent(2, 1)) {
1449                 return ANEURALNETWORKS_BAD_DATA;
1450             }
1451             const RunTimeOperandInfo& input = mOperands[ins[0]];
1452             int32_t axis = getScalarData<int32_t>(mOperands[ins[1]]);
1453 
1454             RunTimeOperandInfo& output = mOperands[outs[0]];
1455             Shape outShape = output.shape();
1456 
1457             success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1458                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1459                       expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1460         } break;
1461         case OperationType::SPLIT: {
1462             if (ins.size() != 3) {
1463                 LOG(ERROR) << "Wrong input count";
1464                 return ANEURALNETWORKS_BAD_DATA;
1465             }
1466 
1467             const RunTimeOperandInfo& input = mOperands[ins[0]];
1468             const int32_t axis = getScalarData<int32_t>(mOperands[ins[1]]);
1469             const int32_t numOutputs = getScalarData<int32_t>(mOperands[ins[2]]);
1470 
1471             if (numOutputs != outs.size()) {
1472                 return ANEURALNETWORKS_BAD_DATA;
1473             }
1474 
1475             std::vector<Shape> outputShapes(numOutputs);
1476             for (int i = 0; i < numOutputs; ++i) {
1477                 outputShapes[i] = mOperands[outs[i]].shape();
1478             }
1479 
1480             success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1481             for (int i = 0; i < numOutputs; ++i) {
1482                 success = success && setInfoAndAllocateIfNeeded(&(mOperands[outs[i]]),
1483                                                                 outputShapes[i], &result);
1484             }
1485             switch (input.type) {
1486                 case OperandType::TENSOR_FLOAT16: {
1487                     std::vector<_Float16*> outputDataPtrs(numOutputs);
1488                     for (int i = 0; i < numOutputs; ++i) {
1489                         outputDataPtrs[i] = reinterpret_cast<_Float16*>(mOperands[outs[i]].buffer);
1490                     }
1491                     success = success &&
1492                               splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1493                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1494                 } break;
1495                 case OperandType::TENSOR_FLOAT32: {
1496                     std::vector<float*> outputDataPtrs(numOutputs);
1497                     for (int i = 0; i < numOutputs; ++i) {
1498                         outputDataPtrs[i] = reinterpret_cast<float*>(mOperands[outs[i]].buffer);
1499                     }
1500                     success = success &&
1501                               splitFloat32(reinterpret_cast<const float*>(input.buffer),
1502                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1503                 } break;
1504                 case OperandType::TENSOR_INT32: {
1505                     std::vector<int32_t*> outputDataPtrs(numOutputs);
1506                     for (int i = 0; i < numOutputs; ++i) {
1507                         outputDataPtrs[i] = reinterpret_cast<int32_t*>(mOperands[outs[i]].buffer);
1508                     }
1509                     success = success &&
1510                               splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1511                                          input.shape(), axis, &outputDataPtrs, outputShapes);
1512                 } break;
1513                 case OperandType::TENSOR_QUANT8_ASYMM: {
1514                     std::vector<uint8_t*> outputDataPtrs(numOutputs);
1515                     for (int i = 0; i < numOutputs; ++i) {
1516                         outputDataPtrs[i] = reinterpret_cast<uint8_t*>(mOperands[outs[i]].buffer);
1517                     }
1518                     success = success &&
1519                               splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1520                                           input.shape(), axis, &outputDataPtrs, outputShapes);
1521                 } break;
1522                 default: {
1523                     return ANEURALNETWORKS_BAD_DATA;
1524                 }
1525             }
1526         } break;
1527         case OperationType::MAXIMUM:
1528         case OperationType::MINIMUM: {
1529             if (!allParametersPresent(2, 1)) {
1530                 return ANEURALNETWORKS_BAD_DATA;
1531             }
1532             const RunTimeOperandInfo& in1 = mOperands[ins[0]];
1533             const RunTimeOperandInfo& in2 = mOperands[ins[1]];
1534 
1535             RunTimeOperandInfo& output = mOperands[outs[0]];
1536             Shape outputShape = output.shape();
1537 
1538             const bool isMinimum = operation.type == OperationType::MINIMUM;
1539             success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1540                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1541                       maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1542                                             isMinimum, output.buffer, outputShape);
1543         } break;
1544         case OperationType::GROUPED_CONV_2D: {
1545             const size_t inCount = ins.size();
1546             if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1547                 return ANEURALNETWORKS_BAD_DATA;
1548             }
1549             const RunTimeOperandInfo& input = mOperands[ins[0]];
1550             const RunTimeOperandInfo& filter = mOperands[ins[1]];
1551             const RunTimeOperandInfo& bias = mOperands[ins[2]];
1552 
1553             int32_t padding_left, padding_right;
1554             int32_t padding_top, padding_bottom;
1555             int32_t padding_implicit = 0;
1556             int32_t stride_width, stride_height;
1557             int32_t numGroups;
1558             int32_t activation;
1559             bool data_layout = false;
1560 
1561             if (inCount == 12) {
1562                 padding_left = getScalarData<int32_t>(mOperands[ins[3]]);
1563                 padding_right = getScalarData<int32_t>(mOperands[ins[4]]);
1564                 padding_top = getScalarData<int32_t>(mOperands[ins[5]]);
1565                 padding_bottom = getScalarData<int32_t>(mOperands[ins[6]]);
1566                 stride_width = getScalarData<int32_t>(mOperands[ins[7]]);
1567                 stride_height = getScalarData<int32_t>(mOperands[ins[8]]);
1568                 numGroups = getScalarData<int32_t>(mOperands[ins[9]]);
1569                 activation = getScalarData<int32_t>(mOperands[ins[10]]);
1570                 data_layout = getScalarData<bool>(mOperands[ins[11]]);
1571             } else {
1572                 padding_implicit = getScalarData<int32_t>(mOperands[ins[3]]);
1573                 stride_width = getScalarData<int32_t>(mOperands[ins[4]]);
1574                 stride_height = getScalarData<int32_t>(mOperands[ins[5]]);
1575                 numGroups = getScalarData<int32_t>(mOperands[ins[6]]);
1576                 activation = getScalarData<int32_t>(mOperands[ins[7]]);
1577                 data_layout = getScalarData<bool>(mOperands[ins[8]]);
1578             }
1579 
1580             RunTimeOperandInfo& output = mOperands[outs[0]];
1581             Shape outShape = output.shape();
1582 
1583             RunTimeOperandInfo input_tmp, output_tmp;
1584             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1585             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1586                 success = false;
1587                 break;
1588             }
1589             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1590             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1591             output_tmp.length = data_layout ? 0 : output.length;
1592 
1593             if (inCount == 9) {
1594                 Shape inputShape = input_tmp.shape();
1595                 Shape filterShape = filter.shape();
1596                 int32_t input_width = getSizeOfDimension(inputShape, 2);
1597                 int32_t input_height = getSizeOfDimension(inputShape, 1);
1598                 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1599                 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1600                 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1601                                          &padding_left, &padding_right);
1602                 calculateExplicitPadding(input_height, stride_height, filter_height,
1603                                          padding_implicit, &padding_top, &padding_bottom);
1604             }
1605 
1606             if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1607                                     padding_right, padding_top, padding_bottom, stride_width,
1608                                     stride_height, numGroups, &outShape) ||
1609                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1610                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1611                 success = false;
1612                 break;
1613             }
1614 
1615             if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1616                 success = groupedConvFloat32(
1617                         reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1618                         reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1619                         reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1620                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1621                         numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1622                         outShape);
1623             } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1624                 success = groupedConvFloat16(
1625                         reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1626                         reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1627                         reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1628                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1629                         numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1630                         outShape);
1631             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1632                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1633                     success = groupedConvQuant8PerChannel(
1634                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1635                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1636                             filter.extraParams.channelQuant().scales.data(),
1637                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1638                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1639                             stride_height, numGroups, activation,
1640                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1641                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1642                     success = groupedConvQuant8(
1643                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1644                             reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1645                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1646                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1647                             stride_height, numGroups, activation,
1648                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1649                 }
1650             }
1651 
1652             if (data_layout) {
1653                 output_tmp_guard.reset(output_tmp.buffer);
1654             }
1655             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1656                 success = false;
1657                 break;
1658             }
1659         } break;
1660         case OperationType::TILE: {
1661             if (!allParametersPresent(2, 1)) {
1662                 return ANEURALNETWORKS_BAD_DATA;
1663             }
1664             const RunTimeOperandInfo& input = mOperands[ins[0]];
1665             const RunTimeOperandInfo& multiples = mOperands[ins[1]];
1666 
1667             RunTimeOperandInfo& output = mOperands[outs[0]];
1668             Shape outShape = output.shape();
1669 
1670             success =
1671                     tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1672                                   multiples.shape(), &outShape) &&
1673                     setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1674                     tile::eval(input.buffer, input.shape(),
1675                                reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1676                                outShape);
1677         } break;
1678         case OperationType::QUANTIZED_16BIT_LSTM: {
1679             if (!allParametersPresent(15, 2)) {
1680                 return ANEURALNETWORKS_BAD_DATA;
1681             }
1682 
1683             RunTimeOperandInfo& cellStateOut =
1684                     mOperands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1685             RunTimeOperandInfo& output = mOperands[outs[QuantizedLSTMCell::kOutputTensor]];
1686 
1687             Shape cellStateOutShape, outputShape;
1688             QuantizedLSTMCell quantizedLSTMCell(operation, mOperands);
1689 
1690             success = QuantizedLSTMCell::prepare(operation, mOperands, &cellStateOutShape,
1691                                                  &outputShape) &&
1692                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1693                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1694                       quantizedLSTMCell.eval();
1695         } break;
1696         case OperationType::POW: {
1697             if (!allParametersPresent(2, 1)) {
1698                 return ANEURALNETWORKS_BAD_DATA;
1699             }
1700             const RunTimeOperandInfo& base = mOperands[ins[0]];
1701             const RunTimeOperandInfo& exponent = mOperands[ins[1]];
1702 
1703             RunTimeOperandInfo& output = mOperands[outs[0]];
1704             Shape outShape = output.shape();
1705 
1706             success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1707                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1708                       pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1709                                 output.buffer, outShape);
1710         } break;
1711         case OperationType::TOPK_V2: {
1712             if (!allParametersPresent(2, 2)) {
1713                 return ANEURALNETWORKS_BAD_DATA;
1714             }
1715             const RunTimeOperandInfo& input = mOperands[ins[0]];
1716             int32_t k = getScalarData<int32_t>(mOperands[ins[1]]);
1717 
1718             RunTimeOperandInfo& values = mOperands[outs[0]];
1719             Shape valuesShape = values.shape();
1720             RunTimeOperandInfo& indices = mOperands[outs[1]];
1721             Shape indicesShape = indices.shape();
1722 
1723             success = topk_v2::prepare(input.shape(), k, &valuesShape, &indicesShape) &&
1724                       setInfoAndAllocateIfNeeded(&values, valuesShape, &result) &&
1725                       setInfoAndAllocateIfNeeded(&indices, indicesShape, &result) &&
1726                       topk_v2::eval(input.buffer, input.shape(), k, values.buffer, valuesShape,
1727                                     indices.buffer, indicesShape);
1728         } break;
1729         default: {
1730             const OperationRegistration* operationRegistration =
1731                     mOperationResolver->findOperation(operation.type);
1732             if (operationRegistration == nullptr) {
1733                 LOG(ERROR) << getOperationName(operation.type) << " not registered";
1734             } else if (operationRegistration->prepare == nullptr ||
1735                        operationRegistration->execute == nullptr) {
1736                 LOG(ERROR) << "Incomplete operation registration: "
1737                            << getOperationName(operation.type);
1738             } else {
1739                 OperationExecutionContext context(&operation, mOperands.data());
1740                 success = operationRegistration->flags.allowOmittedOperand ||
1741                           context.checkNoOmittedOperand();
1742                 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1743                                       context.checkNoZeroSizedInput());
1744                 success = success && operationRegistration->prepare(&context) &&
1745                           operationRegistration->execute(&context);
1746                 result = context.getResultCode();
1747             }
1748         }
1749     }
1750     if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1751         result = ANEURALNETWORKS_OP_FAILED;
1752     }
1753     if (result != ANEURALNETWORKS_NO_ERROR) {
1754         LOG(ERROR) << getOperationName(operation.type) << " failed.";
1755         return result;
1756     }
1757 
1758     freeNoLongerUsedOperands(ins);
1759     return ANEURALNETWORKS_NO_ERROR;
1760 }
1761 
finish(int result)1762 void CpuExecutor::finish(int result) {
1763     // Free allocated temporary operands.
1764     for (auto& info : mOperands) {
1765         if (info.lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info.buffer != nullptr) {
1766             delete[] info.buffer;
1767             info.buffer = nullptr;
1768         }
1769     }
1770 
1771     // Only report the output shapes when the result code is NO_ERROR or
1772     // OUTPUT_INSUFFICIENT_SIZE.
1773     if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
1774         const auto& outputs = mModel->outputIndexes;
1775         mOutputShapes.resize(outputs.size());
1776         for (uint32_t i = 0; i < outputs.size(); i++) {
1777             const uint32_t operandIndex = outputs[i];
1778             RunTimeOperandInfo& from = mOperands[operandIndex];
1779             mOutputShapes[i].dimensions = from.dimensions;
1780             mOutputShapes[i].isSufficient = from.isSufficient();
1781         }
1782     } else {
1783         mOutputShapes.clear();
1784     }
1785 
1786     mModel = nullptr;
1787     mRequest = nullptr;
1788     mFinished = true;
1789 }
1790 
1791 // b/109953668, disable OpenMP
1792 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1793 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1794     mBlocktimeInitial = kmp_get_blocktime();
1795     kmp_set_blocktime(20);  // ms, see b/109645291
1796 
1797 #if NNAPI_LIMIT_CPU_THREADS
1798     // Code not yet enabled. Choosing the number of threads to be based on
1799     // benchmarking. See longer comment by the class declaration.
1800     mMaxThreadsInitial = Eigen::nbThreads();
1801     const int nProcs = omp_get_num_procs();
1802     int threads = nProcs;
1803     if (nProcs >= 8) {
1804         threads = nProcs - 4;
1805     } else if (nProcs >= 4) {
1806         threads = nProcs - 2;
1807     }
1808     Eigen::setNbThreads(threads);
1809 #endif
1810 }
1811 
~ScopedOpenmpSettings()1812 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1813     kmp_set_blocktime(mBlocktimeInitial);
1814 #if NNAPI_LIMIT_CPU_THREADS
1815     Eigen::setNbThreads(mMaxThreadsInitial);
1816 #endif
1817 }
1818 #endif  // NNAPI_OPENMP
1819 
1820 }  // namespace nn
1821 }  // namespace android
1822