1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "CpuExecutor"
18 
19 #include "CpuExecutor.h"
20 
21 #include <android/hardware_buffer.h>
22 #include <sys/mman.h>
23 #include <vndk/hardware_buffer.h>
24 
25 #include <Eigen/Core>
26 #include <memory>
27 #include <utility>
28 #include <vector>
29 
30 // b/109953668, disable OpenMP
31 #ifdef NNAPI_OPENMP
32 #include <omp.h>
33 #endif  // NNAPI_OPENMP
34 
35 #include "ControlFlow.h"
36 #include "NeuralNetworks.h"
37 #include "OperationResolver.h"
38 #include "Operations.h"
39 #include "OperationsUtils.h"
40 #include "Tracing.h"
41 
42 namespace android {
43 namespace nn {
44 
45 namespace {
46 
47 using namespace hal;
48 
49 class OperationExecutionContext : public IOperationExecutionContext {
50     DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
51 
52    public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)53     OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
54         : operation(operation), operands(operands) {}
55 
56     uint32_t getNumInputs() const override;
57     OperandType getInputType(uint32_t index) const override;
58     Shape getInputShape(uint32_t index) const override;
59     const void* getInputBuffer(uint32_t index) const override;
60     const OperandExtraParams getInputExtraParams(uint32_t index) const override;
61 
62     uint32_t getNumOutputs() const override;
63     OperandType getOutputType(uint32_t index) const override;
64     Shape getOutputShape(uint32_t index) const override;
65     void* getOutputBuffer(uint32_t index) override;
66 
67     // Return false on failure and store the result code.
68     // Use getResultCode() to retrieve it at the end of the operation execution.
69     bool setOutputShape(uint32_t index, const Shape& shape) override;
70     int getResultCode() const;
71 
72     bool isOmittedInput(uint32_t index) const override;
73     bool isOmittedOutput(uint32_t index) const override;
74 
75     // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
76     bool checkNoOmittedOperand() const;
77     // Return false if any of inputs has dimension 0.
78     bool checkNoZeroSizedInput() const;
79 
80    private:
81     const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
82     const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
83     RunTimeOperandInfo* getOutputInfo(uint32_t index);
84 
85     const Operation* operation;
86     RunTimeOperandInfo* operands;
87 
88     int result = ANEURALNETWORKS_NO_ERROR;
89 };
90 
getInputInfo(uint32_t index) const91 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
92     CHECK(index < operation->inputs.size());
93     return &operands[operation->inputs[index]];
94 }
95 
getOutputInfo(uint32_t index) const96 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
97     CHECK(index < operation->outputs.size());
98     return &operands[operation->outputs[index]];
99 }
100 
getOutputInfo(uint32_t index)101 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
102     CHECK(index < operation->outputs.size());
103     return &operands[operation->outputs[index]];
104 }
105 
getInputType(uint32_t index) const106 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
107     return getInputInfo(index)->type;
108 }
109 
getInputShape(uint32_t index) const110 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
111     return getInputInfo(index)->shape();
112 }
113 
getInputBuffer(uint32_t index) const114 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
115     return getInputInfo(index)->buffer;
116 }
117 
getInputExtraParams(uint32_t index) const118 const OperandExtraParams OperationExecutionContext::getInputExtraParams(uint32_t index) const {
119     return getInputInfo(index)->extraParams;
120 }
121 
getOutputType(uint32_t index) const122 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
123     return getOutputInfo(index)->type;
124 }
125 
getOutputShape(uint32_t index) const126 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
127     return getOutputInfo(index)->shape();
128 }
129 
getOutputBuffer(uint32_t index)130 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
131     return getOutputInfo(index)->buffer;
132 }
133 
getNumInputs() const134 uint32_t OperationExecutionContext::getNumInputs() const {
135     return operation->inputs.size();
136 }
137 
getNumOutputs() const138 uint32_t OperationExecutionContext::getNumOutputs() const {
139     return operation->outputs.size();
140 }
141 
getResultCode() const142 int OperationExecutionContext::getResultCode() const {
143     return result;
144 }
145 
146 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
147 // Updates the RunTimeOperandInfo with the newly calculated shape.
148 // Allocate the buffer if we need to.
149 //
150 // TODO(b/153081229): This function currently cannot handle extension operands well. We need to
151 //                    propagate the extension type info into this function.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)152 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
153     // For user-provided model output operands, the parameters must match the Shape
154     // calculated from the preparation step.
155     if (info->lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
156         if (info->type != shape.type) {
157             LOG(ERROR) << "Invalid type for model output";
158             *result = ANEURALNETWORKS_OP_FAILED;
159             return false;
160         }
161         if (info->scale != shape.scale) {
162             LOG(ERROR) << "Invalid scale for model output";
163             *result = ANEURALNETWORKS_OP_FAILED;
164             return false;
165         }
166         if (info->zeroPoint != shape.offset) {
167             LOG(ERROR) << "Invalid zeroPoint for model output";
168             *result = ANEURALNETWORKS_OP_FAILED;
169             return false;
170         }
171         if (info->extraParams != shape.extraParams) {
172             LOG(ERROR) << "Invalid extraParams for model output";
173             *result = ANEURALNETWORKS_OP_FAILED;
174             return false;
175         }
176     }
177 
178     auto combined = combineDimensions(shape.dimensions, info->dimensions);
179     if (!combined.has_value()) {
180         LOG(ERROR) << "Invalid dimensions for model operand";
181         *result = ANEURALNETWORKS_OP_FAILED;
182         return false;
183     }
184     info->dimensions = std::move(combined.value());
185     info->type = shape.type;
186     info->scale = shape.scale;
187     info->zeroPoint = shape.offset;
188     info->extraParams = shape.extraParams;
189 
190     // TODO(b/153081229): We bypass the overflow check on extension operands because we do not know
191     //                    the sizes of extension types.
192     if (!isExtensionOperandType(info->type) &&
193         nonExtensionOperandSizeOfDataOverflowsUInt32(info->type, info->dimensions)) {
194         LOG(ERROR) << "Operand data size overflows uint32_t";
195         *result = ANEURALNETWORKS_OP_FAILED;
196         return false;
197     }
198 
199     // Allocate the buffer only if the combined dimension is fully specified
200     if (info->buffer == nullptr && (info->lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
201                                     info->lifetime == OperandLifeTime::SUBGRAPH_OUTPUT)) {
202         if (isExtensionOperandType(info->type)) {
203             LOG(ERROR) << "Cannot allocate a variable of an extension type";
204             *result = ANEURALNETWORKS_OP_FAILED;
205             return false;
206         }
207         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
208         if (length > 0) {
209             info->buffer = new uint8_t[length];
210             if (info->buffer == nullptr) {
211                 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
212                 return false;
213             }
214             info->length = length;
215         }
216     }
217     if (!info->isSufficient()) {
218         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
219         LOG(ERROR) << "Insufficient size for model operand: require = " << length
220                    << ", provided = " << info->length;
221         *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
222         return false;
223     }
224     *result = ANEURALNETWORKS_NO_ERROR;
225     return true;
226 }
227 
setOutputShape(uint32_t index,const Shape & shape)228 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
229     return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
230 }
231 
isOmittedInput(uint32_t index) const232 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
233     return getInputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
234 }
235 
isOmittedOutput(uint32_t index) const236 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
237     return getOutputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
238 }
239 
checkNoOmittedOperand() const240 bool OperationExecutionContext::checkNoOmittedOperand() const {
241     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
242         NN_RET_CHECK(!isOmittedInput(i)) << getOperationName(operation->type) << " input operand "
243                                          << i << " is required but missing.";
244     }
245     for (uint32_t i = 0; i < operation->outputs.size(); i++) {
246         NN_RET_CHECK(!isOmittedOutput(i)) << getOperationName(operation->type) << " output operand "
247                                           << i << " is required but missing.";
248     }
249     return true;
250 }
251 
checkNoZeroSizedInput() const252 bool OperationExecutionContext::checkNoZeroSizedInput() const {
253     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
254         if (isOmittedInput(i)) continue;
255         for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
256             NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0)
257                     << getOperationName(operation->type)
258                     << " does not support zero-sized tensor, but input " << i << " dimension " << j
259                     << " is 0.";
260         }
261     }
262     return true;
263 }
264 
265 }  // namespace
266 
267 // Used to keep a pointer to a memory pool.
268 //
269 // In the case of an "mmap_fd" pool, owns the mmap region
270 // returned by getBuffer() -- i.e., that region goes away
271 // when the RunTimePoolInfo is destroyed or is assigned to.
272 class RunTimePoolInfo::RunTimePoolInfoImpl {
273    public:
274     RunTimePoolInfoImpl(const hidl_memory& hidlMemory, uint8_t* buffer, const sp<IMemory>& memory,
275                         AHardwareBuffer* hardwareBuffer, uint32_t size);
276 
277     // rule of five...
278     ~RunTimePoolInfoImpl();
279     RunTimePoolInfoImpl(const RunTimePoolInfoImpl&) = delete;
280     RunTimePoolInfoImpl(RunTimePoolInfoImpl&&) noexcept = delete;
281     RunTimePoolInfoImpl& operator=(const RunTimePoolInfoImpl&) = delete;
282     RunTimePoolInfoImpl& operator=(RunTimePoolInfoImpl&&) noexcept = delete;
283 
getBuffer() const284     uint8_t* getBuffer() const { return mBuffer; }
getSize() const285     uint32_t getSize() const { return mSize; }
286 
287     bool flush() const;
288 
getHidlMemory() const289     const hidl_memory& getHidlMemory() const { return mHidlMemory; }
290 
291    private:
292     const hidl_memory mHidlMemory;     // always used
293     uint8_t* const mBuffer = nullptr;  // always used
294     const sp<IMemory> mMemory;         // only used when hidlMemory.name() == "ashmem"
295     AHardwareBuffer*
296             mAHardwareBuffer;  // only used when hidlMemory.name() == "hardware_buffer_blob"
297     const uint32_t mSize;
298 };
299 
RunTimePoolInfoImpl(const hidl_memory & hidlMemory,uint8_t * buffer,const sp<IMemory> & memory,AHardwareBuffer * hardwareBuffer,uint32_t size)300 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(const hidl_memory& hidlMemory,
301                                                           uint8_t* buffer,
302                                                           const sp<IMemory>& memory,
303                                                           AHardwareBuffer* hardwareBuffer,
304                                                           uint32_t size)
305     : mHidlMemory(hidlMemory),
306       mBuffer(buffer),
307       mMemory(memory),
308       mAHardwareBuffer(hardwareBuffer),
309       mSize(size) {}
310 
~RunTimePoolInfoImpl()311 RunTimePoolInfo::RunTimePoolInfoImpl::~RunTimePoolInfoImpl() {
312     if (mBuffer == nullptr) {
313         return;
314     }
315 
316     const auto& memType = mHidlMemory.name();
317     if (memType == "ashmem") {
318         // nothing to do
319     } else if (memType == "mmap_fd") {
320         const size_t size = mHidlMemory.size();
321         if (munmap(mBuffer, size)) {
322             LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfo(): Can't munmap";
323         }
324     } else if (memType == "hardware_buffer_blob") {
325         AHardwareBuffer_unlock(mAHardwareBuffer, nullptr);
326     } else if (memType == "") {
327         // Represents a POINTER argument; nothing to do
328     } else {
329         LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfoImpl(): unsupported hidl_memory type";
330     }
331 
332     if (mAHardwareBuffer != nullptr) {
333         AHardwareBuffer_release(mAHardwareBuffer);
334     }
335 }
336 
337 // Making sure the output data are correctly updated after execution.
flush() const338 bool RunTimePoolInfo::RunTimePoolInfoImpl::flush() const {
339     const auto& memType = mHidlMemory.name();
340     if (memType == "mmap_fd") {
341         const int prot = mHidlMemory.handle()->data[1];
342         if (prot & PROT_WRITE) {
343             const size_t size = mHidlMemory.size();
344             return msync(mBuffer, size, MS_SYNC) == 0;
345         }
346     }
347     // No-op for other types of memory.
348     return true;
349 }
350 
351 // TODO: short term, make share memory mapping and updating a utility function.
352 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromHidlMemory(const hidl_memory & hidlMemory)353 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromHidlMemory(
354         const hidl_memory& hidlMemory) {
355     uint8_t* buffer = nullptr;
356     sp<IMemory> memory;
357     AHardwareBuffer* hardwareBuffer = nullptr;
358 
359     const auto& memType = hidlMemory.name();
360     if (memType == "ashmem") {
361         memory = mapMemory(hidlMemory);
362         if (memory == nullptr) {
363             LOG(ERROR) << "Can't map shared memory.";
364             return std::nullopt;
365         }
366         buffer = static_cast<uint8_t*>(static_cast<void*>(memory->getPointer()));
367         if (buffer == nullptr) {
368             LOG(ERROR) << "Can't access shared memory.";
369             return std::nullopt;
370         }
371     } else if (memType == "mmap_fd") {
372         size_t size = hidlMemory.size();
373         int fd = hidlMemory.handle()->data[0];
374         int prot = hidlMemory.handle()->data[1];
375         size_t offset = getSizeFromInts(hidlMemory.handle()->data[2], hidlMemory.handle()->data[3]);
376         buffer = static_cast<uint8_t*>(mmap(nullptr, size, prot, MAP_SHARED, fd, offset));
377         if (buffer == MAP_FAILED) {
378             LOG(ERROR) << "RunTimePoolInfo::set(): Can't mmap the file descriptor.";
379             return std::nullopt;
380         }
381     } else if (memType == "hardware_buffer_blob") {
382         auto handle = hidlMemory.handle();
383         auto format = AHARDWAREBUFFER_FORMAT_BLOB;
384         auto usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
385         const uint32_t width = hidlMemory.size();
386         const uint32_t height = 1;  // height is always 1 for BLOB mode AHardwareBuffer.
387         const uint32_t layers = 1;  // layers is always 1 for BLOB mode AHardwareBuffer.
388         const uint32_t stride = hidlMemory.size();
389 
390         AHardwareBuffer_Desc desc{
391                 .width = width,
392                 .format = format,
393                 .height = height,
394                 .layers = layers,
395                 .usage = usage,
396                 .stride = stride,
397         };
398         status_t status = AHardwareBuffer_createFromHandle(
399                 &desc, handle, AHARDWAREBUFFER_CREATE_FROM_HANDLE_METHOD_CLONE, &hardwareBuffer);
400         if (status != NO_ERROR) {
401             LOG(ERROR) << "RunTimePoolInfo Can't create AHardwareBuffer from handle. Error: "
402                        << status;
403             return std::nullopt;
404         }
405         void* gBuffer = nullptr;
406         status = AHardwareBuffer_lock(hardwareBuffer, usage, -1, nullptr, &gBuffer);
407         if (status != NO_ERROR) {
408             LOG(ERROR) << "RunTimePoolInfo Can't lock the AHardwareBuffer. Error: " << status;
409             return std::nullopt;
410         }
411         buffer = static_cast<uint8_t*>(gBuffer);
412     } else {
413         LOG(ERROR) << "RunTimePoolInfo::set(): unsupported hidl_memory type";
414         return std::nullopt;
415     }
416 
417     const auto impl = std::make_shared<const RunTimePoolInfoImpl>(
418             hidlMemory, buffer, memory, hardwareBuffer, hidlMemory.size());
419     return {RunTimePoolInfo(impl)};
420 }
421 
createFromExistingBuffer(uint8_t * buffer,uint32_t size)422 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer, uint32_t size) {
423     const auto impl = std::make_shared<const RunTimePoolInfoImpl>(hidl_memory{}, buffer, nullptr,
424                                                                   nullptr, size);
425     return {impl};
426 }
427 
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)428 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
429     : mImpl(impl) {}
430 
getBuffer() const431 uint8_t* RunTimePoolInfo::getBuffer() const {
432     return mImpl->getBuffer();
433 }
434 
getSize() const435 uint32_t RunTimePoolInfo::getSize() const {
436     return mImpl->getSize();
437 }
438 
flush() const439 bool RunTimePoolInfo::flush() const {
440     return mImpl->flush();
441 }
442 
getHidlMemory() const443 const hidl_memory& RunTimePoolInfo::getHidlMemory() const {
444     return mImpl->getHidlMemory();
445 }
446 
setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<hidl_memory> & pools)447 bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
448                                          const hidl_vec<hidl_memory>& pools) {
449     CHECK(poolInfos != nullptr);
450     poolInfos->clear();
451     poolInfos->reserve(pools.size());
452     for (const auto& pool : pools) {
453         if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromHidlMemory(pool)) {
454             poolInfos->push_back(*poolInfo);
455         } else {
456             LOG(ERROR) << "Could not map pools";
457             poolInfos->clear();
458             return false;
459         }
460     }
461     return true;
462 }
463 
setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<Request::MemoryPool> & pools)464 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
465                                         const hidl_vec<Request::MemoryPool>& pools) {
466     CHECK(poolInfos != nullptr);
467     poolInfos->clear();
468     poolInfos->reserve(pools.size());
469     for (const auto& pool : pools) {
470         if (pool.getDiscriminator() != Request::MemoryPool::hidl_discriminator::hidlMemory) {
471             LOG(ERROR) << "Unknown memory token";
472             poolInfos->clear();
473             return false;
474         }
475         if (std::optional<RunTimePoolInfo> poolInfo =
476                     RunTimePoolInfo::createFromHidlMemory(pool.hidlMemory())) {
477             poolInfos->push_back(*poolInfo);
478         } else {
479             LOG(ERROR) << "Could not map pools";
480             poolInfos->clear();
481             return false;
482         }
483     }
484     return true;
485 }
486 
487 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)488 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
489     uint32_t spatialSize = fromDim[2] * fromDim[3];
490     for (uint32_t n = 0; n < fromDim[0]; n++) {
491         for (uint32_t hw = 0; hw < spatialSize; hw++) {
492             for (uint32_t c = 0; c < fromDim[1]; c++) {
493                 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
494                 *to++ = from[fromIndex];
495             }
496         }
497     }
498     return true;
499 }
500 
501 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)502 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
503     uint32_t spatialSize = fromDim[1] * fromDim[2];
504     for (uint32_t n = 0; n < fromDim[0]; n++) {
505         for (uint32_t c = 0; c < fromDim[3]; c++) {
506             for (uint32_t hw = 0; hw < spatialSize; hw++) {
507                 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
508                 *to++ = from[fromIndex];
509             }
510         }
511     }
512     return true;
513 }
514 
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)515 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
516                           std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
517     int result;
518     if (from.dimensions.size() != 4) {
519         LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
520         return false;
521     }
522     to.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
523     if (data_layout) {
524         // convert dimensions
525         Shape inShape = from.shape();
526         auto& fromDim = from.dimensions;
527         inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
528         // allocate buffer
529         to.buffer = nullptr;
530         if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
531             return false;
532         }
533         ptr_guard.reset(to.buffer);
534         // convert value
535         if (from.type == OperandType::TENSOR_FLOAT32) {
536             return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
537                                             reinterpret_cast<const float*>(from.buffer), fromDim);
538         } else if (from.type == OperandType::TENSOR_FLOAT16) {
539             return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
540                                                reinterpret_cast<const _Float16*>(from.buffer),
541                                                fromDim);
542         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
543             return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
544                                               reinterpret_cast<const uint8_t*>(from.buffer),
545                                               fromDim);
546         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
547             return convertToNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
548                                              reinterpret_cast<const int8_t*>(from.buffer), fromDim);
549         } else {
550             LOG(ERROR) << "Unsupported data type";
551             return false;
552         }
553     } else {
554         to = from;
555     }
556     return true;
557 }
558 
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)559 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
560                             bool data_layout, int* result) {
561     if (from.dimensions.size() != 4) {
562         LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
563         return false;
564     }
565     if (data_layout) {
566         // convert dimensions
567         Shape outShape = from.shape();
568         auto& fromDim = from.dimensions;
569         outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
570         // allocate buffer
571         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
572             return false;
573         }
574         // convert value
575         if (from.type == OperandType::TENSOR_FLOAT32) {
576             return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
577                                               reinterpret_cast<const float*>(from.buffer), fromDim);
578         } else if (from.type == OperandType::TENSOR_FLOAT16) {
579             return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
580                                                  reinterpret_cast<const _Float16*>(from.buffer),
581                                                  fromDim);
582         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
583             return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
584                                                 reinterpret_cast<const uint8_t*>(from.buffer),
585                                                 fromDim);
586         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
587             return convertFromNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
588                                                reinterpret_cast<const int8_t*>(from.buffer),
589                                                fromDim);
590         } else {
591             LOG(ERROR) << "Unsupported data type";
592             return false;
593         }
594     } else {
595         Shape outShape = from.shape();
596         to.buffer = from.buffer;
597         to.length = from.length;
598         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
599             return false;
600         }
601     }
602     return true;
603 }
604 
605 // Decrements the usage count for the operands listed.  Frees the memory
606 // allocated for any temporary variable with a count of zero.
consumeOperationInputs(const std::vector<uint32_t> & inputs,RunTimeOperandInfo * operands)607 static void consumeOperationInputs(const std::vector<uint32_t>& inputs,
608                                    RunTimeOperandInfo* operands) {
609     for (uint32_t i : inputs) {
610         auto& info = operands[i];
611         // Check if it's a static or model input/output.
612         if (info.numberOfUsesLeft == 0) {
613             continue;
614         }
615         info.numberOfUsesLeft--;
616         if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
617             delete[] info.buffer;
618             info.buffer = nullptr;
619         }
620     }
621 }
622 
623 // This function only frees TEMPORARY_VARIABLE operands that are unused
624 // outputs because consumeOperationInputs takes care of any operands
625 // that are inputs to an operation.
freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo> * operands)626 static void freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo>* operands) {
627     for (auto& info : *operands) {
628         if (info.lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info.numberOfUsesLeft == 0 &&
629             info.buffer != nullptr) {
630             delete[] info.buffer;
631             info.buffer = nullptr;
632         }
633     }
634 }
635 
636 // Ignore the .pools entry in model and request.  This will have been taken care of
637 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)638 int CpuExecutor::run(const Model& model, const Request& request,
639                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
640                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
641     NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
642     VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(toString(request)) << ")";
643     mModelOperandValues = &model.operandValues;
644     mModelPoolInfos = &modelPoolInfos;
645     mReferencedSubgraphs = &model.referenced;
646 
647     // b/109953668, disable OpenMP
648 #ifdef NNAPI_OPENMP
649     ScopedOpenmpSettings openMpSettings;
650 #endif  // NNAPI_OPENMP
651 
652     std::vector<RunTimeOperandInfo> operands = initializeRunTimeInfo(model.main);
653     updateForArguments(model.main.inputIndexes, request.inputs, requestPoolInfos, operands.data());
654     updateForArguments(model.main.outputIndexes, request.outputs, requestPoolInfos,
655                        operands.data());
656     int result = executeSubgraph(model.main, operands.data());
657     freeUnusedSubgraphOperands(&operands);
658 
659     if (result == ANEURALNETWORKS_NO_ERROR) {
660         VLOG(CPUEXE) << "Completed run normally";
661         for (auto& runtimeInfo : requestPoolInfos) {
662             runtimeInfo.flush();
663         }
664     }
665 
666     // Only report the output shapes when the result code is NO_ERROR or OUTPUT_INSUFFICIENT_SIZE.
667     if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
668         setOutputShapes(model.main.outputIndexes, operands);
669     } else {
670         mOutputShapes.clear();
671     }
672 
673     mFinished = true;
674     mModelOperandValues = nullptr;
675     mModelPoolInfos = nullptr;
676     mReferencedSubgraphs = nullptr;
677     return result;
678 }
679 
executeSubgraph(const Subgraph & subgraph,RunTimeOperandInfo * operands)680 int CpuExecutor::executeSubgraph(const Subgraph& subgraph, RunTimeOperandInfo* operands) {
681     VLOG(CPUEXE) << "CpuExecutor::executeSubgraph " << toString(subgraph);
682     // The graph has serialized the operation in execution order.
683     for (const auto& operation : subgraph.operations) {
684         NN_RETURN_IF_ERROR(executeOperation(operation, operands));
685     }
686     return ANEURALNETWORKS_NO_ERROR;
687 }
688 
initializeRunTimeInfo(const Subgraph & subgraph)689 std::vector<RunTimeOperandInfo> CpuExecutor::initializeRunTimeInfo(const Subgraph& subgraph) {
690     VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
691     const size_t count = subgraph.operands.size();
692     std::vector<RunTimeOperandInfo> operands(count);
693     for (size_t i = 0; i < count; i++) {
694         const Operand& from = subgraph.operands[i];
695         RunTimeOperandInfo& to = operands[i];
696         to.type = from.type;
697         to.dimensions = from.dimensions;
698         to.scale = from.scale;
699         to.zeroPoint = from.zeroPoint;
700         to.length = from.location.length;
701         to.lifetime = from.lifetime;
702         to.extraParams = from.extraParams;
703         switch (from.lifetime) {
704             case OperandLifeTime::TEMPORARY_VARIABLE:
705                 to.buffer = nullptr;
706                 to.numberOfUsesLeft = from.numberOfConsumers;
707                 break;
708             case OperandLifeTime::CONSTANT_COPY:
709                 to.buffer = const_cast<uint8_t*>(&(*mModelOperandValues)[from.location.offset]);
710                 to.numberOfUsesLeft = 0;
711                 break;
712             case OperandLifeTime::CONSTANT_REFERENCE: {
713                 auto poolIndex = from.location.poolIndex;
714                 CHECK_LT(poolIndex, mModelPoolInfos->size());
715                 auto& r = (*mModelPoolInfos)[poolIndex];
716                 to.buffer = r.getBuffer() + from.location.offset;
717                 to.numberOfUsesLeft = 0;
718                 break;
719             }
720             case OperandLifeTime::SUBGRAPH: {
721                 auto subgraphIndex = from.location.offset;
722                 CHECK_LT(subgraphIndex, mReferencedSubgraphs->size());
723                 to.buffer = reinterpret_cast<uint8_t*>(
724                         const_cast<Subgraph*>(&(*mReferencedSubgraphs)[subgraphIndex]));
725                 to.numberOfUsesLeft = 0;
726             } break;
727             case OperandLifeTime::SUBGRAPH_INPUT:
728             case OperandLifeTime::SUBGRAPH_OUTPUT:
729             case OperandLifeTime::NO_VALUE:
730                 to.buffer = nullptr;
731                 to.numberOfUsesLeft = 0;
732                 break;
733         }
734     }
735     return operands;
736 }
737 
updateForArguments(const std::vector<uint32_t> & indexes,const hal::hidl_vec<hal::RequestArgument> & arguments,const std::vector<RunTimePoolInfo> & requestPoolInfos,RunTimeOperandInfo * operands)738 void CpuExecutor::updateForArguments(const std::vector<uint32_t>& indexes,
739                                      const hal::hidl_vec<hal::RequestArgument>& arguments,
740                                      const std::vector<RunTimePoolInfo>& requestPoolInfos,
741                                      RunTimeOperandInfo* operands) {
742     CHECK_EQ(indexes.size(), arguments.size());
743     for (size_t i = 0; i < indexes.size(); i++) {
744         const uint32_t operandIndex = indexes[i];
745         const RequestArgument& from = arguments[i];
746         RunTimeOperandInfo& to = operands[operandIndex];
747         if (from.dimensions.size() > 0) {
748             // It's the responsibility of the caller to validate that
749             // from.dimensions only modifies the dimensions that were
750             // unspecified in the model.  That's the case in SampleDriver.cpp
751             // with the call to validateRequest().
752             // TODO make sure that's the case for the default CPU path.
753             to.dimensions = from.dimensions;
754         }
755         if (from.hasNoValue) {
756             to.lifetime = OperandLifeTime::NO_VALUE;
757             CHECK(to.buffer == nullptr);
758             to.length = 0;
759         } else {
760             auto poolIndex = from.location.poolIndex;
761             CHECK_LT(poolIndex, requestPoolInfos.size());
762             auto& r = requestPoolInfos[poolIndex];
763             to.buffer = r.getBuffer() + from.location.offset;
764             if (from.location.offset == 0 && from.location.length == 0) {
765                 // Use the entire memory region.
766                 to.length = r.getSize();
767             } else {
768                 to.length = from.location.length;
769             }
770         }
771     }
772 }
773 
executeOperation(const Operation & operation,RunTimeOperandInfo * operands)774 int CpuExecutor::executeOperation(const Operation& operation, RunTimeOperandInfo* operands) {
775     if (hasDeadlinePassed(mDeadline)) {
776         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
777     }
778     if (operation.type == OperationType::IF) {
779         int result = executeIfOperation(operation, operands);
780         if (result != ANEURALNETWORKS_NO_ERROR) {
781             LOG(ERROR) << "IF failed.";
782         }
783         return result;
784     }
785     if (operation.type == OperationType::WHILE) {
786         int result = executeWhileOperation(operation, operands);
787         if (result != ANEURALNETWORKS_NO_ERROR) {
788             LOG(ERROR) << "WHILE failed.";
789         }
790         return result;
791     }
792 
793     // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << toString(operation) << ")";
794     const hidl_vec<uint32_t>& ins = operation.inputs;
795     const hidl_vec<uint32_t>& outs = operation.outputs;
796     bool success = false;
797     int result = ANEURALNETWORKS_NO_ERROR;
798 
799     // Function to verify that the number of input and output parameters
800     // matches what is expected.  Also checks that all the parameters have
801     // values. This function is to be used only for operations that do not
802     // accept optional arguments.
803     // TODO Have a version that works for optional arguments.
804     auto allParametersPresent = [&operation, &operands, &ins, &outs](size_t requiredIns,
805                                                                      size_t requiredOuts) -> bool {
806         auto verify = [&operation, &operands](size_t requiredCount,
807                                               const hidl_vec<uint32_t>& indexes,
808                                               const char* type) -> bool {
809             size_t actualCount = indexes.size();
810             if (actualCount != requiredCount) {
811                 LOG(ERROR) << getOperationName(operation.type) << ": Invalid number of " << type
812                            << " operands. Got " << actualCount << " of " << requiredCount;
813                 return false;
814             }
815             for (size_t i = 0; i < actualCount; i++) {
816                 if (operands[indexes[i]].lifetime == OperandLifeTime::NO_VALUE) {
817                     LOG(ERROR) << getOperationName(operation.type) << " " << type << " operand "
818                                << i << " is required but missing.";
819                     return false;
820                 }
821             }
822             return true;
823         };
824 
825         auto verifyNoZeroSizedInputs = [&operation, &operands](const hidl_vec<uint32_t>& indexes) {
826             for (size_t i = 0; i < indexes.size(); i++) {
827                 for (size_t j = 0; j < operands[indexes[i]].dimensions.size(); j++) {
828                     if (operands[indexes[i]].dimensions[j] == 0) {
829                         LOG(ERROR) << getOperationName(operation.type)
830                                    << " does not support zero-sized tensor, but input " << i
831                                    << " dimension " << j << " is zero.";
832                         return false;
833                     }
834                 }
835             }
836             return true;
837         };
838 
839         return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
840                verifyNoZeroSizedInputs(ins);
841     };
842 
843     switch (operation.type) {
844         case OperationType::OEM_OPERATION: {
845             LOG(ERROR) << "OEM operation not supported for CPU execution";
846             success = false;
847         } break;
848         case OperationType::RESHAPE: {
849             if (!allParametersPresent(2, 1)) {
850                 return ANEURALNETWORKS_BAD_DATA;
851             }
852             const RunTimeOperandInfo& input = operands[ins[0]];
853             const RunTimeOperandInfo& targetShape = operands[ins[1]];
854 
855             RunTimeOperandInfo& output = operands[outs[0]];
856             Shape outShape = output.shape();
857 
858             success = reshapePrepare(input.shape(),
859                                      reinterpret_cast<const int32_t*>(targetShape.buffer),
860                                      getNumberOfElements(targetShape.shape()), &outShape) &&
861                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
862                       copyData(input.buffer, input.shape(), output.buffer, outShape);
863         } break;
864         case OperationType::DEPTH_TO_SPACE: {
865             const size_t inCount = ins.size();
866             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
867                 return ANEURALNETWORKS_BAD_DATA;
868             }
869             const RunTimeOperandInfo& input = operands[ins[0]];
870             int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
871             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
872 
873             RunTimeOperandInfo& output = operands[outs[0]];
874             Shape outShape = output.shape();
875 
876             RunTimeOperandInfo input_tmp, output_tmp;
877             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
878             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
879                 success = false;
880                 break;
881             }
882             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
883             output_tmp.buffer = data_layout ? nullptr : output.buffer;
884             output_tmp.length = data_layout ? 0 : output.length;
885             if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
886                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
887                 if (!data_layout) output.dimensions = output_tmp.dimensions;
888                 break;
889             }
890             switch (input_tmp.type) {
891                 case OperandType::TENSOR_FLOAT32: {
892                     success = depthToSpaceGeneric(
893                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
894                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
895                     break;
896                 }
897                 case OperandType::TENSOR_FLOAT16: {
898                     success = depthToSpaceGeneric(
899                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
900                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
901                     break;
902                 }
903                 case OperandType::TENSOR_QUANT8_ASYMM: {
904                     success = depthToSpaceGeneric(
905                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
906                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
907                     break;
908                 }
909                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
910                     success = depthToSpaceGeneric(
911                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
912                             blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
913                     break;
914                 }
915                 default: {
916                     LOG(ERROR) << "Unsupported data type";
917                     success = false;
918                 }
919             }
920             if (data_layout) {
921                 output_tmp_guard.reset(output_tmp.buffer);
922             }
923             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
924                 success = false;
925                 break;
926             }
927         } break;
928         case OperationType::SPACE_TO_DEPTH: {
929             const size_t inCount = ins.size();
930             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
931                 return ANEURALNETWORKS_BAD_DATA;
932             }
933             const RunTimeOperandInfo& input = operands[ins[0]];
934             int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
935             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
936 
937             RunTimeOperandInfo& output = operands[outs[0]];
938             Shape outShape = output.shape();
939 
940             RunTimeOperandInfo input_tmp, output_tmp;
941             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
942             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
943                 success = false;
944                 break;
945             }
946             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
947             output_tmp.buffer = data_layout ? nullptr : output.buffer;
948             output_tmp.length = data_layout ? 0 : output.length;
949 
950             if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
951                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
952                 if (!data_layout) output.dimensions = output_tmp.dimensions;
953                 break;
954             }
955             switch (input_tmp.type) {
956                 case OperandType::TENSOR_FLOAT32: {
957                     success = spaceToDepthGeneric(
958                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
959                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
960                     break;
961                 }
962                 case OperandType::TENSOR_FLOAT16: {
963                     success = spaceToDepthGeneric(
964                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
965                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
966                     break;
967                 }
968                 case OperandType::TENSOR_QUANT8_ASYMM: {
969                     success = spaceToDepthGeneric(
970                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
971                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
972                     break;
973                 }
974                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
975                     success = spaceToDepthGeneric(
976                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
977                             blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
978                     break;
979                 }
980                 default: {
981                     LOG(ERROR) << "Unsupported data type";
982                     success = false;
983                 }
984             }
985             if (data_layout) {
986                 output_tmp_guard.reset(output_tmp.buffer);
987             }
988             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
989                 success = false;
990                 break;
991             }
992         } break;
993         case OperationType::EMBEDDING_LOOKUP: {
994             if (!allParametersPresent(2, 1)) {
995                 return ANEURALNETWORKS_BAD_DATA;
996             }
997             const RunTimeOperandInfo& values = operands[ins[EmbeddingLookup::kValueTensor]];
998             const RunTimeOperandInfo& lookups = operands[ins[EmbeddingLookup::kLookupTensor]];
999             RunTimeOperandInfo& output = operands[outs[EmbeddingLookup::kOutputTensor]];
1000 
1001             Shape outputShape;
1002             EmbeddingLookup lookup(operation, operands);
1003 
1004             success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
1005                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
1006         } break;
1007         case OperationType::HASHTABLE_LOOKUP: {
1008             if (!allParametersPresent(3, 2)) {
1009                 return ANEURALNETWORKS_BAD_DATA;
1010             }
1011             const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
1012             const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
1013             const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
1014 
1015             RunTimeOperandInfo& output = operands[outs[HashtableLookup::kOutputTensor]];
1016             RunTimeOperandInfo& hits = operands[outs[HashtableLookup::kHitsTensor]];
1017 
1018             Shape outputShape, hitShape;
1019             HashtableLookup lookup(operation, operands);
1020 
1021             success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
1022                                              &outputShape, &hitShape) &&
1023                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1024                       setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
1025         } break;
1026         case OperationType::LSH_PROJECTION: {
1027             RunTimeOperandInfo& output = operands[outs[LSHProjection::kOutputTensor]];
1028             Shape outputShape;
1029             if (!LSHProjection::Prepare(operation, operands, &outputShape) ||
1030                 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
1031                 break;
1032             }
1033 
1034             LSHProjection lsh(operation, operands);
1035             const RunTimeOperandInfo& hash = operands[ins[LSHProjection::kHashTensor]];
1036             switch (hash.type) {
1037                 case OperandType::TENSOR_FLOAT32: {
1038                     success = lsh.Eval<float>();
1039                     break;
1040                 }
1041                 case OperandType::TENSOR_FLOAT16: {
1042                     success = lsh.Eval<_Float16>();
1043                     break;
1044                 }
1045                 default: {
1046                     success = false;
1047                     LOG(ERROR) << "Unsupported data type";
1048                 }
1049             }
1050         } break;
1051         case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
1052             const auto merge_outputs = getScalarData<bool>(
1053                     operands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1054             const bool output_state = (outs.size() == 5 || outs.size() == 6);
1055             RunTimeOperandInfo& fwOutput =
1056                     operands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1057             Shape fwOutputShape, bwOutputShape, fwOutputActivationStateShape,
1058                     fwOutputCellStateShape, bwOutputActivationStateShape, bwOutputCellStateShape;
1059 
1060             BidirectionalSequenceLSTM lstm(operation, operands);
1061             success = lstm.Prepare(operation, operands, &fwOutputShape, &bwOutputShape,
1062                                    &fwOutputActivationStateShape, &fwOutputCellStateShape,
1063                                    &bwOutputActivationStateShape, &bwOutputCellStateShape) &&
1064                       setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1065             if (!merge_outputs) {
1066                 RunTimeOperandInfo& bwOutput =
1067                         operands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1068                 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1069             }
1070             if (output_state) {
1071                 uint32_t delta = merge_outputs ? 1 : 0;
1072                 RunTimeOperandInfo& fwOutputActivationState =
1073                         operands[outs[BidirectionalSequenceLSTM::kFwOutputActivationStateTensor -
1074                                       delta]];
1075                 RunTimeOperandInfo& fwOutputCellState =
1076                         operands[outs[BidirectionalSequenceLSTM::kFwOutputCellStateTensor - delta]];
1077                 RunTimeOperandInfo& bwOutputActivationState =
1078                         operands[outs[BidirectionalSequenceLSTM::kBwOutputActivationStateTensor -
1079                                       delta]];
1080                 RunTimeOperandInfo& bwOutputCellState =
1081                         operands[outs[BidirectionalSequenceLSTM::kBwOutputCellStateTensor - delta]];
1082                 success = success &&
1083                           setInfoAndAllocateIfNeeded(&fwOutputActivationState,
1084                                                      fwOutputActivationStateShape, &result) &&
1085                           setInfoAndAllocateIfNeeded(&fwOutputCellState, fwOutputCellStateShape,
1086                                                      &result) &&
1087                           setInfoAndAllocateIfNeeded(&bwOutputActivationState,
1088                                                      bwOutputActivationStateShape, &result) &&
1089                           setInfoAndAllocateIfNeeded(&bwOutputCellState, bwOutputCellStateShape,
1090                                                      &result);
1091             }
1092             success = success && lstm.Eval();
1093         } break;
1094         case OperationType::LSTM: {
1095             RunTimeOperandInfo& scratch = operands[outs[LSTMCell::kScratchBufferTensor]];
1096             RunTimeOperandInfo& outputStateOut = operands[outs[LSTMCell::kOutputStateOutTensor]];
1097             RunTimeOperandInfo& cellStateOut = operands[outs[LSTMCell::kCellStateOutTensor]];
1098             RunTimeOperandInfo& output = operands[outs[LSTMCell::kOutputTensor]];
1099 
1100             Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1101             LSTMCell lstm_cell(operation, operands);
1102 
1103             success = lstm_cell.Prepare(operation, operands, &scratchShape, &outputStateShape,
1104                                         &cellStateShape, &outputShape) &&
1105                       setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1106                       setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1107                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1108                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1109         } break;
1110         case OperationType::RANDOM_MULTINOMIAL: {
1111             if (!allParametersPresent(3, 1)) {
1112                 return ANEURALNETWORKS_BAD_DATA;
1113             }
1114             const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
1115             const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
1116             const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
1117             RunTimeOperandInfo& output = operands[outs[Multinomial::kOutputTensor]];
1118 
1119             Shape outputShape;
1120             Multinomial multinomial(operation, operands);
1121 
1122             success = Multinomial::Prepare(operation, operands, &outputShape) &&
1123                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1124                       multinomial.Eval();
1125         } break;
1126         case OperationType::RNN: {
1127             if (!allParametersPresent(6, 2)) {
1128                 return ANEURALNETWORKS_BAD_DATA;
1129             }
1130 
1131             RunTimeOperandInfo& hiddenStateOut = operands[outs[RNN::kHiddenStateOutTensor]];
1132             RunTimeOperandInfo& output = operands[outs[RNN::kOutputTensor]];
1133 
1134             Shape hiddenStateShape, outputShape;
1135             RNN rnn_cell(operation, operands);
1136 
1137             success = RNN::Prepare(operation, operands, &hiddenStateShape, &outputShape) &&
1138                       setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1139                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1140         } break;
1141         case OperationType::SVDF: {
1142             RunTimeOperandInfo& stateOut = operands[outs[SVDF::kStateOutTensor]];
1143             RunTimeOperandInfo& output = operands[outs[SVDF::kOutputTensor]];
1144 
1145             Shape stateShape, outputShape;
1146             SVDF svdf(operation, operands);
1147 
1148             success = SVDF::Prepare(operation, operands, &stateShape, &outputShape) &&
1149                       setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1150                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1151         } break;
1152         case OperationType::BATCH_TO_SPACE_ND: {
1153             const size_t inCount = ins.size();
1154             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1155                 return ANEURALNETWORKS_BAD_DATA;
1156             }
1157             const RunTimeOperandInfo& input = operands[ins[0]];
1158             const RunTimeOperandInfo& blockSize = operands[ins[1]];
1159             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
1160 
1161             RunTimeOperandInfo& output = operands[outs[0]];
1162             Shape outShape = output.shape();
1163 
1164             RunTimeOperandInfo input_tmp, output_tmp;
1165             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1166             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1167                 success = false;
1168                 break;
1169             }
1170             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1171             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1172             output_tmp.length = data_layout ? 0 : output.length;
1173 
1174             if (!batchToSpacePrepare(input_tmp.shape(),
1175                                      reinterpret_cast<const int32_t*>(blockSize.buffer),
1176                                      blockSize.shape(), &outShape) ||
1177                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1178                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1179                 break;
1180             }
1181             switch (input_tmp.type) {
1182                 case OperandType::TENSOR_FLOAT32: {
1183                     success = batchToSpaceGeneric(
1184                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1185                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1186                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1187                     break;
1188                 }
1189                 case OperandType::TENSOR_FLOAT16: {
1190                     success = batchToSpaceGeneric(
1191                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1192                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1193                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1194                     break;
1195                 }
1196                 case OperandType::TENSOR_QUANT8_ASYMM: {
1197                     success = batchToSpaceGeneric(
1198                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1199                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1200                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1201                     break;
1202                 }
1203                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1204                     success = batchToSpaceGeneric(
1205                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1206                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1207                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1208                     break;
1209                 }
1210                 default: {
1211                     LOG(ERROR) << "Unsupported data type";
1212                     success = false;
1213                 }
1214             }
1215             if (data_layout) {
1216                 output_tmp_guard.reset(output_tmp.buffer);
1217             }
1218             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1219                 success = false;
1220                 break;
1221             }
1222         } break;
1223         case OperationType::SPACE_TO_BATCH_ND: {
1224             const size_t inCount = ins.size();
1225             if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1226                 return ANEURALNETWORKS_BAD_DATA;
1227             }
1228             const RunTimeOperandInfo& input = operands[ins[0]];
1229             const RunTimeOperandInfo& blockSize = operands[ins[1]];
1230             const RunTimeOperandInfo& paddings = operands[ins[2]];
1231             bool data_layout = inCount == 4 ? getScalarData<bool>(operands[ins[3]]) : false;
1232 
1233             RunTimeOperandInfo& output = operands[outs[0]];
1234             Shape outShape = output.shape();
1235 
1236             RunTimeOperandInfo input_tmp, output_tmp;
1237             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1238             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1239                 success = false;
1240                 break;
1241             }
1242             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1243             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1244             output_tmp.length = data_layout ? 0 : output.length;
1245 
1246             if (!spaceToBatchPrepare(
1247                         input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1248                         blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1249                         paddings.shape(), &outShape) ||
1250                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1251                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1252                 break;
1253             }
1254             switch (input_tmp.type) {
1255                 case OperandType::TENSOR_FLOAT32: {
1256                     success = spaceToBatchGeneric(
1257                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1258                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1259                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1260                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1261                     break;
1262                 }
1263                 case OperandType::TENSOR_FLOAT16: {
1264                     success = spaceToBatchGeneric(
1265                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1266                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1267                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1268                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1269                     break;
1270                 }
1271                 case OperandType::TENSOR_QUANT8_ASYMM: {
1272                     success = spaceToBatchGeneric(
1273                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1274                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1275                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1276                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1277                     break;
1278                 }
1279                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1280                     success = spaceToBatchGeneric(
1281                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1282                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1283                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1284                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1285                     break;
1286                 }
1287                 default: {
1288                     LOG(ERROR) << "Unsupported data type";
1289                     success = false;
1290                 }
1291             }
1292             if (data_layout) {
1293                 output_tmp_guard.reset(output_tmp.buffer);
1294             }
1295             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1296                 success = false;
1297                 break;
1298             }
1299         } break;
1300         case OperationType::PAD:
1301         case OperationType::PAD_V2: {
1302             const bool isV2 = operation.type == OperationType::PAD_V2;
1303             if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1304                 return ANEURALNETWORKS_BAD_DATA;
1305             }
1306             const RunTimeOperandInfo& input = operands[ins[0]];
1307             const RunTimeOperandInfo& paddings = operands[ins[1]];
1308 
1309             RunTimeOperandInfo& output = operands[outs[0]];
1310             Shape outShape = output.shape();
1311 
1312             if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1313                             paddings.shape(), &outShape) ||
1314                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1315                 break;
1316             }
1317             if (input.type == OperandType::TENSOR_FLOAT32) {
1318                 float pad_value = isV2 ? getScalarData<float>(operands[ins[2]]) : 0;
1319                 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1320                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1321                                      reinterpret_cast<float*>(output.buffer), outShape);
1322             } else if (input.type == OperandType::TENSOR_FLOAT16) {
1323                 _Float16 pad_value = isV2 ? getScalarData<_Float16>(operands[ins[2]]) : 0;
1324                 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1325                                      reinterpret_cast<const int32_t*>(paddings.buffer),
1326                                      static_cast<_Float16>(pad_value),
1327                                      reinterpret_cast<_Float16*>(output.buffer), outShape);
1328             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1329                 uint8_t pad_value =
1330                         isV2 ? getScalarData<uint8_t>(operands[ins[2]]) : outShape.offset;
1331                 success = padGeneric(input.buffer, input.shape(),
1332                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1333                                      output.buffer, outShape);
1334             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1335                 uint8_t pad_value =
1336                         isV2 ? getScalarData<int8_t>(operands[ins[2]]) : outShape.offset;
1337                 success = padGeneric(input.buffer, input.shape(),
1338                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1339                                      output.buffer, outShape);
1340             }
1341         } break;
1342         case OperationType::CAST: {
1343             if (!allParametersPresent(1, 1)) {
1344                 return ANEURALNETWORKS_BAD_DATA;
1345             }
1346             const RunTimeOperandInfo& input = operands[ins[0]];
1347 
1348             RunTimeOperandInfo& output = operands[outs[0]];
1349             Shape outShape = output.shape();
1350 
1351             success = cast::prepare(input.shape(), &outShape) &&
1352                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1353                       cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1354         } break;
1355         case OperationType::MEAN: {
1356             if (!allParametersPresent(3, 1)) {
1357                 return ANEURALNETWORKS_BAD_DATA;
1358             }
1359             const RunTimeOperandInfo& input = operands[ins[0]];
1360             const RunTimeOperandInfo& axis = operands[ins[1]];
1361             int32_t keepDims = getScalarData<int32_t>(operands[ins[2]]);
1362 
1363             RunTimeOperandInfo& output = operands[outs[0]];
1364             Shape outShape = output.shape();
1365 
1366             if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1367                              axis.shape(), keepDims > 0, &outShape) ||
1368                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1369                 break;
1370             }
1371             if (input.type == OperandType::TENSOR_FLOAT16) {
1372                 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1373                                       reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1374                                       keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1375                                       outShape);
1376             } else if (input.type == OperandType::TENSOR_FLOAT32) {
1377                 success = meanGeneric<float, float>(
1378                         reinterpret_cast<float*>(input.buffer), input.shape(),
1379                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1380                         reinterpret_cast<float*>(output.buffer), outShape);
1381             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1382                 success = meanGeneric<uint8_t, int32_t>(
1383                         reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1384                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1385                         reinterpret_cast<uint8_t*>(output.buffer), outShape);
1386             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1387                 success = meanGeneric<int8_t, int32_t>(
1388                         reinterpret_cast<int8_t*>(input.buffer), input.shape(),
1389                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1390                         reinterpret_cast<int8_t*>(output.buffer), outShape);
1391             }
1392         } break;
1393         case OperationType::ARGMAX:
1394         case OperationType::ARGMIN: {
1395             if (!allParametersPresent(2, 1)) {
1396                 return ANEURALNETWORKS_BAD_DATA;
1397             }
1398             const RunTimeOperandInfo& input = operands[ins[0]];
1399             int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1400 
1401             RunTimeOperandInfo& output = operands[outs[0]];
1402             Shape outShape = output.shape();
1403 
1404             const bool isArgMin = operation.type == OperationType::ARGMIN;
1405             success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1406                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1407                       argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1408                                        outShape);
1409         } break;
1410         case OperationType::EXPAND_DIMS: {
1411             if (!allParametersPresent(2, 1)) {
1412                 return ANEURALNETWORKS_BAD_DATA;
1413             }
1414             const RunTimeOperandInfo& input = operands[ins[0]];
1415             int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1416 
1417             RunTimeOperandInfo& output = operands[outs[0]];
1418             Shape outShape = output.shape();
1419 
1420             success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1421                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1422                       expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1423         } break;
1424         case OperationType::SPLIT: {
1425             const size_t outCount = outs.size();
1426             if (!allParametersPresent(3, outCount)) {
1427                 return ANEURALNETWORKS_BAD_DATA;
1428             }
1429 
1430             const RunTimeOperandInfo& input = operands[ins[0]];
1431             const int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1432             const int32_t numOutputs = getScalarData<int32_t>(operands[ins[2]]);
1433 
1434             if (numOutputs != outs.size()) {
1435                 return ANEURALNETWORKS_BAD_DATA;
1436             }
1437 
1438             std::vector<Shape> outputShapes(numOutputs);
1439             for (int i = 0; i < numOutputs; ++i) {
1440                 outputShapes[i] = operands[outs[i]].shape();
1441             }
1442 
1443             success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1444             for (int i = 0; i < numOutputs; ++i) {
1445                 success = success && setInfoAndAllocateIfNeeded(&(operands[outs[i]]),
1446                                                                 outputShapes[i], &result);
1447             }
1448             switch (input.type) {
1449                 case OperandType::TENSOR_FLOAT16: {
1450                     std::vector<_Float16*> outputDataPtrs(numOutputs);
1451                     for (int i = 0; i < numOutputs; ++i) {
1452                         outputDataPtrs[i] = reinterpret_cast<_Float16*>(operands[outs[i]].buffer);
1453                     }
1454                     success = success &&
1455                               splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1456                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1457                 } break;
1458                 case OperandType::TENSOR_FLOAT32: {
1459                     std::vector<float*> outputDataPtrs(numOutputs);
1460                     for (int i = 0; i < numOutputs; ++i) {
1461                         outputDataPtrs[i] = reinterpret_cast<float*>(operands[outs[i]].buffer);
1462                     }
1463                     success = success &&
1464                               splitFloat32(reinterpret_cast<const float*>(input.buffer),
1465                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1466                 } break;
1467                 case OperandType::TENSOR_INT32: {
1468                     std::vector<int32_t*> outputDataPtrs(numOutputs);
1469                     for (int i = 0; i < numOutputs; ++i) {
1470                         outputDataPtrs[i] = reinterpret_cast<int32_t*>(operands[outs[i]].buffer);
1471                     }
1472                     success = success &&
1473                               splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1474                                          input.shape(), axis, &outputDataPtrs, outputShapes);
1475                 } break;
1476                 case OperandType::TENSOR_QUANT8_ASYMM: {
1477                     std::vector<uint8_t*> outputDataPtrs(numOutputs);
1478                     for (int i = 0; i < numOutputs; ++i) {
1479                         outputDataPtrs[i] = reinterpret_cast<uint8_t*>(operands[outs[i]].buffer);
1480                     }
1481                     success = success &&
1482                               splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1483                                           input.shape(), axis, &outputDataPtrs, outputShapes);
1484                 } break;
1485                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1486                     std::vector<int8_t*> outputDataPtrs(numOutputs);
1487                     for (int i = 0; i < numOutputs; ++i) {
1488                         outputDataPtrs[i] = reinterpret_cast<int8_t*>(operands[outs[i]].buffer);
1489                     }
1490                     success = success &&
1491                               splitQuant8Signed(reinterpret_cast<const int8_t*>(input.buffer),
1492                                                 input.shape(), axis, &outputDataPtrs, outputShapes);
1493                 } break;
1494                 default: {
1495                     return ANEURALNETWORKS_BAD_DATA;
1496                 }
1497             }
1498         } break;
1499         case OperationType::MAXIMUM:
1500         case OperationType::MINIMUM: {
1501             if (!allParametersPresent(2, 1)) {
1502                 return ANEURALNETWORKS_BAD_DATA;
1503             }
1504             const RunTimeOperandInfo& in1 = operands[ins[0]];
1505             const RunTimeOperandInfo& in2 = operands[ins[1]];
1506 
1507             RunTimeOperandInfo& output = operands[outs[0]];
1508             Shape outputShape = output.shape();
1509 
1510             const bool isMinimum = operation.type == OperationType::MINIMUM;
1511             success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1512                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1513                       maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1514                                             isMinimum, output.buffer, outputShape);
1515         } break;
1516         case OperationType::GROUPED_CONV_2D: {
1517             const size_t inCount = ins.size();
1518             if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1519                 return ANEURALNETWORKS_BAD_DATA;
1520             }
1521             const RunTimeOperandInfo& input = operands[ins[0]];
1522             const RunTimeOperandInfo& filter = operands[ins[1]];
1523             const RunTimeOperandInfo& bias = operands[ins[2]];
1524 
1525             int32_t padding_left, padding_right;
1526             int32_t padding_top, padding_bottom;
1527             int32_t padding_implicit = 0;
1528             int32_t stride_width, stride_height;
1529             int32_t numGroups;
1530             int32_t activation;
1531             bool data_layout = false;
1532 
1533             if (inCount == 12) {
1534                 padding_left = getScalarData<int32_t>(operands[ins[3]]);
1535                 padding_right = getScalarData<int32_t>(operands[ins[4]]);
1536                 padding_top = getScalarData<int32_t>(operands[ins[5]]);
1537                 padding_bottom = getScalarData<int32_t>(operands[ins[6]]);
1538                 stride_width = getScalarData<int32_t>(operands[ins[7]]);
1539                 stride_height = getScalarData<int32_t>(operands[ins[8]]);
1540                 numGroups = getScalarData<int32_t>(operands[ins[9]]);
1541                 activation = getScalarData<int32_t>(operands[ins[10]]);
1542                 data_layout = getScalarData<bool>(operands[ins[11]]);
1543             } else {
1544                 padding_implicit = getScalarData<int32_t>(operands[ins[3]]);
1545                 stride_width = getScalarData<int32_t>(operands[ins[4]]);
1546                 stride_height = getScalarData<int32_t>(operands[ins[5]]);
1547                 numGroups = getScalarData<int32_t>(operands[ins[6]]);
1548                 activation = getScalarData<int32_t>(operands[ins[7]]);
1549                 data_layout = getScalarData<bool>(operands[ins[8]]);
1550             }
1551 
1552             RunTimeOperandInfo& output = operands[outs[0]];
1553             Shape outShape = output.shape();
1554 
1555             RunTimeOperandInfo input_tmp, output_tmp;
1556             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1557             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1558                 success = false;
1559                 break;
1560             }
1561             output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1562             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1563             output_tmp.length = data_layout ? 0 : output.length;
1564 
1565             if (inCount == 9) {
1566                 Shape inputShape = input_tmp.shape();
1567                 Shape filterShape = filter.shape();
1568                 int32_t input_width = getSizeOfDimension(inputShape, 2);
1569                 int32_t input_height = getSizeOfDimension(inputShape, 1);
1570                 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1571                 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1572                 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1573                                          &padding_left, &padding_right);
1574                 calculateExplicitPadding(input_height, stride_height, filter_height,
1575                                          padding_implicit, &padding_top, &padding_bottom);
1576             }
1577 
1578             if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1579                                     padding_right, padding_top, padding_bottom, stride_width,
1580                                     stride_height, numGroups, &outShape) ||
1581                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1582                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1583                 success = false;
1584                 break;
1585             }
1586 
1587             if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1588                 success = groupedConvFloat32(
1589                         reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1590                         reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1591                         reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1592                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1593                         numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1594                         outShape);
1595             } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1596                 success = groupedConvFloat16(
1597                         reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1598                         reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1599                         reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1600                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1601                         numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1602                         outShape);
1603             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1604                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1605                     success = groupedConvQuant8PerChannel(
1606                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1607                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1608                             filter.extraParams.channelQuant().scales.data(),
1609                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1610                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1611                             stride_height, numGroups, activation,
1612                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1613                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1614                     success = groupedConvQuant8(
1615                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1616                             reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1617                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1618                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1619                             stride_height, numGroups, activation,
1620                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1621                 }
1622             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1623                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1624                     success = groupedConvQuant8PerChannel(
1625                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1626                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1627                             filter.extraParams.channelQuant().scales.data(),
1628                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1629                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1630                             stride_height, numGroups, activation,
1631                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1632                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1633                     success = groupedConvQuant8(
1634                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1635                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1636                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1637                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1638                             stride_height, numGroups, activation,
1639                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1640                 }
1641             }
1642 
1643             if (data_layout) {
1644                 output_tmp_guard.reset(output_tmp.buffer);
1645             }
1646             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1647                 success = false;
1648                 break;
1649             }
1650         } break;
1651         case OperationType::TILE: {
1652             if (!allParametersPresent(2, 1)) {
1653                 return ANEURALNETWORKS_BAD_DATA;
1654             }
1655             const RunTimeOperandInfo& input = operands[ins[0]];
1656             const RunTimeOperandInfo& multiples = operands[ins[1]];
1657 
1658             RunTimeOperandInfo& output = operands[outs[0]];
1659             Shape outShape = output.shape();
1660 
1661             success =
1662                     tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1663                                   multiples.shape(), &outShape) &&
1664                     setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1665                     tile::eval(input.buffer, input.shape(),
1666                                reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1667                                outShape);
1668         } break;
1669         case OperationType::QUANTIZED_16BIT_LSTM: {
1670             if (!allParametersPresent(15, 2)) {
1671                 return ANEURALNETWORKS_BAD_DATA;
1672             }
1673 
1674             RunTimeOperandInfo& cellStateOut =
1675                     operands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1676             RunTimeOperandInfo& output = operands[outs[QuantizedLSTMCell::kOutputTensor]];
1677 
1678             Shape cellStateOutShape, outputShape;
1679             QuantizedLSTMCell quantizedLSTMCell(operation, operands);
1680 
1681             success = QuantizedLSTMCell::prepare(operation, operands, &cellStateOutShape,
1682                                                  &outputShape) &&
1683                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1684                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1685                       quantizedLSTMCell.eval();
1686         } break;
1687         case OperationType::POW: {
1688             if (!allParametersPresent(2, 1)) {
1689                 return ANEURALNETWORKS_BAD_DATA;
1690             }
1691             const RunTimeOperandInfo& base = operands[ins[0]];
1692             const RunTimeOperandInfo& exponent = operands[ins[1]];
1693 
1694             RunTimeOperandInfo& output = operands[outs[0]];
1695             Shape outShape = output.shape();
1696 
1697             success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1698                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1699                       pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1700                                 output.buffer, outShape);
1701         } break;
1702         default: {
1703             const OperationRegistration* operationRegistration =
1704                     mOperationResolver->findOperation(operation.type);
1705             if (operationRegistration == nullptr) {
1706                 LOG(ERROR) << getOperationName(operation.type) << " not registered";
1707             } else if (operationRegistration->prepare == nullptr ||
1708                        operationRegistration->execute == nullptr) {
1709                 LOG(ERROR) << "Incomplete operation registration: "
1710                            << getOperationName(operation.type);
1711             } else {
1712                 OperationExecutionContext context(&operation, operands);
1713                 success = operationRegistration->flags.allowOmittedOperand ||
1714                           context.checkNoOmittedOperand();
1715                 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1716                                       context.checkNoZeroSizedInput());
1717                 success = success && operationRegistration->prepare(&context) &&
1718                           operationRegistration->execute(&context);
1719                 result = context.getResultCode();
1720             }
1721         }
1722     }
1723     if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1724         result = ANEURALNETWORKS_OP_FAILED;
1725     }
1726     if (result != ANEURALNETWORKS_NO_ERROR) {
1727         LOG(ERROR) << getOperationName(operation.type) << " failed.";
1728         return result;
1729     }
1730 
1731     consumeOperationInputs(ins, operands);
1732     return ANEURALNETWORKS_NO_ERROR;
1733 }
1734 
1735 // Copies RunTimeOperandInfo, preserving the original lifetime and numberOfUsesLeft
1736 // to prevent deallocation of subgraph inputs and outputs.
setInfoExceptLifetime(RunTimeOperandInfo * to,const RunTimeOperandInfo & from)1737 static void setInfoExceptLifetime(RunTimeOperandInfo* to, const RunTimeOperandInfo& from) {
1738     auto originalLifetime = to->lifetime;
1739     auto originalNumberOfUsesLeft = to->numberOfUsesLeft;
1740     *to = from;
1741     to->lifetime = originalLifetime;
1742     to->numberOfUsesLeft = originalNumberOfUsesLeft;
1743 }
1744 
executeIfOperation(const Operation & operation,RunTimeOperandInfo * operands)1745 int CpuExecutor::executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1746     namespace op = operation_if;
1747     const RunTimeOperandInfo& condOperand = operands[operation.inputs[op::kCondBoolOperand]];
1748     if (condOperand.buffer == nullptr) {
1749         LOG(ERROR) << "Cannot read IF condition operand value";
1750         return ANEURALNETWORKS_OP_FAILED;
1751     }
1752     const bool condValue = *reinterpret_cast<const bool8*>(condOperand.buffer);
1753     VLOG(CPUEXE) << "CpuExecutor::executeIfOperation: condition value: " << condValue;
1754 
1755     const uint32_t branchInputIndex = condValue ? op::kThenModelOperand : op::kElseModelOperand;
1756     const RunTimeOperandInfo& branchOperand = operands[operation.inputs[branchInputIndex]];
1757     const Subgraph& branchSubgraph = *reinterpret_cast<const Subgraph*>(branchOperand.buffer);
1758     std::vector<RunTimeOperandInfo> branchOperands = initializeRunTimeInfo(branchSubgraph);
1759 
1760     // Initialize inner input and output operands from outer operands.
1761     for (uint32_t i = 0, n = branchSubgraph.inputIndexes.size(); i < n; ++i) {
1762         setInfoExceptLifetime(&branchOperands[branchSubgraph.inputIndexes[i]],
1763                               operands[operation.inputs[op::kFirstInput + i]]);
1764     }
1765     for (uint32_t i = 0, n = branchSubgraph.outputIndexes.size(); i < n; ++i) {
1766         setInfoExceptLifetime(&branchOperands[branchSubgraph.outputIndexes[i]],
1767                               operands[operation.outputs[i]]);
1768     }
1769 
1770     NN_RETURN_IF_ERROR(executeSubgraph(branchSubgraph, branchOperands.data()));
1771     freeUnusedSubgraphOperands(&branchOperands);
1772 
1773     // Update outer outputs.
1774     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1775         setInfoExceptLifetime(&operands[operation.outputs[i]],
1776                               branchOperands[branchSubgraph.outputIndexes[i]]);
1777     }
1778 
1779     consumeOperationInputs(operation.inputs, operands);
1780     return ANEURALNETWORKS_NO_ERROR;
1781 }
1782 
executeWhileOperation(const Operation & operation,RunTimeOperandInfo * operands)1783 int CpuExecutor::executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1784     namespace op = operation_while;
1785     const RunTimeOperandInfo& condModelOperand = operands[operation.inputs[op::kCondModelOperand]];
1786     const RunTimeOperandInfo& bodyModelOperand = operands[operation.inputs[op::kBodyModelOperand]];
1787     const Subgraph& condSubgraph = *reinterpret_cast<const Subgraph*>(condModelOperand.buffer);
1788     const Subgraph& bodySubgraph = *reinterpret_cast<const Subgraph*>(bodyModelOperand.buffer);
1789     std::vector<RunTimeOperandInfo> condOperands = initializeRunTimeInfo(condSubgraph);
1790     std::vector<RunTimeOperandInfo> bodyOperands = initializeRunTimeInfo(bodySubgraph);
1791 
1792     // The code below implements the following sequence of subgraph input and output buffer
1793     // assignments:
1794     // iteration = 0   cond inputs = body inputs = outer inputs   body outputs = tmp1
1795     // iteration = 1   cond inputs = body inputs = tmp1           body outputs = tmp2
1796     // iteration = 2   cond inputs = body inputs = tmp2           body outputs = tmp1
1797     // iteration = 3   cond inputs = body inputs = ...            body outputs = ...
1798 
1799     // For body output double buffering.
1800     std::vector<uint8_t*> tmp1(bodySubgraph.outputIndexes.size());
1801     std::vector<uint8_t*> tmp2(bodySubgraph.outputIndexes.size());
1802 
1803     // For body outputs with unknown shape, we skip double buffering and
1804     // allocate on each iteration instead. This allows growing output tensors
1805     // inside a WHILE loop.
1806     std::vector<bool> bodyOutputHasUnknownShape(bodySubgraph.outputIndexes.size());
1807     for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1808         const Operand& operand = bodySubgraph.operands[bodySubgraph.outputIndexes[i]];
1809         bodyOutputHasUnknownShape[i] = nonExtensionOperandSizeOfData(operand) == 0;
1810     }
1811 
1812     // Initialize condition inputs from outer operands.
1813     for (uint32_t i = 0, n = condSubgraph.inputIndexes.size(); i < n; ++i) {
1814         setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1815                               operands[operation.inputs[op::kFirstInput + i]]);
1816     }
1817 
1818     // Store condition output on the stack.
1819     RunTimeOperandInfo& condOutput = condOperands[condSubgraph.outputIndexes[0]];
1820     bool8 condValue = {/* initialized memory */};
1821     condOutput.buffer = &condValue;
1822     condOutput.length = sizeof(condValue);
1823 
1824     std::chrono::nanoseconds timeoutDuration(mLoopTimeoutDuration);
1825     const auto startTime = std::chrono::steady_clock::now();
1826     for (uint32_t iteration = 0;; ++iteration) {
1827         VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: iteration " << iteration;
1828         if (iteration != 0) {
1829             // Set condition inputs from previous iteration outputs.
1830             for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1831                 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1832                                       bodyOperands[bodySubgraph.outputIndexes[i]]);
1833             }
1834         }
1835         NN_RETURN_IF_ERROR(executeSubgraph(condSubgraph, condOperands.data()));
1836         VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: condition value: "
1837                      << static_cast<int>(condValue);
1838         if (!condValue) {
1839             break;
1840         }
1841 
1842         const auto duration = std::chrono::steady_clock::now() - startTime;
1843         if (duration > timeoutDuration) {
1844             LOG(ERROR) << "CpuExecutor::executeWhileOperation: timed out after "
1845                        << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1846                        << " ms";
1847             return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1848         }
1849 
1850         // Set body inputs from condition inputs.
1851         for (uint32_t i = 0, n = bodySubgraph.inputIndexes.size(); i < n; ++i) {
1852             bodyOperands[bodySubgraph.inputIndexes[i]] = condOperands[condSubgraph.inputIndexes[i]];
1853         }
1854         // Set body outputs.
1855         auto& outputBuffer = iteration % 2 == 0 ? tmp1 : tmp2;
1856         for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1857             RunTimeOperandInfo& info = bodyOperands[bodySubgraph.outputIndexes[i]];
1858             if (bodyOutputHasUnknownShape[i]) {
1859                 // Reset dimensions and buffer.
1860                 info.dimensions = bodySubgraph.operands[bodySubgraph.outputIndexes[i]].dimensions;
1861                 if (outputBuffer[i] != nullptr) {
1862                     delete[] outputBuffer[i];
1863                     outputBuffer[i] = nullptr;
1864                 }
1865             }
1866             info.buffer = outputBuffer[i];
1867         }
1868 
1869         NN_RETURN_IF_ERROR(executeSubgraph(bodySubgraph, bodyOperands.data()));
1870 
1871         // Update output buffer information in case we have allocated new buffers.
1872         for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1873             outputBuffer[i] = bodyOperands[bodySubgraph.outputIndexes[i]].buffer;
1874         }
1875     }
1876 
1877     // Copy body outputs to outer outputs.
1878     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1879         RunTimeOperandInfo& outerOperand = operands[operation.outputs[i]];
1880         RunTimeOperandInfo& innerOperand = condOperands[condSubgraph.inputIndexes[i]];
1881         if (int error; !setInfoAndAllocateIfNeeded(&outerOperand, innerOperand.shape(), &error)) {
1882             return error;
1883         }
1884         CHECK_EQ(outerOperand.length, innerOperand.length);
1885         // TODO: Use the outer buffer as tmp1 to avoid copies.
1886         std::memcpy(outerOperand.buffer, innerOperand.buffer, innerOperand.length);
1887     }
1888 
1889     auto freeLoopOutputs = [](const std::vector<uint8_t*>& tmp) {
1890         for (auto buffer : tmp) {
1891             if (buffer != nullptr) {
1892                 delete[] buffer;
1893             }
1894         }
1895     };
1896     freeLoopOutputs(tmp1);
1897     freeLoopOutputs(tmp2);
1898     freeUnusedSubgraphOperands(&condOperands);
1899     freeUnusedSubgraphOperands(&bodyOperands);
1900     consumeOperationInputs(operation.inputs, operands);
1901 
1902     return ANEURALNETWORKS_NO_ERROR;
1903 }
1904 
setOutputShapes(const std::vector<uint32_t> & outputIndexes,const std::vector<RunTimeOperandInfo> & operands)1905 void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
1906                                   const std::vector<RunTimeOperandInfo>& operands) {
1907     mOutputShapes.resize(outputIndexes.size());
1908     for (uint32_t i = 0; i < outputIndexes.size(); i++) {
1909         const uint32_t operandIndex = outputIndexes[i];
1910         const RunTimeOperandInfo& from = operands[operandIndex];
1911         mOutputShapes[i].dimensions = from.dimensions;
1912         mOutputShapes[i].isSufficient = from.isSufficient();
1913     }
1914 }
1915 
1916 // b/109953668, disable OpenMP
1917 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1918 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1919     mBlocktimeInitial = kmp_get_blocktime();
1920     kmp_set_blocktime(20);  // ms, see b/109645291
1921 
1922 #if NNAPI_LIMIT_CPU_THREADS
1923     // Code not yet enabled. Choosing the number of threads to be based on
1924     // benchmarking. See longer comment by the class declaration.
1925     mMaxThreadsInitial = Eigen::nbThreads();
1926     const int nProcs = omp_get_num_procs();
1927     int threads = nProcs;
1928     if (nProcs >= 8) {
1929         threads = nProcs - 4;
1930     } else if (nProcs >= 4) {
1931         threads = nProcs - 2;
1932     }
1933     Eigen::setNbThreads(threads);
1934 #endif
1935 }
1936 
~ScopedOpenmpSettings()1937 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1938     kmp_set_blocktime(mBlocktimeInitial);
1939 #if NNAPI_LIMIT_CPU_THREADS
1940     Eigen::setNbThreads(mMaxThreadsInitial);
1941 #endif
1942 }
1943 #endif  // NNAPI_OPENMP
1944 
1945 }  // namespace nn
1946 }  // namespace android
1947