1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "CpuExecutor"
18
19 #include "CpuExecutor.h"
20
21 #include <android/hardware_buffer.h>
22 #include <sys/mman.h>
23 #include <vndk/hardware_buffer.h>
24
25 #include <Eigen/Core>
26 #include <memory>
27 #include <utility>
28 #include <vector>
29
30 // b/109953668, disable OpenMP
31 #ifdef NNAPI_OPENMP
32 #include <omp.h>
33 #endif // NNAPI_OPENMP
34
35 #include "ControlFlow.h"
36 #include "NeuralNetworks.h"
37 #include "OperationResolver.h"
38 #include "Operations.h"
39 #include "OperationsUtils.h"
40 #include "Tracing.h"
41
42 namespace android {
43 namespace nn {
44
45 namespace {
46
47 using namespace hal;
48
49 class OperationExecutionContext : public IOperationExecutionContext {
50 DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
51
52 public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)53 OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
54 : operation(operation), operands(operands) {}
55
56 uint32_t getNumInputs() const override;
57 OperandType getInputType(uint32_t index) const override;
58 Shape getInputShape(uint32_t index) const override;
59 const void* getInputBuffer(uint32_t index) const override;
60 const OperandExtraParams getInputExtraParams(uint32_t index) const override;
61
62 uint32_t getNumOutputs() const override;
63 OperandType getOutputType(uint32_t index) const override;
64 Shape getOutputShape(uint32_t index) const override;
65 void* getOutputBuffer(uint32_t index) override;
66
67 // Return false on failure and store the result code.
68 // Use getResultCode() to retrieve it at the end of the operation execution.
69 bool setOutputShape(uint32_t index, const Shape& shape) override;
70 int getResultCode() const;
71
72 bool isOmittedInput(uint32_t index) const override;
73 bool isOmittedOutput(uint32_t index) const override;
74
75 // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
76 bool checkNoOmittedOperand() const;
77 // Return false if any of inputs has dimension 0.
78 bool checkNoZeroSizedInput() const;
79
80 private:
81 const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
82 const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
83 RunTimeOperandInfo* getOutputInfo(uint32_t index);
84
85 const Operation* operation;
86 RunTimeOperandInfo* operands;
87
88 int result = ANEURALNETWORKS_NO_ERROR;
89 };
90
getInputInfo(uint32_t index) const91 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
92 CHECK(index < operation->inputs.size());
93 return &operands[operation->inputs[index]];
94 }
95
getOutputInfo(uint32_t index) const96 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
97 CHECK(index < operation->outputs.size());
98 return &operands[operation->outputs[index]];
99 }
100
getOutputInfo(uint32_t index)101 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
102 CHECK(index < operation->outputs.size());
103 return &operands[operation->outputs[index]];
104 }
105
getInputType(uint32_t index) const106 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
107 return getInputInfo(index)->type;
108 }
109
getInputShape(uint32_t index) const110 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
111 return getInputInfo(index)->shape();
112 }
113
getInputBuffer(uint32_t index) const114 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
115 return getInputInfo(index)->buffer;
116 }
117
getInputExtraParams(uint32_t index) const118 const OperandExtraParams OperationExecutionContext::getInputExtraParams(uint32_t index) const {
119 return getInputInfo(index)->extraParams;
120 }
121
getOutputType(uint32_t index) const122 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
123 return getOutputInfo(index)->type;
124 }
125
getOutputShape(uint32_t index) const126 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
127 return getOutputInfo(index)->shape();
128 }
129
getOutputBuffer(uint32_t index)130 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
131 return getOutputInfo(index)->buffer;
132 }
133
getNumInputs() const134 uint32_t OperationExecutionContext::getNumInputs() const {
135 return operation->inputs.size();
136 }
137
getNumOutputs() const138 uint32_t OperationExecutionContext::getNumOutputs() const {
139 return operation->outputs.size();
140 }
141
getResultCode() const142 int OperationExecutionContext::getResultCode() const {
143 return result;
144 }
145
146 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
147 // Updates the RunTimeOperandInfo with the newly calculated shape.
148 // Allocate the buffer if we need to.
149 //
150 // TODO(b/153081229): This function currently cannot handle extension operands well. We need to
151 // propagate the extension type info into this function.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)152 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
153 // For user-provided model output operands, the parameters must match the Shape
154 // calculated from the preparation step.
155 if (info->lifetime == OperandLifeTime::SUBGRAPH_OUTPUT) {
156 if (info->type != shape.type) {
157 LOG(ERROR) << "Invalid type for model output";
158 *result = ANEURALNETWORKS_OP_FAILED;
159 return false;
160 }
161 if (info->scale != shape.scale) {
162 LOG(ERROR) << "Invalid scale for model output";
163 *result = ANEURALNETWORKS_OP_FAILED;
164 return false;
165 }
166 if (info->zeroPoint != shape.offset) {
167 LOG(ERROR) << "Invalid zeroPoint for model output";
168 *result = ANEURALNETWORKS_OP_FAILED;
169 return false;
170 }
171 if (info->extraParams != shape.extraParams) {
172 LOG(ERROR) << "Invalid extraParams for model output";
173 *result = ANEURALNETWORKS_OP_FAILED;
174 return false;
175 }
176 }
177
178 auto combined = combineDimensions(shape.dimensions, info->dimensions);
179 if (!combined.has_value()) {
180 LOG(ERROR) << "Invalid dimensions for model operand";
181 *result = ANEURALNETWORKS_OP_FAILED;
182 return false;
183 }
184 info->dimensions = std::move(combined.value());
185 info->type = shape.type;
186 info->scale = shape.scale;
187 info->zeroPoint = shape.offset;
188 info->extraParams = shape.extraParams;
189
190 // TODO(b/153081229): We bypass the overflow check on extension operands because we do not know
191 // the sizes of extension types.
192 if (!isExtensionOperandType(info->type) &&
193 nonExtensionOperandSizeOfDataOverflowsUInt32(info->type, info->dimensions)) {
194 LOG(ERROR) << "Operand data size overflows uint32_t";
195 *result = ANEURALNETWORKS_OP_FAILED;
196 return false;
197 }
198
199 // Allocate the buffer only if the combined dimension is fully specified
200 if (info->buffer == nullptr && (info->lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
201 info->lifetime == OperandLifeTime::SUBGRAPH_OUTPUT)) {
202 if (isExtensionOperandType(info->type)) {
203 LOG(ERROR) << "Cannot allocate a variable of an extension type";
204 *result = ANEURALNETWORKS_OP_FAILED;
205 return false;
206 }
207 uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
208 if (length > 0) {
209 info->buffer = new uint8_t[length];
210 if (info->buffer == nullptr) {
211 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
212 return false;
213 }
214 info->length = length;
215 }
216 }
217 if (!info->isSufficient()) {
218 uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
219 LOG(ERROR) << "Insufficient size for model operand: require = " << length
220 << ", provided = " << info->length;
221 *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
222 return false;
223 }
224 *result = ANEURALNETWORKS_NO_ERROR;
225 return true;
226 }
227
setOutputShape(uint32_t index,const Shape & shape)228 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
229 return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
230 }
231
isOmittedInput(uint32_t index) const232 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
233 return getInputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
234 }
235
isOmittedOutput(uint32_t index) const236 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
237 return getOutputInfo(index)->lifetime == OperandLifeTime::NO_VALUE;
238 }
239
checkNoOmittedOperand() const240 bool OperationExecutionContext::checkNoOmittedOperand() const {
241 for (uint32_t i = 0; i < operation->inputs.size(); i++) {
242 NN_RET_CHECK(!isOmittedInput(i)) << getOperationName(operation->type) << " input operand "
243 << i << " is required but missing.";
244 }
245 for (uint32_t i = 0; i < operation->outputs.size(); i++) {
246 NN_RET_CHECK(!isOmittedOutput(i)) << getOperationName(operation->type) << " output operand "
247 << i << " is required but missing.";
248 }
249 return true;
250 }
251
checkNoZeroSizedInput() const252 bool OperationExecutionContext::checkNoZeroSizedInput() const {
253 for (uint32_t i = 0; i < operation->inputs.size(); i++) {
254 if (isOmittedInput(i)) continue;
255 for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
256 NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0)
257 << getOperationName(operation->type)
258 << " does not support zero-sized tensor, but input " << i << " dimension " << j
259 << " is 0.";
260 }
261 }
262 return true;
263 }
264
265 } // namespace
266
267 // Used to keep a pointer to a memory pool.
268 //
269 // In the case of an "mmap_fd" pool, owns the mmap region
270 // returned by getBuffer() -- i.e., that region goes away
271 // when the RunTimePoolInfo is destroyed or is assigned to.
272 class RunTimePoolInfo::RunTimePoolInfoImpl {
273 public:
274 RunTimePoolInfoImpl(const hidl_memory& hidlMemory, uint8_t* buffer, const sp<IMemory>& memory,
275 AHardwareBuffer* hardwareBuffer, uint32_t size);
276
277 // rule of five...
278 ~RunTimePoolInfoImpl();
279 RunTimePoolInfoImpl(const RunTimePoolInfoImpl&) = delete;
280 RunTimePoolInfoImpl(RunTimePoolInfoImpl&&) noexcept = delete;
281 RunTimePoolInfoImpl& operator=(const RunTimePoolInfoImpl&) = delete;
282 RunTimePoolInfoImpl& operator=(RunTimePoolInfoImpl&&) noexcept = delete;
283
getBuffer() const284 uint8_t* getBuffer() const { return mBuffer; }
getSize() const285 uint32_t getSize() const { return mSize; }
286
287 bool flush() const;
288
getHidlMemory() const289 const hidl_memory& getHidlMemory() const { return mHidlMemory; }
290
291 private:
292 const hidl_memory mHidlMemory; // always used
293 uint8_t* const mBuffer = nullptr; // always used
294 const sp<IMemory> mMemory; // only used when hidlMemory.name() == "ashmem"
295 AHardwareBuffer*
296 mAHardwareBuffer; // only used when hidlMemory.name() == "hardware_buffer_blob"
297 const uint32_t mSize;
298 };
299
RunTimePoolInfoImpl(const hidl_memory & hidlMemory,uint8_t * buffer,const sp<IMemory> & memory,AHardwareBuffer * hardwareBuffer,uint32_t size)300 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(const hidl_memory& hidlMemory,
301 uint8_t* buffer,
302 const sp<IMemory>& memory,
303 AHardwareBuffer* hardwareBuffer,
304 uint32_t size)
305 : mHidlMemory(hidlMemory),
306 mBuffer(buffer),
307 mMemory(memory),
308 mAHardwareBuffer(hardwareBuffer),
309 mSize(size) {}
310
~RunTimePoolInfoImpl()311 RunTimePoolInfo::RunTimePoolInfoImpl::~RunTimePoolInfoImpl() {
312 if (mBuffer == nullptr) {
313 return;
314 }
315
316 const auto& memType = mHidlMemory.name();
317 if (memType == "ashmem") {
318 // nothing to do
319 } else if (memType == "mmap_fd") {
320 const size_t size = mHidlMemory.size();
321 if (munmap(mBuffer, size)) {
322 LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfo(): Can't munmap";
323 }
324 } else if (memType == "hardware_buffer_blob") {
325 AHardwareBuffer_unlock(mAHardwareBuffer, nullptr);
326 } else if (memType == "") {
327 // Represents a POINTER argument; nothing to do
328 } else {
329 LOG(ERROR) << "RunTimePoolInfoImpl::~RunTimePoolInfoImpl(): unsupported hidl_memory type";
330 }
331
332 if (mAHardwareBuffer != nullptr) {
333 AHardwareBuffer_release(mAHardwareBuffer);
334 }
335 }
336
337 // Making sure the output data are correctly updated after execution.
flush() const338 bool RunTimePoolInfo::RunTimePoolInfoImpl::flush() const {
339 const auto& memType = mHidlMemory.name();
340 if (memType == "mmap_fd") {
341 const int prot = mHidlMemory.handle()->data[1];
342 if (prot & PROT_WRITE) {
343 const size_t size = mHidlMemory.size();
344 return msync(mBuffer, size, MS_SYNC) == 0;
345 }
346 }
347 // No-op for other types of memory.
348 return true;
349 }
350
351 // TODO: short term, make share memory mapping and updating a utility function.
352 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromHidlMemory(const hidl_memory & hidlMemory)353 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromHidlMemory(
354 const hidl_memory& hidlMemory) {
355 uint8_t* buffer = nullptr;
356 sp<IMemory> memory;
357 AHardwareBuffer* hardwareBuffer = nullptr;
358
359 const auto& memType = hidlMemory.name();
360 if (memType == "ashmem") {
361 memory = mapMemory(hidlMemory);
362 if (memory == nullptr) {
363 LOG(ERROR) << "Can't map shared memory.";
364 return std::nullopt;
365 }
366 buffer = static_cast<uint8_t*>(static_cast<void*>(memory->getPointer()));
367 if (buffer == nullptr) {
368 LOG(ERROR) << "Can't access shared memory.";
369 return std::nullopt;
370 }
371 } else if (memType == "mmap_fd") {
372 size_t size = hidlMemory.size();
373 int fd = hidlMemory.handle()->data[0];
374 int prot = hidlMemory.handle()->data[1];
375 size_t offset = getSizeFromInts(hidlMemory.handle()->data[2], hidlMemory.handle()->data[3]);
376 buffer = static_cast<uint8_t*>(mmap(nullptr, size, prot, MAP_SHARED, fd, offset));
377 if (buffer == MAP_FAILED) {
378 LOG(ERROR) << "RunTimePoolInfo::set(): Can't mmap the file descriptor.";
379 return std::nullopt;
380 }
381 } else if (memType == "hardware_buffer_blob") {
382 auto handle = hidlMemory.handle();
383 auto format = AHARDWAREBUFFER_FORMAT_BLOB;
384 auto usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
385 const uint32_t width = hidlMemory.size();
386 const uint32_t height = 1; // height is always 1 for BLOB mode AHardwareBuffer.
387 const uint32_t layers = 1; // layers is always 1 for BLOB mode AHardwareBuffer.
388 const uint32_t stride = hidlMemory.size();
389
390 AHardwareBuffer_Desc desc{
391 .width = width,
392 .format = format,
393 .height = height,
394 .layers = layers,
395 .usage = usage,
396 .stride = stride,
397 };
398 status_t status = AHardwareBuffer_createFromHandle(
399 &desc, handle, AHARDWAREBUFFER_CREATE_FROM_HANDLE_METHOD_CLONE, &hardwareBuffer);
400 if (status != NO_ERROR) {
401 LOG(ERROR) << "RunTimePoolInfo Can't create AHardwareBuffer from handle. Error: "
402 << status;
403 return std::nullopt;
404 }
405 void* gBuffer = nullptr;
406 status = AHardwareBuffer_lock(hardwareBuffer, usage, -1, nullptr, &gBuffer);
407 if (status != NO_ERROR) {
408 LOG(ERROR) << "RunTimePoolInfo Can't lock the AHardwareBuffer. Error: " << status;
409 return std::nullopt;
410 }
411 buffer = static_cast<uint8_t*>(gBuffer);
412 } else {
413 LOG(ERROR) << "RunTimePoolInfo::set(): unsupported hidl_memory type";
414 return std::nullopt;
415 }
416
417 const auto impl = std::make_shared<const RunTimePoolInfoImpl>(
418 hidlMemory, buffer, memory, hardwareBuffer, hidlMemory.size());
419 return {RunTimePoolInfo(impl)};
420 }
421
createFromExistingBuffer(uint8_t * buffer,uint32_t size)422 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer, uint32_t size) {
423 const auto impl = std::make_shared<const RunTimePoolInfoImpl>(hidl_memory{}, buffer, nullptr,
424 nullptr, size);
425 return {impl};
426 }
427
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)428 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
429 : mImpl(impl) {}
430
getBuffer() const431 uint8_t* RunTimePoolInfo::getBuffer() const {
432 return mImpl->getBuffer();
433 }
434
getSize() const435 uint32_t RunTimePoolInfo::getSize() const {
436 return mImpl->getSize();
437 }
438
flush() const439 bool RunTimePoolInfo::flush() const {
440 return mImpl->flush();
441 }
442
getHidlMemory() const443 const hidl_memory& RunTimePoolInfo::getHidlMemory() const {
444 return mImpl->getHidlMemory();
445 }
446
setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<hidl_memory> & pools)447 bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
448 const hidl_vec<hidl_memory>& pools) {
449 CHECK(poolInfos != nullptr);
450 poolInfos->clear();
451 poolInfos->reserve(pools.size());
452 for (const auto& pool : pools) {
453 if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromHidlMemory(pool)) {
454 poolInfos->push_back(*poolInfo);
455 } else {
456 LOG(ERROR) << "Could not map pools";
457 poolInfos->clear();
458 return false;
459 }
460 }
461 return true;
462 }
463
setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo> * poolInfos,const hidl_vec<Request::MemoryPool> & pools)464 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
465 const hidl_vec<Request::MemoryPool>& pools) {
466 CHECK(poolInfos != nullptr);
467 poolInfos->clear();
468 poolInfos->reserve(pools.size());
469 for (const auto& pool : pools) {
470 if (pool.getDiscriminator() != Request::MemoryPool::hidl_discriminator::hidlMemory) {
471 LOG(ERROR) << "Unknown memory token";
472 poolInfos->clear();
473 return false;
474 }
475 if (std::optional<RunTimePoolInfo> poolInfo =
476 RunTimePoolInfo::createFromHidlMemory(pool.hidlMemory())) {
477 poolInfos->push_back(*poolInfo);
478 } else {
479 LOG(ERROR) << "Could not map pools";
480 poolInfos->clear();
481 return false;
482 }
483 }
484 return true;
485 }
486
487 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)488 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
489 uint32_t spatialSize = fromDim[2] * fromDim[3];
490 for (uint32_t n = 0; n < fromDim[0]; n++) {
491 for (uint32_t hw = 0; hw < spatialSize; hw++) {
492 for (uint32_t c = 0; c < fromDim[1]; c++) {
493 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
494 *to++ = from[fromIndex];
495 }
496 }
497 }
498 return true;
499 }
500
501 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)502 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
503 uint32_t spatialSize = fromDim[1] * fromDim[2];
504 for (uint32_t n = 0; n < fromDim[0]; n++) {
505 for (uint32_t c = 0; c < fromDim[3]; c++) {
506 for (uint32_t hw = 0; hw < spatialSize; hw++) {
507 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
508 *to++ = from[fromIndex];
509 }
510 }
511 }
512 return true;
513 }
514
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)515 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
516 std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
517 int result;
518 if (from.dimensions.size() != 4) {
519 LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
520 return false;
521 }
522 to.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
523 if (data_layout) {
524 // convert dimensions
525 Shape inShape = from.shape();
526 auto& fromDim = from.dimensions;
527 inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
528 // allocate buffer
529 to.buffer = nullptr;
530 if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
531 return false;
532 }
533 ptr_guard.reset(to.buffer);
534 // convert value
535 if (from.type == OperandType::TENSOR_FLOAT32) {
536 return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
537 reinterpret_cast<const float*>(from.buffer), fromDim);
538 } else if (from.type == OperandType::TENSOR_FLOAT16) {
539 return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
540 reinterpret_cast<const _Float16*>(from.buffer),
541 fromDim);
542 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
543 return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
544 reinterpret_cast<const uint8_t*>(from.buffer),
545 fromDim);
546 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
547 return convertToNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
548 reinterpret_cast<const int8_t*>(from.buffer), fromDim);
549 } else {
550 LOG(ERROR) << "Unsupported data type";
551 return false;
552 }
553 } else {
554 to = from;
555 }
556 return true;
557 }
558
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)559 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
560 bool data_layout, int* result) {
561 if (from.dimensions.size() != 4) {
562 LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
563 return false;
564 }
565 if (data_layout) {
566 // convert dimensions
567 Shape outShape = from.shape();
568 auto& fromDim = from.dimensions;
569 outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
570 // allocate buffer
571 if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
572 return false;
573 }
574 // convert value
575 if (from.type == OperandType::TENSOR_FLOAT32) {
576 return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
577 reinterpret_cast<const float*>(from.buffer), fromDim);
578 } else if (from.type == OperandType::TENSOR_FLOAT16) {
579 return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
580 reinterpret_cast<const _Float16*>(from.buffer),
581 fromDim);
582 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
583 return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
584 reinterpret_cast<const uint8_t*>(from.buffer),
585 fromDim);
586 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
587 return convertFromNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
588 reinterpret_cast<const int8_t*>(from.buffer),
589 fromDim);
590 } else {
591 LOG(ERROR) << "Unsupported data type";
592 return false;
593 }
594 } else {
595 Shape outShape = from.shape();
596 to.buffer = from.buffer;
597 to.length = from.length;
598 if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
599 return false;
600 }
601 }
602 return true;
603 }
604
605 // Decrements the usage count for the operands listed. Frees the memory
606 // allocated for any temporary variable with a count of zero.
consumeOperationInputs(const std::vector<uint32_t> & inputs,RunTimeOperandInfo * operands)607 static void consumeOperationInputs(const std::vector<uint32_t>& inputs,
608 RunTimeOperandInfo* operands) {
609 for (uint32_t i : inputs) {
610 auto& info = operands[i];
611 // Check if it's a static or model input/output.
612 if (info.numberOfUsesLeft == 0) {
613 continue;
614 }
615 info.numberOfUsesLeft--;
616 if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
617 delete[] info.buffer;
618 info.buffer = nullptr;
619 }
620 }
621 }
622
623 // This function only frees TEMPORARY_VARIABLE operands that are unused
624 // outputs because consumeOperationInputs takes care of any operands
625 // that are inputs to an operation.
freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo> * operands)626 static void freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo>* operands) {
627 for (auto& info : *operands) {
628 if (info.lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info.numberOfUsesLeft == 0 &&
629 info.buffer != nullptr) {
630 delete[] info.buffer;
631 info.buffer = nullptr;
632 }
633 }
634 }
635
636 // Ignore the .pools entry in model and request. This will have been taken care of
637 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)638 int CpuExecutor::run(const Model& model, const Request& request,
639 const std::vector<RunTimePoolInfo>& modelPoolInfos,
640 const std::vector<RunTimePoolInfo>& requestPoolInfos) {
641 NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
642 VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(toString(request)) << ")";
643 mModelOperandValues = &model.operandValues;
644 mModelPoolInfos = &modelPoolInfos;
645 mReferencedSubgraphs = &model.referenced;
646
647 // b/109953668, disable OpenMP
648 #ifdef NNAPI_OPENMP
649 ScopedOpenmpSettings openMpSettings;
650 #endif // NNAPI_OPENMP
651
652 std::vector<RunTimeOperandInfo> operands = initializeRunTimeInfo(model.main);
653 updateForArguments(model.main.inputIndexes, request.inputs, requestPoolInfos, operands.data());
654 updateForArguments(model.main.outputIndexes, request.outputs, requestPoolInfos,
655 operands.data());
656 int result = executeSubgraph(model.main, operands.data());
657 freeUnusedSubgraphOperands(&operands);
658
659 if (result == ANEURALNETWORKS_NO_ERROR) {
660 VLOG(CPUEXE) << "Completed run normally";
661 for (auto& runtimeInfo : requestPoolInfos) {
662 runtimeInfo.flush();
663 }
664 }
665
666 // Only report the output shapes when the result code is NO_ERROR or OUTPUT_INSUFFICIENT_SIZE.
667 if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
668 setOutputShapes(model.main.outputIndexes, operands);
669 } else {
670 mOutputShapes.clear();
671 }
672
673 mFinished = true;
674 mModelOperandValues = nullptr;
675 mModelPoolInfos = nullptr;
676 mReferencedSubgraphs = nullptr;
677 return result;
678 }
679
executeSubgraph(const Subgraph & subgraph,RunTimeOperandInfo * operands)680 int CpuExecutor::executeSubgraph(const Subgraph& subgraph, RunTimeOperandInfo* operands) {
681 VLOG(CPUEXE) << "CpuExecutor::executeSubgraph " << toString(subgraph);
682 // The graph has serialized the operation in execution order.
683 for (const auto& operation : subgraph.operations) {
684 NN_RETURN_IF_ERROR(executeOperation(operation, operands));
685 }
686 return ANEURALNETWORKS_NO_ERROR;
687 }
688
initializeRunTimeInfo(const Subgraph & subgraph)689 std::vector<RunTimeOperandInfo> CpuExecutor::initializeRunTimeInfo(const Subgraph& subgraph) {
690 VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
691 const size_t count = subgraph.operands.size();
692 std::vector<RunTimeOperandInfo> operands(count);
693 for (size_t i = 0; i < count; i++) {
694 const Operand& from = subgraph.operands[i];
695 RunTimeOperandInfo& to = operands[i];
696 to.type = from.type;
697 to.dimensions = from.dimensions;
698 to.scale = from.scale;
699 to.zeroPoint = from.zeroPoint;
700 to.length = from.location.length;
701 to.lifetime = from.lifetime;
702 to.extraParams = from.extraParams;
703 switch (from.lifetime) {
704 case OperandLifeTime::TEMPORARY_VARIABLE:
705 to.buffer = nullptr;
706 to.numberOfUsesLeft = from.numberOfConsumers;
707 break;
708 case OperandLifeTime::CONSTANT_COPY:
709 to.buffer = const_cast<uint8_t*>(&(*mModelOperandValues)[from.location.offset]);
710 to.numberOfUsesLeft = 0;
711 break;
712 case OperandLifeTime::CONSTANT_REFERENCE: {
713 auto poolIndex = from.location.poolIndex;
714 CHECK_LT(poolIndex, mModelPoolInfos->size());
715 auto& r = (*mModelPoolInfos)[poolIndex];
716 to.buffer = r.getBuffer() + from.location.offset;
717 to.numberOfUsesLeft = 0;
718 break;
719 }
720 case OperandLifeTime::SUBGRAPH: {
721 auto subgraphIndex = from.location.offset;
722 CHECK_LT(subgraphIndex, mReferencedSubgraphs->size());
723 to.buffer = reinterpret_cast<uint8_t*>(
724 const_cast<Subgraph*>(&(*mReferencedSubgraphs)[subgraphIndex]));
725 to.numberOfUsesLeft = 0;
726 } break;
727 case OperandLifeTime::SUBGRAPH_INPUT:
728 case OperandLifeTime::SUBGRAPH_OUTPUT:
729 case OperandLifeTime::NO_VALUE:
730 to.buffer = nullptr;
731 to.numberOfUsesLeft = 0;
732 break;
733 }
734 }
735 return operands;
736 }
737
updateForArguments(const std::vector<uint32_t> & indexes,const hal::hidl_vec<hal::RequestArgument> & arguments,const std::vector<RunTimePoolInfo> & requestPoolInfos,RunTimeOperandInfo * operands)738 void CpuExecutor::updateForArguments(const std::vector<uint32_t>& indexes,
739 const hal::hidl_vec<hal::RequestArgument>& arguments,
740 const std::vector<RunTimePoolInfo>& requestPoolInfos,
741 RunTimeOperandInfo* operands) {
742 CHECK_EQ(indexes.size(), arguments.size());
743 for (size_t i = 0; i < indexes.size(); i++) {
744 const uint32_t operandIndex = indexes[i];
745 const RequestArgument& from = arguments[i];
746 RunTimeOperandInfo& to = operands[operandIndex];
747 if (from.dimensions.size() > 0) {
748 // It's the responsibility of the caller to validate that
749 // from.dimensions only modifies the dimensions that were
750 // unspecified in the model. That's the case in SampleDriver.cpp
751 // with the call to validateRequest().
752 // TODO make sure that's the case for the default CPU path.
753 to.dimensions = from.dimensions;
754 }
755 if (from.hasNoValue) {
756 to.lifetime = OperandLifeTime::NO_VALUE;
757 CHECK(to.buffer == nullptr);
758 to.length = 0;
759 } else {
760 auto poolIndex = from.location.poolIndex;
761 CHECK_LT(poolIndex, requestPoolInfos.size());
762 auto& r = requestPoolInfos[poolIndex];
763 to.buffer = r.getBuffer() + from.location.offset;
764 if (from.location.offset == 0 && from.location.length == 0) {
765 // Use the entire memory region.
766 to.length = r.getSize();
767 } else {
768 to.length = from.location.length;
769 }
770 }
771 }
772 }
773
executeOperation(const Operation & operation,RunTimeOperandInfo * operands)774 int CpuExecutor::executeOperation(const Operation& operation, RunTimeOperandInfo* operands) {
775 if (hasDeadlinePassed(mDeadline)) {
776 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
777 }
778 if (operation.type == OperationType::IF) {
779 int result = executeIfOperation(operation, operands);
780 if (result != ANEURALNETWORKS_NO_ERROR) {
781 LOG(ERROR) << "IF failed.";
782 }
783 return result;
784 }
785 if (operation.type == OperationType::WHILE) {
786 int result = executeWhileOperation(operation, operands);
787 if (result != ANEURALNETWORKS_NO_ERROR) {
788 LOG(ERROR) << "WHILE failed.";
789 }
790 return result;
791 }
792
793 // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << toString(operation) << ")";
794 const hidl_vec<uint32_t>& ins = operation.inputs;
795 const hidl_vec<uint32_t>& outs = operation.outputs;
796 bool success = false;
797 int result = ANEURALNETWORKS_NO_ERROR;
798
799 // Function to verify that the number of input and output parameters
800 // matches what is expected. Also checks that all the parameters have
801 // values. This function is to be used only for operations that do not
802 // accept optional arguments.
803 // TODO Have a version that works for optional arguments.
804 auto allParametersPresent = [&operation, &operands, &ins, &outs](size_t requiredIns,
805 size_t requiredOuts) -> bool {
806 auto verify = [&operation, &operands](size_t requiredCount,
807 const hidl_vec<uint32_t>& indexes,
808 const char* type) -> bool {
809 size_t actualCount = indexes.size();
810 if (actualCount != requiredCount) {
811 LOG(ERROR) << getOperationName(operation.type) << ": Invalid number of " << type
812 << " operands. Got " << actualCount << " of " << requiredCount;
813 return false;
814 }
815 for (size_t i = 0; i < actualCount; i++) {
816 if (operands[indexes[i]].lifetime == OperandLifeTime::NO_VALUE) {
817 LOG(ERROR) << getOperationName(operation.type) << " " << type << " operand "
818 << i << " is required but missing.";
819 return false;
820 }
821 }
822 return true;
823 };
824
825 auto verifyNoZeroSizedInputs = [&operation, &operands](const hidl_vec<uint32_t>& indexes) {
826 for (size_t i = 0; i < indexes.size(); i++) {
827 for (size_t j = 0; j < operands[indexes[i]].dimensions.size(); j++) {
828 if (operands[indexes[i]].dimensions[j] == 0) {
829 LOG(ERROR) << getOperationName(operation.type)
830 << " does not support zero-sized tensor, but input " << i
831 << " dimension " << j << " is zero.";
832 return false;
833 }
834 }
835 }
836 return true;
837 };
838
839 return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
840 verifyNoZeroSizedInputs(ins);
841 };
842
843 switch (operation.type) {
844 case OperationType::OEM_OPERATION: {
845 LOG(ERROR) << "OEM operation not supported for CPU execution";
846 success = false;
847 } break;
848 case OperationType::RESHAPE: {
849 if (!allParametersPresent(2, 1)) {
850 return ANEURALNETWORKS_BAD_DATA;
851 }
852 const RunTimeOperandInfo& input = operands[ins[0]];
853 const RunTimeOperandInfo& targetShape = operands[ins[1]];
854
855 RunTimeOperandInfo& output = operands[outs[0]];
856 Shape outShape = output.shape();
857
858 success = reshapePrepare(input.shape(),
859 reinterpret_cast<const int32_t*>(targetShape.buffer),
860 getNumberOfElements(targetShape.shape()), &outShape) &&
861 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
862 copyData(input.buffer, input.shape(), output.buffer, outShape);
863 } break;
864 case OperationType::DEPTH_TO_SPACE: {
865 const size_t inCount = ins.size();
866 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
867 return ANEURALNETWORKS_BAD_DATA;
868 }
869 const RunTimeOperandInfo& input = operands[ins[0]];
870 int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
871 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
872
873 RunTimeOperandInfo& output = operands[outs[0]];
874 Shape outShape = output.shape();
875
876 RunTimeOperandInfo input_tmp, output_tmp;
877 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
878 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
879 success = false;
880 break;
881 }
882 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
883 output_tmp.buffer = data_layout ? nullptr : output.buffer;
884 output_tmp.length = data_layout ? 0 : output.length;
885 if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
886 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
887 if (!data_layout) output.dimensions = output_tmp.dimensions;
888 break;
889 }
890 switch (input_tmp.type) {
891 case OperandType::TENSOR_FLOAT32: {
892 success = depthToSpaceGeneric(
893 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
894 blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
895 break;
896 }
897 case OperandType::TENSOR_FLOAT16: {
898 success = depthToSpaceGeneric(
899 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
900 blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
901 break;
902 }
903 case OperandType::TENSOR_QUANT8_ASYMM: {
904 success = depthToSpaceGeneric(
905 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
906 blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
907 break;
908 }
909 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
910 success = depthToSpaceGeneric(
911 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
912 blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
913 break;
914 }
915 default: {
916 LOG(ERROR) << "Unsupported data type";
917 success = false;
918 }
919 }
920 if (data_layout) {
921 output_tmp_guard.reset(output_tmp.buffer);
922 }
923 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
924 success = false;
925 break;
926 }
927 } break;
928 case OperationType::SPACE_TO_DEPTH: {
929 const size_t inCount = ins.size();
930 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
931 return ANEURALNETWORKS_BAD_DATA;
932 }
933 const RunTimeOperandInfo& input = operands[ins[0]];
934 int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
935 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
936
937 RunTimeOperandInfo& output = operands[outs[0]];
938 Shape outShape = output.shape();
939
940 RunTimeOperandInfo input_tmp, output_tmp;
941 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
942 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
943 success = false;
944 break;
945 }
946 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
947 output_tmp.buffer = data_layout ? nullptr : output.buffer;
948 output_tmp.length = data_layout ? 0 : output.length;
949
950 if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
951 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
952 if (!data_layout) output.dimensions = output_tmp.dimensions;
953 break;
954 }
955 switch (input_tmp.type) {
956 case OperandType::TENSOR_FLOAT32: {
957 success = spaceToDepthGeneric(
958 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
959 blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
960 break;
961 }
962 case OperandType::TENSOR_FLOAT16: {
963 success = spaceToDepthGeneric(
964 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
965 blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
966 break;
967 }
968 case OperandType::TENSOR_QUANT8_ASYMM: {
969 success = spaceToDepthGeneric(
970 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
971 blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
972 break;
973 }
974 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
975 success = spaceToDepthGeneric(
976 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
977 blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
978 break;
979 }
980 default: {
981 LOG(ERROR) << "Unsupported data type";
982 success = false;
983 }
984 }
985 if (data_layout) {
986 output_tmp_guard.reset(output_tmp.buffer);
987 }
988 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
989 success = false;
990 break;
991 }
992 } break;
993 case OperationType::EMBEDDING_LOOKUP: {
994 if (!allParametersPresent(2, 1)) {
995 return ANEURALNETWORKS_BAD_DATA;
996 }
997 const RunTimeOperandInfo& values = operands[ins[EmbeddingLookup::kValueTensor]];
998 const RunTimeOperandInfo& lookups = operands[ins[EmbeddingLookup::kLookupTensor]];
999 RunTimeOperandInfo& output = operands[outs[EmbeddingLookup::kOutputTensor]];
1000
1001 Shape outputShape;
1002 EmbeddingLookup lookup(operation, operands);
1003
1004 success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
1005 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
1006 } break;
1007 case OperationType::HASHTABLE_LOOKUP: {
1008 if (!allParametersPresent(3, 2)) {
1009 return ANEURALNETWORKS_BAD_DATA;
1010 }
1011 const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
1012 const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
1013 const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
1014
1015 RunTimeOperandInfo& output = operands[outs[HashtableLookup::kOutputTensor]];
1016 RunTimeOperandInfo& hits = operands[outs[HashtableLookup::kHitsTensor]];
1017
1018 Shape outputShape, hitShape;
1019 HashtableLookup lookup(operation, operands);
1020
1021 success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
1022 &outputShape, &hitShape) &&
1023 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1024 setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
1025 } break;
1026 case OperationType::LSH_PROJECTION: {
1027 RunTimeOperandInfo& output = operands[outs[LSHProjection::kOutputTensor]];
1028 Shape outputShape;
1029 if (!LSHProjection::Prepare(operation, operands, &outputShape) ||
1030 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
1031 break;
1032 }
1033
1034 LSHProjection lsh(operation, operands);
1035 const RunTimeOperandInfo& hash = operands[ins[LSHProjection::kHashTensor]];
1036 switch (hash.type) {
1037 case OperandType::TENSOR_FLOAT32: {
1038 success = lsh.Eval<float>();
1039 break;
1040 }
1041 case OperandType::TENSOR_FLOAT16: {
1042 success = lsh.Eval<_Float16>();
1043 break;
1044 }
1045 default: {
1046 success = false;
1047 LOG(ERROR) << "Unsupported data type";
1048 }
1049 }
1050 } break;
1051 case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
1052 const auto merge_outputs = getScalarData<bool>(
1053 operands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1054 const bool output_state = (outs.size() == 5 || outs.size() == 6);
1055 RunTimeOperandInfo& fwOutput =
1056 operands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1057 Shape fwOutputShape, bwOutputShape, fwOutputActivationStateShape,
1058 fwOutputCellStateShape, bwOutputActivationStateShape, bwOutputCellStateShape;
1059
1060 BidirectionalSequenceLSTM lstm(operation, operands);
1061 success = lstm.Prepare(operation, operands, &fwOutputShape, &bwOutputShape,
1062 &fwOutputActivationStateShape, &fwOutputCellStateShape,
1063 &bwOutputActivationStateShape, &bwOutputCellStateShape) &&
1064 setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1065 if (!merge_outputs) {
1066 RunTimeOperandInfo& bwOutput =
1067 operands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1068 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1069 }
1070 if (output_state) {
1071 uint32_t delta = merge_outputs ? 1 : 0;
1072 RunTimeOperandInfo& fwOutputActivationState =
1073 operands[outs[BidirectionalSequenceLSTM::kFwOutputActivationStateTensor -
1074 delta]];
1075 RunTimeOperandInfo& fwOutputCellState =
1076 operands[outs[BidirectionalSequenceLSTM::kFwOutputCellStateTensor - delta]];
1077 RunTimeOperandInfo& bwOutputActivationState =
1078 operands[outs[BidirectionalSequenceLSTM::kBwOutputActivationStateTensor -
1079 delta]];
1080 RunTimeOperandInfo& bwOutputCellState =
1081 operands[outs[BidirectionalSequenceLSTM::kBwOutputCellStateTensor - delta]];
1082 success = success &&
1083 setInfoAndAllocateIfNeeded(&fwOutputActivationState,
1084 fwOutputActivationStateShape, &result) &&
1085 setInfoAndAllocateIfNeeded(&fwOutputCellState, fwOutputCellStateShape,
1086 &result) &&
1087 setInfoAndAllocateIfNeeded(&bwOutputActivationState,
1088 bwOutputActivationStateShape, &result) &&
1089 setInfoAndAllocateIfNeeded(&bwOutputCellState, bwOutputCellStateShape,
1090 &result);
1091 }
1092 success = success && lstm.Eval();
1093 } break;
1094 case OperationType::LSTM: {
1095 RunTimeOperandInfo& scratch = operands[outs[LSTMCell::kScratchBufferTensor]];
1096 RunTimeOperandInfo& outputStateOut = operands[outs[LSTMCell::kOutputStateOutTensor]];
1097 RunTimeOperandInfo& cellStateOut = operands[outs[LSTMCell::kCellStateOutTensor]];
1098 RunTimeOperandInfo& output = operands[outs[LSTMCell::kOutputTensor]];
1099
1100 Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1101 LSTMCell lstm_cell(operation, operands);
1102
1103 success = lstm_cell.Prepare(operation, operands, &scratchShape, &outputStateShape,
1104 &cellStateShape, &outputShape) &&
1105 setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1106 setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1107 setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1108 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1109 } break;
1110 case OperationType::RANDOM_MULTINOMIAL: {
1111 if (!allParametersPresent(3, 1)) {
1112 return ANEURALNETWORKS_BAD_DATA;
1113 }
1114 const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
1115 const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
1116 const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
1117 RunTimeOperandInfo& output = operands[outs[Multinomial::kOutputTensor]];
1118
1119 Shape outputShape;
1120 Multinomial multinomial(operation, operands);
1121
1122 success = Multinomial::Prepare(operation, operands, &outputShape) &&
1123 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1124 multinomial.Eval();
1125 } break;
1126 case OperationType::RNN: {
1127 if (!allParametersPresent(6, 2)) {
1128 return ANEURALNETWORKS_BAD_DATA;
1129 }
1130
1131 RunTimeOperandInfo& hiddenStateOut = operands[outs[RNN::kHiddenStateOutTensor]];
1132 RunTimeOperandInfo& output = operands[outs[RNN::kOutputTensor]];
1133
1134 Shape hiddenStateShape, outputShape;
1135 RNN rnn_cell(operation, operands);
1136
1137 success = RNN::Prepare(operation, operands, &hiddenStateShape, &outputShape) &&
1138 setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1139 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1140 } break;
1141 case OperationType::SVDF: {
1142 RunTimeOperandInfo& stateOut = operands[outs[SVDF::kStateOutTensor]];
1143 RunTimeOperandInfo& output = operands[outs[SVDF::kOutputTensor]];
1144
1145 Shape stateShape, outputShape;
1146 SVDF svdf(operation, operands);
1147
1148 success = SVDF::Prepare(operation, operands, &stateShape, &outputShape) &&
1149 setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1150 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1151 } break;
1152 case OperationType::BATCH_TO_SPACE_ND: {
1153 const size_t inCount = ins.size();
1154 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1155 return ANEURALNETWORKS_BAD_DATA;
1156 }
1157 const RunTimeOperandInfo& input = operands[ins[0]];
1158 const RunTimeOperandInfo& blockSize = operands[ins[1]];
1159 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
1160
1161 RunTimeOperandInfo& output = operands[outs[0]];
1162 Shape outShape = output.shape();
1163
1164 RunTimeOperandInfo input_tmp, output_tmp;
1165 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1166 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1167 success = false;
1168 break;
1169 }
1170 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1171 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1172 output_tmp.length = data_layout ? 0 : output.length;
1173
1174 if (!batchToSpacePrepare(input_tmp.shape(),
1175 reinterpret_cast<const int32_t*>(blockSize.buffer),
1176 blockSize.shape(), &outShape) ||
1177 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1178 if (!data_layout) output.dimensions = output_tmp.dimensions;
1179 break;
1180 }
1181 switch (input_tmp.type) {
1182 case OperandType::TENSOR_FLOAT32: {
1183 success = batchToSpaceGeneric(
1184 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1185 reinterpret_cast<const int32_t*>(blockSize.buffer),
1186 reinterpret_cast<float*>(output_tmp.buffer), outShape);
1187 break;
1188 }
1189 case OperandType::TENSOR_FLOAT16: {
1190 success = batchToSpaceGeneric(
1191 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1192 reinterpret_cast<const int32_t*>(blockSize.buffer),
1193 reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1194 break;
1195 }
1196 case OperandType::TENSOR_QUANT8_ASYMM: {
1197 success = batchToSpaceGeneric(
1198 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1199 reinterpret_cast<const int32_t*>(blockSize.buffer),
1200 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1201 break;
1202 }
1203 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1204 success = batchToSpaceGeneric(
1205 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1206 reinterpret_cast<const int32_t*>(blockSize.buffer),
1207 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1208 break;
1209 }
1210 default: {
1211 LOG(ERROR) << "Unsupported data type";
1212 success = false;
1213 }
1214 }
1215 if (data_layout) {
1216 output_tmp_guard.reset(output_tmp.buffer);
1217 }
1218 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1219 success = false;
1220 break;
1221 }
1222 } break;
1223 case OperationType::SPACE_TO_BATCH_ND: {
1224 const size_t inCount = ins.size();
1225 if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1226 return ANEURALNETWORKS_BAD_DATA;
1227 }
1228 const RunTimeOperandInfo& input = operands[ins[0]];
1229 const RunTimeOperandInfo& blockSize = operands[ins[1]];
1230 const RunTimeOperandInfo& paddings = operands[ins[2]];
1231 bool data_layout = inCount == 4 ? getScalarData<bool>(operands[ins[3]]) : false;
1232
1233 RunTimeOperandInfo& output = operands[outs[0]];
1234 Shape outShape = output.shape();
1235
1236 RunTimeOperandInfo input_tmp, output_tmp;
1237 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1238 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1239 success = false;
1240 break;
1241 }
1242 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1243 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1244 output_tmp.length = data_layout ? 0 : output.length;
1245
1246 if (!spaceToBatchPrepare(
1247 input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1248 blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1249 paddings.shape(), &outShape) ||
1250 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1251 if (!data_layout) output.dimensions = output_tmp.dimensions;
1252 break;
1253 }
1254 switch (input_tmp.type) {
1255 case OperandType::TENSOR_FLOAT32: {
1256 success = spaceToBatchGeneric(
1257 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1258 reinterpret_cast<const int32_t*>(blockSize.buffer),
1259 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1260 reinterpret_cast<float*>(output_tmp.buffer), outShape);
1261 break;
1262 }
1263 case OperandType::TENSOR_FLOAT16: {
1264 success = spaceToBatchGeneric(
1265 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1266 reinterpret_cast<const int32_t*>(blockSize.buffer),
1267 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1268 reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1269 break;
1270 }
1271 case OperandType::TENSOR_QUANT8_ASYMM: {
1272 success = spaceToBatchGeneric(
1273 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1274 reinterpret_cast<const int32_t*>(blockSize.buffer),
1275 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1276 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1277 break;
1278 }
1279 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1280 success = spaceToBatchGeneric(
1281 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1282 reinterpret_cast<const int32_t*>(blockSize.buffer),
1283 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1284 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1285 break;
1286 }
1287 default: {
1288 LOG(ERROR) << "Unsupported data type";
1289 success = false;
1290 }
1291 }
1292 if (data_layout) {
1293 output_tmp_guard.reset(output_tmp.buffer);
1294 }
1295 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1296 success = false;
1297 break;
1298 }
1299 } break;
1300 case OperationType::PAD:
1301 case OperationType::PAD_V2: {
1302 const bool isV2 = operation.type == OperationType::PAD_V2;
1303 if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1304 return ANEURALNETWORKS_BAD_DATA;
1305 }
1306 const RunTimeOperandInfo& input = operands[ins[0]];
1307 const RunTimeOperandInfo& paddings = operands[ins[1]];
1308
1309 RunTimeOperandInfo& output = operands[outs[0]];
1310 Shape outShape = output.shape();
1311
1312 if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1313 paddings.shape(), &outShape) ||
1314 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1315 break;
1316 }
1317 if (input.type == OperandType::TENSOR_FLOAT32) {
1318 float pad_value = isV2 ? getScalarData<float>(operands[ins[2]]) : 0;
1319 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1320 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1321 reinterpret_cast<float*>(output.buffer), outShape);
1322 } else if (input.type == OperandType::TENSOR_FLOAT16) {
1323 _Float16 pad_value = isV2 ? getScalarData<_Float16>(operands[ins[2]]) : 0;
1324 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1325 reinterpret_cast<const int32_t*>(paddings.buffer),
1326 static_cast<_Float16>(pad_value),
1327 reinterpret_cast<_Float16*>(output.buffer), outShape);
1328 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1329 uint8_t pad_value =
1330 isV2 ? getScalarData<uint8_t>(operands[ins[2]]) : outShape.offset;
1331 success = padGeneric(input.buffer, input.shape(),
1332 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1333 output.buffer, outShape);
1334 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1335 uint8_t pad_value =
1336 isV2 ? getScalarData<int8_t>(operands[ins[2]]) : outShape.offset;
1337 success = padGeneric(input.buffer, input.shape(),
1338 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1339 output.buffer, outShape);
1340 }
1341 } break;
1342 case OperationType::CAST: {
1343 if (!allParametersPresent(1, 1)) {
1344 return ANEURALNETWORKS_BAD_DATA;
1345 }
1346 const RunTimeOperandInfo& input = operands[ins[0]];
1347
1348 RunTimeOperandInfo& output = operands[outs[0]];
1349 Shape outShape = output.shape();
1350
1351 success = cast::prepare(input.shape(), &outShape) &&
1352 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1353 cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1354 } break;
1355 case OperationType::MEAN: {
1356 if (!allParametersPresent(3, 1)) {
1357 return ANEURALNETWORKS_BAD_DATA;
1358 }
1359 const RunTimeOperandInfo& input = operands[ins[0]];
1360 const RunTimeOperandInfo& axis = operands[ins[1]];
1361 int32_t keepDims = getScalarData<int32_t>(operands[ins[2]]);
1362
1363 RunTimeOperandInfo& output = operands[outs[0]];
1364 Shape outShape = output.shape();
1365
1366 if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1367 axis.shape(), keepDims > 0, &outShape) ||
1368 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1369 break;
1370 }
1371 if (input.type == OperandType::TENSOR_FLOAT16) {
1372 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1373 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1374 keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1375 outShape);
1376 } else if (input.type == OperandType::TENSOR_FLOAT32) {
1377 success = meanGeneric<float, float>(
1378 reinterpret_cast<float*>(input.buffer), input.shape(),
1379 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1380 reinterpret_cast<float*>(output.buffer), outShape);
1381 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1382 success = meanGeneric<uint8_t, int32_t>(
1383 reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1384 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1385 reinterpret_cast<uint8_t*>(output.buffer), outShape);
1386 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1387 success = meanGeneric<int8_t, int32_t>(
1388 reinterpret_cast<int8_t*>(input.buffer), input.shape(),
1389 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1390 reinterpret_cast<int8_t*>(output.buffer), outShape);
1391 }
1392 } break;
1393 case OperationType::ARGMAX:
1394 case OperationType::ARGMIN: {
1395 if (!allParametersPresent(2, 1)) {
1396 return ANEURALNETWORKS_BAD_DATA;
1397 }
1398 const RunTimeOperandInfo& input = operands[ins[0]];
1399 int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1400
1401 RunTimeOperandInfo& output = operands[outs[0]];
1402 Shape outShape = output.shape();
1403
1404 const bool isArgMin = operation.type == OperationType::ARGMIN;
1405 success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1406 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1407 argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1408 outShape);
1409 } break;
1410 case OperationType::EXPAND_DIMS: {
1411 if (!allParametersPresent(2, 1)) {
1412 return ANEURALNETWORKS_BAD_DATA;
1413 }
1414 const RunTimeOperandInfo& input = operands[ins[0]];
1415 int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1416
1417 RunTimeOperandInfo& output = operands[outs[0]];
1418 Shape outShape = output.shape();
1419
1420 success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1421 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1422 expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1423 } break;
1424 case OperationType::SPLIT: {
1425 const size_t outCount = outs.size();
1426 if (!allParametersPresent(3, outCount)) {
1427 return ANEURALNETWORKS_BAD_DATA;
1428 }
1429
1430 const RunTimeOperandInfo& input = operands[ins[0]];
1431 const int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1432 const int32_t numOutputs = getScalarData<int32_t>(operands[ins[2]]);
1433
1434 if (numOutputs != outs.size()) {
1435 return ANEURALNETWORKS_BAD_DATA;
1436 }
1437
1438 std::vector<Shape> outputShapes(numOutputs);
1439 for (int i = 0; i < numOutputs; ++i) {
1440 outputShapes[i] = operands[outs[i]].shape();
1441 }
1442
1443 success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1444 for (int i = 0; i < numOutputs; ++i) {
1445 success = success && setInfoAndAllocateIfNeeded(&(operands[outs[i]]),
1446 outputShapes[i], &result);
1447 }
1448 switch (input.type) {
1449 case OperandType::TENSOR_FLOAT16: {
1450 std::vector<_Float16*> outputDataPtrs(numOutputs);
1451 for (int i = 0; i < numOutputs; ++i) {
1452 outputDataPtrs[i] = reinterpret_cast<_Float16*>(operands[outs[i]].buffer);
1453 }
1454 success = success &&
1455 splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1456 input.shape(), axis, &outputDataPtrs, outputShapes);
1457 } break;
1458 case OperandType::TENSOR_FLOAT32: {
1459 std::vector<float*> outputDataPtrs(numOutputs);
1460 for (int i = 0; i < numOutputs; ++i) {
1461 outputDataPtrs[i] = reinterpret_cast<float*>(operands[outs[i]].buffer);
1462 }
1463 success = success &&
1464 splitFloat32(reinterpret_cast<const float*>(input.buffer),
1465 input.shape(), axis, &outputDataPtrs, outputShapes);
1466 } break;
1467 case OperandType::TENSOR_INT32: {
1468 std::vector<int32_t*> outputDataPtrs(numOutputs);
1469 for (int i = 0; i < numOutputs; ++i) {
1470 outputDataPtrs[i] = reinterpret_cast<int32_t*>(operands[outs[i]].buffer);
1471 }
1472 success = success &&
1473 splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1474 input.shape(), axis, &outputDataPtrs, outputShapes);
1475 } break;
1476 case OperandType::TENSOR_QUANT8_ASYMM: {
1477 std::vector<uint8_t*> outputDataPtrs(numOutputs);
1478 for (int i = 0; i < numOutputs; ++i) {
1479 outputDataPtrs[i] = reinterpret_cast<uint8_t*>(operands[outs[i]].buffer);
1480 }
1481 success = success &&
1482 splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1483 input.shape(), axis, &outputDataPtrs, outputShapes);
1484 } break;
1485 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1486 std::vector<int8_t*> outputDataPtrs(numOutputs);
1487 for (int i = 0; i < numOutputs; ++i) {
1488 outputDataPtrs[i] = reinterpret_cast<int8_t*>(operands[outs[i]].buffer);
1489 }
1490 success = success &&
1491 splitQuant8Signed(reinterpret_cast<const int8_t*>(input.buffer),
1492 input.shape(), axis, &outputDataPtrs, outputShapes);
1493 } break;
1494 default: {
1495 return ANEURALNETWORKS_BAD_DATA;
1496 }
1497 }
1498 } break;
1499 case OperationType::MAXIMUM:
1500 case OperationType::MINIMUM: {
1501 if (!allParametersPresent(2, 1)) {
1502 return ANEURALNETWORKS_BAD_DATA;
1503 }
1504 const RunTimeOperandInfo& in1 = operands[ins[0]];
1505 const RunTimeOperandInfo& in2 = operands[ins[1]];
1506
1507 RunTimeOperandInfo& output = operands[outs[0]];
1508 Shape outputShape = output.shape();
1509
1510 const bool isMinimum = operation.type == OperationType::MINIMUM;
1511 success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1512 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1513 maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1514 isMinimum, output.buffer, outputShape);
1515 } break;
1516 case OperationType::GROUPED_CONV_2D: {
1517 const size_t inCount = ins.size();
1518 if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1519 return ANEURALNETWORKS_BAD_DATA;
1520 }
1521 const RunTimeOperandInfo& input = operands[ins[0]];
1522 const RunTimeOperandInfo& filter = operands[ins[1]];
1523 const RunTimeOperandInfo& bias = operands[ins[2]];
1524
1525 int32_t padding_left, padding_right;
1526 int32_t padding_top, padding_bottom;
1527 int32_t padding_implicit = 0;
1528 int32_t stride_width, stride_height;
1529 int32_t numGroups;
1530 int32_t activation;
1531 bool data_layout = false;
1532
1533 if (inCount == 12) {
1534 padding_left = getScalarData<int32_t>(operands[ins[3]]);
1535 padding_right = getScalarData<int32_t>(operands[ins[4]]);
1536 padding_top = getScalarData<int32_t>(operands[ins[5]]);
1537 padding_bottom = getScalarData<int32_t>(operands[ins[6]]);
1538 stride_width = getScalarData<int32_t>(operands[ins[7]]);
1539 stride_height = getScalarData<int32_t>(operands[ins[8]]);
1540 numGroups = getScalarData<int32_t>(operands[ins[9]]);
1541 activation = getScalarData<int32_t>(operands[ins[10]]);
1542 data_layout = getScalarData<bool>(operands[ins[11]]);
1543 } else {
1544 padding_implicit = getScalarData<int32_t>(operands[ins[3]]);
1545 stride_width = getScalarData<int32_t>(operands[ins[4]]);
1546 stride_height = getScalarData<int32_t>(operands[ins[5]]);
1547 numGroups = getScalarData<int32_t>(operands[ins[6]]);
1548 activation = getScalarData<int32_t>(operands[ins[7]]);
1549 data_layout = getScalarData<bool>(operands[ins[8]]);
1550 }
1551
1552 RunTimeOperandInfo& output = operands[outs[0]];
1553 Shape outShape = output.shape();
1554
1555 RunTimeOperandInfo input_tmp, output_tmp;
1556 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1557 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1558 success = false;
1559 break;
1560 }
1561 output_tmp.lifetime = OperandLifeTime::TEMPORARY_VARIABLE;
1562 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1563 output_tmp.length = data_layout ? 0 : output.length;
1564
1565 if (inCount == 9) {
1566 Shape inputShape = input_tmp.shape();
1567 Shape filterShape = filter.shape();
1568 int32_t input_width = getSizeOfDimension(inputShape, 2);
1569 int32_t input_height = getSizeOfDimension(inputShape, 1);
1570 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1571 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1572 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1573 &padding_left, &padding_right);
1574 calculateExplicitPadding(input_height, stride_height, filter_height,
1575 padding_implicit, &padding_top, &padding_bottom);
1576 }
1577
1578 if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1579 padding_right, padding_top, padding_bottom, stride_width,
1580 stride_height, numGroups, &outShape) ||
1581 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1582 if (!data_layout) output.dimensions = output_tmp.dimensions;
1583 success = false;
1584 break;
1585 }
1586
1587 if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1588 success = groupedConvFloat32(
1589 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1590 reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1591 reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1592 padding_right, padding_top, padding_bottom, stride_width, stride_height,
1593 numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1594 outShape);
1595 } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1596 success = groupedConvFloat16(
1597 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1598 reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1599 reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1600 padding_right, padding_top, padding_bottom, stride_width, stride_height,
1601 numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1602 outShape);
1603 } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1604 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1605 success = groupedConvQuant8PerChannel(
1606 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1607 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1608 filter.extraParams.channelQuant().scales.data(),
1609 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1610 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1611 stride_height, numGroups, activation,
1612 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1613 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1614 success = groupedConvQuant8(
1615 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1616 reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1617 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1618 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1619 stride_height, numGroups, activation,
1620 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1621 }
1622 } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1623 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1624 success = groupedConvQuant8PerChannel(
1625 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1626 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1627 filter.extraParams.channelQuant().scales.data(),
1628 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1629 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1630 stride_height, numGroups, activation,
1631 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1632 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1633 success = groupedConvQuant8(
1634 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1635 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1636 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1637 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1638 stride_height, numGroups, activation,
1639 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1640 }
1641 }
1642
1643 if (data_layout) {
1644 output_tmp_guard.reset(output_tmp.buffer);
1645 }
1646 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1647 success = false;
1648 break;
1649 }
1650 } break;
1651 case OperationType::TILE: {
1652 if (!allParametersPresent(2, 1)) {
1653 return ANEURALNETWORKS_BAD_DATA;
1654 }
1655 const RunTimeOperandInfo& input = operands[ins[0]];
1656 const RunTimeOperandInfo& multiples = operands[ins[1]];
1657
1658 RunTimeOperandInfo& output = operands[outs[0]];
1659 Shape outShape = output.shape();
1660
1661 success =
1662 tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1663 multiples.shape(), &outShape) &&
1664 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1665 tile::eval(input.buffer, input.shape(),
1666 reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1667 outShape);
1668 } break;
1669 case OperationType::QUANTIZED_16BIT_LSTM: {
1670 if (!allParametersPresent(15, 2)) {
1671 return ANEURALNETWORKS_BAD_DATA;
1672 }
1673
1674 RunTimeOperandInfo& cellStateOut =
1675 operands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1676 RunTimeOperandInfo& output = operands[outs[QuantizedLSTMCell::kOutputTensor]];
1677
1678 Shape cellStateOutShape, outputShape;
1679 QuantizedLSTMCell quantizedLSTMCell(operation, operands);
1680
1681 success = QuantizedLSTMCell::prepare(operation, operands, &cellStateOutShape,
1682 &outputShape) &&
1683 setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1684 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1685 quantizedLSTMCell.eval();
1686 } break;
1687 case OperationType::POW: {
1688 if (!allParametersPresent(2, 1)) {
1689 return ANEURALNETWORKS_BAD_DATA;
1690 }
1691 const RunTimeOperandInfo& base = operands[ins[0]];
1692 const RunTimeOperandInfo& exponent = operands[ins[1]];
1693
1694 RunTimeOperandInfo& output = operands[outs[0]];
1695 Shape outShape = output.shape();
1696
1697 success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1698 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1699 pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1700 output.buffer, outShape);
1701 } break;
1702 default: {
1703 const OperationRegistration* operationRegistration =
1704 mOperationResolver->findOperation(operation.type);
1705 if (operationRegistration == nullptr) {
1706 LOG(ERROR) << getOperationName(operation.type) << " not registered";
1707 } else if (operationRegistration->prepare == nullptr ||
1708 operationRegistration->execute == nullptr) {
1709 LOG(ERROR) << "Incomplete operation registration: "
1710 << getOperationName(operation.type);
1711 } else {
1712 OperationExecutionContext context(&operation, operands);
1713 success = operationRegistration->flags.allowOmittedOperand ||
1714 context.checkNoOmittedOperand();
1715 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1716 context.checkNoZeroSizedInput());
1717 success = success && operationRegistration->prepare(&context) &&
1718 operationRegistration->execute(&context);
1719 result = context.getResultCode();
1720 }
1721 }
1722 }
1723 if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1724 result = ANEURALNETWORKS_OP_FAILED;
1725 }
1726 if (result != ANEURALNETWORKS_NO_ERROR) {
1727 LOG(ERROR) << getOperationName(operation.type) << " failed.";
1728 return result;
1729 }
1730
1731 consumeOperationInputs(ins, operands);
1732 return ANEURALNETWORKS_NO_ERROR;
1733 }
1734
1735 // Copies RunTimeOperandInfo, preserving the original lifetime and numberOfUsesLeft
1736 // to prevent deallocation of subgraph inputs and outputs.
setInfoExceptLifetime(RunTimeOperandInfo * to,const RunTimeOperandInfo & from)1737 static void setInfoExceptLifetime(RunTimeOperandInfo* to, const RunTimeOperandInfo& from) {
1738 auto originalLifetime = to->lifetime;
1739 auto originalNumberOfUsesLeft = to->numberOfUsesLeft;
1740 *to = from;
1741 to->lifetime = originalLifetime;
1742 to->numberOfUsesLeft = originalNumberOfUsesLeft;
1743 }
1744
executeIfOperation(const Operation & operation,RunTimeOperandInfo * operands)1745 int CpuExecutor::executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1746 namespace op = operation_if;
1747 const RunTimeOperandInfo& condOperand = operands[operation.inputs[op::kCondBoolOperand]];
1748 if (condOperand.buffer == nullptr) {
1749 LOG(ERROR) << "Cannot read IF condition operand value";
1750 return ANEURALNETWORKS_OP_FAILED;
1751 }
1752 const bool condValue = *reinterpret_cast<const bool8*>(condOperand.buffer);
1753 VLOG(CPUEXE) << "CpuExecutor::executeIfOperation: condition value: " << condValue;
1754
1755 const uint32_t branchInputIndex = condValue ? op::kThenModelOperand : op::kElseModelOperand;
1756 const RunTimeOperandInfo& branchOperand = operands[operation.inputs[branchInputIndex]];
1757 const Subgraph& branchSubgraph = *reinterpret_cast<const Subgraph*>(branchOperand.buffer);
1758 std::vector<RunTimeOperandInfo> branchOperands = initializeRunTimeInfo(branchSubgraph);
1759
1760 // Initialize inner input and output operands from outer operands.
1761 for (uint32_t i = 0, n = branchSubgraph.inputIndexes.size(); i < n; ++i) {
1762 setInfoExceptLifetime(&branchOperands[branchSubgraph.inputIndexes[i]],
1763 operands[operation.inputs[op::kFirstInput + i]]);
1764 }
1765 for (uint32_t i = 0, n = branchSubgraph.outputIndexes.size(); i < n; ++i) {
1766 setInfoExceptLifetime(&branchOperands[branchSubgraph.outputIndexes[i]],
1767 operands[operation.outputs[i]]);
1768 }
1769
1770 NN_RETURN_IF_ERROR(executeSubgraph(branchSubgraph, branchOperands.data()));
1771 freeUnusedSubgraphOperands(&branchOperands);
1772
1773 // Update outer outputs.
1774 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1775 setInfoExceptLifetime(&operands[operation.outputs[i]],
1776 branchOperands[branchSubgraph.outputIndexes[i]]);
1777 }
1778
1779 consumeOperationInputs(operation.inputs, operands);
1780 return ANEURALNETWORKS_NO_ERROR;
1781 }
1782
executeWhileOperation(const Operation & operation,RunTimeOperandInfo * operands)1783 int CpuExecutor::executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1784 namespace op = operation_while;
1785 const RunTimeOperandInfo& condModelOperand = operands[operation.inputs[op::kCondModelOperand]];
1786 const RunTimeOperandInfo& bodyModelOperand = operands[operation.inputs[op::kBodyModelOperand]];
1787 const Subgraph& condSubgraph = *reinterpret_cast<const Subgraph*>(condModelOperand.buffer);
1788 const Subgraph& bodySubgraph = *reinterpret_cast<const Subgraph*>(bodyModelOperand.buffer);
1789 std::vector<RunTimeOperandInfo> condOperands = initializeRunTimeInfo(condSubgraph);
1790 std::vector<RunTimeOperandInfo> bodyOperands = initializeRunTimeInfo(bodySubgraph);
1791
1792 // The code below implements the following sequence of subgraph input and output buffer
1793 // assignments:
1794 // iteration = 0 cond inputs = body inputs = outer inputs body outputs = tmp1
1795 // iteration = 1 cond inputs = body inputs = tmp1 body outputs = tmp2
1796 // iteration = 2 cond inputs = body inputs = tmp2 body outputs = tmp1
1797 // iteration = 3 cond inputs = body inputs = ... body outputs = ...
1798
1799 // For body output double buffering.
1800 std::vector<uint8_t*> tmp1(bodySubgraph.outputIndexes.size());
1801 std::vector<uint8_t*> tmp2(bodySubgraph.outputIndexes.size());
1802
1803 // For body outputs with unknown shape, we skip double buffering and
1804 // allocate on each iteration instead. This allows growing output tensors
1805 // inside a WHILE loop.
1806 std::vector<bool> bodyOutputHasUnknownShape(bodySubgraph.outputIndexes.size());
1807 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1808 const Operand& operand = bodySubgraph.operands[bodySubgraph.outputIndexes[i]];
1809 bodyOutputHasUnknownShape[i] = nonExtensionOperandSizeOfData(operand) == 0;
1810 }
1811
1812 // Initialize condition inputs from outer operands.
1813 for (uint32_t i = 0, n = condSubgraph.inputIndexes.size(); i < n; ++i) {
1814 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1815 operands[operation.inputs[op::kFirstInput + i]]);
1816 }
1817
1818 // Store condition output on the stack.
1819 RunTimeOperandInfo& condOutput = condOperands[condSubgraph.outputIndexes[0]];
1820 bool8 condValue = {/* initialized memory */};
1821 condOutput.buffer = &condValue;
1822 condOutput.length = sizeof(condValue);
1823
1824 std::chrono::nanoseconds timeoutDuration(mLoopTimeoutDuration);
1825 const auto startTime = std::chrono::steady_clock::now();
1826 for (uint32_t iteration = 0;; ++iteration) {
1827 VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: iteration " << iteration;
1828 if (iteration != 0) {
1829 // Set condition inputs from previous iteration outputs.
1830 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1831 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1832 bodyOperands[bodySubgraph.outputIndexes[i]]);
1833 }
1834 }
1835 NN_RETURN_IF_ERROR(executeSubgraph(condSubgraph, condOperands.data()));
1836 VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: condition value: "
1837 << static_cast<int>(condValue);
1838 if (!condValue) {
1839 break;
1840 }
1841
1842 const auto duration = std::chrono::steady_clock::now() - startTime;
1843 if (duration > timeoutDuration) {
1844 LOG(ERROR) << "CpuExecutor::executeWhileOperation: timed out after "
1845 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1846 << " ms";
1847 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1848 }
1849
1850 // Set body inputs from condition inputs.
1851 for (uint32_t i = 0, n = bodySubgraph.inputIndexes.size(); i < n; ++i) {
1852 bodyOperands[bodySubgraph.inputIndexes[i]] = condOperands[condSubgraph.inputIndexes[i]];
1853 }
1854 // Set body outputs.
1855 auto& outputBuffer = iteration % 2 == 0 ? tmp1 : tmp2;
1856 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1857 RunTimeOperandInfo& info = bodyOperands[bodySubgraph.outputIndexes[i]];
1858 if (bodyOutputHasUnknownShape[i]) {
1859 // Reset dimensions and buffer.
1860 info.dimensions = bodySubgraph.operands[bodySubgraph.outputIndexes[i]].dimensions;
1861 if (outputBuffer[i] != nullptr) {
1862 delete[] outputBuffer[i];
1863 outputBuffer[i] = nullptr;
1864 }
1865 }
1866 info.buffer = outputBuffer[i];
1867 }
1868
1869 NN_RETURN_IF_ERROR(executeSubgraph(bodySubgraph, bodyOperands.data()));
1870
1871 // Update output buffer information in case we have allocated new buffers.
1872 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1873 outputBuffer[i] = bodyOperands[bodySubgraph.outputIndexes[i]].buffer;
1874 }
1875 }
1876
1877 // Copy body outputs to outer outputs.
1878 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1879 RunTimeOperandInfo& outerOperand = operands[operation.outputs[i]];
1880 RunTimeOperandInfo& innerOperand = condOperands[condSubgraph.inputIndexes[i]];
1881 if (int error; !setInfoAndAllocateIfNeeded(&outerOperand, innerOperand.shape(), &error)) {
1882 return error;
1883 }
1884 CHECK_EQ(outerOperand.length, innerOperand.length);
1885 // TODO: Use the outer buffer as tmp1 to avoid copies.
1886 std::memcpy(outerOperand.buffer, innerOperand.buffer, innerOperand.length);
1887 }
1888
1889 auto freeLoopOutputs = [](const std::vector<uint8_t*>& tmp) {
1890 for (auto buffer : tmp) {
1891 if (buffer != nullptr) {
1892 delete[] buffer;
1893 }
1894 }
1895 };
1896 freeLoopOutputs(tmp1);
1897 freeLoopOutputs(tmp2);
1898 freeUnusedSubgraphOperands(&condOperands);
1899 freeUnusedSubgraphOperands(&bodyOperands);
1900 consumeOperationInputs(operation.inputs, operands);
1901
1902 return ANEURALNETWORKS_NO_ERROR;
1903 }
1904
setOutputShapes(const std::vector<uint32_t> & outputIndexes,const std::vector<RunTimeOperandInfo> & operands)1905 void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
1906 const std::vector<RunTimeOperandInfo>& operands) {
1907 mOutputShapes.resize(outputIndexes.size());
1908 for (uint32_t i = 0; i < outputIndexes.size(); i++) {
1909 const uint32_t operandIndex = outputIndexes[i];
1910 const RunTimeOperandInfo& from = operands[operandIndex];
1911 mOutputShapes[i].dimensions = from.dimensions;
1912 mOutputShapes[i].isSufficient = from.isSufficient();
1913 }
1914 }
1915
1916 // b/109953668, disable OpenMP
1917 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1918 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1919 mBlocktimeInitial = kmp_get_blocktime();
1920 kmp_set_blocktime(20); // ms, see b/109645291
1921
1922 #if NNAPI_LIMIT_CPU_THREADS
1923 // Code not yet enabled. Choosing the number of threads to be based on
1924 // benchmarking. See longer comment by the class declaration.
1925 mMaxThreadsInitial = Eigen::nbThreads();
1926 const int nProcs = omp_get_num_procs();
1927 int threads = nProcs;
1928 if (nProcs >= 8) {
1929 threads = nProcs - 4;
1930 } else if (nProcs >= 4) {
1931 threads = nProcs - 2;
1932 }
1933 Eigen::setNbThreads(threads);
1934 #endif
1935 }
1936
~ScopedOpenmpSettings()1937 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1938 kmp_set_blocktime(mBlocktimeInitial);
1939 #if NNAPI_LIMIT_CPU_THREADS
1940 Eigen::setNbThreads(mMaxThreadsInitial);
1941 #endif
1942 }
1943 #endif // NNAPI_OPENMP
1944
1945 } // namespace nn
1946 } // namespace android
1947