1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Manager"
18
19 #include "Manager.h"
20
21 #include <android/hidl/manager/1.2/IServiceManager.h>
22 #include <android/sync.h>
23 #include <build/version.h>
24 #include <cutils/native_handle.h>
25 #include <hidl/HidlTransportSupport.h>
26 #include <hidl/ServiceManagement.h>
27
28 #include <algorithm>
29 #include <functional>
30 #include <memory>
31 #include <string>
32 #include <tuple>
33 #include <utility>
34 #include <vector>
35
36 #include "Callbacks.h"
37 #include "CpuExecutor.h"
38 #include "ExecutionBurstController.h"
39 #include "HalInterfaces.h"
40 #include "Memory.h"
41 #include "MetaModel.h"
42 #include "ModelArgumentInfo.h"
43 #include "Tracing.h"
44 #include "TypeManager.h"
45 #include "Utils.h"
46 #include "VersionedInterfaces.h"
47
48 namespace android {
49 namespace nn {
50
51 using namespace hal;
52
53 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
54
55 // A Device with actual underlying driver
56 class DriverDevice : public Device {
57 public:
58 // Create a DriverDevice from a name and a DeviceFactory function.
59 // Returns nullptr on failure.
60 static std::shared_ptr<DriverDevice> create(const std::string& name,
61 const DeviceFactory& makeDevice);
62
63 // Prefer using DriverDevice::create
64 DriverDevice(std::shared_ptr<VersionedIDevice> device);
65
getName() const66 const std::string& getName() const override { return kInterface->getName(); }
getVersionString() const67 const std::string& getVersionString() const override { return kInterface->getVersionString(); }
getFeatureLevel() const68 int64_t getFeatureLevel() const override { return kInterface->getFeatureLevel(); }
getType() const69 int32_t getType() const override { return kInterface->getType(); }
getSupportedExtensions() const70 const std::vector<Extension>& getSupportedExtensions() const override {
71 return kInterface->getSupportedExtensions();
72 }
73 std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getPerformance(OperandType type) const74 PerformanceInfo getPerformance(OperandType type) const override {
75 const auto& capabilities = kInterface->getCapabilities();
76 return lookup(capabilities.operandPerformance, type);
77 }
getRelaxedFloat32toFloat16PerformanceScalar() const78 PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
79 const auto& capabilities = kInterface->getCapabilities();
80 return capabilities.relaxedFloat32toFloat16PerformanceScalar;
81 }
getRelaxedFloat32toFloat16PerformanceTensor() const82 PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
83 const auto& capabilities = kInterface->getCapabilities();
84 return capabilities.relaxedFloat32toFloat16PerformanceTensor;
85 }
getIfPerformance() const86 PerformanceInfo getIfPerformance() const override {
87 const auto& capabilities = kInterface->getCapabilities();
88 return capabilities.ifPerformance;
89 }
getWhilePerformance() const90 PerformanceInfo getWhilePerformance() const override {
91 const auto& capabilities = kInterface->getCapabilities();
92 return capabilities.whilePerformance;
93 }
isCachingSupported() const94 bool isCachingSupported() const override {
95 // Caching is supported if either of numModelCache or numDataCache is greater than 0.
96 const auto [numModelCacheFiles, numDataCacheFiles] =
97 kInterface->getNumberOfCacheFilesNeeded();
98 return numModelCacheFiles > 0 || numDataCacheFiles > 0;
99 }
wait() const100 int wait() const override { return kInterface->wait(); }
101
102 std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
103 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
104 const std::optional<Deadline>& deadline, const std::string& cacheDir,
105 const std::optional<CacheToken>& maybeToken) const override;
106
107 std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
108 hal::OperandType) const override;
109
110 private:
111 const std::shared_ptr<VersionedIDevice> kInterface;
112
113 #ifdef NN_DEBUGGABLE
114 // For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
115 // 0 - all operations reported by IDevice::getSupportedOperations() supported
116 // 1 - some operations reported by IDevice::getSupportedOperations() supported
117 uint32_t mSupported = 0;
118 #endif // NN_DEBUGGABLE
119 };
120
121 // A PreparedModel with underlying IPreparedModel instance return by actual driver.
122 class DriverPreparedModel : public PreparedModel {
123 public:
DriverPreparedModel(const Device * device,const std::shared_ptr<VersionedIPreparedModel> & preparedModel)124 DriverPreparedModel(const Device* device,
125 const std::shared_ptr<VersionedIPreparedModel>& preparedModel)
126 : mDevice(device), mPreparedModel(preparedModel) {
127 CHECK(mDevice != nullptr);
128 CHECK(mPreparedModel != nullptr);
129 }
130
getDevice() const131 const Device* getDevice() const override { return mDevice; }
getInterface() const132 std::shared_ptr<VersionedIPreparedModel> getInterface() const override {
133 return mPreparedModel;
134 }
135 std::tuple<int, std::vector<OutputShape>, Timing> execute(
136 const std::vector<ModelArgumentInfo>& inputs,
137 const std::vector<ModelArgumentInfo>& outputs,
138 const std::vector<const Memory*>& memories,
139 const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
140 const std::optional<Deadline>& deadline,
141 const OptionalTimeoutDuration& loopTimeoutDuration) const override;
142
143 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
144 const std::vector<ModelArgumentInfo>& inputs,
145 const std::vector<ModelArgumentInfo>& outputs,
146 const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
147 MeasureTiming measure, const std::optional<Deadline>& deadline,
148 const OptionalTimeoutDuration& loopTimeoutDuration,
149 const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
150
configureExecutionBurst(bool preferPowerOverLatency) const151 std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
152 bool preferPowerOverLatency) const override {
153 return mPreparedModel->configureExecutionBurst(preferPowerOverLatency);
154 }
155
156 private:
157 const Device* mDevice;
158 const std::shared_ptr<VersionedIPreparedModel> mPreparedModel;
159 };
160
DriverDevice(std::shared_ptr<VersionedIDevice> device)161 DriverDevice::DriverDevice(std::shared_ptr<VersionedIDevice> device)
162 : kInterface(std::move(device)) {
163 CHECK(kInterface != nullptr);
164 #ifdef NN_DEBUGGABLE
165 static const char samplePrefix[] = "sample";
166 if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
167 mSupported = getProp("debug.nn.sample.supported");
168 }
169 #endif // NN_DEBUGGABLE
170 }
171
create(const std::string & name,const DeviceFactory & makeDevice)172 std::shared_ptr<DriverDevice> DriverDevice::create(const std::string& name,
173 const DeviceFactory& makeDevice) {
174 CHECK(makeDevice != nullptr);
175 std::shared_ptr<VersionedIDevice> device = VersionedIDevice::create(name, makeDevice);
176 if (device == nullptr) {
177 LOG(ERROR) << "DriverDevice::create failed to create VersionedIDevice object for service "
178 << name;
179 return nullptr;
180 }
181
182 return std::make_shared<DriverDevice>(std::move(device));
183 }
184
getSupportedOperations(const MetaModel & metaModel) const185 std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
186 // Query the driver for what it can do.
187 ErrorStatus status = ErrorStatus::GENERAL_FAILURE;
188 std::vector<bool> supportedOperations;
189 std::tie(status, supportedOperations) = kInterface->getSupportedOperations(metaModel);
190
191 const Model& hidlModel = metaModel.getModel();
192 const uint32_t operationCount = hidlModel.main.operations.size();
193 if (status != ErrorStatus::NONE) {
194 LOG(ERROR) << "IDevice::getSupportedOperations returned the error " << toString(status);
195 // Set the supported operation vectors to all false, so we won't use this driver.
196 return std::vector<bool>(operationCount, false);
197 }
198 if (supportedOperations.size() != operationCount) {
199 LOG(ERROR) << "IDevice::getSupportedOperations returned a vector of length "
200 << supportedOperations.size() << " when expecting " << operationCount;
201 // Set the supported operation vectors to all false, so we won't use this driver.
202 return std::vector<bool>(operationCount, false);
203 }
204
205 #ifdef NN_DEBUGGABLE
206 if (mSupported != 1) {
207 return supportedOperations;
208 }
209
210 const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
211 for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
212 if (!supportedOperations[operationIndex]) {
213 continue;
214 }
215
216 uint32_t accumulator = baseAccumulator;
217 const Operation& operation = hidlModel.main.operations[operationIndex];
218 accumulator ^= static_cast<uint32_t>(operation.type);
219 auto accumulateOperands = [&hidlModel, &accumulator](const hidl_vec<uint32_t>& operands) {
220 for (uint32_t operandIndex : operands) {
221 const Operand& operand = hidlModel.main.operands[operandIndex];
222 accumulator ^= static_cast<uint32_t>(operand.type);
223 accumulator ^= operand.dimensions.size();
224 for (uint32_t dimension : operand.dimensions) {
225 accumulator ^= dimension;
226 if (operand.lifetime == OperandLifeTime::CONSTANT_COPY ||
227 operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
228 accumulator ^= 1;
229 }
230 }
231 }
232 };
233 accumulateOperands(operation.inputs);
234 accumulateOperands(operation.outputs);
235 if (accumulator & 1) {
236 supportedOperations[operationIndex] = false;
237 }
238 }
239 #endif // NN_DEBUGGABLE
240
241 return supportedOperations;
242 }
243
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const std::optional<Deadline> & deadline,const std::string & cacheDir,const std::optional<CacheToken> & maybeToken) const244 std::pair<int, std::shared_ptr<PreparedModel>> DriverDevice::prepareModel(
245 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
246 const std::optional<Deadline>& deadline, const std::string& cacheDir,
247 const std::optional<CacheToken>& maybeToken) const {
248 const auto [n, preparedModel] = kInterface->prepareModel(makeModel, preference, priority,
249 deadline, cacheDir, maybeToken);
250 if (n != ANEURALNETWORKS_NO_ERROR) {
251 return {n, nullptr};
252 }
253 CHECK(preparedModel != nullptr) << "prepareModel returned nullptr without error code";
254 return {ANEURALNETWORKS_NO_ERROR, std::make_shared<DriverPreparedModel>(this, preparedModel)};
255 }
256
allocate(const MemoryDescriptor & desc,hal::OperandType) const257 std::pair<int, std::unique_ptr<Memory>> DriverDevice::allocate(const MemoryDescriptor& desc,
258 hal::OperandType) const {
259 const BufferDesc hidlDesc = {.dimensions = desc.dimensions};
260 std::vector<std::shared_ptr<VersionedIPreparedModel>> preparedModels(
261 desc.preparedModels.size());
262 std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
263 [](const auto* preparedModel) {
264 const auto versionedPreparedModel = preparedModel->getInterface();
265 CHECK(versionedPreparedModel != nullptr);
266 return versionedPreparedModel;
267 });
268 auto [status, buffer, token] =
269 kInterface->allocate(hidlDesc, preparedModels, desc.inputRoles, desc.outputRoles);
270 if (status != ErrorStatus::NONE) {
271 LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
272 << " failed!";
273 return {convertErrorStatusToResultCode(status), nullptr};
274 }
275 return MemoryFromDevice::create(std::move(buffer), token);
276 }
277
278 // Figures out how to place each of the input or outputs in a buffer. This just
279 // does the layout and memory allocation, it does not copy data. Aligns each
280 // input a bit.
281 static std::tuple<int, std::unique_ptr<MemoryAshmem>, std::vector<DataLocation>>
allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo> & args,std::vector<const Memory * > * memories)282 allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo>& args,
283 std::vector<const Memory*>* memories) {
284 CHECK(memories != nullptr);
285 std::vector<DataLocation> ptrArgsLocations;
286 const uint32_t nextPoolIndex = memories->size();
287 int64_t total = 0;
288 for (const auto& info : args) {
289 if (info.state() == ModelArgumentInfo::POINTER) {
290 // TODO Good enough alignment?
291 total += alignBytesNeeded(static_cast<uint32_t>(total), info.length());
292 ptrArgsLocations.push_back({.poolIndex = nextPoolIndex,
293 .offset = static_cast<uint32_t>(total),
294 .length = info.length()});
295 total += info.length();
296 }
297 };
298 if (total > 0xFFFFFFFF) {
299 LOG(ERROR) << "allocatePointerArgumentsToPool: ANeuralNetworksExecution: Size of all "
300 "inputs or outputs exceeds 2^32.";
301 return {ANEURALNETWORKS_BAD_DATA, nullptr, std::vector<DataLocation>{}};
302 }
303 if (total <= 0) {
304 return {ANEURALNETWORKS_NO_ERROR, nullptr, std::vector<DataLocation>{}};
305 }
306 auto [n, memory] = MemoryAshmem::create(total);
307 if (n != ANEURALNETWORKS_NO_ERROR) {
308 return {n, nullptr, std::vector<DataLocation>{}};
309 }
310 memories->push_back(memory.get());
311 return {ANEURALNETWORKS_NO_ERROR, std::move(memory), std::move(ptrArgsLocations)};
312 }
313
314 // Perform computation on an actual HIDL driver.
315 //
316 // Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
317 // outputs specified by pointers. The input pointer data will be copied to the input pool prior to
318 // execution, and the output pointer data will be copied out from the output pool after the
319 // execution.
320 //
321 // The HIDL invocation will choose between sync/async execution according to
322 // DeviceManager::mSyncExecHal.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> & burstController,MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration) const323 std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
324 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
325 const std::vector<const Memory*>& memories,
326 const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
327 const std::optional<Deadline>& deadline,
328 const OptionalTimeoutDuration& loopTimeoutDuration) const {
329 NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
330
331 // Make a copy of the memory tracker as we will append memory pools for pointer arguments.
332 std::vector<const Memory*> localMemories = memories;
333
334 // We separate the input & output pools so accelerators only need to copy
335 // the contents of the input pools. We could also use it to set protection
336 // on read only memory but that's not currently done.
337
338 // Layout the input and output data
339 const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
340 allocatePointerArgumentsToPool(inputs, &localMemories);
341 if (n1 != ANEURALNETWORKS_NO_ERROR) {
342 return {n1, {}, kNoTiming};
343 }
344 const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
345 allocatePointerArgumentsToPool(outputs, &localMemories);
346 if (n2 != ANEURALNETWORKS_NO_ERROR) {
347 return {n2, {}, kNoTiming};
348 }
349
350 // Copy the input data that was specified via a pointer.
351 if (inputPtrArgsMemory != nullptr) {
352 uint32_t ptrInputIndex = 0;
353 for (const auto& info : inputs) {
354 if (info.state() == ModelArgumentInfo::POINTER) {
355 const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
356 uint8_t* const data = inputPtrArgsMemory->getPointer();
357 memcpy(data + loc.offset, info.buffer(), loc.length);
358 }
359 }
360 }
361
362 Request request;
363 request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
364 request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
365 uint32_t count = localMemories.size();
366 request.pools.resize(count);
367 for (uint32_t i = 0; i < count; i++) {
368 request.pools[i] = localMemories[i]->getMemoryPool();
369 }
370
371 NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
372 "DriverPreparedModel::execute::execute");
373
374 int n = ANEURALNETWORKS_OP_FAILED;
375 std::vector<OutputShape> outputShapes;
376 Timing timing = kNoTiming;
377
378 // compute using burst if present
379 const bool burstCompute = (burstController != nullptr);
380 bool burstFallback = true;
381 if (burstCompute) {
382 const bool compliant = compliantWithV1_2(request);
383 if (compliant) {
384 V1_0::Request request12 = convertToV1_2(request);
385 std::vector<intptr_t> memoryIds;
386 memoryIds.reserve(localMemories.size());
387 for (const Memory* memory : localMemories) {
388 memory->usedBy(burstController);
389 memoryIds.push_back(memory->getKey());
390 }
391
392 VLOG(EXECUTION) << "Before ExecutionBurstController->compute() "
393 << SHOW_IF_DEBUG(toString(request12));
394 std::tie(n, outputShapes, timing, burstFallback) =
395 burstController->compute(request12, measure, memoryIds);
396 }
397 }
398
399 // compute from IPreparedModel if either:
400 // (1) burst was not supplied, or
401 // (2) the burst execution failed and requested a fallback execution
402 if (!burstCompute || burstFallback) {
403 const bool preferSynchronous = DeviceManager::get()->syncExecHal();
404 std::tie(n, outputShapes, timing) = mPreparedModel->execute(
405 request, measure, deadline, loopTimeoutDuration, preferSynchronous);
406 }
407
408 if (n != ANEURALNETWORKS_NO_ERROR) {
409 VLOG(EXECUTION) << "**Execution failed**";
410 return {n, std::move(outputShapes), timing};
411 }
412
413 // Copy the output data from shared memory to the output buffers.
414 NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::execute");
415 if (outputPtrArgsMemory != nullptr) {
416 uint32_t ptrOutputIndex = 0;
417 for (const auto& info : outputs) {
418 if (info.state() == ModelArgumentInfo::POINTER) {
419 const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
420 const uint8_t* const data = outputPtrArgsMemory->getPointer();
421 memcpy(info.buffer(), data + loc.offset, loc.length);
422 }
423 }
424 }
425
426 VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
427 return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
428 }
429
430 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::vector<int> & waitFor,hal::MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration,const hal::OptionalTimeoutDuration & timeoutDurationAfterFence) const431 DriverPreparedModel::executeFenced(
432 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
433 const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
434 hal::MeasureTiming measure, const std::optional<Deadline>& deadline,
435 const OptionalTimeoutDuration& loopTimeoutDuration,
436 const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const {
437 NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
438 CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd > 0; }));
439 // Make a copy of the memory tracker as we will append memory pools for pointer arguments.
440 std::vector<const Memory*> localMemories = memories;
441 sp<hal::IFencedExecutionCallback> executeFencedCallback;
442 hal::Timing timing = kNoTiming;
443
444 // We separate the input & output pools so accelerators only need to copy
445 // the contents of the input pools. We could also use it to set protection
446 // on read only memory but that's not currently done.
447
448 // Layout the input and output data
449 const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
450 allocatePointerArgumentsToPool(inputs, &localMemories);
451 if (n1 != ANEURALNETWORKS_NO_ERROR) {
452 return {n1, -1, nullptr, timing};
453 }
454 const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
455 allocatePointerArgumentsToPool(outputs, &localMemories);
456 if (n2 != ANEURALNETWORKS_NO_ERROR) {
457 return {n2, -1, nullptr, timing};
458 }
459
460 // Copy the input data that was specified via a pointer.
461 if (inputPtrArgsMemory != nullptr) {
462 uint32_t ptrInputIndex = 0;
463 for (const auto& info : inputs) {
464 if (info.state() == ModelArgumentInfo::POINTER) {
465 const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
466 uint8_t* const data = inputPtrArgsMemory->getPointer();
467 memcpy(data + loc.offset, info.buffer(), loc.length);
468 }
469 }
470 }
471
472 Request request;
473 request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
474 request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
475 uint32_t count = localMemories.size();
476 request.pools.resize(count);
477 for (uint32_t i = 0; i < count; i++) {
478 request.pools[i] = localMemories[i]->getMemoryPool();
479 }
480
481 NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
482 "DriverPreparedModel::executeFenced");
483
484 int n = ANEURALNETWORKS_OP_FAILED;
485 hidl_vec<hidl_handle> waitForHandles;
486 waitForHandles.resize(waitFor.size());
487 for (uint32_t i = 0; i < waitFor.size(); i++) {
488 native_handle_t* nativeHandle = native_handle_create(1, 0);
489 if (nativeHandle == nullptr) {
490 LOG(ERROR) << "Failed to create native_handle";
491 return {n, -1, nullptr, timing};
492 }
493 int dupFd = dup(waitFor[i]);
494 if (dupFd <= 0) {
495 LOG(ERROR) << "Unable to dup the file descriptor";
496 return {n, -1, nullptr, timing};
497 }
498 nativeHandle->data[0] = dupFd;
499 hidl_handle hidlHandle;
500 hidlHandle.setTo(nativeHandle, /*shouldOwn=*/true);
501 waitForHandles[i] = std::move(hidlHandle);
502 }
503
504 hidl_handle syncFence;
505 std::tie(n, syncFence, executeFencedCallback, timing) =
506 mPreparedModel->executeFenced(request, waitForHandles, measure, deadline,
507 loopTimeoutDuration, timeoutDurationAfterFence);
508
509 if (n != ANEURALNETWORKS_NO_ERROR) {
510 VLOG(EXECUTION) << "**executeFenced failed**";
511 return {n, -1, nullptr, timing};
512 }
513
514 int syncFenceFd = -1;
515 if (syncFence.getNativeHandle()) {
516 syncFenceFd = dup(syncFence.getNativeHandle()->data[0]);
517 if (syncFenceFd < 0) {
518 LOG(ERROR) << "Failed to dup the file descriptor";
519 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
520 }
521 }
522 // If output buffer is provided as a malloc pointer, wait for the execution to finish.
523 // Then copy the output data from shared memory to the output buffers.
524 if (outputPtrArgsMemory != nullptr) {
525 NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::executeFenced");
526 if (syncFenceFd > 0) {
527 auto r = syncWait(syncFenceFd, -1);
528 if (r != FenceState::SIGNALED) {
529 LOG(ERROR) << "syncWait failed, fd: " << syncFenceFd;
530 return {ANEURALNETWORKS_OP_FAILED, syncFenceFd, nullptr, timing};
531 }
532 }
533 uint32_t ptrOutputIndex = 0;
534 for (const auto& info : outputs) {
535 if (info.state() == ModelArgumentInfo::POINTER) {
536 const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
537 const uint8_t* const data = outputPtrArgsMemory->getPointer();
538 memcpy(info.buffer(), data + loc.offset, loc.length);
539 }
540 }
541 }
542
543 VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
544 return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedCallback, timing};
545 }
546
547 // A special abstracted device for the CPU. Only one instance of this class will exist.
548 // Use get() to retrieve it.
549 class CpuDevice : public Device {
550 public:
551 // Returns the singleton CPU fallback device.
get()552 static std::shared_ptr<CpuDevice> get() {
553 static std::shared_ptr<CpuDevice> instance(new CpuDevice);
554 return instance;
555 }
556
getName() const557 const std::string& getName() const override { return kName; }
getVersionString() const558 const std::string& getVersionString() const override { return kVersionString; }
getFeatureLevel() const559 int64_t getFeatureLevel() const override { return kFeatureLevel; }
getType() const560 int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
getSupportedExtensions() const561 const std::vector<Extension>& getSupportedExtensions() const override {
562 return kSupportedExtensions;
563 }
564 std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getPerformance(OperandType) const565 PerformanceInfo getPerformance(OperandType) const override { return kPerformance; }
getRelaxedFloat32toFloat16PerformanceScalar() const566 PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
567 return kPerformance;
568 }
getRelaxedFloat32toFloat16PerformanceTensor() const569 PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
570 return kPerformance;
571 }
getIfPerformance() const572 PerformanceInfo getIfPerformance() const override { return kPerformance; }
getWhilePerformance() const573 PerformanceInfo getWhilePerformance() const override { return kPerformance; }
isCachingSupported() const574 bool isCachingSupported() const override { return false; }
wait() const575 int wait() const override { return ANEURALNETWORKS_NO_ERROR; }
576
577 std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
578 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
579 const std::optional<Deadline>& deadline, const std::string& cacheDir,
580 const std::optional<CacheToken>& maybeToken) const override;
581
582 std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
583 OperandType type) const override;
584
585 private:
586 CpuDevice() = default;
587 const int64_t kFeatureLevel = __ANDROID_API__;
588 const std::string kName = "nnapi-reference";
589 const std::string kVersionString = build::GetBuildNumber();
590 // Since the performance is a ratio compared to the CPU performance,
591 // by definition the performance of the CPU is 1.0.
592 const PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
593 const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
594 };
595
596 // A special abstracted PreparedModel for the CPU, constructed by CpuDevice.
597 class CpuPreparedModel : public PreparedModel {
598 public:
599 // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
600 // a prepared model object if successfully created. Returns an error code
601 // and nullptr otherwise.
602 static std::pair<int, std::shared_ptr<PreparedModel>> create(Model hidlModel);
603
getDevice() const604 const Device* getDevice() const override { return CpuDevice::get().get(); }
getInterface() const605 std::shared_ptr<VersionedIPreparedModel> getInterface() const override { return nullptr; }
606
607 std::tuple<int, std::vector<OutputShape>, Timing> execute(
608 const std::vector<ModelArgumentInfo>& inputs,
609 const std::vector<ModelArgumentInfo>& outputs,
610 const std::vector<const Memory*>& memories,
611 const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
612 const std::optional<Deadline>& deadline,
613 const OptionalTimeoutDuration& loopTimeoutDuration) const override;
614
configureExecutionBurst(bool) const615 std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
616 bool /*preferPowerOverLatency*/) const override {
617 return nullptr;
618 }
619
620 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
621 const std::vector<ModelArgumentInfo>& inputs,
622 const std::vector<ModelArgumentInfo>& outputs,
623 const std::vector<const Memory*>& memories, const std::vector<int>& wait_for,
624 MeasureTiming measure, const std::optional<Deadline>& deadline,
625 const OptionalTimeoutDuration& loopTimeoutDuration,
626 const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
627
628 // Prefer to use CpuPreparedModel::create.
CpuPreparedModel(Model model,std::vector<RunTimePoolInfo> poolInfos)629 CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
630 : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
631
632 private:
633 const Model mModel;
634 const std::vector<RunTimePoolInfo> mModelPoolInfos;
635 };
636
getSupportedOperations(const MetaModel & metaModel) const637 std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
638 const Model& hidlModel = metaModel.getModel();
639 const size_t count = hidlModel.main.operations.size();
640 std::vector<bool> result(count, false);
641 for (size_t i = 0; i < count; i++) {
642 // TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
643 // We may want to use the slicer for CpuDevice just as we do for
644 // DriverDevice.
645 OperationType operationType = hidlModel.main.operations[i].type;
646 result[i] = !isExtensionOperationType(operationType) &&
647 operationType != OperationType::OEM_OPERATION;
648 }
649 return result;
650 }
651
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const std::optional<Deadline> & deadline,const std::string &,const std::optional<CacheToken> & maybeToken) const652 std::pair<int, std::shared_ptr<PreparedModel>> CpuDevice::prepareModel(
653 const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
654 const std::optional<Deadline>& deadline, const std::string& /*cacheDir*/,
655 const std::optional<CacheToken>& maybeToken) const {
656 CHECK(!maybeToken.has_value())
657 << "Should never call prepareModel with cache information on CpuDevice";
658
659 const Model model = makeModel();
660 if (!validateModel(model, ValidationMode::RUNTIME) ||
661 !validateExecutionPreference(preference) || !validatePriority(priority)) {
662 return {ANEURALNETWORKS_OP_FAILED, nullptr};
663 }
664 if (hasDeadlinePassed(deadline)) {
665 return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
666 }
667
668 return CpuPreparedModel::create(model);
669 }
670
allocate(const MemoryDescriptor & desc,OperandType type) const671 std::pair<int, std::unique_ptr<Memory>> CpuDevice::allocate(const MemoryDescriptor& desc,
672 OperandType type) const {
673 uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
674 if (size == 0) {
675 LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
676 return {ANEURALNETWORKS_OP_FAILED, nullptr};
677 }
678 return MemoryAshmem::create(size);
679 }
680
create(Model hidlModel)681 std::pair<int, std::shared_ptr<PreparedModel>> CpuPreparedModel::create(Model hidlModel) {
682 std::vector<RunTimePoolInfo> poolInfos;
683 if (!setRunTimePoolInfosFromHidlMemories(&poolInfos, hidlModel.pools)) {
684 return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
685 }
686
687 std::shared_ptr<PreparedModel> preparedModel =
688 std::make_shared<CpuPreparedModel>(std::move(hidlModel), std::move(poolInfos));
689 return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
690 }
691
computeOnCpu(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration)692 static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
693 const Model& model, const Request& request,
694 const std::vector<RunTimePoolInfo>& modelPoolInfos,
695 const std::vector<RunTimePoolInfo>& requestPoolInfos,
696 const std::optional<Deadline>& deadline,
697 const OptionalTimeoutDuration& loopTimeoutDuration) {
698 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
699 CpuExecutor executor;
700 if (loopTimeoutDuration.getDiscriminator() !=
701 OptionalTimeoutDuration::hidl_discriminator::none) {
702 executor.setLoopTimeout(loopTimeoutDuration.nanoseconds());
703 }
704 if (deadline.has_value()) {
705 executor.setDeadline(*deadline);
706 }
707 int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
708 const auto& outputShapes = executor.getOutputShapes();
709 return {err, outputShapes, kNoTiming};
710 }
711
712 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::vector<int> & waitFor,hal::MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration,const hal::OptionalTimeoutDuration & duration) const713 CpuPreparedModel::executeFenced(const std::vector<ModelArgumentInfo>& inputs,
714 const std::vector<ModelArgumentInfo>& outputs,
715 const std::vector<const Memory*>& memories,
716 const std::vector<int>& waitFor, hal::MeasureTiming measure,
717 const std::optional<Deadline>& deadline,
718 const OptionalTimeoutDuration& loopTimeoutDuration,
719 const hal::OptionalTimeoutDuration& duration) const {
720 VLOG(EXECUTION)
721 << "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
722 for (int syncFd : waitFor) {
723 if (syncFd > 0) {
724 auto r = syncWait(syncFd, -1);
725 if (r != FenceState::SIGNALED) {
726 LOG(ERROR) << "sync wait failed, fd: " << syncFd;
727 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {UINT64_MAX, UINT64_MAX}};
728 }
729 }
730 }
731
732 // Update deadline if the timeout duration is closer than the deadline.
733 auto closestDeadline = deadline;
734 if (duration.getDiscriminator() != OptionalTimeoutDuration::hidl_discriminator::none) {
735 const auto timeoutDurationDeadline = makeDeadline(duration.nanoseconds());
736 if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
737 closestDeadline = timeoutDurationDeadline;
738 }
739 }
740
741 const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
742 closestDeadline, loopTimeoutDuration);
743 return {result, -1, nullptr, timing};
744 }
745
746 // Perform computation on NNAPI CPU reference implementation.
747 //
748 // Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
749 // same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
750 // there are input/output in this method to avoid data copying.
751 //
752 // Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> &,MeasureTiming,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration) const753 std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
754 const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
755 const std::vector<const Memory*>& memories,
756 const std::shared_ptr<ExecutionBurstController>& /*burstController*/,
757 MeasureTiming /*measure*/, const std::optional<Deadline>& deadline,
758 const OptionalTimeoutDuration& loopTimeoutDuration) const {
759 if (hasDeadlinePassed(deadline)) {
760 return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, kNoTiming};
761 }
762
763 std::vector<RunTimePoolInfo> requestPoolInfos;
764 requestPoolInfos.reserve(memories.size());
765 for (const Memory* mem : memories) {
766 if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
767 requestPoolInfos.emplace_back(*poolInfo);
768 } else {
769 return {ANEURALNETWORKS_UNMAPPABLE, {}, kNoTiming};
770 }
771 }
772 // Create as many pools as there are input / output.
773 auto fixPointerArguments =
774 [&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
775 std::vector<DataLocation> ptrArgsLocations;
776 for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
777 if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
778 ptrArgsLocations.push_back(
779 {.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
780 .offset = 0,
781 .length = argumentInfo.length()});
782 requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
783 static_cast<uint8_t*>(argumentInfo.buffer())));
784 }
785 }
786 return ptrArgsLocations;
787 };
788 const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
789 const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);
790
791 Request request;
792 request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
793 request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
794
795 if (!DeviceManager::get()->syncExecCpu()) {
796 // TODO: use a thread pool
797 // TODO(mikie): this could have NNTRACE so we could measure the overhead
798 // of spinning up a new thread.
799 std::tuple<int, std::vector<OutputShape>, Timing> result = {};
800 std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
801 result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
802 loopTimeoutDuration);
803 }).join();
804 return result;
805 }
806
807 return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
808 loopTimeoutDuration);
809 }
810
get()811 DeviceManager* DeviceManager::get() {
812 static DeviceManager manager;
813 return &manager;
814 }
815
getCpuDevice()816 std::shared_ptr<Device> DeviceManager::getCpuDevice() {
817 return CpuDevice::get();
818 }
819
forTest_makeDriverDevice(const std::string & name,const sp<V1_0::IDevice> & device)820 std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const std::string& name,
821 const sp<V1_0::IDevice>& device) {
822 const DeviceFactory makeDevice = [device](bool /*blocking*/) { return device; };
823 const auto driverDevice = DriverDevice::create(name, makeDevice);
824 CHECK(driverDevice != nullptr);
825 return driverDevice;
826 }
827
findAvailableDevices()828 void DeviceManager::findAvailableDevices() {
829 VLOG(MANAGER) << "findAvailableDevices";
830
831 // register driver devices
832 const auto names = hardware::getAllHalInstanceNames(V1_0::IDevice::descriptor);
833 for (const auto& name : names) {
834 VLOG(MANAGER) << "Found interface " << name;
835 const DeviceFactory makeDevice = [name](bool blocking) {
836 return blocking ? V1_0::IDevice::getService(name) : V1_0::IDevice::tryGetService(name);
837 };
838 registerDevice(name, makeDevice);
839 }
840
841 // register CPU fallback device
842 mDevices.push_back(CpuDevice::get());
843 mDevicesCpuOnly.push_back(CpuDevice::get());
844 }
845
registerDevice(const std::string & name,const DeviceFactory & makeDevice)846 void DeviceManager::registerDevice(const std::string& name, const DeviceFactory& makeDevice) {
847 if (auto device = DriverDevice::create(name, makeDevice)) {
848 mDevices.push_back(std::move(device));
849 }
850 }
851
DeviceManager()852 DeviceManager::DeviceManager() {
853 VLOG(MANAGER) << "DeviceManager::DeviceManager";
854 findAvailableDevices();
855 #ifdef NN_DEBUGGABLE
856 mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
857 mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
858 mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
859 mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
860 if (!mSyncExecHalSetter) {
861 mSyncExecHal = (getProp("debug.nn.syncexec-hal", 1) != 0);
862 }
863 mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
864 #endif // NN_DEBUGGABLE
865 }
866
867 } // namespace nn
868 } // namespace android
869