1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Manager"
18 
19 #include "Manager.h"
20 
21 #include <android/hidl/manager/1.2/IServiceManager.h>
22 #include <android/sync.h>
23 #include <build/version.h>
24 #include <cutils/native_handle.h>
25 #include <hidl/HidlTransportSupport.h>
26 #include <hidl/ServiceManagement.h>
27 
28 #include <algorithm>
29 #include <functional>
30 #include <memory>
31 #include <string>
32 #include <tuple>
33 #include <utility>
34 #include <vector>
35 
36 #include "Callbacks.h"
37 #include "CpuExecutor.h"
38 #include "ExecutionBurstController.h"
39 #include "HalInterfaces.h"
40 #include "Memory.h"
41 #include "MetaModel.h"
42 #include "ModelArgumentInfo.h"
43 #include "Tracing.h"
44 #include "TypeManager.h"
45 #include "Utils.h"
46 #include "VersionedInterfaces.h"
47 
48 namespace android {
49 namespace nn {
50 
51 using namespace hal;
52 
53 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
54 
55 // A Device with actual underlying driver
56 class DriverDevice : public Device {
57    public:
58     // Create a DriverDevice from a name and a DeviceFactory function.
59     // Returns nullptr on failure.
60     static std::shared_ptr<DriverDevice> create(const std::string& name,
61                                                 const DeviceFactory& makeDevice);
62 
63     // Prefer using DriverDevice::create
64     DriverDevice(std::shared_ptr<VersionedIDevice> device);
65 
getName() const66     const std::string& getName() const override { return kInterface->getName(); }
getVersionString() const67     const std::string& getVersionString() const override { return kInterface->getVersionString(); }
getFeatureLevel() const68     int64_t getFeatureLevel() const override { return kInterface->getFeatureLevel(); }
getType() const69     int32_t getType() const override { return kInterface->getType(); }
getSupportedExtensions() const70     const std::vector<Extension>& getSupportedExtensions() const override {
71         return kInterface->getSupportedExtensions();
72     }
73     std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getPerformance(OperandType type) const74     PerformanceInfo getPerformance(OperandType type) const override {
75         const auto& capabilities = kInterface->getCapabilities();
76         return lookup(capabilities.operandPerformance, type);
77     }
getRelaxedFloat32toFloat16PerformanceScalar() const78     PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
79         const auto& capabilities = kInterface->getCapabilities();
80         return capabilities.relaxedFloat32toFloat16PerformanceScalar;
81     }
getRelaxedFloat32toFloat16PerformanceTensor() const82     PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
83         const auto& capabilities = kInterface->getCapabilities();
84         return capabilities.relaxedFloat32toFloat16PerformanceTensor;
85     }
getIfPerformance() const86     PerformanceInfo getIfPerformance() const override {
87         const auto& capabilities = kInterface->getCapabilities();
88         return capabilities.ifPerformance;
89     }
getWhilePerformance() const90     PerformanceInfo getWhilePerformance() const override {
91         const auto& capabilities = kInterface->getCapabilities();
92         return capabilities.whilePerformance;
93     }
isCachingSupported() const94     bool isCachingSupported() const override {
95         // Caching is supported if either of numModelCache or numDataCache is greater than 0.
96         const auto [numModelCacheFiles, numDataCacheFiles] =
97                 kInterface->getNumberOfCacheFilesNeeded();
98         return numModelCacheFiles > 0 || numDataCacheFiles > 0;
99     }
wait() const100     int wait() const override { return kInterface->wait(); }
101 
102     std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
103             const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
104             const std::optional<Deadline>& deadline, const std::string& cacheDir,
105             const std::optional<CacheToken>& maybeToken) const override;
106 
107     std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
108                                                      hal::OperandType) const override;
109 
110    private:
111     const std::shared_ptr<VersionedIDevice> kInterface;
112 
113 #ifdef NN_DEBUGGABLE
114     // For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
115     // 0 - all operations reported by IDevice::getSupportedOperations() supported
116     // 1 - some operations reported by IDevice::getSupportedOperations() supported
117     uint32_t mSupported = 0;
118 #endif  // NN_DEBUGGABLE
119 };
120 
121 // A PreparedModel with underlying IPreparedModel instance return by actual driver.
122 class DriverPreparedModel : public PreparedModel {
123    public:
DriverPreparedModel(const Device * device,const std::shared_ptr<VersionedIPreparedModel> & preparedModel)124     DriverPreparedModel(const Device* device,
125                         const std::shared_ptr<VersionedIPreparedModel>& preparedModel)
126         : mDevice(device), mPreparedModel(preparedModel) {
127         CHECK(mDevice != nullptr);
128         CHECK(mPreparedModel != nullptr);
129     }
130 
getDevice() const131     const Device* getDevice() const override { return mDevice; }
getInterface() const132     std::shared_ptr<VersionedIPreparedModel> getInterface() const override {
133         return mPreparedModel;
134     }
135     std::tuple<int, std::vector<OutputShape>, Timing> execute(
136             const std::vector<ModelArgumentInfo>& inputs,
137             const std::vector<ModelArgumentInfo>& outputs,
138             const std::vector<const Memory*>& memories,
139             const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
140             const std::optional<Deadline>& deadline,
141             const OptionalTimeoutDuration& loopTimeoutDuration) const override;
142 
143     std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
144             const std::vector<ModelArgumentInfo>& inputs,
145             const std::vector<ModelArgumentInfo>& outputs,
146             const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
147             MeasureTiming measure, const std::optional<Deadline>& deadline,
148             const OptionalTimeoutDuration& loopTimeoutDuration,
149             const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
150 
configureExecutionBurst(bool preferPowerOverLatency) const151     std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
152             bool preferPowerOverLatency) const override {
153         return mPreparedModel->configureExecutionBurst(preferPowerOverLatency);
154     }
155 
156    private:
157     const Device* mDevice;
158     const std::shared_ptr<VersionedIPreparedModel> mPreparedModel;
159 };
160 
DriverDevice(std::shared_ptr<VersionedIDevice> device)161 DriverDevice::DriverDevice(std::shared_ptr<VersionedIDevice> device)
162     : kInterface(std::move(device)) {
163     CHECK(kInterface != nullptr);
164 #ifdef NN_DEBUGGABLE
165     static const char samplePrefix[] = "sample";
166     if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
167         mSupported = getProp("debug.nn.sample.supported");
168     }
169 #endif  // NN_DEBUGGABLE
170 }
171 
create(const std::string & name,const DeviceFactory & makeDevice)172 std::shared_ptr<DriverDevice> DriverDevice::create(const std::string& name,
173                                                    const DeviceFactory& makeDevice) {
174     CHECK(makeDevice != nullptr);
175     std::shared_ptr<VersionedIDevice> device = VersionedIDevice::create(name, makeDevice);
176     if (device == nullptr) {
177         LOG(ERROR) << "DriverDevice::create failed to create VersionedIDevice object for service "
178                    << name;
179         return nullptr;
180     }
181 
182     return std::make_shared<DriverDevice>(std::move(device));
183 }
184 
getSupportedOperations(const MetaModel & metaModel) const185 std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
186     // Query the driver for what it can do.
187     ErrorStatus status = ErrorStatus::GENERAL_FAILURE;
188     std::vector<bool> supportedOperations;
189     std::tie(status, supportedOperations) = kInterface->getSupportedOperations(metaModel);
190 
191     const Model& hidlModel = metaModel.getModel();
192     const uint32_t operationCount = hidlModel.main.operations.size();
193     if (status != ErrorStatus::NONE) {
194         LOG(ERROR) << "IDevice::getSupportedOperations returned the error " << toString(status);
195         // Set the supported operation vectors to all false, so we won't use this driver.
196         return std::vector<bool>(operationCount, false);
197     }
198     if (supportedOperations.size() != operationCount) {
199         LOG(ERROR) << "IDevice::getSupportedOperations returned a vector of length "
200                    << supportedOperations.size() << " when expecting " << operationCount;
201         // Set the supported operation vectors to all false, so we won't use this driver.
202         return std::vector<bool>(operationCount, false);
203     }
204 
205 #ifdef NN_DEBUGGABLE
206     if (mSupported != 1) {
207         return supportedOperations;
208     }
209 
210     const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
211     for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
212         if (!supportedOperations[operationIndex]) {
213             continue;
214         }
215 
216         uint32_t accumulator = baseAccumulator;
217         const Operation& operation = hidlModel.main.operations[operationIndex];
218         accumulator ^= static_cast<uint32_t>(operation.type);
219         auto accumulateOperands = [&hidlModel, &accumulator](const hidl_vec<uint32_t>& operands) {
220             for (uint32_t operandIndex : operands) {
221                 const Operand& operand = hidlModel.main.operands[operandIndex];
222                 accumulator ^= static_cast<uint32_t>(operand.type);
223                 accumulator ^= operand.dimensions.size();
224                 for (uint32_t dimension : operand.dimensions) {
225                     accumulator ^= dimension;
226                     if (operand.lifetime == OperandLifeTime::CONSTANT_COPY ||
227                         operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
228                         accumulator ^= 1;
229                     }
230                 }
231             }
232         };
233         accumulateOperands(operation.inputs);
234         accumulateOperands(operation.outputs);
235         if (accumulator & 1) {
236             supportedOperations[operationIndex] = false;
237         }
238     }
239 #endif  // NN_DEBUGGABLE
240 
241     return supportedOperations;
242 }
243 
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const std::optional<Deadline> & deadline,const std::string & cacheDir,const std::optional<CacheToken> & maybeToken) const244 std::pair<int, std::shared_ptr<PreparedModel>> DriverDevice::prepareModel(
245         const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
246         const std::optional<Deadline>& deadline, const std::string& cacheDir,
247         const std::optional<CacheToken>& maybeToken) const {
248     const auto [n, preparedModel] = kInterface->prepareModel(makeModel, preference, priority,
249                                                              deadline, cacheDir, maybeToken);
250     if (n != ANEURALNETWORKS_NO_ERROR) {
251         return {n, nullptr};
252     }
253     CHECK(preparedModel != nullptr) << "prepareModel returned nullptr without error code";
254     return {ANEURALNETWORKS_NO_ERROR, std::make_shared<DriverPreparedModel>(this, preparedModel)};
255 }
256 
allocate(const MemoryDescriptor & desc,hal::OperandType) const257 std::pair<int, std::unique_ptr<Memory>> DriverDevice::allocate(const MemoryDescriptor& desc,
258                                                                hal::OperandType) const {
259     const BufferDesc hidlDesc = {.dimensions = desc.dimensions};
260     std::vector<std::shared_ptr<VersionedIPreparedModel>> preparedModels(
261             desc.preparedModels.size());
262     std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
263                    [](const auto* preparedModel) {
264                        const auto versionedPreparedModel = preparedModel->getInterface();
265                        CHECK(versionedPreparedModel != nullptr);
266                        return versionedPreparedModel;
267                    });
268     auto [status, buffer, token] =
269             kInterface->allocate(hidlDesc, preparedModels, desc.inputRoles, desc.outputRoles);
270     if (status != ErrorStatus::NONE) {
271         LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
272                    << " failed!";
273         return {convertErrorStatusToResultCode(status), nullptr};
274     }
275     return MemoryFromDevice::create(std::move(buffer), token);
276 }
277 
278 // Figures out how to place each of the input or outputs in a buffer. This just
279 // does the layout and memory allocation, it does not copy data.  Aligns each
280 // input a bit.
281 static std::tuple<int, std::unique_ptr<MemoryAshmem>, std::vector<DataLocation>>
allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo> & args,std::vector<const Memory * > * memories)282 allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo>& args,
283                                std::vector<const Memory*>* memories) {
284     CHECK(memories != nullptr);
285     std::vector<DataLocation> ptrArgsLocations;
286     const uint32_t nextPoolIndex = memories->size();
287     int64_t total = 0;
288     for (const auto& info : args) {
289         if (info.state() == ModelArgumentInfo::POINTER) {
290             // TODO Good enough alignment?
291             total += alignBytesNeeded(static_cast<uint32_t>(total), info.length());
292             ptrArgsLocations.push_back({.poolIndex = nextPoolIndex,
293                                         .offset = static_cast<uint32_t>(total),
294                                         .length = info.length()});
295             total += info.length();
296         }
297     };
298     if (total > 0xFFFFFFFF) {
299         LOG(ERROR) << "allocatePointerArgumentsToPool: ANeuralNetworksExecution: Size of all "
300                       "inputs or outputs exceeds 2^32.";
301         return {ANEURALNETWORKS_BAD_DATA, nullptr, std::vector<DataLocation>{}};
302     }
303     if (total <= 0) {
304         return {ANEURALNETWORKS_NO_ERROR, nullptr, std::vector<DataLocation>{}};
305     }
306     auto [n, memory] = MemoryAshmem::create(total);
307     if (n != ANEURALNETWORKS_NO_ERROR) {
308         return {n, nullptr, std::vector<DataLocation>{}};
309     }
310     memories->push_back(memory.get());
311     return {ANEURALNETWORKS_NO_ERROR, std::move(memory), std::move(ptrArgsLocations)};
312 }
313 
314 // Perform computation on an actual HIDL driver.
315 //
316 // Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
317 // outputs specified by pointers. The input pointer data will be copied to the input pool prior to
318 // execution, and the output pointer data will be copied out from the output pool after the
319 // execution.
320 //
321 // The HIDL invocation will choose between sync/async execution according to
322 // DeviceManager::mSyncExecHal.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> & burstController,MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration) const323 std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
324         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
325         const std::vector<const Memory*>& memories,
326         const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
327         const std::optional<Deadline>& deadline,
328         const OptionalTimeoutDuration& loopTimeoutDuration) const {
329     NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
330 
331     // Make a copy of the memory tracker as we will append memory pools for pointer arguments.
332     std::vector<const Memory*> localMemories = memories;
333 
334     // We separate the input & output pools so accelerators only need to copy
335     // the contents of the input pools. We could also use it to set protection
336     // on read only memory but that's not currently done.
337 
338     // Layout the input and output data
339     const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
340             allocatePointerArgumentsToPool(inputs, &localMemories);
341     if (n1 != ANEURALNETWORKS_NO_ERROR) {
342         return {n1, {}, kNoTiming};
343     }
344     const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
345             allocatePointerArgumentsToPool(outputs, &localMemories);
346     if (n2 != ANEURALNETWORKS_NO_ERROR) {
347         return {n2, {}, kNoTiming};
348     }
349 
350     // Copy the input data that was specified via a pointer.
351     if (inputPtrArgsMemory != nullptr) {
352         uint32_t ptrInputIndex = 0;
353         for (const auto& info : inputs) {
354             if (info.state() == ModelArgumentInfo::POINTER) {
355                 const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
356                 uint8_t* const data = inputPtrArgsMemory->getPointer();
357                 memcpy(data + loc.offset, info.buffer(), loc.length);
358             }
359         }
360     }
361 
362     Request request;
363     request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
364     request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
365     uint32_t count = localMemories.size();
366     request.pools.resize(count);
367     for (uint32_t i = 0; i < count; i++) {
368         request.pools[i] = localMemories[i]->getMemoryPool();
369     }
370 
371     NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
372                         "DriverPreparedModel::execute::execute");
373 
374     int n = ANEURALNETWORKS_OP_FAILED;
375     std::vector<OutputShape> outputShapes;
376     Timing timing = kNoTiming;
377 
378     // compute using burst if present
379     const bool burstCompute = (burstController != nullptr);
380     bool burstFallback = true;
381     if (burstCompute) {
382         const bool compliant = compliantWithV1_2(request);
383         if (compliant) {
384             V1_0::Request request12 = convertToV1_2(request);
385             std::vector<intptr_t> memoryIds;
386             memoryIds.reserve(localMemories.size());
387             for (const Memory* memory : localMemories) {
388                 memory->usedBy(burstController);
389                 memoryIds.push_back(memory->getKey());
390             }
391 
392             VLOG(EXECUTION) << "Before ExecutionBurstController->compute() "
393                             << SHOW_IF_DEBUG(toString(request12));
394             std::tie(n, outputShapes, timing, burstFallback) =
395                     burstController->compute(request12, measure, memoryIds);
396         }
397     }
398 
399     // compute from IPreparedModel if either:
400     // (1) burst was not supplied, or
401     // (2) the burst execution failed and requested a fallback execution
402     if (!burstCompute || burstFallback) {
403         const bool preferSynchronous = DeviceManager::get()->syncExecHal();
404         std::tie(n, outputShapes, timing) = mPreparedModel->execute(
405                 request, measure, deadline, loopTimeoutDuration, preferSynchronous);
406     }
407 
408     if (n != ANEURALNETWORKS_NO_ERROR) {
409         VLOG(EXECUTION) << "**Execution failed**";
410         return {n, std::move(outputShapes), timing};
411     }
412 
413     // Copy the output data from shared memory to the output buffers.
414     NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::execute");
415     if (outputPtrArgsMemory != nullptr) {
416         uint32_t ptrOutputIndex = 0;
417         for (const auto& info : outputs) {
418             if (info.state() == ModelArgumentInfo::POINTER) {
419                 const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
420                 const uint8_t* const data = outputPtrArgsMemory->getPointer();
421                 memcpy(info.buffer(), data + loc.offset, loc.length);
422             }
423         }
424     }
425 
426     VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
427     return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
428 }
429 
430 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::vector<int> & waitFor,hal::MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration,const hal::OptionalTimeoutDuration & timeoutDurationAfterFence) const431 DriverPreparedModel::executeFenced(
432         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
433         const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
434         hal::MeasureTiming measure, const std::optional<Deadline>& deadline,
435         const OptionalTimeoutDuration& loopTimeoutDuration,
436         const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const {
437     NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
438     CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd > 0; }));
439     // Make a copy of the memory tracker as we will append memory pools for pointer arguments.
440     std::vector<const Memory*> localMemories = memories;
441     sp<hal::IFencedExecutionCallback> executeFencedCallback;
442     hal::Timing timing = kNoTiming;
443 
444     // We separate the input & output pools so accelerators only need to copy
445     // the contents of the input pools. We could also use it to set protection
446     // on read only memory but that's not currently done.
447 
448     // Layout the input and output data
449     const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
450             allocatePointerArgumentsToPool(inputs, &localMemories);
451     if (n1 != ANEURALNETWORKS_NO_ERROR) {
452         return {n1, -1, nullptr, timing};
453     }
454     const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
455             allocatePointerArgumentsToPool(outputs, &localMemories);
456     if (n2 != ANEURALNETWORKS_NO_ERROR) {
457         return {n2, -1, nullptr, timing};
458     }
459 
460     // Copy the input data that was specified via a pointer.
461     if (inputPtrArgsMemory != nullptr) {
462         uint32_t ptrInputIndex = 0;
463         for (const auto& info : inputs) {
464             if (info.state() == ModelArgumentInfo::POINTER) {
465                 const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
466                 uint8_t* const data = inputPtrArgsMemory->getPointer();
467                 memcpy(data + loc.offset, info.buffer(), loc.length);
468             }
469         }
470     }
471 
472     Request request;
473     request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
474     request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
475     uint32_t count = localMemories.size();
476     request.pools.resize(count);
477     for (uint32_t i = 0; i < count; i++) {
478         request.pools[i] = localMemories[i]->getMemoryPool();
479     }
480 
481     NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
482                         "DriverPreparedModel::executeFenced");
483 
484     int n = ANEURALNETWORKS_OP_FAILED;
485     hidl_vec<hidl_handle> waitForHandles;
486     waitForHandles.resize(waitFor.size());
487     for (uint32_t i = 0; i < waitFor.size(); i++) {
488         native_handle_t* nativeHandle = native_handle_create(1, 0);
489         if (nativeHandle == nullptr) {
490             LOG(ERROR) << "Failed to create native_handle";
491             return {n, -1, nullptr, timing};
492         }
493         int dupFd = dup(waitFor[i]);
494         if (dupFd <= 0) {
495             LOG(ERROR) << "Unable to dup the file descriptor";
496             return {n, -1, nullptr, timing};
497         }
498         nativeHandle->data[0] = dupFd;
499         hidl_handle hidlHandle;
500         hidlHandle.setTo(nativeHandle, /*shouldOwn=*/true);
501         waitForHandles[i] = std::move(hidlHandle);
502     }
503 
504     hidl_handle syncFence;
505     std::tie(n, syncFence, executeFencedCallback, timing) =
506             mPreparedModel->executeFenced(request, waitForHandles, measure, deadline,
507                                           loopTimeoutDuration, timeoutDurationAfterFence);
508 
509     if (n != ANEURALNETWORKS_NO_ERROR) {
510         VLOG(EXECUTION) << "**executeFenced failed**";
511         return {n, -1, nullptr, timing};
512     }
513 
514     int syncFenceFd = -1;
515     if (syncFence.getNativeHandle()) {
516         syncFenceFd = dup(syncFence.getNativeHandle()->data[0]);
517         if (syncFenceFd < 0) {
518             LOG(ERROR) << "Failed to dup the file descriptor";
519             return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
520         }
521     }
522     // If output buffer is provided as a malloc pointer, wait for the execution to finish.
523     // Then copy the output data from shared memory to the output buffers.
524     if (outputPtrArgsMemory != nullptr) {
525         NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::executeFenced");
526         if (syncFenceFd > 0) {
527             auto r = syncWait(syncFenceFd, -1);
528             if (r != FenceState::SIGNALED) {
529                 LOG(ERROR) << "syncWait failed, fd: " << syncFenceFd;
530                 return {ANEURALNETWORKS_OP_FAILED, syncFenceFd, nullptr, timing};
531             }
532         }
533         uint32_t ptrOutputIndex = 0;
534         for (const auto& info : outputs) {
535             if (info.state() == ModelArgumentInfo::POINTER) {
536                 const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
537                 const uint8_t* const data = outputPtrArgsMemory->getPointer();
538                 memcpy(info.buffer(), data + loc.offset, loc.length);
539             }
540         }
541     }
542 
543     VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
544     return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedCallback, timing};
545 }
546 
547 // A special abstracted device for the CPU. Only one instance of this class will exist.
548 // Use get() to retrieve it.
549 class CpuDevice : public Device {
550    public:
551     // Returns the singleton CPU fallback device.
get()552     static std::shared_ptr<CpuDevice> get() {
553         static std::shared_ptr<CpuDevice> instance(new CpuDevice);
554         return instance;
555     }
556 
getName() const557     const std::string& getName() const override { return kName; }
getVersionString() const558     const std::string& getVersionString() const override { return kVersionString; }
getFeatureLevel() const559     int64_t getFeatureLevel() const override { return kFeatureLevel; }
getType() const560     int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
getSupportedExtensions() const561     const std::vector<Extension>& getSupportedExtensions() const override {
562         return kSupportedExtensions;
563     }
564     std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
getPerformance(OperandType) const565     PerformanceInfo getPerformance(OperandType) const override { return kPerformance; }
getRelaxedFloat32toFloat16PerformanceScalar() const566     PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
567         return kPerformance;
568     }
getRelaxedFloat32toFloat16PerformanceTensor() const569     PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
570         return kPerformance;
571     }
getIfPerformance() const572     PerformanceInfo getIfPerformance() const override { return kPerformance; }
getWhilePerformance() const573     PerformanceInfo getWhilePerformance() const override { return kPerformance; }
isCachingSupported() const574     bool isCachingSupported() const override { return false; }
wait() const575     int wait() const override { return ANEURALNETWORKS_NO_ERROR; }
576 
577     std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
578             const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
579             const std::optional<Deadline>& deadline, const std::string& cacheDir,
580             const std::optional<CacheToken>& maybeToken) const override;
581 
582     std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
583                                                      OperandType type) const override;
584 
585    private:
586     CpuDevice() = default;
587     const int64_t kFeatureLevel = __ANDROID_API__;
588     const std::string kName = "nnapi-reference";
589     const std::string kVersionString = build::GetBuildNumber();
590     // Since the performance is a ratio compared to the CPU performance,
591     // by definition the performance of the CPU is 1.0.
592     const PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
593     const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
594 };
595 
596 // A special abstracted PreparedModel for the CPU, constructed by CpuDevice.
597 class CpuPreparedModel : public PreparedModel {
598    public:
599     // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
600     // a prepared model object if successfully created. Returns an error code
601     // and nullptr otherwise.
602     static std::pair<int, std::shared_ptr<PreparedModel>> create(Model hidlModel);
603 
getDevice() const604     const Device* getDevice() const override { return CpuDevice::get().get(); }
getInterface() const605     std::shared_ptr<VersionedIPreparedModel> getInterface() const override { return nullptr; }
606 
607     std::tuple<int, std::vector<OutputShape>, Timing> execute(
608             const std::vector<ModelArgumentInfo>& inputs,
609             const std::vector<ModelArgumentInfo>& outputs,
610             const std::vector<const Memory*>& memories,
611             const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
612             const std::optional<Deadline>& deadline,
613             const OptionalTimeoutDuration& loopTimeoutDuration) const override;
614 
configureExecutionBurst(bool) const615     std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
616             bool /*preferPowerOverLatency*/) const override {
617         return nullptr;
618     }
619 
620     std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
621             const std::vector<ModelArgumentInfo>& inputs,
622             const std::vector<ModelArgumentInfo>& outputs,
623             const std::vector<const Memory*>& memories, const std::vector<int>& wait_for,
624             MeasureTiming measure, const std::optional<Deadline>& deadline,
625             const OptionalTimeoutDuration& loopTimeoutDuration,
626             const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
627 
628     // Prefer to use CpuPreparedModel::create.
CpuPreparedModel(Model model,std::vector<RunTimePoolInfo> poolInfos)629     CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
630         : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
631 
632    private:
633     const Model mModel;
634     const std::vector<RunTimePoolInfo> mModelPoolInfos;
635 };
636 
getSupportedOperations(const MetaModel & metaModel) const637 std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
638     const Model& hidlModel = metaModel.getModel();
639     const size_t count = hidlModel.main.operations.size();
640     std::vector<bool> result(count, false);
641     for (size_t i = 0; i < count; i++) {
642         // TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
643         //                    We may want to use the slicer for CpuDevice just as we do for
644         //                    DriverDevice.
645         OperationType operationType = hidlModel.main.operations[i].type;
646         result[i] = !isExtensionOperationType(operationType) &&
647                     operationType != OperationType::OEM_OPERATION;
648     }
649     return result;
650 }
651 
prepareModel(const ModelFactory & makeModel,ExecutionPreference preference,Priority priority,const std::optional<Deadline> & deadline,const std::string &,const std::optional<CacheToken> & maybeToken) const652 std::pair<int, std::shared_ptr<PreparedModel>> CpuDevice::prepareModel(
653         const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
654         const std::optional<Deadline>& deadline, const std::string& /*cacheDir*/,
655         const std::optional<CacheToken>& maybeToken) const {
656     CHECK(!maybeToken.has_value())
657             << "Should never call prepareModel with cache information on CpuDevice";
658 
659     const Model model = makeModel();
660     if (!validateModel(model, ValidationMode::RUNTIME) ||
661         !validateExecutionPreference(preference) || !validatePriority(priority)) {
662         return {ANEURALNETWORKS_OP_FAILED, nullptr};
663     }
664     if (hasDeadlinePassed(deadline)) {
665         return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
666     }
667 
668     return CpuPreparedModel::create(model);
669 }
670 
allocate(const MemoryDescriptor & desc,OperandType type) const671 std::pair<int, std::unique_ptr<Memory>> CpuDevice::allocate(const MemoryDescriptor& desc,
672                                                             OperandType type) const {
673     uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
674     if (size == 0) {
675         LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
676         return {ANEURALNETWORKS_OP_FAILED, nullptr};
677     }
678     return MemoryAshmem::create(size);
679 }
680 
create(Model hidlModel)681 std::pair<int, std::shared_ptr<PreparedModel>> CpuPreparedModel::create(Model hidlModel) {
682     std::vector<RunTimePoolInfo> poolInfos;
683     if (!setRunTimePoolInfosFromHidlMemories(&poolInfos, hidlModel.pools)) {
684         return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
685     }
686 
687     std::shared_ptr<PreparedModel> preparedModel =
688             std::make_shared<CpuPreparedModel>(std::move(hidlModel), std::move(poolInfos));
689     return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
690 }
691 
computeOnCpu(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration)692 static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
693         const Model& model, const Request& request,
694         const std::vector<RunTimePoolInfo>& modelPoolInfos,
695         const std::vector<RunTimePoolInfo>& requestPoolInfos,
696         const std::optional<Deadline>& deadline,
697         const OptionalTimeoutDuration& loopTimeoutDuration) {
698     NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
699     CpuExecutor executor;
700     if (loopTimeoutDuration.getDiscriminator() !=
701         OptionalTimeoutDuration::hidl_discriminator::none) {
702         executor.setLoopTimeout(loopTimeoutDuration.nanoseconds());
703     }
704     if (deadline.has_value()) {
705         executor.setDeadline(*deadline);
706     }
707     int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
708     const auto& outputShapes = executor.getOutputShapes();
709     return {err, outputShapes, kNoTiming};
710 }
711 
712 std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
executeFenced(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::vector<int> & waitFor,hal::MeasureTiming measure,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration,const hal::OptionalTimeoutDuration & duration) const713 CpuPreparedModel::executeFenced(const std::vector<ModelArgumentInfo>& inputs,
714                                 const std::vector<ModelArgumentInfo>& outputs,
715                                 const std::vector<const Memory*>& memories,
716                                 const std::vector<int>& waitFor, hal::MeasureTiming measure,
717                                 const std::optional<Deadline>& deadline,
718                                 const OptionalTimeoutDuration& loopTimeoutDuration,
719                                 const hal::OptionalTimeoutDuration& duration) const {
720     VLOG(EXECUTION)
721             << "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
722     for (int syncFd : waitFor) {
723         if (syncFd > 0) {
724             auto r = syncWait(syncFd, -1);
725             if (r != FenceState::SIGNALED) {
726                 LOG(ERROR) << "sync wait failed, fd: " << syncFd;
727                 return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {UINT64_MAX, UINT64_MAX}};
728             }
729         }
730     }
731 
732     // Update deadline if the timeout duration is closer than the deadline.
733     auto closestDeadline = deadline;
734     if (duration.getDiscriminator() != OptionalTimeoutDuration::hidl_discriminator::none) {
735         const auto timeoutDurationDeadline = makeDeadline(duration.nanoseconds());
736         if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
737             closestDeadline = timeoutDurationDeadline;
738         }
739     }
740 
741     const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
742                                                         closestDeadline, loopTimeoutDuration);
743     return {result, -1, nullptr, timing};
744 }
745 
746 // Perform computation on NNAPI CPU reference implementation.
747 //
748 // Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
749 // same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
750 // there are input/output in this method to avoid data copying.
751 //
752 // Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
execute(const std::vector<ModelArgumentInfo> & inputs,const std::vector<ModelArgumentInfo> & outputs,const std::vector<const Memory * > & memories,const std::shared_ptr<ExecutionBurstController> &,MeasureTiming,const std::optional<Deadline> & deadline,const OptionalTimeoutDuration & loopTimeoutDuration) const753 std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
754         const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
755         const std::vector<const Memory*>& memories,
756         const std::shared_ptr<ExecutionBurstController>& /*burstController*/,
757         MeasureTiming /*measure*/, const std::optional<Deadline>& deadline,
758         const OptionalTimeoutDuration& loopTimeoutDuration) const {
759     if (hasDeadlinePassed(deadline)) {
760         return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, kNoTiming};
761     }
762 
763     std::vector<RunTimePoolInfo> requestPoolInfos;
764     requestPoolInfos.reserve(memories.size());
765     for (const Memory* mem : memories) {
766         if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
767             requestPoolInfos.emplace_back(*poolInfo);
768         } else {
769             return {ANEURALNETWORKS_UNMAPPABLE, {}, kNoTiming};
770         }
771     }
772     // Create as many pools as there are input / output.
773     auto fixPointerArguments =
774             [&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
775                 std::vector<DataLocation> ptrArgsLocations;
776                 for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
777                     if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
778                         ptrArgsLocations.push_back(
779                                 {.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
780                                  .offset = 0,
781                                  .length = argumentInfo.length()});
782                         requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
783                                 static_cast<uint8_t*>(argumentInfo.buffer())));
784                     }
785                 }
786                 return ptrArgsLocations;
787             };
788     const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
789     const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);
790 
791     Request request;
792     request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
793     request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
794 
795     if (!DeviceManager::get()->syncExecCpu()) {
796         // TODO: use a thread pool
797         // TODO(mikie): this could have NNTRACE so we could measure the overhead
798         //              of spinning up a new thread.
799         std::tuple<int, std::vector<OutputShape>, Timing> result = {};
800         std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
801             result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
802                                   loopTimeoutDuration);
803         }).join();
804         return result;
805     }
806 
807     return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
808                         loopTimeoutDuration);
809 }
810 
get()811 DeviceManager* DeviceManager::get() {
812     static DeviceManager manager;
813     return &manager;
814 }
815 
getCpuDevice()816 std::shared_ptr<Device> DeviceManager::getCpuDevice() {
817     return CpuDevice::get();
818 }
819 
forTest_makeDriverDevice(const std::string & name,const sp<V1_0::IDevice> & device)820 std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const std::string& name,
821                                                                 const sp<V1_0::IDevice>& device) {
822     const DeviceFactory makeDevice = [device](bool /*blocking*/) { return device; };
823     const auto driverDevice = DriverDevice::create(name, makeDevice);
824     CHECK(driverDevice != nullptr);
825     return driverDevice;
826 }
827 
findAvailableDevices()828 void DeviceManager::findAvailableDevices() {
829     VLOG(MANAGER) << "findAvailableDevices";
830 
831     // register driver devices
832     const auto names = hardware::getAllHalInstanceNames(V1_0::IDevice::descriptor);
833     for (const auto& name : names) {
834         VLOG(MANAGER) << "Found interface " << name;
835         const DeviceFactory makeDevice = [name](bool blocking) {
836             return blocking ? V1_0::IDevice::getService(name) : V1_0::IDevice::tryGetService(name);
837         };
838         registerDevice(name, makeDevice);
839     }
840 
841     // register CPU fallback device
842     mDevices.push_back(CpuDevice::get());
843     mDevicesCpuOnly.push_back(CpuDevice::get());
844 }
845 
registerDevice(const std::string & name,const DeviceFactory & makeDevice)846 void DeviceManager::registerDevice(const std::string& name, const DeviceFactory& makeDevice) {
847     if (auto device = DriverDevice::create(name, makeDevice)) {
848         mDevices.push_back(std::move(device));
849     }
850 }
851 
DeviceManager()852 DeviceManager::DeviceManager() {
853     VLOG(MANAGER) << "DeviceManager::DeviceManager";
854     findAvailableDevices();
855 #ifdef NN_DEBUGGABLE
856     mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
857     mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
858     mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
859     mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
860     if (!mSyncExecHalSetter) {
861         mSyncExecHal = (getProp("debug.nn.syncexec-hal", 1) != 0);
862     }
863     mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
864 #endif  // NN_DEBUGGABLE
865 }
866 
867 }  // namespace nn
868 }  // namespace android
869