ExecutionBuilder.cpp revision 4d83c52f52613585f7b86368be762b2857f7460f
1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "ExecutionBuilder"
18
19#include "ExecutionBuilder.h"
20
21#include "CompilationBuilder.h"
22#include "CpuExecutor.h"
23#include "HalInterfaces.h"
24#include "Manager.h"
25#include "ModelBuilder.h"
26
27#include <mutex>
28#include <thread>
29#include <vector>
30
31namespace android {
32namespace nn {
33
34int ModelArgumentInfo::setFromPointer(const Operand& operand,
35                                      const ANeuralNetworksOperandType* type, void* data,
36                                      uint32_t length) {
37    int n = updateDimensionInfo(operand, type);
38    if (n != ANEURALNETWORKS_NO_ERROR) {
39        return n;
40    }
41    if (data == nullptr) {
42        if (length) {
43            LOG(ERROR) << "Setting argument as having no value but non-zero length passed.";
44            return ANEURALNETWORKS_BAD_DATA;
45        }
46        state = ModelArgumentInfo::HAS_NO_VALUE;
47    } else {
48        state = ModelArgumentInfo::POINTER;
49    }
50    buffer = data;
51    locationAndLength = {.poolIndex = 0, .offset = 0, .length = length};
52    return ANEURALNETWORKS_NO_ERROR;
53}
54
55int ModelArgumentInfo::setFromMemory(const Operand& operand, const ANeuralNetworksOperandType* type,
56                                     uint32_t poolIndex, uint32_t offset, uint32_t length) {
57    int n = updateDimensionInfo(operand, type);
58    if (n != ANEURALNETWORKS_NO_ERROR) {
59        return n;
60    }
61    state = ModelArgumentInfo::MEMORY;
62    locationAndLength = {.poolIndex = poolIndex, .offset = offset, .length = length};
63    buffer = nullptr;
64    return ANEURALNETWORKS_NO_ERROR;
65}
66
67int ModelArgumentInfo::setFromTemporaryMemory(const Operand& operand, uint32_t poolIndex) {
68    dimensions = operand.dimensions;
69    state = ModelArgumentInfo::MEMORY;
70    locationAndLength =
71            {.poolIndex = poolIndex, .offset = 0, .length = sizeOfData(operand)};
72    buffer = nullptr;
73    return ANEURALNETWORKS_NO_ERROR;
74}
75
76int ModelArgumentInfo::updateDimensionInfo(const Operand& operand,
77                                           const ANeuralNetworksOperandType* newType) {
78    if (newType == nullptr) {
79        dimensions = hidl_vec<uint32_t>();
80    } else {
81        uint32_t count = newType->dimensionCount;
82        if (static_cast<OperandType>(newType->type) != operand.type ||
83            count != operand.dimensions.size()) {
84            LOG(ERROR) << "ANeuralNetworksExecution_setInput/Output incompatible types";
85            return ANEURALNETWORKS_BAD_DATA;
86        }
87        for (uint32_t i = 0; i < count; i++) {
88            dimensions[i] = newType->dimensions[i];
89        }
90    }
91    return ANEURALNETWORKS_NO_ERROR;
92}
93
94ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation) :
95        mModel(compilation->mModel),
96        mPlan(&compilation->mPlan),
97        mInputs(mModel->inputCount()),
98        mOutputs(mModel->outputCount()),
99        mMemories(mModel->getMemories()) {
100    LOG(DEBUG) << "ExecutionBuilder::ExecutionBuilder";
101}
102
103int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type,
104                               const void* buffer, size_t length) {
105    uint32_t count = static_cast<uint32_t>(mInputs.size());
106    if (index >= count) {
107        LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count;
108        return ANEURALNETWORKS_BAD_DATA;
109    }
110    if (type != nullptr) {
111        int n = validateOperandType(*type, "ANeuralNetworksExecution_setInput", false);
112        if (n != ANEURALNETWORKS_NO_ERROR) {
113            return n;
114        }
115    }
116    if (length > 0xFFFFFFFF) {
117        LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length;
118        return ANEURALNETWORKS_BAD_DATA;
119    }
120    uint32_t l = static_cast<uint32_t>(length);
121    return mInputs[index].setFromPointer(mModel->getInputOperand(index), type,
122                                         const_cast<void*>(buffer), l);
123}
124
125int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
126                                         const Memory* memory, size_t offset, size_t length) {
127    // Should be similar to StepExecutor::setInputOrOutputFromTemporaryMemory()
128
129    uint32_t count = static_cast<uint32_t>(mInputs.size());
130    if (index >= count) {
131        LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " "
132                   << count;
133        return ANEURALNETWORKS_BAD_DATA;
134    }
135    if (!memory->validateSize(offset, length)) {
136        return ANEURALNETWORKS_BAD_DATA;
137    }
138    // TODO validate the rest
139    uint32_t poolIndex = mMemories.add(memory);
140    return mInputs[index].setFromMemory(mModel->getInputOperand(index), type, poolIndex, offset,
141                                        length);
142}
143
144int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer,
145                                size_t length) {
146    uint32_t count = static_cast<uint32_t>(mOutputs.size());
147    if (index >= count) {
148        LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count;
149        return ANEURALNETWORKS_BAD_DATA;
150    }
151    if (type != nullptr) {
152        int n = validateOperandType(*type, "ANeuralNetworksExecution_setOutput", false);
153        if (n != ANEURALNETWORKS_NO_ERROR) {
154            return n;
155        }
156    }
157    if (length > 0xFFFFFFFF) {
158        LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length;
159        return ANEURALNETWORKS_BAD_DATA;
160    }
161    uint32_t l = static_cast<uint32_t>(length);
162    return mOutputs[index].setFromPointer(mModel->getOutputOperand(index), type, buffer, l);
163}
164
165int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
166                                          const Memory* memory, size_t offset, size_t length) {
167    // Should be similar to StepExecutor::setInputOrOutputFromTemporaryMemory()
168
169    uint32_t count = static_cast<uint32_t>(mOutputs.size());
170    if (index >= count) {
171        LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " "
172                   << count;
173        return ANEURALNETWORKS_BAD_DATA;
174    }
175    if (!memory->validateSize(offset, length)) {
176        return ANEURALNETWORKS_BAD_DATA;
177    }
178    // TODO validate the rest
179    uint32_t poolIndex = mMemories.add(memory);
180    return mOutputs[index].setFromMemory(mModel->getOutputOperand(index), type, poolIndex, offset,
181                                         length);
182}
183
184int ExecutionBuilder::startCompute(sp<ExecutionCallback>* synchronizationCallback) {
185    *synchronizationCallback = nullptr;
186
187    // TODO validate that we have full types for all inputs and outputs,
188    // that the graph is not cyclic,
189
190    for (auto& p : mInputs) {
191        if (p.state == ModelArgumentInfo::UNSPECIFIED) {
192            LOG(ERROR) << "ANeuralNetworksExecution_startCompute not all inputs specified";
193            return ANEURALNETWORKS_BAD_DATA;
194        }
195    }
196    for (auto& p : mOutputs) {
197        if (p.state == ModelArgumentInfo::UNSPECIFIED) {
198            LOG(ERROR) << "ANeuralNetworksExecution_startCompute not all outputs specified";
199            return ANEURALNETWORKS_BAD_DATA;
200        }
201    }
202
203    // TODO: Remove the non-plan-based path once we've fully integrated ExecutionPlan
204    // with the compilation and execution phases of the NN API?  Or retain that path
205    // as a fallback in the case of partitioning failure?
206    //
207    // TODO: Entire plan-based-path should run in an asynchronous thread --
208    // take the asynchronous thread logic out of startComputeOnCpu() and use
209    // it to wrap the plan-based-path.
210    const int partitioning = DeviceManager::get()->getPartitioning();
211    if (partitioning > 0) {
212        const bool simulation = (partitioning == 1);
213        std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this);
214        if (controller == nullptr) {
215            const bool fallback = (partitioning == 2);
216            if (!simulation && !fallback) {
217                return ANEURALNETWORKS_OP_FAILED;
218            }
219        } else {
220            LOG(DEBUG) << "ExecutionBuilder::startCompute"
221                       << (simulation ? " SIMULATION" : "")
222                       << " (from plan, iteratively)";
223            while (true) {
224                std::shared_ptr<StepExecutor> executor;
225                LOG(DEBUG) << "looking for next StepExecutor";
226                int n = mPlan->next(controller, &executor);
227                if (n != ANEURALNETWORKS_NO_ERROR || executor == nullptr) {
228                    if (!simulation) {
229                        return n;
230                    }
231
232                    // simulation
233                    if (n != ANEURALNETWORKS_NO_ERROR) {
234                        LOG(DEBUG) << "ExecutionBuilder::startCompute SIMULATION failed "
235                                   << "with error " << n;
236                    }
237                    break;
238                }
239                if (simulation) {
240                    continue;
241                }
242
243                n = executor->startCompute(synchronizationCallback);
244                if (n != ANEURALNETWORKS_NO_ERROR) {
245                    return n;
246                }
247                (*synchronizationCallback)->wait();
248                if ((*synchronizationCallback)->getStatus() != ErrorStatus::NONE) {
249                    return ANEURALNETWORKS_OP_FAILED;
250                }
251            }
252        }
253    }
254
255    // Find a driver that can handle all the operations.
256    Model hidlModel;
257    mModel->setHidlModel(&hidlModel);
258    const std::vector<std::shared_ptr<Device>>& devices = DeviceManager::get()->getDrivers();
259    for (const auto& device : devices) {
260        hidl_vec<bool> supports;
261        LOG(DEBUG) << "Checking " << device->getName();
262        device->getSupportedOperations(hidlModel, &supports);
263        if (std::find(supports.begin(), supports.end(), false) == supports.end()) {
264            LOG(DEBUG) << "ExecutionBuilder::startCompute (without plan) on " << device->getName();
265            StepExecutor executor(this, mModel, device->getInterface(),
266                                  nullptr /* no IPreparedModel, so compile */);
267            executor.mapInputsAndOutputsTrivially();
268            return executor.startCompute(synchronizationCallback);
269        }
270    }
271    // If none can, run on the CPU.
272    LOG(DEBUG) << "ExecutionBuilder::startCompute (without plan) on CPU";
273    StepExecutor executor(this, mModel,
274                          nullptr /* no IDevice, so CPU */,
275                          nullptr /* no IPreparedModel */);
276    executor.mapInputsAndOutputsTrivially();
277    return executor.startCompute(synchronizationCallback);
278}
279
280// Figures out how to place each of the input or outputs in a buffer. This just does the layout,
281// it does not copy data.  Aligns each input a bit.
282int StepExecutor::allocatePointerArgumentsToPool(std::vector<ModelArgumentInfo>* args,
283                                                 Memory* memory) {
284    uint32_t nextPoolIndex = mMemories.size();
285    int64_t total = 0;
286    for (auto& info : *args) {
287        if (info.state == ModelArgumentInfo::POINTER) {
288            DataLocation& loc = info.locationAndLength;
289            // TODO Good enough alignment?
290            total += alignBytesNeeded(static_cast<uint32_t>(total), loc.length);
291            loc.poolIndex = nextPoolIndex;
292            loc.offset = static_cast<uint32_t>(total);
293            total += loc.length;
294        }
295    };
296    if (total > 0xFFFFFFFF) {
297        LOG(ERROR) << "ANeuralNetworksExecution_startCompute Size of all inputs or outputs exceeds "
298                      "2^32.";
299        return ANEURALNETWORKS_BAD_DATA;
300    }
301    hidl_memory hidlMemory;
302    if (total > 0) {
303        memory->create(total);  // TODO check error
304        mMemories.add(memory);
305    }
306    return ANEURALNETWORKS_NO_ERROR;
307}
308
309static void setRequestArgumentArray(const std::vector<ModelArgumentInfo>& argumentInfos,
310                                     hidl_vec<RequestArgument>* ioInfos) {
311    size_t count = argumentInfos.size();
312    ioInfos->resize(count);
313    for (size_t i = 0; i < count; i++) {
314        const auto& info = argumentInfos[i];
315        (*ioInfos)[i] = { .hasNoValue = info.state == ModelArgumentInfo::HAS_NO_VALUE,
316                          .location = info.locationAndLength,
317                          .dimensions = info.dimensions,
318                        };
319    }
320}
321
322StepExecutor::StepExecutor(const ExecutionBuilder* executionBuilder,
323                           const ModelBuilder* model,
324                           sp<IDevice> driver, sp<IPreparedModel> preparedModel) :
325    mExecutionBuilder(executionBuilder), mModel(model),
326    mDriver(driver), mPreparedModel(preparedModel),
327    mInputs(model->inputCount()), mOutputs(model->outputCount()) {}
328
329void StepExecutor::mapInputsAndOutputsTrivially() {
330    mInputs = mExecutionBuilder->mInputs;
331    mOutputs = mExecutionBuilder->mOutputs;
332    mMemories = mExecutionBuilder->mMemories;
333}
334
335void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
336                                    ModelArgumentInfo* executorInputOrOutput) {
337    *executorInputOrOutput = builderInputOrOutput;
338    switch (executorInputOrOutput->state) {
339        default:
340            nnAssert(!"unexpected ModelArgumentInfo::state");
341        case ModelArgumentInfo::POINTER:
342        case ModelArgumentInfo::UNSPECIFIED:
343            break;
344        case ModelArgumentInfo::MEMORY: {
345            const uint32_t builderPoolIndex =
346                    builderInputOrOutput.locationAndLength.poolIndex;
347            const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
348            const uint32_t executorPoolIndex = mMemories.add(memory);
349            executorInputOrOutput->locationAndLength.poolIndex =
350                    executorPoolIndex;
351            break;
352        }
353    }
354}
355
356int StepExecutor::setInputOrOutputFromTemporaryMemory(const Operand& inputOrOutputOperand,
357                                                      const Memory* memory,
358                                                      ModelArgumentInfo* inputOrOutputInfo) {
359    // Should be similar to
360    //     ExecutionBuilder::setInputFromMemory()
361    //     ExecutionBuilder::setOutputFromMemory()
362
363    uint32_t poolIndex = mMemories.add(memory);
364    return inputOrOutputInfo->setFromTemporaryMemory(inputOrOutputOperand, poolIndex);
365}
366
367int StepExecutor::startCompute(sp<ExecutionCallback>* synchronizationCallback) {
368    if (mDriver == nullptr) {
369        return startComputeOnCpu(synchronizationCallback);
370    } else {
371        return startComputeOnDevice(synchronizationCallback);
372    }
373}
374
375int StepExecutor::startComputeOnDevice(sp<ExecutionCallback>* synchronizationCallback) {
376    nnAssert(mDriver != nullptr);
377
378    *synchronizationCallback = nullptr;
379
380    // TODO: Remove the mPreparedModel == nullptr case once we've fully integrated
381    // ExecutionPlan with the compilation and execution phases of the NN API
382    if (mPreparedModel == nullptr) {
383        Model model;
384        mModel->setHidlModel(&model);
385
386        // TODO Dangerous!  In async, the model will outlive it here. Safe for now
387        sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();
388        Return<ErrorStatus> prepareLaunchStatus =
389                mDriver->prepareModel(model, preparedModelCallback);
390        if (!prepareLaunchStatus.isOk() || prepareLaunchStatus != ErrorStatus::NONE) {
391            return ANEURALNETWORKS_OP_FAILED;
392        }
393
394        // Immediately synchronize with callback object for now
395        // TODO: change to asynchronous later
396        preparedModelCallback->wait();
397        ErrorStatus prepareReturnStatus = preparedModelCallback->getStatus();
398        mPreparedModel = preparedModelCallback->getPreparedModel();
399        if (prepareReturnStatus != ErrorStatus::NONE || mPreparedModel == nullptr) {
400            return ANEURALNETWORKS_OP_FAILED;
401        }
402    }
403
404    // We separate the input & output pools so that we reduce the copying done if we
405    // do an eventual remoting (hidl_memory->update()).  We could also use it to set
406    // protection on read only memory but that's not currently done.
407    Memory inputPointerArguments;
408    Memory outputPointerArguments;
409
410    // Layout the input and output data
411    int n = allocatePointerArgumentsToPool(&mInputs, &inputPointerArguments);
412    if (n != ANEURALNETWORKS_NO_ERROR) {
413        return n;
414    }
415    n = allocatePointerArgumentsToPool(&mOutputs, &outputPointerArguments);
416    if (n != ANEURALNETWORKS_NO_ERROR) {
417        return n;
418    }
419
420    // Copy the input data that was specified via a pointer.
421    // inputPointerArguments.update();
422    for (auto& info : mInputs) {
423        if (info.state == ModelArgumentInfo::POINTER) {
424            DataLocation& loc = info.locationAndLength;
425            uint8_t* data = nullptr;
426            int n = inputPointerArguments.getPointer(&data);
427            if (n != ANEURALNETWORKS_NO_ERROR) {
428                return n;
429            }
430            memcpy(data + loc.offset, info.buffer, loc.length);
431        }
432    }
433    // TODO: Add inputPointerArguments.commit() and .update() at all the right places
434
435    Request request;
436    setRequestArgumentArray(mInputs, &request.inputs);
437    setRequestArgumentArray(mOutputs, &request.outputs);
438    uint32_t count = mMemories.size();
439    request.pools.resize(count);
440    for (uint32_t i = 0; i < count; i++) {
441        request.pools[i] = mMemories[i]->getHidlMemory();
442    }
443
444    // Prepare the callback for asynchronous execution. sp<ExecutionCallback>
445    // object is returned when the execution has been successfully launched,
446    // otherwise a nullptr is returned. The executionCallback is abstracted in
447    // the NN API as an "event".
448    //
449    // The sp is used for ref-counting purposes. Without it, the HIDL service
450    // could attempt to communicate with a dead callback object.
451    //
452    // TODO: Explain the "dead callback" problem further, either here or
453    // in the design document.
454    sp<ExecutionCallback> executionCallback = new ExecutionCallback();
455
456    LOG(DEBUG) << "Before mPreparedModel->execute() " << toString(request);
457    // Execute.
458    // TODO: What happens to the Callback if the service dies abnormally
459    // -- won't that keep the Callback live forever, because the service
460    // never has the opportunity to bump the reference count down? Or
461    // maybe the HIDL infrastructure handles this magically? At worst,
462    // it seems like this is a small memory leak, if the Callback stays
463    // alive forever.
464    if (mPreparedModel->execute(request, executionCallback) != ErrorStatus::NONE) {
465        LOG(DEBUG) << "**Execute failed**";
466        return ANEURALNETWORKS_OP_FAILED;
467    }
468
469    // TODO: Remove this synchronization point when the block of code below is
470    // removed.
471    executionCallback->wait();
472    Return<ErrorStatus> executionStatus = executionCallback->getStatus();
473    if (!executionStatus.isOk() || executionStatus != ErrorStatus::NONE) {
474        LOG(DEBUG) << "**Execute async failed**";
475        return ANEURALNETWORKS_OP_FAILED;
476    }
477
478    // Copy the output data from shared memory to the output buffers.
479    // TODO: Move this block of code somewhere else. It should not be in the
480    // startCompute function.
481    // TODO: outputMemory->update(); outputMemory->commit()
482    for (auto& info : mOutputs) {
483        if (info.state == ModelArgumentInfo::POINTER) {
484            DataLocation& loc = info.locationAndLength;
485            uint8_t* data = nullptr;
486            int n = outputPointerArguments.getPointer(&data);
487            if (n != ANEURALNETWORKS_NO_ERROR) {
488                return n;
489            }
490            memcpy(info.buffer, data + loc.offset, loc.length);
491        }
492    }
493    LOG(DEBUG) << "StepExecutor::startComputeOnDevice completed";
494
495    *synchronizationCallback = executionCallback;
496    return ANEURALNETWORKS_NO_ERROR;
497}
498
499static void asyncStartComputeOnCpu(const Model& model, const Request& request,
500                                   const std::vector<RunTimePoolInfo>& runTimePoolInfos,
501                                   const sp<IExecutionCallback>& executionCallback) {
502    CpuExecutor executor;
503    int err = executor.run(model, request, runTimePoolInfos);
504    ErrorStatus status = err == ANEURALNETWORKS_NO_ERROR ?
505            ErrorStatus::NONE : ErrorStatus::GENERAL_FAILURE;
506    executionCallback->notify(status);
507}
508
509int StepExecutor::startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback) {
510    // TODO: use a thread pool
511
512    Model model;
513    mModel->setHidlModel(&model);
514
515    // Prepare the callback for asynchronous execution. sp<ExecutionCallback>
516    // object is returned when the execution has been successfully launched,
517    // otherwise a nullptr is returned. The executionCallback is abstracted in
518    // the NN API as an "event".
519    sp<ExecutionCallback> executionCallback = new ExecutionCallback();
520    *synchronizationCallback = nullptr;
521
522    std::vector<RunTimePoolInfo> runTimePoolInfos;
523    uint32_t count = mMemories.size();
524    runTimePoolInfos.resize(count);
525    for (uint32_t i = 0; i < count; i++) {
526        const Memory* mem = mMemories[i];
527        runTimePoolInfos[i].set(mem->getHidlMemory());
528    }
529    // Create as many pools as there are input / output.
530    auto fixPointerArguments = [&runTimePoolInfos](std::vector<ModelArgumentInfo>& argumentInfos) {
531        for (ModelArgumentInfo& argumentInfo : argumentInfos) {
532            if (argumentInfo.state == ModelArgumentInfo::POINTER) {
533                RunTimePoolInfo runTimeInfo = {
534                            .buffer = static_cast<uint8_t*>(argumentInfo.buffer)};
535                argumentInfo.locationAndLength.poolIndex =
536                            static_cast<uint32_t>(runTimePoolInfos.size());
537                argumentInfo.locationAndLength.offset = 0;
538                runTimePoolInfos.push_back(runTimeInfo);
539            }
540        }
541    };
542    fixPointerArguments(mInputs);
543    fixPointerArguments(mOutputs);
544
545    Request request;
546    setRequestArgumentArray(mInputs, &request.inputs);
547    setRequestArgumentArray(mOutputs, &request.outputs);
548
549    // TODO: should model be moved with a std::cref?
550    std::thread thread(asyncStartComputeOnCpu, model, std::move(request),
551                       std::move(runTimePoolInfos), executionCallback);
552    executionCallback->bind_thread(std::move(thread));
553
554    *synchronizationCallback = executionCallback;
555    return ANEURALNETWORKS_NO_ERROR;
556}
557
558}  // namespace nn
559}  // namespace android
560