rsCpuScriptGroup2.cpp revision 3e5318a36be470ba7a8c5cf82bbe069178733b11
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7
8#include <string>
9#include <vector>
10
11#ifndef RS_COMPATIBILITY_LIB
12#include "bcc/Config/Config.h"
13#include <sys/wait.h>
14#endif
15
16#include "cpu_ref/rsCpuCore.h"
17#include "rsClosure.h"
18#include "rsContext.h"
19#include "rsCpuCore.h"
20#include "rsCpuExecutable.h"
21#include "rsCpuScript.h"
22#include "rsScript.h"
23#include "rsScriptGroup2.h"
24#include "rsScriptIntrinsic.h"
25
26using std::string;
27using std::vector;
28
29namespace android {
30namespace renderscript {
31
32namespace {
33
34const size_t DefaultKernelArgCount = 2;
35
36void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
37               uint32_t xend, uint32_t outstep) {
38    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr;
39    RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
40    const void **oldIns  = kparams->ins;
41    uint32_t *oldStrides = kparams->inEStrides;
42
43    std::vector<const void*> ins(DefaultKernelArgCount);
44    std::vector<uint32_t> strides(DefaultKernelArgCount);
45
46    for (CPUClosure* cpuClosure : closures) {
47        const Closure* closure = cpuClosure->mClosure;
48
49        auto in_iter = ins.begin();
50        auto stride_iter = strides.begin();
51
52        for (size_t i = 0; i < closure->mNumArg; i++) {
53            const void* arg = closure->mArgs[i];
54            const Allocation* a = (const Allocation*)arg;
55            const uint32_t eStride = a->mHal.state.elementSizeBytes;
56            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
57                    eStride * xstart;
58            if (kparams->dimY > 1) {
59                ptr += a->mHal.drvState.lod[0].stride * kparams->y;
60            }
61            *in_iter++ = ptr;
62            *stride_iter++ = eStride;
63        }
64
65        mutable_kparams->ins = &ins[0];
66        mutable_kparams->inEStrides = &strides[0];
67
68        const Allocation* out = closure->mReturnValue;
69        const uint32_t ostep = out->mHal.state.elementSizeBytes;
70        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
71                ostep * xstart;
72        if (kparams->dimY > 1) {
73            ptr += out->mHal.drvState.lod[0].stride * kparams->y;
74        }
75
76        mutable_kparams->out = (void*)ptr;
77
78        mutable_kparams->usr = cpuClosure->mUsrPtr;
79
80        cpuClosure->mFunc(kparams, xstart, xend, ostep);
81    }
82
83    mutable_kparams->ins        = oldIns;
84    mutable_kparams->inEStrides = oldStrides;
85    mutable_kparams->usr        = &closures;
86}
87
88}  // namespace
89
90Batch::~Batch() {
91    for (CPUClosure* c : mClosures) {
92        delete c;
93    }
94    if (mScriptObj) {
95        dlclose(mScriptObj);
96    }
97}
98
99bool Batch::conflict(CPUClosure* cpuClosure) const {
100    if (mClosures.empty()) {
101        return false;
102    }
103
104    const Closure* closure = cpuClosure->mClosure;
105
106    if (closure->mKernelID.get() == nullptr ||
107        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
108        // An invoke should be in a batch by itself, so it conflicts with any other
109        // closure.
110        return true;
111    }
112
113    const auto& globalDeps = closure->mGlobalDeps;
114    const auto& argDeps = closure->mArgDeps;
115
116    for (CPUClosure* c : mClosures) {
117        const Closure* batched = c->mClosure;
118        if (globalDeps.find(batched) != globalDeps.end()) {
119            return true;
120        }
121        const auto& it = argDeps.find(batched);
122        if (it != argDeps.end()) {
123            const auto& args = (*it).second;
124            for (const auto &p1 : *args) {
125                if (p1.second->get() != nullptr) {
126                    return true;
127                }
128            }
129        }
130    }
131
132    return false;
133}
134
135CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
136                                         const ScriptGroupBase *sg) :
137    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) {
138    rsAssert(!mGroup->mClosures.empty());
139
140    Batch* batch = new Batch(this);
141    for (Closure* closure: mGroup->mClosures) {
142        const ScriptKernelID* kernelID = closure->mKernelID.get();
143        RsdCpuScriptImpl* si;
144        CPUClosure* cc;
145        if (kernelID != nullptr) {
146            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript);
147            MTLaunchStruct mtls;
148            si->forEachKernelSetup(kernelID->mSlot, &mtls);
149            // TODO: Is mtls.fep.usrLen ever used?
150            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel,
151                                mtls.fep.usr, mtls.fep.usrLen);
152        } else {
153            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(
154                    closure->mInvokeID->mScript);
155            cc = new CPUClosure(closure, si);
156        }
157
158        if (batch->conflict(cc)) {
159            mBatches.push_back(batch);
160            batch = new Batch(this);
161        }
162
163        batch->mClosures.push_back(cc);
164    }
165
166    rsAssert(!batch->mClosures.empty());
167    mBatches.push_back(batch);
168
169#ifndef RS_COMPATIBILITY_LIB
170    for (Batch* batch : mBatches) {
171        batch->tryToCreateFusedKernel(mGroup->mCacheDir);
172    }
173#endif
174}
175
176CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
177    for (Batch* batch : mBatches) {
178        delete batch;
179    }
180}
181
182namespace {
183
184#ifndef RS_COMPATIBILITY_LIB
185
186string getFileName(string path) {
187    unsigned found = path.find_last_of("/\\");
188    return path.substr(found + 1);
189}
190
191void setupCompileArguments(
192        const vector<string>& inputs, const vector<int>& kernels,
193        const string& output_dir, const string& output_filename,
194        const string& rsLib, vector<const char*>* args) {
195    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
196    args->push_back("-fPIC");
197    args->push_back("-embedRSInfo");
198    args->push_back("-mtriple");
199    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
200    args->push_back("-bclib");
201    args->push_back(rsLib.c_str());
202    for (const string& input : inputs) {
203        args->push_back(input.c_str());
204    }
205    for (int kernel : kernels) {
206        args->push_back("-k");
207        string strKernel = std::to_string(kernel);
208        args->push_back(strKernel.c_str());
209    }
210    args->push_back("-output_path");
211    args->push_back(output_dir.c_str());
212    args->push_back("-o");
213    args->push_back(output_filename.c_str());
214    args->push_back(nullptr);
215}
216
217bool fuseAndCompile(const char** arguments,
218                    const string& commandLine) {
219    const pid_t pid = fork();
220
221    if (pid == -1) {
222        ALOGE("Couldn't fork for bcc execution");
223        return false;
224    }
225
226    if (pid == 0) {
227        // Child process
228        ALOGV("Invoking BCC with: %s", commandLine.c_str());
229        execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments);
230
231        ALOGE("execv() failed: %s", strerror(errno));
232        abort();
233        return false;
234    }
235
236    // Parent process
237    int status = 0;
238    const pid_t w = waitpid(pid, &status, 0);
239    if (w == -1) {
240        return false;
241    }
242
243    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) {
244        ALOGE("bcc terminated unexpectedly");
245        return false;
246    }
247
248    return true;
249}
250#endif
251
252}  // anonymous namespace
253
254void Batch::tryToCreateFusedKernel(const char *cacheDir) {
255#ifndef RS_COMPATIBILITY_LIB
256    if (mClosures.size() < 2) {
257        return;
258    }
259
260    //===--------------------------------------------------------------------===//
261    // Fuse the input kernels and generate native code in an object file
262    //===--------------------------------------------------------------------===//
263
264    std::vector<string> inputFiles;
265    std::vector<int> slots;
266
267    for (CPUClosure* cpuClosure : mClosures) {
268        const Closure* closure = cpuClosure->mClosure;
269        const ScriptKernelID* kernelID = closure->mKernelID.get();
270        const Script* script = kernelID->mScript;
271
272        if (script->isIntrinsic()) {
273            return;
274        }
275
276        const RsdCpuScriptImpl *cpuScript =
277                (const RsdCpuScriptImpl*)script->mHal.drv;
278
279        const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
280
281        inputFiles.push_back(bitcodeFilename);
282        slots.push_back(kernelID->mSlot);
283    }
284
285    rsAssert(cacheDir != nullptr);
286    string objFilePath(cacheDir);
287    objFilePath.append("/fusedXXXXXX.o");
288    // Find unique object file name, to make following file names unique.
289    int tempfd = mkstemps(&objFilePath[0], 2);
290    if (tempfd == -1) {
291      return;
292    }
293    TEMP_FAILURE_RETRY(close(tempfd));
294
295    string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2));
296    string rsLibPath(SYSLIBPATH"/libclcore.bc");
297    vector<const char*> arguments;
298    setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath,
299                          &arguments);
300    std::unique_ptr<const char> joined(
301        rsuJoinStrings(arguments.size() - 1, arguments.data()));
302    string commandLine (joined.get());
303
304    if (!fuseAndCompile(arguments.data(), commandLine)) {
305        unlink(objFilePath.c_str());
306        return;
307    }
308
309    //===--------------------------------------------------------------------===//
310    // Create and load the shared lib
311    //===--------------------------------------------------------------------===//
312
313    const char* resName = outputFileName.c_str();
314
315    if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
316        ALOGE("Failed to link object file '%s'", resName);
317        return;
318    }
319
320    void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
321    if (mSharedObj == nullptr) {
322        ALOGE("Unable to load '%s'", resName);
323        return;
324    }
325
326    mExecutable = ScriptExecutable::createFromSharedObject(
327                                                           nullptr,  // RS context. Unused.
328                                                           mSharedObj);
329
330#endif  // RS_COMPATIBILITY_LIB
331}
332
333void CpuScriptGroup2Impl::execute() {
334    for (auto batch : mBatches) {
335        batch->setGlobalsForBatch();
336        batch->run();
337    }
338}
339
340void Batch::setGlobalsForBatch() {
341    for (CPUClosure* cpuClosure : mClosures) {
342        const Closure* closure = cpuClosure->mClosure;
343        const ScriptKernelID* kernelID = closure->mKernelID.get();
344        Script* s;
345        if (kernelID != nullptr) {
346            s = kernelID->mScript;
347        } else {
348            s = cpuClosure->mClosure->mInvokeID->mScript;
349        }
350        for (const auto& p : closure->mGlobals) {
351            const void* value = p.second.first;
352            int size = p.second.second;
353            if (value == nullptr && size == 0) {
354                // This indicates the current closure depends on another closure for a
355                // global in their shared module (script). In this case we don't need to
356                // copy the value. For example, an invoke intializes a global variable
357                // which a kernel later reads.
358                continue;
359            }
360            rsAssert(p.first != nullptr);
361            ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)",
362                  closure, p.first, p.first->mScript, p.first->mSlot);
363            // We use -1 size to indicate an ObjectBase rather than a primitive type
364            if (size < 0) {
365                s->setVarObj(p.first->mSlot, (ObjectBase*)value);
366            } else {
367                s->setVar(p.first->mSlot, (const void*)&value, size);
368            }
369        }
370    }
371}
372
373void Batch::run() {
374    if (mExecutable != nullptr) {
375        MTLaunchStruct mtls;
376        const CPUClosure* firstCpuClosure = mClosures.front();
377        const CPUClosure* lastCpuClosure = mClosures.back();
378
379        firstCpuClosure->mSi->forEachMtlsSetup(
380                (const Allocation**)firstCpuClosure->mClosure->mArgs,
381                firstCpuClosure->mClosure->mNumArg,
382                lastCpuClosure->mClosure->mReturnValue,
383                nullptr, 0, nullptr, &mtls);
384
385        mtls.script = nullptr;
386        mtls.fep.usr = nullptr;
387        mtls.kernel = mExecutable->getForEachFunction(0);
388
389        mGroup->getCpuRefImpl()->launchThreads(
390                (const Allocation**)firstCpuClosure->mClosure->mArgs,
391                firstCpuClosure->mClosure->mNumArg,
392                lastCpuClosure->mClosure->mReturnValue,
393                nullptr, &mtls);
394
395        return;
396    }
397
398    if (mClosures.size() == 1 &&
399        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
400        // This closure is for an invoke function
401        CPUClosure* cc = mClosures.front();
402        const Closure* c = cc->mClosure;
403        const ScriptInvokeID* invokeID = c->mInvokeID;
404        rsAssert(invokeID != nullptr);
405        cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
406        return;
407    }
408
409    for (CPUClosure* cpuClosure : mClosures) {
410        const Closure* closure = cpuClosure->mClosure;
411        const ScriptKernelID* kernelID = closure->mKernelID.get();
412        cpuClosure->mSi->preLaunch(kernelID->mSlot,
413                                   (const Allocation**)closure->mArgs,
414                                   closure->mNumArg, closure->mReturnValue,
415                                   cpuClosure->mUsrPtr, cpuClosure->mUsrSize,
416                                   nullptr);
417    }
418
419    const CPUClosure* cpuClosure = mClosures.front();
420    const Closure* closure = cpuClosure->mClosure;
421    MTLaunchStruct mtls;
422
423    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
424                                          closure->mNumArg,
425                                          closure->mReturnValue,
426                                          nullptr, 0, nullptr, &mtls)) {
427
428        mtls.script = nullptr;
429        mtls.kernel = (void (*)())&groupRoot;
430        mtls.fep.usr = &mClosures;
431
432        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
433    }
434
435    for (CPUClosure* cpuClosure : mClosures) {
436        const Closure* closure = cpuClosure->mClosure;
437        const ScriptKernelID* kernelID = closure->mKernelID.get();
438        cpuClosure->mSi->postLaunch(kernelID->mSlot,
439                                    (const Allocation**)closure->mArgs,
440                                    closure->mNumArg, closure->mReturnValue,
441                                    nullptr, 0, nullptr);
442    }
443}
444
445}  // namespace renderscript
446}  // namespace android
447