rsCpuScriptGroup2.cpp revision 1efae29f4bbe6c165caf6dfc4b89cf8a5f8c469b
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7
8#include <set>
9#include <sstream>
10#include <string>
11#include <vector>
12
13#ifndef RS_COMPATIBILITY_LIB
14#include "bcc/Config/Config.h"
15#endif
16
17#include "cpu_ref/rsCpuCore.h"
18#include "rsClosure.h"
19#include "rsContext.h"
20#include "rsCpuCore.h"
21#include "rsCpuExecutable.h"
22#include "rsCpuScript.h"
23#include "rsScript.h"
24#include "rsScriptGroup2.h"
25#include "rsScriptIntrinsic.h"
26
27using std::string;
28using std::vector;
29
30namespace android {
31namespace renderscript {
32
33namespace {
34
35const size_t DefaultKernelArgCount = 2;
36
37void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart,
38               uint32_t xend, uint32_t outstep) {
39    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr;
40    RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo);
41
42    const size_t oldInLen = mutable_kinfo->inLen;
43
44    decltype(mutable_kinfo->inStride) oldInStride;
45    memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride));
46
47    for (CPUClosure* cpuClosure : closures) {
48        const Closure* closure = cpuClosure->mClosure;
49
50        // There had better be enough space in mutable_kinfo
51        rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT);
52
53        for (size_t i = 0; i < closure->mNumArg; i++) {
54            const void* arg = closure->mArgs[i];
55            const Allocation* a = (const Allocation*)arg;
56            const uint32_t eStride = a->mHal.state.elementSizeBytes;
57            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
58                    eStride * xstart;
59            if (kinfo->dim.y > 1) {
60                ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y;
61            }
62            mutable_kinfo->inPtr[i] = ptr;
63            mutable_kinfo->inStride[i] = eStride;
64        }
65        mutable_kinfo->inLen = closure->mNumArg;
66
67        const Allocation* out = closure->mReturnValue;
68        const uint32_t ostep = out->mHal.state.elementSizeBytes;
69        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
70                ostep * xstart;
71        if (kinfo->dim.y > 1) {
72            ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y;
73        }
74
75        rsAssert(kinfo->outLen <= 1);
76        mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr);
77
78        cpuClosure->mFunc(kinfo, xstart, xend, ostep);
79    }
80
81    mutable_kinfo->inLen = oldInLen;
82    memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride));
83}
84
85}  // namespace
86
87Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
88    mGroup(group), mFunc(nullptr) {
89    mName = strndup(name, strlen(name));
90}
91
92Batch::~Batch() {
93    for (CPUClosure* c : mClosures) {
94        delete c;
95    }
96    free(mName);
97}
98
99bool Batch::conflict(CPUClosure* cpuClosure) const {
100    if (mClosures.empty()) {
101        return false;
102    }
103
104    const Closure* closure = cpuClosure->mClosure;
105
106    if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
107        // An invoke should be in a batch by itself, so it conflicts with any other
108        // closure.
109        return true;
110    }
111
112    const auto& globalDeps = closure->mGlobalDeps;
113    const auto& argDeps = closure->mArgDeps;
114
115    for (CPUClosure* c : mClosures) {
116        const Closure* batched = c->mClosure;
117        if (globalDeps.find(batched) != globalDeps.end()) {
118            return true;
119        }
120        const auto& it = argDeps.find(batched);
121        if (it != argDeps.end()) {
122            const auto& args = (*it).second;
123            for (const auto &p1 : *args) {
124                if (p1.second.get() != nullptr) {
125                    return true;
126                }
127            }
128        }
129    }
130
131    // The compiler fusion pass in bcc expects that kernels chained up through
132    // (1st) input and output.
133
134    const Closure* lastBatched = mClosures.back()->mClosure;
135    const auto& it = argDeps.find(lastBatched);
136
137    if (it == argDeps.end()) {
138        return true;
139    }
140
141    const auto& args = (*it).second;
142    for (const auto &p1 : *args) {
143        if (p1.first == 0 && p1.second.get() == nullptr) {
144            // The new closure depends on the last batched closure's return
145            // value (fieldId being nullptr) for its first argument (argument 0)
146            return false;
147        }
148    }
149
150    return true;
151}
152
153CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
154                                         const ScriptGroupBase *sg) :
155    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
156    mExecutable(nullptr), mScriptObj(nullptr) {
157    rsAssert(!mGroup->mClosures.empty());
158
159    mCpuRefImpl->lockMutex();
160    Batch* batch = new Batch(this, "Batch0");
161    int i = 0;
162    for (Closure* closure: mGroup->mClosures) {
163        CPUClosure* cc;
164        const IDBase* funcID = closure->mFunctionID.get();
165        RsdCpuScriptImpl* si =
166                (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
167        if (closure->mIsKernel) {
168            MTLaunchStruct mtls;
169            si->forEachKernelSetup(funcID->mSlot, &mtls);
170            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
171        } else {
172            cc = new CPUClosure(closure, si);
173        }
174
175        if (batch->conflict(cc)) {
176            mBatches.push_back(batch);
177            std::stringstream ss;
178            ss << "Batch" << ++i;
179            batch = new Batch(this, ss.str().c_str());
180        }
181
182        batch->mClosures.push_back(cc);
183    }
184
185    rsAssert(!batch->mClosures.empty());
186    mBatches.push_back(batch);
187
188#ifndef RS_COMPATIBILITY_LIB
189    compile(mGroup->mCacheDir);
190    if (mScriptObj != nullptr && mExecutable != nullptr) {
191        for (Batch* batch : mBatches) {
192            batch->resolveFuncPtr(mScriptObj);
193        }
194    }
195#endif  // RS_COMPATIBILITY_LIB
196    mCpuRefImpl->unlockMutex();
197}
198
199void Batch::resolveFuncPtr(void* sharedObj) {
200    std::string funcName(mName);
201    if (mClosures.front()->mClosure->mIsKernel) {
202        funcName.append(".expand");
203    }
204    mFunc = dlsym(sharedObj, funcName.c_str());
205    rsAssert (mFunc != nullptr);
206}
207
208CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
209    for (Batch* batch : mBatches) {
210        delete batch;
211    }
212    delete mExecutable;
213    // TODO: move this dlclose into ~ScriptExecutable().
214    if (mScriptObj != nullptr) {
215        dlclose(mScriptObj);
216    }
217}
218
219namespace {
220
221#ifndef RS_COMPATIBILITY_LIB
222
223string getCoreLibPath(Context* context, string* coreLibRelaxedPath) {
224    *coreLibRelaxedPath = "";
225
226    // If we're debugging, use the debug library.
227    if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
228        return SYSLIBPATH"/libclcore_debug.bc";
229    }
230
231    // Check for a platform specific library
232
233#if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
234    // NEON-capable ARMv7a devices can use an accelerated math library
235    // for all reduced precision scripts.
236    // ARMv8 does not use NEON, as ASIMD can be used with all precision
237    // levels.
238    *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc";
239#endif
240
241#if defined(__i386__) || defined(__x86_64__)
242    // x86 devices will use an optimized library.
243    return SYSLIBPATH"/libclcore_x86.bc";
244#else
245    return SYSLIBPATH"/libclcore.bc";
246#endif
247}
248
249void setupCompileArguments(
250        const vector<const char*>& inputs, const vector<string>& kernelBatches,
251        const vector<string>& invokeBatches,
252        const char* outputDir, const char* outputFileName,
253        const char* coreLibPath, const char* coreLibRelaxedPath,
254        const bool emitGlobalInfo, const bool emitGlobalInfoSkipConstant,
255        vector<const char*>* args) {
256    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
257    args->push_back("-fPIC");
258    args->push_back("-embedRSInfo");
259    if (emitGlobalInfo) {
260        args->push_back("-rs-global-info");
261        if (emitGlobalInfoSkipConstant) {
262            args->push_back("-rs-global-info-skip-constant");
263        }
264    }
265    args->push_back("-mtriple");
266    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
267    args->push_back("-bclib");
268    args->push_back(coreLibPath);
269    args->push_back("-bclib_relaxed");
270    args->push_back(coreLibRelaxedPath);
271    for (const char* input : inputs) {
272        args->push_back(input);
273    }
274    for (const string& batch : kernelBatches) {
275        args->push_back("-merge");
276        args->push_back(batch.c_str());
277    }
278    for (const string& batch : invokeBatches) {
279        args->push_back("-invoke");
280        args->push_back(batch.c_str());
281    }
282    args->push_back("-output_path");
283    args->push_back(outputDir);
284
285    // The output filename has to be the last, in case we need to pop it out and
286    // replace with a different name.
287    args->push_back("-o");
288    args->push_back(outputFileName);
289}
290
291void generateSourceSlot(RsdCpuReferenceImpl* ctxt,
292                        const Closure& closure,
293                        const std::vector<const char*>& inputs,
294                        std::stringstream& ss) {
295    const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
296    const Script* script = funcID->mScript;
297
298    rsAssert (!script->isIntrinsic());
299
300    const RsdCpuScriptImpl *cpuScript =
301            (const RsdCpuScriptImpl *)ctxt->lookupScript(script);
302    const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
303
304    const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
305            inputs.begin();
306
307    ss << index << "," << funcID->mSlot << ".";
308}
309
310#endif  // RS_COMPATIBILTY_LIB
311
312}  // anonymous namespace
313
314void CpuScriptGroup2Impl::compile(const char* cacheDir) {
315#ifndef RS_COMPATIBILITY_LIB
316    if (mGroup->mClosures.size() < 2) {
317        return;
318    }
319
320    auto comparator = [](const char* str1, const char* str2) -> bool {
321        return strcmp(str1, str2) < 0;
322    };
323    std::set<const char*, decltype(comparator)> inputSet(comparator);
324
325    for (Closure* closure : mGroup->mClosures) {
326        const Script* script = closure->mFunctionID.get()->mScript;
327
328        // If any script is an intrinsic, give up trying fusing the kernels.
329        if (script->isIntrinsic()) {
330            return;
331        }
332
333        const RsdCpuScriptImpl *cpuScript =
334            (const RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(script);
335
336        const char* bitcodeFilename = cpuScript->getBitcodeFilePath();
337        inputSet.insert(bitcodeFilename);
338    }
339
340    std::vector<const char*> inputs(inputSet.begin(), inputSet.end());
341
342    std::vector<string> kernelBatches;
343    std::vector<string> invokeBatches;
344
345    int i = 0;
346    for (const auto& batch : mBatches) {
347        rsAssert(batch->size() > 0);
348
349        std::stringstream ss;
350        ss << batch->mName << ":";
351
352        if (!batch->mClosures.front()->mClosure->mIsKernel) {
353            rsAssert(batch->size() == 1);
354            generateSourceSlot(mCpuRefImpl, *batch->mClosures.front()->mClosure, inputs, ss);
355            invokeBatches.push_back(ss.str());
356        } else {
357            for (const auto& cpuClosure : batch->mClosures) {
358                generateSourceSlot(mCpuRefImpl, *cpuClosure->mClosure, inputs, ss);
359            }
360            kernelBatches.push_back(ss.str());
361        }
362    }
363
364    rsAssert(cacheDir != nullptr);
365    string objFilePath(cacheDir);
366    objFilePath.append("/");
367    objFilePath.append(mGroup->mName);
368    objFilePath.append(".o");
369
370    const char* resName = mGroup->mName;
371    string coreLibRelaxedPath;
372    const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(),
373                                               &coreLibRelaxedPath);
374
375    vector<const char*> arguments;
376    bool emitGlobalInfo = getCpuRefImpl()->getEmbedGlobalInfo();
377    bool emitGlobalInfoSkipConstant = getCpuRefImpl()->getEmbedGlobalInfoSkipConstant();
378    setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir,
379                          resName, coreLibPath.c_str(), coreLibRelaxedPath.c_str(),
380                          emitGlobalInfo, emitGlobalInfoSkipConstant,
381                          &arguments);
382
383    std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1,
384                                                       arguments.data()));
385
386    inputs.push_back(coreLibPath.c_str());
387    inputs.push_back(coreLibRelaxedPath.c_str());
388
389    uint32_t checksum = constructBuildChecksum(nullptr, 0, cmdLine.get(),
390                                               inputs.data(), inputs.size());
391
392    if (checksum == 0) {
393        return;
394    }
395
396    std::stringstream ss;
397    ss << std::hex << checksum;
398    const char* checksumStr = ss.str().c_str();
399
400    //===--------------------------------------------------------------------===//
401    // Try to load a shared lib from code cache matching filename and checksum
402    //===--------------------------------------------------------------------===//
403
404    bool alreadyLoaded = false;
405    std::string cloneName;
406
407    mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nullptr,
408                                                       &alreadyLoaded);
409    if (mScriptObj != nullptr) {
410        // A shared library named resName is found in code cache directory
411        // cacheDir, and loaded with the handle stored in mScriptObj.
412
413        mExecutable = ScriptExecutable::createFromSharedObject(
414            getCpuRefImpl()->getContext(), mScriptObj, checksum);
415
416        if (mExecutable != nullptr) {
417            // The loaded shared library in mScriptObj has a matching checksum.
418            // An executable object has been created.
419            return;
420        }
421
422        ALOGV("Failed to create an executable object from so file due to "
423              "mismatching checksum");
424
425        if (alreadyLoaded) {
426            // The shared object found in code cache has already been loaded.
427            // A different file name is needed for the new shared library, to
428            // avoid corrupting the currently loaded instance.
429
430            cloneName.append(resName);
431            cloneName.append("#");
432            cloneName.append(SharedLibraryUtils::getRandomString(6).string());
433
434            // The last element in arguments is the output filename.
435            arguments.pop_back();
436            arguments.push_back(cloneName.c_str());
437        }
438
439        dlclose(mScriptObj);
440        mScriptObj = nullptr;
441    }
442
443    //===--------------------------------------------------------------------===//
444    // Fuse the input kernels and generate native code in an object file
445    //===--------------------------------------------------------------------===//
446
447    arguments.push_back("-build-checksum");
448    arguments.push_back(checksumStr);
449    arguments.push_back(nullptr);
450
451    bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH,
452                                      arguments.size()-1,
453                                      arguments.data());
454    if (!compiled) {
455        return;
456    }
457
458    //===--------------------------------------------------------------------===//
459    // Create and load the shared lib
460    //===--------------------------------------------------------------------===//
461
462    if (!SharedLibraryUtils::createSharedLibrary(
463            getCpuRefImpl()->getContext()->getDriverName(), cacheDir, resName)) {
464        ALOGE("Failed to link object file '%s'", resName);
465        unlink(objFilePath.c_str());
466        return;
467    }
468
469    unlink(objFilePath.c_str());
470
471    mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
472    if (mScriptObj == nullptr) {
473        ALOGE("Unable to load '%s'", resName);
474        return;
475    }
476
477    if (alreadyLoaded) {
478        // Delete the temporary, random-named file that we created to avoid
479        // interfering with an already loaded shared library.
480        string cloneFilePath(cacheDir);
481        cloneFilePath.append("/");
482        cloneFilePath.append(cloneName.c_str());
483        cloneFilePath.append(".so");
484        unlink(cloneFilePath.c_str());
485    }
486
487    mExecutable = ScriptExecutable::createFromSharedObject(
488        getCpuRefImpl()->getContext(),
489        mScriptObj);
490
491#endif  // RS_COMPATIBILITY_LIB
492}
493
494void CpuScriptGroup2Impl::execute() {
495    for (auto batch : mBatches) {
496        batch->setGlobalsForBatch();
497        batch->run();
498    }
499}
500
501void Batch::setGlobalsForBatch() {
502    for (CPUClosure* cpuClosure : mClosures) {
503        const Closure* closure = cpuClosure->mClosure;
504        const IDBase* funcID = closure->mFunctionID.get();
505        Script* s = funcID->mScript;;
506        for (const auto& p : closure->mGlobals) {
507            const void* value = p.second.first;
508            int size = p.second.second;
509            if (value == nullptr && size == 0) {
510                // This indicates the current closure depends on another closure for a
511                // global in their shared module (script). In this case we don't need to
512                // copy the value. For example, an invoke intializes a global variable
513                // which a kernel later reads.
514                continue;
515            }
516            rsAssert(p.first != nullptr);
517            Script* script = p.first->mScript;
518            RsdCpuReferenceImpl* ctxt = mGroup->getCpuRefImpl();
519            const RsdCpuScriptImpl *cpuScript =
520                    (const RsdCpuScriptImpl *)ctxt->lookupScript(script);
521            int slot = p.first->mSlot;
522            ScriptExecutable* exec = mGroup->getExecutable();
523            if (exec != nullptr) {
524                const char* varName = cpuScript->getFieldName(slot);
525                void* addr = exec->getFieldAddress(varName);
526                if (size < 0) {
527                    rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
528                                 (rs_object_base*)addr, (ObjectBase*)value);
529                } else {
530                    memcpy(addr, (const void*)&value, size);
531                }
532            } else {
533                // We use -1 size to indicate an ObjectBase rather than a primitive type
534                if (size < 0) {
535                    s->setVarObj(slot, (ObjectBase*)value);
536                } else {
537                    s->setVar(slot, (const void*)&value, size);
538                }
539            }
540        }
541    }
542}
543
544void Batch::run() {
545    if (!mClosures.front()->mClosure->mIsKernel) {
546        rsAssert(mClosures.size() == 1);
547
548        // This batch contains a single closure for an invoke function
549        CPUClosure* cc = mClosures.front();
550        const Closure* c = cc->mClosure;
551
552        if (mFunc != nullptr) {
553            // TODO: Need align pointers for x86_64.
554            // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
555            ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
556        } else {
557            const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
558            rsAssert(invokeID != nullptr);
559            cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
560        }
561
562        return;
563    }
564
565    if (mFunc != nullptr) {
566        MTLaunchStruct mtls;
567        const CPUClosure* firstCpuClosure = mClosures.front();
568        const CPUClosure* lastCpuClosure = mClosures.back();
569
570        firstCpuClosure->mSi->forEachMtlsSetup(
571                (const Allocation**)firstCpuClosure->mClosure->mArgs,
572                firstCpuClosure->mClosure->mNumArg,
573                lastCpuClosure->mClosure->mReturnValue,
574                nullptr, 0, nullptr, &mtls);
575
576        mtls.script = nullptr;
577        mtls.fep.usr = nullptr;
578        mtls.kernel = (ForEachFunc_t)mFunc;
579
580        mGroup->getCpuRefImpl()->launchThreads(
581                (const Allocation**)firstCpuClosure->mClosure->mArgs,
582                firstCpuClosure->mClosure->mNumArg,
583                lastCpuClosure->mClosure->mReturnValue,
584                nullptr, &mtls);
585
586        return;
587    }
588
589    for (CPUClosure* cpuClosure : mClosures) {
590        const Closure* closure = cpuClosure->mClosure;
591        const ScriptKernelID* kernelID =
592                (const ScriptKernelID*)closure->mFunctionID.get();
593        cpuClosure->mSi->preLaunch(kernelID->mSlot,
594                                   (const Allocation**)closure->mArgs,
595                                   closure->mNumArg, closure->mReturnValue,
596                                   nullptr, 0, nullptr);
597    }
598
599    const CPUClosure* cpuClosure = mClosures.front();
600    const Closure* closure = cpuClosure->mClosure;
601    MTLaunchStruct mtls;
602
603    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
604                                          closure->mNumArg,
605                                          closure->mReturnValue,
606                                          nullptr, 0, nullptr, &mtls)) {
607
608        mtls.script = nullptr;
609        mtls.kernel = (void (*)())&groupRoot;
610        mtls.fep.usr = &mClosures;
611
612        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
613    }
614
615    for (CPUClosure* cpuClosure : mClosures) {
616        const Closure* closure = cpuClosure->mClosure;
617        const ScriptKernelID* kernelID =
618                (const ScriptKernelID*)closure->mFunctionID.get();
619        cpuClosure->mSi->postLaunch(kernelID->mSlot,
620                                    (const Allocation**)closure->mArgs,
621                                    closure->mNumArg, closure->mReturnValue,
622                                    nullptr, 0, nullptr);
623    }
624}
625
626}  // namespace renderscript
627}  // namespace android
628