rsCpuScriptGroup2.cpp revision f02a2b0a2749d4a4f07edbc23eddff2e51d11b72
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7
8#include <set>
9#include <sstream>
10#include <string>
11#include <vector>
12
13#ifndef RS_COMPATIBILITY_LIB
14#include <zlib.h>
15
16#include "bcc/Config/Config.h"
17#endif
18
19#include "cpu_ref/rsCpuCore.h"
20#include "rsClosure.h"
21#include "rsContext.h"
22#include "rsCpuCore.h"
23#include "rsCpuExecutable.h"
24#include "rsCpuScript.h"
25#include "rsScript.h"
26#include "rsScriptGroup2.h"
27#include "rsScriptIntrinsic.h"
28
29using std::string;
30using std::vector;
31
32namespace android {
33namespace renderscript {
34
35namespace {
36
37const size_t DefaultKernelArgCount = 2;
38
39void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart,
40               uint32_t xend, uint32_t outstep) {
41    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr;
42    RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo);
43
44    const size_t oldInLen = mutable_kinfo->inLen;
45
46    decltype(mutable_kinfo->inStride) oldInStride;
47    memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride));
48
49    for (CPUClosure* cpuClosure : closures) {
50        const Closure* closure = cpuClosure->mClosure;
51
52        // There had better be enough space in mutable_kinfo
53        rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT);
54
55        for (size_t i = 0; i < closure->mNumArg; i++) {
56            const void* arg = closure->mArgs[i];
57            const Allocation* a = (const Allocation*)arg;
58            const uint32_t eStride = a->mHal.state.elementSizeBytes;
59            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
60                    eStride * xstart;
61            if (kinfo->dim.y > 1) {
62                ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y;
63            }
64            mutable_kinfo->inPtr[i] = ptr;
65            mutable_kinfo->inStride[i] = eStride;
66        }
67        mutable_kinfo->inLen = closure->mNumArg;
68
69        const Allocation* out = closure->mReturnValue;
70        const uint32_t ostep = out->mHal.state.elementSizeBytes;
71        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
72                ostep * xstart;
73        if (kinfo->dim.y > 1) {
74            ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y;
75        }
76
77        rsAssert(kinfo->outLen <= 1);
78        mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr);
79
80        cpuClosure->mFunc(kinfo, xstart, xend, ostep);
81    }
82
83    mutable_kinfo->inLen = oldInLen;
84    memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride));
85}
86
87}  // namespace
88
89Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
90    mGroup(group), mFunc(nullptr) {
91    mName = strndup(name, strlen(name));
92}
93
94Batch::~Batch() {
95    for (CPUClosure* c : mClosures) {
96        delete c;
97    }
98    free(mName);
99}
100
101bool Batch::conflict(CPUClosure* cpuClosure) const {
102    if (mClosures.empty()) {
103        return false;
104    }
105
106    const Closure* closure = cpuClosure->mClosure;
107
108    if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
109        // An invoke should be in a batch by itself, so it conflicts with any other
110        // closure.
111        return true;
112    }
113
114    const auto& globalDeps = closure->mGlobalDeps;
115    const auto& argDeps = closure->mArgDeps;
116
117    for (CPUClosure* c : mClosures) {
118        const Closure* batched = c->mClosure;
119        if (globalDeps.find(batched) != globalDeps.end()) {
120            return true;
121        }
122        const auto& it = argDeps.find(batched);
123        if (it != argDeps.end()) {
124            const auto& args = (*it).second;
125            for (const auto &p1 : *args) {
126                if (p1.second.get() != nullptr) {
127                    return true;
128                }
129            }
130        }
131    }
132
133    return false;
134}
135
136CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
137                                         const ScriptGroupBase *sg) :
138    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
139    mExecutable(nullptr), mScriptObj(nullptr) {
140    rsAssert(!mGroup->mClosures.empty());
141
142    Batch* batch = new Batch(this, "Batch0");
143    int i = 0;
144    for (Closure* closure: mGroup->mClosures) {
145        CPUClosure* cc;
146        const IDBase* funcID = closure->mFunctionID.get();
147        RsdCpuScriptImpl* si =
148                (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
149        if (closure->mIsKernel) {
150            MTLaunchStruct mtls;
151            si->forEachKernelSetup(funcID->mSlot, &mtls);
152            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
153        } else {
154            cc = new CPUClosure(closure, si);
155        }
156
157        if (batch->conflict(cc)) {
158            mBatches.push_back(batch);
159            std::stringstream ss;
160            ss << "Batch" << ++i;
161            batch = new Batch(this, ss.str().c_str());
162        }
163
164        batch->mClosures.push_back(cc);
165    }
166
167    rsAssert(!batch->mClosures.empty());
168    mBatches.push_back(batch);
169
170#ifndef RS_COMPATIBILITY_LIB
171    compile(mGroup->mCacheDir);
172    if (mScriptObj != nullptr && mExecutable != nullptr) {
173        for (Batch* batch : mBatches) {
174            batch->resolveFuncPtr(mScriptObj);
175        }
176    }
177#endif  // RS_COMPATIBILITY_LIB
178}
179
180void Batch::resolveFuncPtr(void* sharedObj) {
181    std::string funcName(mName);
182    if (mClosures.front()->mClosure->mIsKernel) {
183        funcName.append(".expand");
184    }
185    mFunc = dlsym(sharedObj, funcName.c_str());
186    rsAssert (mFunc != nullptr);
187}
188
189CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
190    for (Batch* batch : mBatches) {
191        delete batch;
192    }
193    delete mExecutable;
194    // TODO: move this dlclose into ~ScriptExecutable().
195    if (mScriptObj != nullptr) {
196        dlclose(mScriptObj);
197    }
198}
199
200namespace {
201
202#ifndef RS_COMPATIBILITY_LIB
203
204string getCoreLibPath(Context* context, string* coreLibRelaxedPath) {
205    *coreLibRelaxedPath = "";
206
207    // If we're debugging, use the debug library.
208    if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
209        return SYSLIBPATH"/libclcore_debug.bc";
210    }
211
212    // Check for a platform specific library
213
214#if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
215    // NEON-capable ARMv7a devices can use an accelerated math library
216    // for all reduced precision scripts.
217    // ARMv8 does not use NEON, as ASIMD can be used with all precision
218    // levels.
219    *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc";
220#endif
221
222#if defined(__i386__) || defined(__x86_64__)
223    // x86 devices will use an optimized library.
224    return SYSLIBPATH"/libclcore_x86.bc";
225#else
226    return SYSLIBPATH"/libclcore.bc";
227#endif
228}
229
230bool getChecksum(const std::vector<string>& inputBitcodeFilenames,
231                 const string& coreLibPath, const string& coreLibRelaxedPath,
232                 const char* commandLine,
233                 char* checksumStr) {
234    uint32_t checksum = adler32(0L, Z_NULL, 0);
235
236    for (const auto& bcFilename : inputBitcodeFilenames) {
237        if (!android::renderscript::addFileToChecksum(bcFilename.c_str(), checksum)) {
238            return false;
239        }
240    }
241
242    if (!android::renderscript::addFileToChecksum(coreLibPath.c_str(), checksum)) {
243        return false;
244    }
245
246    if (!coreLibRelaxedPath.empty() &&
247        !android::renderscript::addFileToChecksum(coreLibRelaxedPath.c_str(), checksum)) {
248        return false;
249    }
250
251    // include checksum of command line arguments
252    checksum = adler32(checksum, (const unsigned char *) commandLine,
253                       strlen(commandLine));
254
255    sprintf(checksumStr, "%08x", checksum);
256
257    return true;
258}
259
260void setupCompileArguments(
261        const vector<string>& inputs, const vector<string>& kernelBatches,
262        const vector<string>& invokeBatches,
263        const string& output_dir, const string& output_filename,
264        const string& coreLibPath, const string& coreLibRelaxedPath,
265        vector<const char*>* args) {
266    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
267    args->push_back("-fPIC");
268    args->push_back("-embedRSInfo");
269    args->push_back("-mtriple");
270    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
271    args->push_back("-bclib");
272    args->push_back(coreLibPath.c_str());
273    args->push_back("-bclib_relaxed");
274    args->push_back(coreLibRelaxedPath.c_str());
275    for (const string& input : inputs) {
276        args->push_back(input.c_str());
277    }
278    for (const string& batch : kernelBatches) {
279        args->push_back("-merge");
280        args->push_back(batch.c_str());
281    }
282    for (const string& batch : invokeBatches) {
283        args->push_back("-invoke");
284        args->push_back(batch.c_str());
285    }
286    args->push_back("-output_path");
287    args->push_back(output_dir.c_str());
288    args->push_back("-o");
289    args->push_back(output_filename.c_str());
290}
291
292void generateSourceSlot(const Closure& closure,
293                        const std::vector<std::string>& inputs,
294                        std::stringstream& ss) {
295    const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
296    const Script* script = funcID->mScript;
297
298    rsAssert (!script->isIntrinsic());
299
300    const RsdCpuScriptImpl *cpuScript =
301            (const RsdCpuScriptImpl*)script->mHal.drv;
302    const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
303
304    const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
305            inputs.begin();
306
307    ss << index << "," << funcID->mSlot << ".";
308}
309
310#endif  // RS_COMPATIBILTY_LIB
311
312}  // anonymous namespace
313
314void CpuScriptGroup2Impl::compile(const char* cacheDir) {
315#ifndef RS_COMPATIBILITY_LIB
316    if (mGroup->mClosures.size() < 2) {
317        return;
318    }
319
320    std::set<string> inputSet;
321    for (Closure* closure : mGroup->mClosures) {
322        const Script* script = closure->mFunctionID.get()->mScript;
323
324        // If any script is an intrinsic, give up trying fusing the kernels.
325        if (script->isIntrinsic()) {
326            return;
327        }
328
329        const RsdCpuScriptImpl *cpuScript =
330                (const RsdCpuScriptImpl*)script->mHal.drv;
331        const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
332        inputSet.insert(bitcodeFilename);
333    }
334
335    std::vector<string> inputs(inputSet.begin(), inputSet.end());
336
337    std::vector<string> kernelBatches;
338    std::vector<string> invokeBatches;
339
340    int i = 0;
341    for (const auto& batch : mBatches) {
342        rsAssert(batch->size() > 0);
343
344        std::stringstream ss;
345        ss << batch->mName << ":";
346
347        if (!batch->mClosures.front()->mClosure->mIsKernel) {
348            rsAssert(batch->size() == 1);
349            generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss);
350            invokeBatches.push_back(ss.str());
351        } else {
352            for (const auto& cpuClosure : batch->mClosures) {
353                generateSourceSlot(*cpuClosure->mClosure, inputs, ss);
354            }
355            kernelBatches.push_back(ss.str());
356        }
357    }
358
359    rsAssert(cacheDir != nullptr);
360    string objFilePath(cacheDir);
361    objFilePath.append("/");
362    objFilePath.append(mGroup->mName);
363    objFilePath.append(".o");
364
365    string outputFileName(mGroup->mName);
366    string coreLibRelaxedPath;
367    const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(),
368                                               &coreLibRelaxedPath);
369
370    vector<const char*> arguments;
371    string output_dir(cacheDir);
372    setupCompileArguments(inputs, kernelBatches, invokeBatches, output_dir,
373                          outputFileName, coreLibPath, coreLibRelaxedPath,
374                          &arguments);
375
376    std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1,
377                                                  arguments.data()));
378
379    if (!getChecksum(inputs, coreLibPath, coreLibRelaxedPath, cmdLine.get(),
380                     mChecksum)) {
381        return;
382    }
383
384    const char* resName = outputFileName.c_str();
385
386    //===--------------------------------------------------------------------===//
387    // Try to load a shared lib from code cache matching filename and checksum
388    //===--------------------------------------------------------------------===//
389
390    mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
391    if (mScriptObj != nullptr) {
392        mExecutable = ScriptExecutable::createFromSharedObject(
393            getCpuRefImpl()->getContext(), mScriptObj);
394        if (mExecutable != nullptr) {
395            if (mExecutable->isChecksumValid(mChecksum)) {
396                return;
397            } else {
398                ALOGE("Invalid checksum from cached so: %s (expected: %s)",
399                      mExecutable->getBuildChecksum(), mChecksum);
400            }
401            delete mExecutable;
402            mExecutable = nullptr;
403        } else {
404            ALOGE("Failed to create an executable object from so file");
405        }
406        dlclose(mScriptObj);
407        mScriptObj = nullptr;
408    }
409
410    //===--------------------------------------------------------------------===//
411    // Fuse the input kernels and generate native code in an object file
412    //===--------------------------------------------------------------------===//
413
414    arguments.push_back("-build-checksum");
415    arguments.push_back(mChecksum);
416    arguments.push_back(nullptr);
417
418    bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH,
419                                      arguments.size()-1,
420                                      arguments.data());
421    if (!compiled) {
422        return;
423    }
424
425    //===--------------------------------------------------------------------===//
426    // Create and load the shared lib
427    //===--------------------------------------------------------------------===//
428
429    if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
430        ALOGE("Failed to link object file '%s'", resName);
431        unlink(objFilePath.c_str());
432        return;
433    }
434
435    unlink(objFilePath.c_str());
436
437    mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
438    if (mScriptObj == nullptr) {
439        ALOGE("Unable to load '%s'", resName);
440        return;
441    }
442
443    mExecutable = ScriptExecutable::createFromSharedObject(
444        getCpuRefImpl()->getContext(),
445        mScriptObj);
446
447#endif  // RS_COMPATIBILITY_LIB
448}
449
450void CpuScriptGroup2Impl::execute() {
451    for (auto batch : mBatches) {
452        batch->setGlobalsForBatch();
453        batch->run();
454    }
455}
456
457void Batch::setGlobalsForBatch() {
458    for (CPUClosure* cpuClosure : mClosures) {
459        const Closure* closure = cpuClosure->mClosure;
460        const IDBase* funcID = closure->mFunctionID.get();
461        Script* s = funcID->mScript;;
462        for (const auto& p : closure->mGlobals) {
463            const void* value = p.second.first;
464            int size = p.second.second;
465            if (value == nullptr && size == 0) {
466                // This indicates the current closure depends on another closure for a
467                // global in their shared module (script). In this case we don't need to
468                // copy the value. For example, an invoke intializes a global variable
469                // which a kernel later reads.
470                continue;
471            }
472            rsAssert(p.first != nullptr);
473            Script* script = p.first->mScript;
474            const RsdCpuScriptImpl *cpuScript =
475                    (const RsdCpuScriptImpl*)script->mHal.drv;
476            int slot = p.first->mSlot;
477            ScriptExecutable* exec = mGroup->getExecutable();
478            if (exec != nullptr) {
479                const char* varName = cpuScript->getFieldName(slot);
480                void* addr = exec->getFieldAddress(varName);
481                if (size < 0) {
482                    rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
483                                 (rs_object_base*)addr, (ObjectBase*)value);
484                } else {
485                    memcpy(addr, (const void*)&value, size);
486                }
487            } else {
488                // We use -1 size to indicate an ObjectBase rather than a primitive type
489                if (size < 0) {
490                    s->setVarObj(slot, (ObjectBase*)value);
491                } else {
492                    s->setVar(slot, (const void*)&value, size);
493                }
494            }
495        }
496    }
497}
498
499void Batch::run() {
500    if (!mClosures.front()->mClosure->mIsKernel) {
501        rsAssert(mClosures.size() == 1);
502
503        // This batch contains a single closure for an invoke function
504        CPUClosure* cc = mClosures.front();
505        const Closure* c = cc->mClosure;
506
507        if (mFunc != nullptr) {
508            // TODO: Need align pointers for x86_64.
509            // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
510            ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
511        } else {
512            const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
513            rsAssert(invokeID != nullptr);
514            cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
515        }
516
517        return;
518    }
519
520    if (mFunc != nullptr) {
521        MTLaunchStruct mtls;
522        const CPUClosure* firstCpuClosure = mClosures.front();
523        const CPUClosure* lastCpuClosure = mClosures.back();
524
525        firstCpuClosure->mSi->forEachMtlsSetup(
526                (const Allocation**)firstCpuClosure->mClosure->mArgs,
527                firstCpuClosure->mClosure->mNumArg,
528                lastCpuClosure->mClosure->mReturnValue,
529                nullptr, 0, nullptr, &mtls);
530
531        mtls.script = nullptr;
532        mtls.fep.usr = nullptr;
533        mtls.kernel = (ForEachFunc_t)mFunc;
534
535        mGroup->getCpuRefImpl()->launchThreads(
536                (const Allocation**)firstCpuClosure->mClosure->mArgs,
537                firstCpuClosure->mClosure->mNumArg,
538                lastCpuClosure->mClosure->mReturnValue,
539                nullptr, &mtls);
540
541        return;
542    }
543
544    for (CPUClosure* cpuClosure : mClosures) {
545        const Closure* closure = cpuClosure->mClosure;
546        const ScriptKernelID* kernelID =
547                (const ScriptKernelID*)closure->mFunctionID.get();
548        cpuClosure->mSi->preLaunch(kernelID->mSlot,
549                                   (const Allocation**)closure->mArgs,
550                                   closure->mNumArg, closure->mReturnValue,
551                                   nullptr, 0, nullptr);
552    }
553
554    const CPUClosure* cpuClosure = mClosures.front();
555    const Closure* closure = cpuClosure->mClosure;
556    MTLaunchStruct mtls;
557
558    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
559                                          closure->mNumArg,
560                                          closure->mReturnValue,
561                                          nullptr, 0, nullptr, &mtls)) {
562
563        mtls.script = nullptr;
564        mtls.kernel = (void (*)())&groupRoot;
565        mtls.fep.usr = &mClosures;
566
567        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
568    }
569
570    for (CPUClosure* cpuClosure : mClosures) {
571        const Closure* closure = cpuClosure->mClosure;
572        const ScriptKernelID* kernelID =
573                (const ScriptKernelID*)closure->mFunctionID.get();
574        cpuClosure->mSi->postLaunch(kernelID->mSlot,
575                                    (const Allocation**)closure->mArgs,
576                                    closure->mNumArg, closure->mReturnValue,
577                                    nullptr, 0, nullptr);
578    }
579}
580
581}  // namespace renderscript
582}  // namespace android
583