rsCpuScriptGroup2.cpp revision 958d8b23ac969d13ea3da0a2d9a355f5951afa8c
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7
8#include <set>
9#include <sstream>
10#include <string>
11#include <vector>
12
13#ifndef RS_COMPATIBILITY_LIB
14#include "bcc/Config/Config.h"
15#include <sys/wait.h>
16#endif
17
18#include "cpu_ref/rsCpuCore.h"
19#include "cpu_ref/rsCpuCoreRuntime.h"
20#include "rsClosure.h"
21#include "rsContext.h"
22#include "rsCpuCore.h"
23#include "rsCpuExecutable.h"
24#include "rsCpuScript.h"
25#include "rsScript.h"
26#include "rsScriptGroup2.h"
27#include "rsScriptIntrinsic.h"
28
29using std::string;
30using std::vector;
31
32namespace android {
33namespace renderscript {
34
35namespace {
36
37const size_t DefaultKernelArgCount = 2;
38
39void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
40               uint32_t xend, uint32_t outstep) {
41    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr;
42    RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
43    const void **oldIns  = kparams->ins;
44    uint32_t *oldStrides = kparams->inEStrides;
45
46    std::vector<const void*> ins(DefaultKernelArgCount);
47    std::vector<uint32_t> strides(DefaultKernelArgCount);
48
49    for (CPUClosure* cpuClosure : closures) {
50        const Closure* closure = cpuClosure->mClosure;
51
52        auto in_iter = ins.begin();
53        auto stride_iter = strides.begin();
54
55        for (size_t i = 0; i < closure->mNumArg; i++) {
56            const void* arg = closure->mArgs[i];
57            const Allocation* a = (const Allocation*)arg;
58            const uint32_t eStride = a->mHal.state.elementSizeBytes;
59            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
60                    eStride * xstart;
61            if (kparams->dimY > 1) {
62                ptr += a->mHal.drvState.lod[0].stride * kparams->y;
63            }
64            *in_iter++ = ptr;
65            *stride_iter++ = eStride;
66        }
67
68        mutable_kparams->ins = &ins[0];
69        mutable_kparams->inEStrides = &strides[0];
70
71        const Allocation* out = closure->mReturnValue;
72        const uint32_t ostep = out->mHal.state.elementSizeBytes;
73        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
74                ostep * xstart;
75        if (kparams->dimY > 1) {
76            ptr += out->mHal.drvState.lod[0].stride * kparams->y;
77        }
78
79        mutable_kparams->out = (void*)ptr;
80
81        cpuClosure->mFunc(kparams, xstart, xend, ostep);
82    }
83
84    mutable_kparams->ins        = oldIns;
85    mutable_kparams->inEStrides = oldStrides;
86}
87
88}  // namespace
89
90Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
91    mGroup(group), mFunc(nullptr) {
92    mName = strndup(name, strlen(name));
93}
94
95Batch::~Batch() {
96    for (CPUClosure* c : mClosures) {
97        delete c;
98    }
99    free(mName);
100}
101
102bool Batch::conflict(CPUClosure* cpuClosure) const {
103    if (mClosures.empty()) {
104        return false;
105    }
106
107    const Closure* closure = cpuClosure->mClosure;
108
109    if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
110        // An invoke should be in a batch by itself, so it conflicts with any other
111        // closure.
112        return true;
113    }
114
115    const auto& globalDeps = closure->mGlobalDeps;
116    const auto& argDeps = closure->mArgDeps;
117
118    for (CPUClosure* c : mClosures) {
119        const Closure* batched = c->mClosure;
120        if (globalDeps.find(batched) != globalDeps.end()) {
121            return true;
122        }
123        const auto& it = argDeps.find(batched);
124        if (it != argDeps.end()) {
125            const auto& args = (*it).second;
126            for (const auto &p1 : *args) {
127                if (p1.second->get() != nullptr) {
128                    return true;
129                }
130            }
131        }
132    }
133
134    return false;
135}
136
137CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
138                                         const ScriptGroupBase *sg) :
139    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
140    mExecutable(nullptr), mScriptObj(nullptr) {
141    rsAssert(!mGroup->mClosures.empty());
142
143    Batch* batch = new Batch(this, "Batch0");
144    int i = 0;
145    for (Closure* closure: mGroup->mClosures) {
146        CPUClosure* cc;
147        const IDBase* funcID = closure->mFunctionID.get();
148        RsdCpuScriptImpl* si =
149                (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
150        if (closure->mIsKernel) {
151            MTLaunchStruct mtls;
152            si->forEachKernelSetup(funcID->mSlot, &mtls);
153            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
154        } else {
155            cc = new CPUClosure(closure, si);
156        }
157
158        if (batch->conflict(cc)) {
159            mBatches.push_back(batch);
160            std::stringstream ss;
161            ss << "Batch" << ++i;
162            batch = new Batch(this, ss.str().c_str());
163        }
164
165        batch->mClosures.push_back(cc);
166    }
167
168    rsAssert(!batch->mClosures.empty());
169    mBatches.push_back(batch);
170
171#ifndef RS_COMPATIBILITY_LIB
172    compile(mGroup->mCacheDir);
173    if (mScriptObj != nullptr && mExecutable != nullptr) {
174        for (Batch* batch : mBatches) {
175            batch->resolveFuncPtr(mScriptObj);
176        }
177    }
178#endif  // RS_COMPATIBILITY_LIB
179}
180
181void Batch::resolveFuncPtr(void* sharedObj) {
182    std::string funcName(mName);
183    if (mClosures.front()->mClosure->mIsKernel) {
184        funcName.append(".expand");
185    }
186    mFunc = dlsym(sharedObj, funcName.c_str());
187    rsAssert (mFunc != nullptr);
188}
189
190CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
191    for (Batch* batch : mBatches) {
192        delete batch;
193    }
194    // TODO: move this dlclose into ~ScriptExecutable().
195    if (mScriptObj != nullptr) {
196        dlclose(mScriptObj);
197    }
198    delete mExecutable;
199}
200
201namespace {
202
203#ifndef RS_COMPATIBILITY_LIB
204
205string getCoreLibPath(Context* context, string* coreLibRelaxedPath) {
206    *coreLibRelaxedPath = "";
207
208    // If we're debugging, use the debug library.
209    if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
210        return SYSLIBPATH"/libclcore_debug.bc";
211    }
212
213    // Check for a platform specific library
214
215#if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
216    // NEON-capable ARMv7a devices can use an accelerated math library
217    // for all reduced precision scripts.
218    // ARMv8 does not use NEON, as ASIMD can be used with all precision
219    // levels.
220    *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc";
221#endif
222
223#if defined(__i386__) || defined(__x86_64__)
224    // x86 devices will use an optimized library.
225    return SYSLIBPATH"/libclcore_x86.bc";
226#else
227    return SYSLIBPATH"/libclcore.bc";
228#endif
229}
230
231string getFileName(string path) {
232    unsigned found = path.find_last_of("/\\");
233    return path.substr(found + 1);
234}
235
236void setupCompileArguments(
237        const vector<string>& inputs, const vector<string>& kernelBatches,
238        const vector<string>& invokeBatches,
239        const string& output_dir, const string& output_filename,
240        const string& coreLibPath, const string& coreLibRelaxedPath,
241        vector<const char*>* args) {
242    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
243    args->push_back("-fPIC");
244    args->push_back("-embedRSInfo");
245    args->push_back("-mtriple");
246    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
247    args->push_back("-bclib");
248    args->push_back(coreLibPath.c_str());
249    args->push_back("-bclib_relaxed");
250    args->push_back(coreLibRelaxedPath.c_str());
251    for (const string& input : inputs) {
252        args->push_back(input.c_str());
253    }
254    for (const string& batch : kernelBatches) {
255        args->push_back("-merge");
256        args->push_back(batch.c_str());
257    }
258    for (const string& batch : invokeBatches) {
259        args->push_back("-invoke");
260        args->push_back(batch.c_str());
261    }
262    args->push_back("-output_path");
263    args->push_back(output_dir.c_str());
264    args->push_back("-o");
265    args->push_back(output_filename.c_str());
266    args->push_back(nullptr);
267}
268
269bool fuseAndCompile(const char** arguments,
270                    const string& commandLine) {
271    const pid_t pid = fork();
272
273    if (pid == -1) {
274        ALOGE("Couldn't fork for bcc execution");
275        return false;
276    }
277
278    if (pid == 0) {
279        // Child process
280        ALOGV("Invoking BCC with: %s", commandLine.c_str());
281        execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments);
282
283        ALOGE("execv() failed: %s", strerror(errno));
284        abort();
285        return false;
286    }
287
288    // Parent process
289    int status = 0;
290    const pid_t w = waitpid(pid, &status, 0);
291    if (w == -1) {
292        return false;
293    }
294
295    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) {
296        ALOGE("bcc terminated unexpectedly");
297        return false;
298    }
299
300    return true;
301}
302
303void generateSourceSlot(const Closure& closure,
304                        const std::vector<std::string>& inputs,
305                        std::stringstream& ss) {
306    const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
307    const Script* script = funcID->mScript;
308
309    rsAssert (!script->isIntrinsic());
310
311    const RsdCpuScriptImpl *cpuScript =
312            (const RsdCpuScriptImpl*)script->mHal.drv;
313    const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
314
315    const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
316            inputs.begin();
317
318    ss << index << "," << funcID->mSlot << ".";
319}
320
321#endif  // RS_COMPATIBILTY_LIB
322
323}  // anonymous namespace
324
325void CpuScriptGroup2Impl::compile(const char* cacheDir) {
326#ifndef RS_COMPATIBILITY_LIB
327    if (mGroup->mClosures.size() < 2) {
328        return;
329    }
330
331    //===--------------------------------------------------------------------===//
332    // Fuse the input kernels and generate native code in an object file
333    //===--------------------------------------------------------------------===//
334
335    std::set<string> inputSet;
336    for (Closure* closure : mGroup->mClosures) {
337        const Script* script = closure->mFunctionID.get()->mScript;
338
339        // If any script is an intrinsic, give up trying fusing the kernels.
340        if (script->isIntrinsic()) {
341            return;
342        }
343
344        const RsdCpuScriptImpl *cpuScript =
345                (const RsdCpuScriptImpl*)script->mHal.drv;
346        const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
347        inputSet.insert(bitcodeFilename);
348    }
349
350    std::vector<string> inputs(inputSet.begin(), inputSet.end());
351
352    std::vector<string> kernelBatches;
353    std::vector<string> invokeBatches;
354
355    int i = 0;
356    for (const auto& batch : mBatches) {
357        rsAssert(batch->size() > 0);
358
359        std::stringstream ss;
360        ss << batch->mName << ":";
361
362        if (!batch->mClosures.front()->mClosure->mIsKernel) {
363            rsAssert(batch->size() == 1);
364            generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss);
365            invokeBatches.push_back(ss.str());
366        } else {
367            for (const auto& cpuClosure : batch->mClosures) {
368                generateSourceSlot(*cpuClosure->mClosure, inputs, ss);
369            }
370            kernelBatches.push_back(ss.str());
371        }
372    }
373
374    rsAssert(cacheDir != nullptr);
375    string objFilePath(cacheDir);
376    objFilePath.append("/fusedXXXXXX.o");
377    // Find unique object file name, to make following file names unique.
378    int tempfd = mkstemps(&objFilePath[0], 2);
379    if (tempfd == -1) {
380      return;
381    }
382    TEMP_FAILURE_RETRY(close(tempfd));
383
384    string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2));
385    string coreLibRelaxedPath;
386    const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(),
387                                               &coreLibRelaxedPath);
388    vector<const char*> arguments;
389    setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir,
390                          outputFileName, coreLibPath, coreLibRelaxedPath, &arguments);
391    std::unique_ptr<const char> joined(
392        rsuJoinStrings(arguments.size() - 1, arguments.data()));
393    string commandLine (joined.get());
394
395    if (!fuseAndCompile(arguments.data(), commandLine)) {
396        unlink(objFilePath.c_str());
397        return;
398    }
399
400    //===--------------------------------------------------------------------===//
401    // Create and load the shared lib
402    //===--------------------------------------------------------------------===//
403
404    const char* resName = outputFileName.c_str();
405
406    if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
407        ALOGE("Failed to link object file '%s'", resName);
408        return;
409    }
410
411    mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
412    if (mScriptObj == nullptr) {
413        ALOGE("Unable to load '%s'", resName);
414        return;
415    }
416
417    mExecutable = ScriptExecutable::createFromSharedObject(
418        nullptr,  // RS context. Unused.
419        mScriptObj);
420
421#endif  // RS_COMPATIBILITY_LIB
422}
423
424void CpuScriptGroup2Impl::execute() {
425    for (auto batch : mBatches) {
426        batch->setGlobalsForBatch();
427        batch->run();
428    }
429}
430
431void Batch::setGlobalsForBatch() {
432    for (CPUClosure* cpuClosure : mClosures) {
433        const Closure* closure = cpuClosure->mClosure;
434        const IDBase* funcID = closure->mFunctionID.get();
435        Script* s = funcID->mScript;;
436        for (const auto& p : closure->mGlobals) {
437            const void* value = p.second.first;
438            int size = p.second.second;
439            if (value == nullptr && size == 0) {
440                // This indicates the current closure depends on another closure for a
441                // global in their shared module (script). In this case we don't need to
442                // copy the value. For example, an invoke intializes a global variable
443                // which a kernel later reads.
444                continue;
445            }
446            rsAssert(p.first != nullptr);
447            ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)",
448                  closure, p.first, p.first->mScript, p.first->mSlot);
449            Script* script = p.first->mScript;
450            const RsdCpuScriptImpl *cpuScript =
451                    (const RsdCpuScriptImpl*)script->mHal.drv;
452            int slot = p.first->mSlot;
453            ScriptExecutable* exec = mGroup->getExecutable();
454            if (exec != nullptr) {
455                const char* varName = cpuScript->getFieldName(slot);
456                void* addr = exec->getFieldAddress(varName);
457                if (size < 0) {
458                    rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
459                                 (rs_object_base*)addr, (ObjectBase*)value);
460                } else {
461                    memcpy(addr, (const void*)&value, size);
462                }
463            } else {
464                // We use -1 size to indicate an ObjectBase rather than a primitive type
465                if (size < 0) {
466                    s->setVarObj(slot, (ObjectBase*)value);
467                } else {
468                    s->setVar(slot, (const void*)&value, size);
469                }
470            }
471        }
472    }
473}
474
475void Batch::run() {
476    if (!mClosures.front()->mClosure->mIsKernel) {
477        rsAssert(mClosures.size() == 1);
478
479        // This batch contains a single closure for an invoke function
480        CPUClosure* cc = mClosures.front();
481        const Closure* c = cc->mClosure;
482
483        if (mFunc != nullptr) {
484            // TODO: Need align pointers for x86_64.
485            // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
486            ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
487        } else {
488            const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
489            rsAssert(invokeID != nullptr);
490            cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
491        }
492
493        return;
494    }
495
496    if (mFunc != nullptr) {
497        MTLaunchStruct mtls;
498        const CPUClosure* firstCpuClosure = mClosures.front();
499        const CPUClosure* lastCpuClosure = mClosures.back();
500
501        firstCpuClosure->mSi->forEachMtlsSetup(
502                (const Allocation**)firstCpuClosure->mClosure->mArgs,
503                firstCpuClosure->mClosure->mNumArg,
504                lastCpuClosure->mClosure->mReturnValue,
505                nullptr, 0, nullptr, &mtls);
506
507        mtls.script = nullptr;
508        mtls.fep.usr = nullptr;
509        mtls.kernel = (ForEachFunc_t)mFunc;
510
511        mGroup->getCpuRefImpl()->launchThreads(
512                (const Allocation**)firstCpuClosure->mClosure->mArgs,
513                firstCpuClosure->mClosure->mNumArg,
514                lastCpuClosure->mClosure->mReturnValue,
515                nullptr, &mtls);
516
517        return;
518    }
519
520    for (CPUClosure* cpuClosure : mClosures) {
521        const Closure* closure = cpuClosure->mClosure;
522        const ScriptKernelID* kernelID =
523                (const ScriptKernelID*)closure->mFunctionID.get();
524        cpuClosure->mSi->preLaunch(kernelID->mSlot,
525                                   (const Allocation**)closure->mArgs,
526                                   closure->mNumArg, closure->mReturnValue,
527                                   nullptr, 0, nullptr);
528    }
529
530    const CPUClosure* cpuClosure = mClosures.front();
531    const Closure* closure = cpuClosure->mClosure;
532    MTLaunchStruct mtls;
533
534    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
535                                          closure->mNumArg,
536                                          closure->mReturnValue,
537                                          nullptr, 0, nullptr, &mtls)) {
538
539        mtls.script = nullptr;
540        mtls.kernel = (void (*)())&groupRoot;
541        mtls.fep.usr = &mClosures;
542
543        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
544    }
545
546    for (CPUClosure* cpuClosure : mClosures) {
547        const Closure* closure = cpuClosure->mClosure;
548        const ScriptKernelID* kernelID =
549                (const ScriptKernelID*)closure->mFunctionID.get();
550        cpuClosure->mSi->postLaunch(kernelID->mSlot,
551                                    (const Allocation**)closure->mArgs,
552                                    closure->mNumArg, closure->mReturnValue,
553                                    nullptr, 0, nullptr);
554    }
555}
556
557}  // namespace renderscript
558}  // namespace android
559