rsCpuScriptGroup2.cpp revision 062c287f573ecc06c38ee4295e5627e12c52ac3d
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7
8#include <set>
9#include <sstream>
10#include <string>
11#include <vector>
12
13#ifndef RS_COMPATIBILITY_LIB
14#include "bcc/Config/Config.h"
15#include <sys/wait.h>
16#endif
17
18#include "cpu_ref/rsCpuCore.h"
19#include "rsClosure.h"
20#include "rsContext.h"
21#include "rsCpuCore.h"
22#include "rsCpuExecutable.h"
23#include "rsCpuScript.h"
24#include "rsScript.h"
25#include "rsScriptGroup2.h"
26#include "rsScriptIntrinsic.h"
27
28using std::string;
29using std::vector;
30
31namespace android {
32namespace renderscript {
33
34namespace {
35
36const size_t DefaultKernelArgCount = 2;
37
38void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
39               uint32_t xend, uint32_t outstep) {
40    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr;
41    RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
42    const void **oldIns  = kparams->ins;
43    uint32_t *oldStrides = kparams->inEStrides;
44
45    std::vector<const void*> ins(DefaultKernelArgCount);
46    std::vector<uint32_t> strides(DefaultKernelArgCount);
47
48    for (CPUClosure* cpuClosure : closures) {
49        const Closure* closure = cpuClosure->mClosure;
50
51        auto in_iter = ins.begin();
52        auto stride_iter = strides.begin();
53
54        for (size_t i = 0; i < closure->mNumArg; i++) {
55            const void* arg = closure->mArgs[i];
56            const Allocation* a = (const Allocation*)arg;
57            const uint32_t eStride = a->mHal.state.elementSizeBytes;
58            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
59                    eStride * xstart;
60            if (kparams->dimY > 1) {
61                ptr += a->mHal.drvState.lod[0].stride * kparams->y;
62            }
63            *in_iter++ = ptr;
64            *stride_iter++ = eStride;
65        }
66
67        mutable_kparams->ins = &ins[0];
68        mutable_kparams->inEStrides = &strides[0];
69
70        const Allocation* out = closure->mReturnValue;
71        const uint32_t ostep = out->mHal.state.elementSizeBytes;
72        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
73                ostep * xstart;
74        if (kparams->dimY > 1) {
75            ptr += out->mHal.drvState.lod[0].stride * kparams->y;
76        }
77
78        mutable_kparams->out = (void*)ptr;
79
80        cpuClosure->mFunc(kparams, xstart, xend, ostep);
81    }
82
83    mutable_kparams->ins        = oldIns;
84    mutable_kparams->inEStrides = oldStrides;
85}
86
87}  // namespace
88
89Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
90    mGroup(group), mFunc(nullptr) {
91    mName = strndup(name, strlen(name));
92}
93
94Batch::~Batch() {
95    for (CPUClosure* c : mClosures) {
96        delete c;
97    }
98    free(mName);
99}
100
101bool Batch::conflict(CPUClosure* cpuClosure) const {
102    if (mClosures.empty()) {
103        return false;
104    }
105
106    const Closure* closure = cpuClosure->mClosure;
107
108    if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
109        // An invoke should be in a batch by itself, so it conflicts with any other
110        // closure.
111        return true;
112    }
113
114    const auto& globalDeps = closure->mGlobalDeps;
115    const auto& argDeps = closure->mArgDeps;
116
117    for (CPUClosure* c : mClosures) {
118        const Closure* batched = c->mClosure;
119        if (globalDeps.find(batched) != globalDeps.end()) {
120            return true;
121        }
122        const auto& it = argDeps.find(batched);
123        if (it != argDeps.end()) {
124            const auto& args = (*it).second;
125            for (const auto &p1 : *args) {
126                if (p1.second->get() != nullptr) {
127                    return true;
128                }
129            }
130        }
131    }
132
133    return false;
134}
135
136CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
137                                         const ScriptGroupBase *sg) :
138    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
139    mExecutable(nullptr), mScriptObj(nullptr) {
140    rsAssert(!mGroup->mClosures.empty());
141
142    Batch* batch = new Batch(this, "Batch0");
143    int i = 0;
144    for (Closure* closure: mGroup->mClosures) {
145        CPUClosure* cc;
146        const IDBase* funcID = closure->mFunctionID.get();
147        RsdCpuScriptImpl* si =
148                (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
149        if (closure->mIsKernel) {
150            MTLaunchStruct mtls;
151            si->forEachKernelSetup(funcID->mSlot, &mtls);
152            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
153        } else {
154            cc = new CPUClosure(closure, si);
155        }
156
157        if (batch->conflict(cc)) {
158            mBatches.push_back(batch);
159            std::stringstream ss;
160            ss << "Batch" << ++i;
161            batch = new Batch(this, ss.str().c_str());
162        }
163
164        batch->mClosures.push_back(cc);
165    }
166
167    rsAssert(!batch->mClosures.empty());
168    mBatches.push_back(batch);
169
170#ifndef RS_COMPATIBILITY_LIB
171    compile(mGroup->mCacheDir);
172    if (mScriptObj != nullptr && mExecutable != nullptr) {
173        for (Batch* batch : mBatches) {
174            batch->resolveFuncPtr(mScriptObj);
175        }
176    }
177#endif  // RS_COMPATIBILITY_LIB
178}
179
180void Batch::resolveFuncPtr(void* sharedObj) {
181    std::string funcName(mName);
182    if (mClosures.front()->mClosure->mIsKernel) {
183        funcName.append(".expand");
184    }
185    mFunc = dlsym(sharedObj, funcName.c_str());
186    rsAssert (mFunc != nullptr);
187}
188
189CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
190    for (Batch* batch : mBatches) {
191        delete batch;
192    }
193    // TODO: move this dlclose into ~ScriptExecutable().
194    if (mScriptObj != nullptr) {
195        dlclose(mScriptObj);
196    }
197    delete mExecutable;
198}
199
200namespace {
201
202#ifndef RS_COMPATIBILITY_LIB
203
204string getFileName(string path) {
205    unsigned found = path.find_last_of("/\\");
206    return path.substr(found + 1);
207}
208
209void setupCompileArguments(
210        const vector<string>& inputs, const vector<string>& kernelBatches,
211        const vector<string>& invokeBatches,
212        const string& output_dir, const string& output_filename,
213        const string& rsLib, vector<const char*>* args) {
214    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
215    args->push_back("-fPIC");
216    args->push_back("-embedRSInfo");
217    args->push_back("-mtriple");
218    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
219    args->push_back("-bclib");
220    args->push_back(rsLib.c_str());
221    for (const string& input : inputs) {
222        args->push_back(input.c_str());
223    }
224    for (const string& batch : kernelBatches) {
225        args->push_back("-merge");
226        args->push_back(batch.c_str());
227    }
228    for (const string& batch : invokeBatches) {
229        args->push_back("-invoke");
230        args->push_back(batch.c_str());
231    }
232    args->push_back("-output_path");
233    args->push_back(output_dir.c_str());
234    args->push_back("-o");
235    args->push_back(output_filename.c_str());
236    args->push_back(nullptr);
237}
238
239bool fuseAndCompile(const char** arguments,
240                    const string& commandLine) {
241    const pid_t pid = fork();
242
243    if (pid == -1) {
244        ALOGE("Couldn't fork for bcc execution");
245        return false;
246    }
247
248    if (pid == 0) {
249        // Child process
250        ALOGV("Invoking BCC with: %s", commandLine.c_str());
251        execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments);
252
253        ALOGE("execv() failed: %s", strerror(errno));
254        abort();
255        return false;
256    }
257
258    // Parent process
259    int status = 0;
260    const pid_t w = waitpid(pid, &status, 0);
261    if (w == -1) {
262        return false;
263    }
264
265    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) {
266        ALOGE("bcc terminated unexpectedly");
267        return false;
268    }
269
270    return true;
271}
272
273void generateSourceSlot(const Closure& closure,
274                        const std::vector<std::string>& inputs,
275                        std::stringstream& ss) {
276    const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
277    const Script* script = funcID->mScript;
278
279    rsAssert (!script->isIntrinsic());
280
281    const RsdCpuScriptImpl *cpuScript =
282            (const RsdCpuScriptImpl*)script->mHal.drv;
283    const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
284
285    const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
286            inputs.begin();
287
288    ss << index << "," << funcID->mSlot << ".";
289}
290
291#endif  // RS_COMPATIBILTY_LIB
292
293}  // anonymous namespace
294
295void CpuScriptGroup2Impl::compile(const char* cacheDir) {
296#ifndef RS_COMPATIBILITY_LIB
297    if (mGroup->mClosures.size() < 2) {
298        return;
299    }
300
301    //===--------------------------------------------------------------------===//
302    // Fuse the input kernels and generate native code in an object file
303    //===--------------------------------------------------------------------===//
304
305    std::set<string> inputSet;
306    for (Closure* closure : mGroup->mClosures) {
307        const Script* script = closure->mFunctionID.get()->mScript;
308
309        // If any script is an intrinsic, give up trying fusing the kernels.
310        if (script->isIntrinsic()) {
311            return;
312        }
313
314        const RsdCpuScriptImpl *cpuScript =
315                (const RsdCpuScriptImpl*)script->mHal.drv;
316        const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
317        inputSet.insert(bitcodeFilename);
318    }
319
320    std::vector<string> inputs(inputSet.begin(), inputSet.end());
321
322    std::vector<string> kernelBatches;
323    std::vector<string> invokeBatches;
324
325    int i = 0;
326    for (const auto& batch : mBatches) {
327        rsAssert(batch->size() > 0);
328
329        std::stringstream ss;
330        ss << batch->mName << ":";
331
332        if (!batch->mClosures.front()->mClosure->mIsKernel) {
333            rsAssert(batch->size() == 1);
334            generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss);
335            invokeBatches.push_back(ss.str());
336        } else {
337            for (const auto& cpuClosure : batch->mClosures) {
338                generateSourceSlot(*cpuClosure->mClosure, inputs, ss);
339            }
340            kernelBatches.push_back(ss.str());
341        }
342    }
343
344    rsAssert(cacheDir != nullptr);
345    string objFilePath(cacheDir);
346    objFilePath.append("/fusedXXXXXX.o");
347    // Find unique object file name, to make following file names unique.
348    int tempfd = mkstemps(&objFilePath[0], 2);
349    if (tempfd == -1) {
350      return;
351    }
352    TEMP_FAILURE_RETRY(close(tempfd));
353
354    string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2));
355    string rsLibPath(SYSLIBPATH"/libclcore.bc");
356    vector<const char*> arguments;
357    setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir,
358                          outputFileName, rsLibPath, &arguments);
359    std::unique_ptr<const char> joined(
360        rsuJoinStrings(arguments.size() - 1, arguments.data()));
361    string commandLine (joined.get());
362
363    if (!fuseAndCompile(arguments.data(), commandLine)) {
364        unlink(objFilePath.c_str());
365        return;
366    }
367
368    //===--------------------------------------------------------------------===//
369    // Create and load the shared lib
370    //===--------------------------------------------------------------------===//
371
372    const char* resName = outputFileName.c_str();
373
374    if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
375        ALOGE("Failed to link object file '%s'", resName);
376        return;
377    }
378
379    mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
380    if (mScriptObj == nullptr) {
381        ALOGE("Unable to load '%s'", resName);
382        return;
383    }
384
385    mExecutable = ScriptExecutable::createFromSharedObject(
386        nullptr,  // RS context. Unused.
387        mScriptObj);
388
389#endif  // RS_COMPATIBILITY_LIB
390}
391
392void CpuScriptGroup2Impl::execute() {
393    for (auto batch : mBatches) {
394        batch->setGlobalsForBatch();
395        batch->run();
396    }
397}
398
399void Batch::setGlobalsForBatch() {
400    for (CPUClosure* cpuClosure : mClosures) {
401        const Closure* closure = cpuClosure->mClosure;
402        const IDBase* funcID = closure->mFunctionID.get();
403        Script* s = funcID->mScript;;
404        for (const auto& p : closure->mGlobals) {
405            const void* value = p.second.first;
406            int size = p.second.second;
407            if (value == nullptr && size == 0) {
408                // This indicates the current closure depends on another closure for a
409                // global in their shared module (script). In this case we don't need to
410                // copy the value. For example, an invoke intializes a global variable
411                // which a kernel later reads.
412                continue;
413            }
414            rsAssert(p.first != nullptr);
415            ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)",
416                  closure, p.first, p.first->mScript, p.first->mSlot);
417            Script* script = p.first->mScript;
418            const RsdCpuScriptImpl *cpuScript =
419                    (const RsdCpuScriptImpl*)script->mHal.drv;
420            int slot = p.first->mSlot;
421            ScriptExecutable* exec = mGroup->getExecutable();
422            if (exec != nullptr) {
423                const char* varName = cpuScript->getFieldName(slot);
424                void* addr = exec->getFieldAddress(varName);
425                if (size < 0) {
426                    rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
427                                 (rs_object_base*)addr, (ObjectBase*)value);
428                } else {
429                    memcpy(addr, (const void*)&value, size);
430                }
431            } else {
432                // We use -1 size to indicate an ObjectBase rather than a primitive type
433                if (size < 0) {
434                    s->setVarObj(slot, (ObjectBase*)value);
435                } else {
436                    s->setVar(slot, (const void*)&value, size);
437                }
438            }
439        }
440    }
441}
442
443void Batch::run() {
444    if (!mClosures.front()->mClosure->mIsKernel) {
445        rsAssert(mClosures.size() == 1);
446
447        // This batch contains a single closure for an invoke function
448        CPUClosure* cc = mClosures.front();
449        const Closure* c = cc->mClosure;
450
451        if (mFunc != nullptr) {
452            // TODO: Need align pointers for x86_64.
453            // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
454            ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
455        } else {
456            const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
457            rsAssert(invokeID != nullptr);
458            cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
459        }
460
461        return;
462    }
463
464    if (mFunc != nullptr) {
465        MTLaunchStruct mtls;
466        const CPUClosure* firstCpuClosure = mClosures.front();
467        const CPUClosure* lastCpuClosure = mClosures.back();
468
469        firstCpuClosure->mSi->forEachMtlsSetup(
470                (const Allocation**)firstCpuClosure->mClosure->mArgs,
471                firstCpuClosure->mClosure->mNumArg,
472                lastCpuClosure->mClosure->mReturnValue,
473                nullptr, 0, nullptr, &mtls);
474
475        mtls.script = nullptr;
476        mtls.fep.usr = nullptr;
477        mtls.kernel = (ForEachFunc_t)mFunc;
478
479        mGroup->getCpuRefImpl()->launchThreads(
480                (const Allocation**)firstCpuClosure->mClosure->mArgs,
481                firstCpuClosure->mClosure->mNumArg,
482                lastCpuClosure->mClosure->mReturnValue,
483                nullptr, &mtls);
484
485        return;
486    }
487
488    for (CPUClosure* cpuClosure : mClosures) {
489        const Closure* closure = cpuClosure->mClosure;
490        const ScriptKernelID* kernelID =
491                (const ScriptKernelID*)closure->mFunctionID.get();
492        cpuClosure->mSi->preLaunch(kernelID->mSlot,
493                                   (const Allocation**)closure->mArgs,
494                                   closure->mNumArg, closure->mReturnValue,
495                                   nullptr, 0, nullptr);
496    }
497
498    const CPUClosure* cpuClosure = mClosures.front();
499    const Closure* closure = cpuClosure->mClosure;
500    MTLaunchStruct mtls;
501
502    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
503                                          closure->mNumArg,
504                                          closure->mReturnValue,
505                                          nullptr, 0, nullptr, &mtls)) {
506
507        mtls.script = nullptr;
508        mtls.kernel = (void (*)())&groupRoot;
509        mtls.fep.usr = &mClosures;
510
511        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
512    }
513
514    for (CPUClosure* cpuClosure : mClosures) {
515        const Closure* closure = cpuClosure->mClosure;
516        const ScriptKernelID* kernelID =
517                (const ScriptKernelID*)closure->mFunctionID.get();
518        cpuClosure->mSi->postLaunch(kernelID->mSlot,
519                                    (const Allocation**)closure->mArgs,
520                                    closure->mNumArg, closure->mReturnValue,
521                                    nullptr, 0, nullptr);
522    }
523}
524
525}  // namespace renderscript
526}  // namespace android
527