rsCpuScriptGroup2.cpp revision 2abfcc6d129fe3defddef4540aa95cc445c03a7a
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4
5#include <string>
6#include <vector>
7
8#ifndef RS_COMPATIBILITY_LIB
9#include "bcc/Config/Config.h"
10#include <sys/wait.h>
11#endif
12
13#include "cpu_ref/rsCpuCore.h"
14#include "rsClosure.h"
15#include "rsContext.h"
16#include "rsCpuCore.h"
17#include "rsCpuExecutable.h"
18#include "rsCpuScript.h"
19#include "rsScript.h"
20#include "rsScriptGroup2.h"
21#include "rsScriptIntrinsic.h"
22
23using std::string;
24using std::vector;
25
26namespace android {
27namespace renderscript {
28
29namespace {
30
31const size_t DefaultKernelArgCount = 2;
32
33void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
34               uint32_t xend, uint32_t outstep) {
35    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr;
36    RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
37    const void **oldIns  = kparams->ins;
38    uint32_t *oldStrides = kparams->inEStrides;
39
40    std::vector<const void*> ins(DefaultKernelArgCount);
41    std::vector<uint32_t> strides(DefaultKernelArgCount);
42
43    for (CPUClosure* cpuClosure : closures) {
44        const Closure* closure = cpuClosure->mClosure;
45
46        auto in_iter = ins.begin();
47        auto stride_iter = strides.begin();
48
49        for (size_t i = 0; i < closure->mNumArg; i++) {
50            const void* arg = closure->mArgs[i];
51            const Allocation* a = (const Allocation*)arg;
52            const uint32_t eStride = a->mHal.state.elementSizeBytes;
53            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
54                    eStride * xstart;
55            if (kparams->dimY > 1) {
56                ptr += a->mHal.drvState.lod[0].stride * kparams->y;
57            }
58            *in_iter++ = ptr;
59            *stride_iter++ = eStride;
60        }
61
62        mutable_kparams->ins = &ins[0];
63        mutable_kparams->inEStrides = &strides[0];
64
65        const Allocation* out = closure->mReturnValue;
66        const uint32_t ostep = out->mHal.state.elementSizeBytes;
67        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
68                ostep * xstart;
69        if (kparams->dimY > 1) {
70            ptr += out->mHal.drvState.lod[0].stride * kparams->y;
71        }
72
73        mutable_kparams->out = (void*)ptr;
74
75        mutable_kparams->usr = cpuClosure->mUsrPtr;
76
77        cpuClosure->mFunc(kparams, xstart, xend, ostep);
78    }
79
80    mutable_kparams->ins        = oldIns;
81    mutable_kparams->inEStrides = oldStrides;
82    mutable_kparams->usr        = &closures;
83}
84
85}  // namespace
86
87Batch::~Batch() {
88    for (CPUClosure* c : mClosures) {
89        delete c;
90    }
91    if (mScriptObj) {
92        dlclose(mScriptObj);
93    }
94}
95
96bool Batch::conflict(CPUClosure* cpuClosure) const {
97    if (mClosures.empty()) {
98        return false;
99    }
100
101    const Closure* closure = cpuClosure->mClosure;
102
103    if (closure->mKernelID.get() == nullptr ||
104        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
105        // An invoke should be in a batch by itself, so it conflicts with any other
106        // closure.
107        return true;
108    }
109
110    const auto& globalDeps = closure->mGlobalDeps;
111    const auto& argDeps = closure->mArgDeps;
112
113    for (CPUClosure* c : mClosures) {
114        const Closure* batched = c->mClosure;
115        if (globalDeps.find(batched) != globalDeps.end()) {
116            return true;
117        }
118        const auto& it = argDeps.find(batched);
119        if (it != argDeps.end()) {
120            const auto& args = (*it).second;
121            for (const auto &p1 : *args) {
122                if (p1.second->get() != nullptr) {
123                    return true;
124                }
125            }
126        }
127    }
128
129    return false;
130}
131
132CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
133                                         const ScriptGroupBase *sg) :
134    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) {
135    rsAssert(!mGroup->mClosures.empty());
136
137    Batch* batch = new Batch(this);
138    for (Closure* closure: mGroup->mClosures) {
139        const ScriptKernelID* kernelID = closure->mKernelID.get();
140        RsdCpuScriptImpl* si;
141        CPUClosure* cc;
142        if (kernelID != nullptr) {
143            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript);
144            MTLaunchStruct mtls;
145            si->forEachKernelSetup(kernelID->mSlot, &mtls);
146            // TODO: Is mtls.fep.usrLen ever used?
147            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel,
148                                mtls.fep.usr, mtls.fep.usrLen);
149        } else {
150            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(
151                    closure->mInvokeID->mScript);
152            cc = new CPUClosure(closure, si);
153        }
154
155        if (batch->conflict(cc)) {
156            mBatches.push_back(batch);
157            batch = new Batch(this);
158        }
159
160        batch->mClosures.push_back(cc);
161    }
162
163    rsAssert(!batch->mClosures.empty());
164    mBatches.push_back(batch);
165
166#ifndef RS_COMPATIBILITY_LIB
167    for (Batch* batch : mBatches) {
168        batch->tryToCreateFusedKernel(mGroup->mCacheDir);
169    }
170#endif
171}
172
173CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
174    for (Batch* batch : mBatches) {
175        delete batch;
176    }
177}
178
179namespace {
180
181#ifndef RS_COMPATIBILITY_LIB
182
183string getFileName(string path) {
184    unsigned found = path.find_last_of("/\\");
185    return path.substr(found + 1);
186}
187
188void setupCompileArguments(
189        const vector<string>& inputs, const vector<int>& kernels,
190        const string& output_dir, const string& output_filename,
191        const string& rsLib, vector<const char*>* args) {
192    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
193    args->push_back("-fPIC");
194    args->push_back("-embedRSInfo");
195    args->push_back("-mtriple");
196    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
197    args->push_back("-bclib");
198    args->push_back(rsLib.c_str());
199    for (const string& input : inputs) {
200        args->push_back(input.c_str());
201    }
202    for (int kernel : kernels) {
203        args->push_back("-k");
204        string strKernel = std::to_string(kernel);
205        args->push_back(strKernel.c_str());
206    }
207    args->push_back("-output_path");
208    args->push_back(output_dir.c_str());
209    args->push_back("-o");
210    args->push_back(output_filename.c_str());
211    args->push_back(nullptr);
212}
213
214bool fuseAndCompile(const char** arguments,
215                    const string& commandLine) {
216    const pid_t pid = fork();
217
218    if (pid == -1) {
219        ALOGE("Couldn't fork for bcc execution");
220        return false;
221    }
222
223    if (pid == 0) {
224        // Child process
225        ALOGV("Invoking BCC with: %s", commandLine.c_str());
226        execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments);
227
228        ALOGE("execv() failed: %s", strerror(errno));
229        abort();
230        return false;
231    }
232
233    // Parent process
234    int status = 0;
235    const pid_t w = waitpid(pid, &status, 0);
236    if (w == -1) {
237        return false;
238    }
239
240    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) {
241        ALOGE("bcc terminated unexpectedly");
242        return false;
243    }
244
245    return true;
246}
247#endif
248
249}  // anonymous namespace
250
251void Batch::tryToCreateFusedKernel(const char *cacheDir) {
252#ifndef RS_COMPATIBILITY_LIB
253    if (mClosures.size() < 2) {
254        return;
255    }
256
257    //===--------------------------------------------------------------------===//
258    // Fuse the input kernels and generate native code in an object file
259    //===--------------------------------------------------------------------===//
260
261    std::vector<string> inputFiles;
262    std::vector<int> slots;
263
264    for (CPUClosure* cpuClosure : mClosures) {
265        const Closure* closure = cpuClosure->mClosure;
266        const ScriptKernelID* kernelID = closure->mKernelID.get();
267        const Script* script = kernelID->mScript;
268
269        if (script->isIntrinsic()) {
270            return;
271        }
272
273        const RsdCpuScriptImpl *cpuScript =
274                (const RsdCpuScriptImpl*)script->mHal.drv;
275
276        const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
277
278        inputFiles.push_back(bitcodeFilename);
279        slots.push_back(kernelID->mSlot);
280    }
281
282    string outputPath(tempnam(cacheDir, "fused"));
283    string outputFileName = getFileName(outputPath);
284    string objFilePath(outputPath);
285    objFilePath.append(".o");
286    string rsLibPath(SYSLIBPATH"/libclcore.bc");
287    vector<const char*> arguments;
288    setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath,
289                          &arguments);
290    std::unique_ptr<const char> joined(
291        rsuJoinStrings(arguments.size() - 1, arguments.data()));
292    string commandLine (joined.get());
293
294    if (!fuseAndCompile(arguments.data(), commandLine)) {
295        return;
296    }
297
298    //===--------------------------------------------------------------------===//
299    // Create and load the shared lib
300    //===--------------------------------------------------------------------===//
301
302    const char* resName = outputFileName.c_str();
303
304    if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
305        ALOGE("Failed to link object file '%s'", resName);
306        return;
307    }
308
309    void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
310    if (mSharedObj == nullptr) {
311        ALOGE("Unable to load '%s'", resName);
312        return;
313    }
314
315    mExecutable = ScriptExecutable::createFromSharedObject(
316                                                           nullptr,  // RS context. Unused.
317                                                           mSharedObj);
318
319#endif  // RS_COMPATIBILITY_LIB
320}
321
322void CpuScriptGroup2Impl::execute() {
323    for (auto batch : mBatches) {
324        batch->setGlobalsForBatch();
325        batch->run();
326    }
327}
328
329void Batch::setGlobalsForBatch() {
330    for (CPUClosure* cpuClosure : mClosures) {
331        const Closure* closure = cpuClosure->mClosure;
332        const ScriptKernelID* kernelID = closure->mKernelID.get();
333        Script* s;
334        if (kernelID != nullptr) {
335            s = kernelID->mScript;
336        } else {
337            s = cpuClosure->mClosure->mInvokeID->mScript;
338        }
339        for (const auto& p : closure->mGlobals) {
340            const void* value = p.second.first;
341            int size = p.second.second;
342            if (value == nullptr && size == 0) {
343                // This indicates the current closure depends on another closure for a
344                // global in their shared module (script). In this case we don't need to
345                // copy the value. For example, an invoke intializes a global variable
346                // which a kernel later reads.
347                continue;
348            }
349            rsAssert(p.first != nullptr);
350            ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)",
351                  closure, p.first, p.first->mScript, p.first->mSlot);
352            // We use -1 size to indicate an ObjectBase rather than a primitive type
353            if (size < 0) {
354                s->setVarObj(p.first->mSlot, (ObjectBase*)value);
355            } else {
356                s->setVar(p.first->mSlot, (const void*)&value, size);
357            }
358        }
359    }
360}
361
362void Batch::run() {
363    if (mExecutable != nullptr) {
364        MTLaunchStruct mtls;
365        const CPUClosure* firstCpuClosure = mClosures.front();
366        const CPUClosure* lastCpuClosure = mClosures.back();
367
368        firstCpuClosure->mSi->forEachMtlsSetup(
369                (const Allocation**)firstCpuClosure->mClosure->mArgs,
370                firstCpuClosure->mClosure->mNumArg,
371                lastCpuClosure->mClosure->mReturnValue,
372                nullptr, 0, nullptr, &mtls);
373
374        mtls.script = nullptr;
375        mtls.fep.usr = nullptr;
376        mtls.kernel = mExecutable->getForEachFunction(0);
377
378        mGroup->getCpuRefImpl()->launchThreads(
379                (const Allocation**)firstCpuClosure->mClosure->mArgs,
380                firstCpuClosure->mClosure->mNumArg,
381                lastCpuClosure->mClosure->mReturnValue,
382                nullptr, &mtls);
383
384        return;
385    }
386
387    if (mClosures.size() == 1 &&
388        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
389        // This closure is for an invoke function
390        CPUClosure* cc = mClosures.front();
391        const Closure* c = cc->mClosure;
392        const ScriptInvokeID* invokeID = c->mInvokeID;
393        rsAssert(invokeID != nullptr);
394        cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
395        return;
396    }
397
398    for (CPUClosure* cpuClosure : mClosures) {
399        const Closure* closure = cpuClosure->mClosure;
400        const ScriptKernelID* kernelID = closure->mKernelID.get();
401        cpuClosure->mSi->preLaunch(kernelID->mSlot,
402                                   (const Allocation**)closure->mArgs,
403                                   closure->mNumArg, closure->mReturnValue,
404                                   cpuClosure->mUsrPtr, cpuClosure->mUsrSize,
405                                   nullptr);
406    }
407
408    const CPUClosure* cpuClosure = mClosures.front();
409    const Closure* closure = cpuClosure->mClosure;
410    MTLaunchStruct mtls;
411
412    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
413                                          closure->mNumArg,
414                                          closure->mReturnValue,
415                                          nullptr, 0, nullptr, &mtls)) {
416
417        mtls.script = nullptr;
418        mtls.kernel = (void (*)())&groupRoot;
419        mtls.fep.usr = &mClosures;
420
421        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
422    }
423
424    for (CPUClosure* cpuClosure : mClosures) {
425        const Closure* closure = cpuClosure->mClosure;
426        const ScriptKernelID* kernelID = closure->mKernelID.get();
427        cpuClosure->mSi->postLaunch(kernelID->mSlot,
428                                    (const Allocation**)closure->mArgs,
429                                    closure->mNumArg, closure->mReturnValue,
430                                    nullptr, 0, nullptr);
431    }
432}
433
434}  // namespace renderscript
435}  // namespace android
436