rsCpuScriptGroup2.cpp revision 433558f0f9abbf07770db288183a15fd261cace2
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7
8#include <string>
9#include <vector>
10
11#ifndef RS_COMPATIBILITY_LIB
12#include "bcc/Config/Config.h"
13#include <sys/wait.h>
14#endif
15
16#include "cpu_ref/rsCpuCore.h"
17#include "rsClosure.h"
18#include "rsContext.h"
19#include "rsCpuCore.h"
20#include "rsCpuScript.h"
21#include "rsScript.h"
22#include "rsScriptGroup2.h"
23#include "rsScriptIntrinsic.h"
24
25using std::string;
26using std::vector;
27
28namespace android {
29namespace renderscript {
30
31namespace {
32
33const size_t DefaultKernelArgCount = 2;
34
35void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
36               uint32_t xend, uint32_t outstep) {
37    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr;
38    RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
39    const void **oldIns  = kparams->ins;
40    uint32_t *oldStrides = kparams->inEStrides;
41
42    std::vector<const void*> ins(DefaultKernelArgCount);
43    std::vector<uint32_t> strides(DefaultKernelArgCount);
44
45    for (CPUClosure* cpuClosure : closures) {
46        const Closure* closure = cpuClosure->mClosure;
47
48        auto in_iter = ins.begin();
49        auto stride_iter = strides.begin();
50
51        for (size_t i = 0; i < closure->mNumArg; i++) {
52            const void* arg = closure->mArgs[i];
53            const Allocation* a = (const Allocation*)arg;
54            const uint32_t eStride = a->mHal.state.elementSizeBytes;
55            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
56                    eStride * xstart;
57            if (kparams->dimY > 1) {
58                ptr += a->mHal.drvState.lod[0].stride * kparams->y;
59            }
60            *in_iter++ = ptr;
61            *stride_iter++ = eStride;
62        }
63
64        mutable_kparams->ins = &ins[0];
65        mutable_kparams->inEStrides = &strides[0];
66
67        const Allocation* out = closure->mReturnValue;
68        const uint32_t ostep = out->mHal.state.elementSizeBytes;
69        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
70                ostep * xstart;
71        if (kparams->dimY > 1) {
72            ptr += out->mHal.drvState.lod[0].stride * kparams->y;
73        }
74
75        mutable_kparams->out = (void*)ptr;
76
77        mutable_kparams->usr = cpuClosure->mUsrPtr;
78
79        cpuClosure->mFunc(kparams, xstart, xend, ostep);
80    }
81
82    mutable_kparams->ins        = oldIns;
83    mutable_kparams->inEStrides = oldStrides;
84    mutable_kparams->usr        = &closures;
85}
86
87}  // namespace
88
89Batch::~Batch() {
90    for (CPUClosure* c : mClosures) {
91        delete c;
92    }
93    if (mScriptObj) {
94        dlclose(mScriptObj);
95    }
96}
97
98bool Batch::conflict(CPUClosure* cpuClosure) const {
99    if (mClosures.empty()) {
100        return false;
101    }
102
103    const Closure* closure = cpuClosure->mClosure;
104
105    if (closure->mKernelID.get() == nullptr ||
106        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
107        // An invoke should be in a batch by itself, so it conflicts with any other
108        // closure.
109        return true;
110    }
111
112    const auto& globalDeps = closure->mGlobalDeps;
113    const auto& argDeps = closure->mArgDeps;
114
115    for (CPUClosure* c : mClosures) {
116        const Closure* batched = c->mClosure;
117        if (globalDeps.find(batched) != globalDeps.end()) {
118            return true;
119        }
120        const auto& it = argDeps.find(batched);
121        if (it != argDeps.end()) {
122            const auto& args = (*it).second;
123            for (const auto &p1 : *args) {
124                if (p1.second->get() != nullptr) {
125                    return true;
126                }
127            }
128        }
129    }
130
131    return false;
132}
133
134CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
135                                         const ScriptGroupBase *sg) :
136    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) {
137    rsAssert(!mGroup->mClosures.empty());
138
139    Batch* batch = new Batch(this);
140    for (Closure* closure: mGroup->mClosures) {
141        const ScriptKernelID* kernelID = closure->mKernelID.get();
142        RsdCpuScriptImpl* si;
143        CPUClosure* cc;
144        if (kernelID != nullptr) {
145            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript);
146            MTLaunchStruct mtls;
147            si->forEachKernelSetup(kernelID->mSlot, &mtls);
148            // TODO: Is mtls.fep.usrLen ever used?
149            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel,
150                                mtls.fep.usr, mtls.fep.usrLen);
151        } else {
152            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(
153                    closure->mInvokeID->mScript);
154            cc = new CPUClosure(closure, si);
155        }
156
157        if (batch->conflict(cc)) {
158            mBatches.push_back(batch);
159            batch = new Batch(this);
160        }
161
162        batch->mClosures.push_back(cc);
163    }
164
165    rsAssert(!batch->mClosures.empty());
166    mBatches.push_back(batch);
167
168#ifndef RS_COMPATIBILITY_LIB
169    for (Batch* batch : mBatches) {
170        batch->tryToCreateFusedKernel(mGroup->mCacheDir);
171    }
172#endif
173}
174
175CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
176    for (Batch* batch : mBatches) {
177        delete batch;
178    }
179}
180
181namespace {
182
183#ifndef RS_COMPATIBILITY_LIB
184
185string getFileName(string path) {
186    unsigned found = path.find_last_of("/\\");
187    return path.substr(found + 1);
188}
189
190void setupCompileArguments(
191        const vector<string>& inputs, const vector<int>& kernels,
192        const string& output_dir, const string& output_filename,
193        const string& rsLib, vector<const char*>* args) {
194    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
195    args->push_back("-fPIC");
196    args->push_back("-embedRSInfo");
197    args->push_back("-mtriple");
198    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
199    args->push_back("-bclib");
200    args->push_back(rsLib.c_str());
201    for (const string& input : inputs) {
202        args->push_back(input.c_str());
203    }
204    for (int kernel : kernels) {
205        args->push_back("-k");
206        string strKernel = std::to_string(kernel);
207        args->push_back(strKernel.c_str());
208    }
209    args->push_back("-output_path");
210    args->push_back(output_dir.c_str());
211    args->push_back("-o");
212    args->push_back(output_filename.c_str());
213    args->push_back(nullptr);
214}
215
216string convertListToString(int n, const char* const* strs) {
217    string ret;
218    ret.append(strs[0]);
219    for (int i = 1; i < n; i++) {
220        ret.append(" ");
221        ret.append(strs[i]);
222    }
223    return ret;
224}
225
226bool fuseAndCompile(const char** arguments,
227                    const string& commandLine) {
228    const pid_t pid = fork();
229
230    if (pid == -1) {
231        ALOGE("Couldn't fork for bcc execution");
232        return false;
233    }
234
235    if (pid == 0) {
236        // Child process
237        ALOGV("Invoking BCC with: %s", commandLine.c_str());
238        execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments);
239
240        ALOGE("execv() failed: %s", strerror(errno));
241        abort();
242        return false;
243    }
244
245    // Parent process
246    int status = 0;
247    const pid_t w = waitpid(pid, &status, 0);
248    if (w == -1) {
249        return false;
250    }
251
252    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) {
253        ALOGE("bcc terminated unexpectedly");
254        return false;
255    }
256
257    return true;
258}
259#endif
260
261}  // anonymous namespace
262
263void Batch::tryToCreateFusedKernel(const char *cacheDir) {
264#ifndef RS_COMPATIBILITY_LIB
265    if (mClosures.size() < 2) {
266        return;
267    }
268
269    //===--------------------------------------------------------------------===//
270    // Fuse the input kernels and generate native code in an object file
271    //===--------------------------------------------------------------------===//
272
273    std::vector<string> inputFiles;
274    std::vector<int> slots;
275
276    for (CPUClosure* cpuClosure : mClosures) {
277        const Closure* closure = cpuClosure->mClosure;
278        const ScriptKernelID* kernelID = closure->mKernelID.get();
279        const Script* script = kernelID->mScript;
280
281        if (script->isIntrinsic()) {
282            return;
283        }
284
285        const RsdCpuScriptImpl *cpuScript =
286                (const RsdCpuScriptImpl*)script->mHal.drv;
287
288        const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
289
290        inputFiles.push_back(bitcodeFilename);
291        slots.push_back(kernelID->mSlot);
292    }
293
294    rsAssert(cacheDir != nullptr);
295    string objFilePath(cacheDir);
296    objFilePath.append("/fusedXXXXXX.o");
297    // Find unique object file name, to make following file names unique.
298    int tempfd = mkstemps(&objFilePath[0], 2);
299    if (tempfd == -1) {
300      return;
301    }
302    TEMP_FAILURE_RETRY(close(tempfd));
303
304    string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2));
305    string rsLibPath(SYSLIBPATH"/libclcore.bc");
306    vector<const char*> arguments;
307    setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath,
308                          &arguments);
309    string commandLine =
310            convertListToString(arguments.size() - 1, arguments.data());
311
312    if (!fuseAndCompile(arguments.data(), commandLine)) {
313        unlink(objFilePath.c_str());
314        return;
315    }
316
317    //===--------------------------------------------------------------------===//
318    // Create and load the shared lib
319    //===--------------------------------------------------------------------===//
320
321    const char* resName = outputFileName.c_str();
322
323    if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
324        ALOGE("Failed to link object file '%s'", resName);
325        return;
326    }
327
328    void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
329    if (mSharedObj == nullptr) {
330        ALOGE("Unable to load '%s'", resName);
331        return;
332    }
333
334    mExecutable = ScriptExecutable::createFromSharedObject(
335                                                           nullptr,  // RS context. Unused.
336                                                           mSharedObj);
337
338#endif  // RS_COMPATIBILITY_LIB
339}
340
341void CpuScriptGroup2Impl::execute() {
342    for (auto batch : mBatches) {
343        batch->setGlobalsForBatch();
344        batch->run();
345    }
346}
347
348void Batch::setGlobalsForBatch() {
349    for (CPUClosure* cpuClosure : mClosures) {
350        const Closure* closure = cpuClosure->mClosure;
351        const ScriptKernelID* kernelID = closure->mKernelID.get();
352        Script* s;
353        if (kernelID != nullptr) {
354            s = kernelID->mScript;
355        } else {
356            s = cpuClosure->mClosure->mInvokeID->mScript;
357        }
358        for (const auto& p : closure->mGlobals) {
359            const void* value = p.second.first;
360            int size = p.second.second;
361            if (value == nullptr && size == 0) {
362                // This indicates the current closure depends on another closure for a
363                // global in their shared module (script). In this case we don't need to
364                // copy the value. For example, an invoke intializes a global variable
365                // which a kernel later reads.
366                continue;
367            }
368            rsAssert(p.first != nullptr);
369            ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)",
370                  closure, p.first, p.first->mScript, p.first->mSlot);
371            // We use -1 size to indicate an ObjectBase rather than a primitive type
372            if (size < 0) {
373                s->setVarObj(p.first->mSlot, (ObjectBase*)value);
374            } else {
375                s->setVar(p.first->mSlot, (const void*)&value, size);
376            }
377        }
378    }
379}
380
381void Batch::run() {
382    if (mExecutable != nullptr) {
383        MTLaunchStruct mtls;
384        const CPUClosure* firstCpuClosure = mClosures.front();
385        const CPUClosure* lastCpuClosure = mClosures.back();
386
387        firstCpuClosure->mSi->forEachMtlsSetup(
388                (const Allocation**)firstCpuClosure->mClosure->mArgs,
389                firstCpuClosure->mClosure->mNumArg,
390                lastCpuClosure->mClosure->mReturnValue,
391                nullptr, 0, nullptr, &mtls);
392
393        mtls.script = nullptr;
394        mtls.fep.usr = nullptr;
395        mtls.kernel = mExecutable->getForEachFunction(0);
396
397        mGroup->getCpuRefImpl()->launchThreads(
398                (const Allocation**)firstCpuClosure->mClosure->mArgs,
399                firstCpuClosure->mClosure->mNumArg,
400                lastCpuClosure->mClosure->mReturnValue,
401                nullptr, &mtls);
402
403        return;
404    }
405
406    if (mClosures.size() == 1 &&
407        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
408        // This closure is for an invoke function
409        CPUClosure* cc = mClosures.front();
410        const Closure* c = cc->mClosure;
411        const ScriptInvokeID* invokeID = c->mInvokeID;
412        rsAssert(invokeID != nullptr);
413        cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
414        return;
415    }
416
417    for (CPUClosure* cpuClosure : mClosures) {
418        const Closure* closure = cpuClosure->mClosure;
419        const ScriptKernelID* kernelID = closure->mKernelID.get();
420        cpuClosure->mSi->preLaunch(kernelID->mSlot,
421                                   (const Allocation**)closure->mArgs,
422                                   closure->mNumArg, closure->mReturnValue,
423                                   cpuClosure->mUsrPtr, cpuClosure->mUsrSize,
424                                   nullptr);
425    }
426
427    const CPUClosure* cpuClosure = mClosures.front();
428    const Closure* closure = cpuClosure->mClosure;
429    MTLaunchStruct mtls;
430
431    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
432                                          closure->mNumArg,
433                                          closure->mReturnValue,
434                                          nullptr, 0, nullptr, &mtls)) {
435
436        mtls.script = nullptr;
437        mtls.kernel = (void (*)())&groupRoot;
438        mtls.fep.usr = &mClosures;
439
440        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
441    }
442
443    for (CPUClosure* cpuClosure : mClosures) {
444        const Closure* closure = cpuClosure->mClosure;
445        const ScriptKernelID* kernelID = closure->mKernelID.get();
446        cpuClosure->mSi->postLaunch(kernelID->mSlot,
447                                    (const Allocation**)closure->mArgs,
448                                    closure->mNumArg, closure->mReturnValue,
449                                    nullptr, 0, nullptr);
450    }
451}
452
453}  // namespace renderscript
454}  // namespace android
455