rsCpuScriptGroup2.cpp revision ff2bb54ebf593b1d19d3a2e4cfa70a8ea4432c0d
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4
5#include <string>
6#include <vector>
7
8#ifndef RS_COMPATIBILITY_LIB
9#include "bcc/Config/Config.h"
10#include <sys/wait.h>
11#endif
12
13#include "cpu_ref/rsCpuCore.h"
14#include "rsClosure.h"
15#include "rsContext.h"
16#include "rsCpuCore.h"
17#include "rsCpuScript.h"
18#include "rsScript.h"
19#include "rsScriptGroup2.h"
20#include "rsScriptIntrinsic.h"
21
22using std::string;
23using std::vector;
24
25namespace android {
26namespace renderscript {
27
28namespace {
29
30const size_t DefaultKernelArgCount = 2;
31
32void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
33               uint32_t xend, uint32_t outstep) {
34    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr;
35    RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
36    const void **oldIns  = kparams->ins;
37    uint32_t *oldStrides = kparams->inEStrides;
38
39    std::vector<const void*> ins(DefaultKernelArgCount);
40    std::vector<uint32_t> strides(DefaultKernelArgCount);
41
42    for (CPUClosure* cpuClosure : closures) {
43        const Closure* closure = cpuClosure->mClosure;
44
45        auto in_iter = ins.begin();
46        auto stride_iter = strides.begin();
47
48        for (size_t i = 0; i < closure->mNumArg; i++) {
49            const void* arg = closure->mArgs[i];
50            const Allocation* a = (const Allocation*)arg;
51            const uint32_t eStride = a->mHal.state.elementSizeBytes;
52            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
53                    eStride * xstart;
54            if (kparams->dimY > 1) {
55                ptr += a->mHal.drvState.lod[0].stride * kparams->y;
56            }
57            *in_iter++ = ptr;
58            *stride_iter++ = eStride;
59        }
60
61        mutable_kparams->ins = &ins[0];
62        mutable_kparams->inEStrides = &strides[0];
63
64        const Allocation* out = closure->mReturnValue;
65        const uint32_t ostep = out->mHal.state.elementSizeBytes;
66        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
67                ostep * xstart;
68        if (kparams->dimY > 1) {
69            ptr += out->mHal.drvState.lod[0].stride * kparams->y;
70        }
71
72        mutable_kparams->out = (void*)ptr;
73
74        mutable_kparams->usr = cpuClosure->mUsrPtr;
75
76        cpuClosure->mFunc(kparams, xstart, xend, ostep);
77    }
78
79    mutable_kparams->ins        = oldIns;
80    mutable_kparams->inEStrides = oldStrides;
81    mutable_kparams->usr        = &closures;
82}
83
84}  // namespace
85
86Batch::~Batch() {
87    for (CPUClosure* c : mClosures) {
88        delete c;
89    }
90    if (mScriptObj) {
91        dlclose(mScriptObj);
92    }
93}
94
95bool Batch::conflict(CPUClosure* cpuClosure) const {
96    if (mClosures.empty()) {
97        return false;
98    }
99
100    const Closure* closure = cpuClosure->mClosure;
101
102    if (closure->mKernelID.get() == nullptr ||
103        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
104        // An invoke should be in a batch by itself, so it conflicts with any other
105        // closure.
106        return true;
107    }
108
109    const auto& globalDeps = closure->mGlobalDeps;
110    const auto& argDeps = closure->mArgDeps;
111
112    for (CPUClosure* c : mClosures) {
113        const Closure* batched = c->mClosure;
114        if (globalDeps.find(batched) != globalDeps.end()) {
115            return true;
116        }
117        const auto& it = argDeps.find(batched);
118        if (it != argDeps.end()) {
119            const auto& args = (*it).second;
120            for (const auto &p1 : *args) {
121                if (p1.second->get() != nullptr) {
122                    return true;
123                }
124            }
125        }
126    }
127
128    return false;
129}
130
131CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
132                                         const ScriptGroupBase *sg) :
133    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) {
134    rsAssert(!mGroup->mClosures.empty());
135
136    Batch* batch = new Batch(this);
137    for (Closure* closure: mGroup->mClosures) {
138        const ScriptKernelID* kernelID = closure->mKernelID.get();
139        RsdCpuScriptImpl* si;
140        CPUClosure* cc;
141        if (kernelID != nullptr) {
142            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript);
143            MTLaunchStruct mtls;
144            si->forEachKernelSetup(kernelID->mSlot, &mtls);
145            // TODO: Is mtls.fep.usrLen ever used?
146            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel,
147                                mtls.fep.usr, mtls.fep.usrLen);
148        } else {
149            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(
150                    closure->mInvokeID->mScript);
151            cc = new CPUClosure(closure, si);
152        }
153
154        if (batch->conflict(cc)) {
155            mBatches.push_back(batch);
156            batch = new Batch(this);
157        }
158
159        batch->mClosures.push_back(cc);
160    }
161
162    rsAssert(!batch->mClosures.empty());
163    mBatches.push_back(batch);
164
165#ifndef RS_COMPATIBILITY_LIB
166    for (Batch* batch : mBatches) {
167        batch->tryToCreateFusedKernel(mGroup->mCacheDir);
168    }
169#endif
170}
171
172CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
173    for (Batch* batch : mBatches) {
174        delete batch;
175    }
176}
177
178namespace {
179
180#ifndef RS_COMPATIBILITY_LIB
181
182string getFileName(string path) {
183    unsigned found = path.find_last_of("/\\");
184    return path.substr(found + 1);
185}
186
187void setupCompileArguments(
188        const vector<string>& inputs, const vector<int>& kernels,
189        const string& output_dir, const string& output_filename,
190        const string& rsLib, vector<const char*>* args) {
191    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
192    args->push_back("-fPIC");
193    args->push_back("-embedRSInfo");
194    args->push_back("-mtriple");
195    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
196    args->push_back("-bclib");
197    args->push_back(rsLib.c_str());
198    for (const string& input : inputs) {
199        args->push_back(input.c_str());
200    }
201    for (int kernel : kernels) {
202        args->push_back("-k");
203        string strKernel = std::to_string(kernel);
204        args->push_back(strKernel.c_str());
205    }
206    args->push_back("-output_path");
207    args->push_back(output_dir.c_str());
208    args->push_back("-o");
209    args->push_back(output_filename.c_str());
210    args->push_back(nullptr);
211}
212
213string convertListToString(int n, const char* const* strs) {
214    string ret;
215    ret.append(strs[0]);
216    for (int i = 1; i < n; i++) {
217        ret.append(" ");
218        ret.append(strs[i]);
219    }
220    return ret;
221}
222
223bool fuseAndCompile(const char** arguments,
224                    const string& commandLine) {
225    const pid_t pid = fork();
226
227    if (pid == -1) {
228        ALOGE("Couldn't fork for bcc execution");
229        return false;
230    }
231
232    if (pid == 0) {
233        // Child process
234        ALOGV("Invoking BCC with: %s", commandLine.c_str());
235        execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments);
236
237        ALOGE("execv() failed: %s", strerror(errno));
238        abort();
239        return false;
240    }
241
242    // Parent process
243    int status = 0;
244    const pid_t w = waitpid(pid, &status, 0);
245    if (w == -1) {
246        return false;
247    }
248
249    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) {
250        ALOGE("bcc terminated unexpectedly");
251        return false;
252    }
253
254    return true;
255}
256#endif
257
258}  // anonymous namespace
259
260void Batch::tryToCreateFusedKernel(const char *cacheDir) {
261#ifndef RS_COMPATIBILITY_LIB
262    if (mClosures.size() < 2) {
263        return;
264    }
265
266    //===--------------------------------------------------------------------===//
267    // Fuse the input kernels and generate native code in an object file
268    //===--------------------------------------------------------------------===//
269
270    std::vector<string> inputFiles;
271    std::vector<int> slots;
272
273    for (CPUClosure* cpuClosure : mClosures) {
274        const Closure* closure = cpuClosure->mClosure;
275        const ScriptKernelID* kernelID = closure->mKernelID.get();
276        const Script* script = kernelID->mScript;
277
278        if (script->isIntrinsic()) {
279            return;
280        }
281
282        const RsdCpuScriptImpl *cpuScript =
283                (const RsdCpuScriptImpl*)script->mHal.drv;
284
285        const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
286
287        inputFiles.push_back(bitcodeFilename);
288        slots.push_back(kernelID->mSlot);
289    }
290
291    string outputPath(tempnam(cacheDir, "fused"));
292    string outputFileName = getFileName(outputPath);
293    string objFilePath(outputPath);
294    objFilePath.append(".o");
295    string rsLibPath(SYSLIBPATH"/libclcore.bc");
296    vector<const char*> arguments;
297    setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath,
298                          &arguments);
299    string commandLine =
300            convertListToString(arguments.size() - 1, arguments.data());
301
302    if (!fuseAndCompile(arguments.data(), commandLine)) {
303        return;
304    }
305
306    //===--------------------------------------------------------------------===//
307    // Create and load the shared lib
308    //===--------------------------------------------------------------------===//
309
310    const char* resName = outputFileName.c_str();
311
312    if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
313        ALOGE("Failed to link object file '%s'", resName);
314        return;
315    }
316
317    void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
318    if (mSharedObj == nullptr) {
319        ALOGE("Unable to load '%s'", resName);
320        return;
321    }
322
323    mExecutable = ScriptExecutable::createFromSharedObject(
324                                                           nullptr,  // RS context. Unused.
325                                                           mSharedObj);
326
327#endif  // RS_COMPATIBILITY_LIB
328}
329
330void CpuScriptGroup2Impl::execute() {
331    for (auto batch : mBatches) {
332        batch->setGlobalsForBatch();
333        batch->run();
334    }
335}
336
337void Batch::setGlobalsForBatch() {
338    for (CPUClosure* cpuClosure : mClosures) {
339        const Closure* closure = cpuClosure->mClosure;
340        const ScriptKernelID* kernelID = closure->mKernelID.get();
341        Script* s;
342        if (kernelID != nullptr) {
343            s = kernelID->mScript;
344        } else {
345            s = cpuClosure->mClosure->mInvokeID->mScript;
346        }
347        for (const auto& p : closure->mGlobals) {
348            const void* value = p.second.first;
349            int size = p.second.second;
350            if (value == nullptr && size == 0) {
351                // This indicates the current closure depends on another closure for a
352                // global in their shared module (script). In this case we don't need to
353                // copy the value. For example, an invoke intializes a global variable
354                // which a kernel later reads.
355                continue;
356            }
357            rsAssert(p.first != nullptr);
358            ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)",
359                  closure, p.first, p.first->mScript, p.first->mSlot);
360            // We use -1 size to indicate an ObjectBase rather than a primitive type
361            if (size < 0) {
362                s->setVarObj(p.first->mSlot, (ObjectBase*)value);
363            } else {
364                s->setVar(p.first->mSlot, (const void*)&value, size);
365            }
366        }
367    }
368}
369
370void Batch::run() {
371    if (mExecutable != nullptr) {
372        MTLaunchStruct mtls;
373        const CPUClosure* firstCpuClosure = mClosures.front();
374        const CPUClosure* lastCpuClosure = mClosures.back();
375
376        firstCpuClosure->mSi->forEachMtlsSetup(
377                (const Allocation**)firstCpuClosure->mClosure->mArgs,
378                firstCpuClosure->mClosure->mNumArg,
379                lastCpuClosure->mClosure->mReturnValue,
380                nullptr, 0, nullptr, &mtls);
381
382        mtls.script = nullptr;
383        mtls.fep.usr = nullptr;
384        mtls.kernel = mExecutable->getForEachFunction(0);
385
386        mGroup->getCpuRefImpl()->launchThreads(
387                (const Allocation**)firstCpuClosure->mClosure->mArgs,
388                firstCpuClosure->mClosure->mNumArg,
389                lastCpuClosure->mClosure->mReturnValue,
390                nullptr, &mtls);
391
392        return;
393    }
394
395    if (mClosures.size() == 1 &&
396        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
397        // This closure is for an invoke function
398        CPUClosure* cc = mClosures.front();
399        const Closure* c = cc->mClosure;
400        const ScriptInvokeID* invokeID = c->mInvokeID;
401        rsAssert(invokeID != nullptr);
402        cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
403        return;
404    }
405
406    for (CPUClosure* cpuClosure : mClosures) {
407        const Closure* closure = cpuClosure->mClosure;
408        const ScriptKernelID* kernelID = closure->mKernelID.get();
409        cpuClosure->mSi->preLaunch(kernelID->mSlot,
410                                   (const Allocation**)closure->mArgs,
411                                   closure->mNumArg, closure->mReturnValue,
412                                   cpuClosure->mUsrPtr, cpuClosure->mUsrSize,
413                                   nullptr);
414    }
415
416    const CPUClosure* cpuClosure = mClosures.front();
417    const Closure* closure = cpuClosure->mClosure;
418    MTLaunchStruct mtls;
419
420    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
421                                          closure->mNumArg,
422                                          closure->mReturnValue,
423                                          nullptr, 0, nullptr, &mtls)) {
424
425        mtls.script = nullptr;
426        mtls.kernel = (void (*)())&groupRoot;
427        mtls.fep.usr = &mClosures;
428
429        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
430    }
431
432    for (CPUClosure* cpuClosure : mClosures) {
433        const Closure* closure = cpuClosure->mClosure;
434        const ScriptKernelID* kernelID = closure->mKernelID.get();
435        cpuClosure->mSi->postLaunch(kernelID->mSlot,
436                                    (const Allocation**)closure->mArgs,
437                                    closure->mNumArg, closure->mReturnValue,
438                                    nullptr, 0, nullptr);
439    }
440}
441
442}  // namespace renderscript
443}  // namespace android
444