rsCpuScriptGroup2.cpp revision bd0af2d161e36e52e6782ccb2d15dd5a36467704
1#include "rsCpuScriptGroup2.h"
2
3#include <dlfcn.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <unistd.h>
7
8#include <set>
9#include <sstream>
10#include <string>
11#include <vector>
12
13#ifndef RS_COMPATIBILITY_LIB
14#include "bcc/Config/Config.h"
15#endif
16
17#include "cpu_ref/rsCpuCore.h"
18#include "rsClosure.h"
19#include "rsContext.h"
20#include "rsCpuCore.h"
21#include "rsCpuExecutable.h"
22#include "rsCpuScript.h"
23#include "rsScript.h"
24#include "rsScriptGroup2.h"
25#include "rsScriptIntrinsic.h"
26
27using std::string;
28using std::vector;
29
30namespace android {
31namespace renderscript {
32
33namespace {
34
35const size_t DefaultKernelArgCount = 2;
36
37void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart,
38               uint32_t xend, uint32_t outstep) {
39    const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr;
40    RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo);
41
42    const size_t oldInLen = mutable_kinfo->inLen;
43
44    decltype(mutable_kinfo->inStride) oldInStride;
45    memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride));
46
47    for (CPUClosure* cpuClosure : closures) {
48        const Closure* closure = cpuClosure->mClosure;
49
50        // There had better be enough space in mutable_kinfo
51        rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT);
52
53        for (size_t i = 0; i < closure->mNumArg; i++) {
54            const void* arg = closure->mArgs[i];
55            const Allocation* a = (const Allocation*)arg;
56            const uint32_t eStride = a->mHal.state.elementSizeBytes;
57            const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
58                    eStride * xstart;
59            if (kinfo->dim.y > 1) {
60                ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y;
61            }
62            mutable_kinfo->inPtr[i] = ptr;
63            mutable_kinfo->inStride[i] = eStride;
64        }
65        mutable_kinfo->inLen = closure->mNumArg;
66
67        const Allocation* out = closure->mReturnValue;
68        const uint32_t ostep = out->mHal.state.elementSizeBytes;
69        const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
70                ostep * xstart;
71        if (kinfo->dim.y > 1) {
72            ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y;
73        }
74
75        rsAssert(kinfo->outLen <= 1);
76        mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr);
77
78        cpuClosure->mFunc(kinfo, xstart, xend, ostep);
79    }
80
81    mutable_kinfo->inLen = oldInLen;
82    memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride));
83}
84
85}  // namespace
86
87Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
88    mGroup(group), mFunc(nullptr) {
89    mName = strndup(name, strlen(name));
90}
91
92Batch::~Batch() {
93    for (CPUClosure* c : mClosures) {
94        delete c;
95    }
96    free(mName);
97}
98
99bool Batch::conflict(CPUClosure* cpuClosure) const {
100    if (mClosures.empty()) {
101        return false;
102    }
103
104    const Closure* closure = cpuClosure->mClosure;
105
106    if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
107        // An invoke should be in a batch by itself, so it conflicts with any other
108        // closure.
109        return true;
110    }
111
112    const auto& globalDeps = closure->mGlobalDeps;
113    const auto& argDeps = closure->mArgDeps;
114
115    for (CPUClosure* c : mClosures) {
116        const Closure* batched = c->mClosure;
117        if (globalDeps.find(batched) != globalDeps.end()) {
118            return true;
119        }
120        const auto& it = argDeps.find(batched);
121        if (it != argDeps.end()) {
122            const auto& args = (*it).second;
123            for (const auto &p1 : *args) {
124                if (p1.second.get() != nullptr) {
125                    return true;
126                }
127            }
128        }
129    }
130
131    return false;
132}
133
134CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
135                                         const ScriptGroupBase *sg) :
136    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
137    mExecutable(nullptr), mScriptObj(nullptr) {
138    rsAssert(!mGroup->mClosures.empty());
139
140    Batch* batch = new Batch(this, "Batch0");
141    int i = 0;
142    for (Closure* closure: mGroup->mClosures) {
143        CPUClosure* cc;
144        const IDBase* funcID = closure->mFunctionID.get();
145        RsdCpuScriptImpl* si =
146                (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
147        if (closure->mIsKernel) {
148            MTLaunchStruct mtls;
149            si->forEachKernelSetup(funcID->mSlot, &mtls);
150            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
151        } else {
152            cc = new CPUClosure(closure, si);
153        }
154
155        if (batch->conflict(cc)) {
156            mBatches.push_back(batch);
157            std::stringstream ss;
158            ss << "Batch" << ++i;
159            batch = new Batch(this, ss.str().c_str());
160        }
161
162        batch->mClosures.push_back(cc);
163    }
164
165    rsAssert(!batch->mClosures.empty());
166    mBatches.push_back(batch);
167
168#ifndef RS_COMPATIBILITY_LIB
169    compile(mGroup->mCacheDir);
170    if (mScriptObj != nullptr && mExecutable != nullptr) {
171        for (Batch* batch : mBatches) {
172            batch->resolveFuncPtr(mScriptObj);
173        }
174    }
175#endif  // RS_COMPATIBILITY_LIB
176}
177
178void Batch::resolveFuncPtr(void* sharedObj) {
179    std::string funcName(mName);
180    if (mClosures.front()->mClosure->mIsKernel) {
181        funcName.append(".expand");
182    }
183    mFunc = dlsym(sharedObj, funcName.c_str());
184    rsAssert (mFunc != nullptr);
185}
186
187CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
188    for (Batch* batch : mBatches) {
189        delete batch;
190    }
191    delete mExecutable;
192    // TODO: move this dlclose into ~ScriptExecutable().
193    if (mScriptObj != nullptr) {
194        dlclose(mScriptObj);
195    }
196}
197
198namespace {
199
200#ifndef RS_COMPATIBILITY_LIB
201
202string getCoreLibPath(Context* context, string* coreLibRelaxedPath) {
203    *coreLibRelaxedPath = "";
204
205    // If we're debugging, use the debug library.
206    if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
207        return SYSLIBPATH"/libclcore_debug.bc";
208    }
209
210    // Check for a platform specific library
211
212#if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
213    // NEON-capable ARMv7a devices can use an accelerated math library
214    // for all reduced precision scripts.
215    // ARMv8 does not use NEON, as ASIMD can be used with all precision
216    // levels.
217    *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc";
218#endif
219
220#if defined(__i386__) || defined(__x86_64__)
221    // x86 devices will use an optimized library.
222    return SYSLIBPATH"/libclcore_x86.bc";
223#else
224    return SYSLIBPATH"/libclcore.bc";
225#endif
226}
227
228string getFileName(string path) {
229    unsigned found = path.find_last_of("/\\");
230    return path.substr(found + 1);
231}
232
233void setupCompileArguments(
234        const vector<string>& inputs, const vector<string>& kernelBatches,
235        const vector<string>& invokeBatches,
236        const string& output_dir, const string& output_filename,
237        const string& coreLibPath, const string& coreLibRelaxedPath,
238        vector<const char*>* args) {
239    args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
240    args->push_back("-fPIC");
241    args->push_back("-embedRSInfo");
242    args->push_back("-mtriple");
243    args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
244    args->push_back("-bclib");
245    args->push_back(coreLibPath.c_str());
246    args->push_back("-bclib_relaxed");
247    args->push_back(coreLibRelaxedPath.c_str());
248    for (const string& input : inputs) {
249        args->push_back(input.c_str());
250    }
251    for (const string& batch : kernelBatches) {
252        args->push_back("-merge");
253        args->push_back(batch.c_str());
254    }
255    for (const string& batch : invokeBatches) {
256        args->push_back("-invoke");
257        args->push_back(batch.c_str());
258    }
259    args->push_back("-output_path");
260    args->push_back(output_dir.c_str());
261    args->push_back("-o");
262    args->push_back(output_filename.c_str());
263    args->push_back(nullptr);
264}
265
266void generateSourceSlot(const Closure& closure,
267                        const std::vector<std::string>& inputs,
268                        std::stringstream& ss) {
269    const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
270    const Script* script = funcID->mScript;
271
272    rsAssert (!script->isIntrinsic());
273
274    const RsdCpuScriptImpl *cpuScript =
275            (const RsdCpuScriptImpl*)script->mHal.drv;
276    const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
277
278    const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
279            inputs.begin();
280
281    ss << index << "," << funcID->mSlot << ".";
282}
283
284#endif  // RS_COMPATIBILTY_LIB
285
286}  // anonymous namespace
287
288void CpuScriptGroup2Impl::compile(const char* cacheDir) {
289#ifndef RS_COMPATIBILITY_LIB
290    if (mGroup->mClosures.size() < 2) {
291        return;
292    }
293
294    //===--------------------------------------------------------------------===//
295    // Fuse the input kernels and generate native code in an object file
296    //===--------------------------------------------------------------------===//
297
298    std::set<string> inputSet;
299    for (Closure* closure : mGroup->mClosures) {
300        const Script* script = closure->mFunctionID.get()->mScript;
301
302        // If any script is an intrinsic, give up trying fusing the kernels.
303        if (script->isIntrinsic()) {
304            return;
305        }
306
307        const RsdCpuScriptImpl *cpuScript =
308                (const RsdCpuScriptImpl*)script->mHal.drv;
309        const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
310        inputSet.insert(bitcodeFilename);
311    }
312
313    std::vector<string> inputs(inputSet.begin(), inputSet.end());
314
315    std::vector<string> kernelBatches;
316    std::vector<string> invokeBatches;
317
318    int i = 0;
319    for (const auto& batch : mBatches) {
320        rsAssert(batch->size() > 0);
321
322        std::stringstream ss;
323        ss << batch->mName << ":";
324
325        if (!batch->mClosures.front()->mClosure->mIsKernel) {
326            rsAssert(batch->size() == 1);
327            generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss);
328            invokeBatches.push_back(ss.str());
329        } else {
330            for (const auto& cpuClosure : batch->mClosures) {
331                generateSourceSlot(*cpuClosure->mClosure, inputs, ss);
332            }
333            kernelBatches.push_back(ss.str());
334        }
335    }
336
337    rsAssert(cacheDir != nullptr);
338    string objFilePath(cacheDir);
339    objFilePath.append("/fusedXXXXXX.o");
340    // Find unique object file name, to make following file names unique.
341    int tempfd = mkstemps(&objFilePath[0], 2);
342    if (tempfd == -1) {
343      return;
344    }
345    TEMP_FAILURE_RETRY(close(tempfd));
346
347    string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2));
348    string coreLibRelaxedPath;
349    const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(),
350                                               &coreLibRelaxedPath);
351    vector<const char*> arguments;
352    string output_dir(cacheDir);
353    setupCompileArguments(inputs, kernelBatches, invokeBatches, output_dir,
354                          outputFileName, coreLibPath, coreLibRelaxedPath, &arguments);
355
356    bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH,
357                                     arguments.size()-1,
358                                     arguments.data());
359    if (!compiled) {
360        unlink(objFilePath.c_str());
361        return;
362    }
363
364    //===--------------------------------------------------------------------===//
365    // Create and load the shared lib
366    //===--------------------------------------------------------------------===//
367
368    const char* resName = outputFileName.c_str();
369
370    if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
371        ALOGE("Failed to link object file '%s'", resName);
372        return;
373    }
374
375    mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
376    if (mScriptObj == nullptr) {
377        ALOGE("Unable to load '%s'", resName);
378        return;
379    }
380
381    mExecutable = ScriptExecutable::createFromSharedObject(
382        getCpuRefImpl()->getContext(),
383        mScriptObj);
384
385#endif  // RS_COMPATIBILITY_LIB
386}
387
388void CpuScriptGroup2Impl::execute() {
389    for (auto batch : mBatches) {
390        batch->setGlobalsForBatch();
391        batch->run();
392    }
393}
394
395void Batch::setGlobalsForBatch() {
396    for (CPUClosure* cpuClosure : mClosures) {
397        const Closure* closure = cpuClosure->mClosure;
398        const IDBase* funcID = closure->mFunctionID.get();
399        Script* s = funcID->mScript;;
400        for (const auto& p : closure->mGlobals) {
401            const void* value = p.second.first;
402            int size = p.second.second;
403            if (value == nullptr && size == 0) {
404                // This indicates the current closure depends on another closure for a
405                // global in their shared module (script). In this case we don't need to
406                // copy the value. For example, an invoke intializes a global variable
407                // which a kernel later reads.
408                continue;
409            }
410            rsAssert(p.first != nullptr);
411            Script* script = p.first->mScript;
412            const RsdCpuScriptImpl *cpuScript =
413                    (const RsdCpuScriptImpl*)script->mHal.drv;
414            int slot = p.first->mSlot;
415            ScriptExecutable* exec = mGroup->getExecutable();
416            if (exec != nullptr) {
417                const char* varName = cpuScript->getFieldName(slot);
418                void* addr = exec->getFieldAddress(varName);
419                if (size < 0) {
420                    rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
421                                 (rs_object_base*)addr, (ObjectBase*)value);
422                } else {
423                    memcpy(addr, (const void*)&value, size);
424                }
425            } else {
426                // We use -1 size to indicate an ObjectBase rather than a primitive type
427                if (size < 0) {
428                    s->setVarObj(slot, (ObjectBase*)value);
429                } else {
430                    s->setVar(slot, (const void*)&value, size);
431                }
432            }
433        }
434    }
435}
436
437void Batch::run() {
438    if (!mClosures.front()->mClosure->mIsKernel) {
439        rsAssert(mClosures.size() == 1);
440
441        // This batch contains a single closure for an invoke function
442        CPUClosure* cc = mClosures.front();
443        const Closure* c = cc->mClosure;
444
445        if (mFunc != nullptr) {
446            // TODO: Need align pointers for x86_64.
447            // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
448            ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
449        } else {
450            const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
451            rsAssert(invokeID != nullptr);
452            cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
453        }
454
455        return;
456    }
457
458    if (mFunc != nullptr) {
459        MTLaunchStruct mtls;
460        const CPUClosure* firstCpuClosure = mClosures.front();
461        const CPUClosure* lastCpuClosure = mClosures.back();
462
463        firstCpuClosure->mSi->forEachMtlsSetup(
464                (const Allocation**)firstCpuClosure->mClosure->mArgs,
465                firstCpuClosure->mClosure->mNumArg,
466                lastCpuClosure->mClosure->mReturnValue,
467                nullptr, 0, nullptr, &mtls);
468
469        mtls.script = nullptr;
470        mtls.fep.usr = nullptr;
471        mtls.kernel = (ForEachFunc_t)mFunc;
472
473        mGroup->getCpuRefImpl()->launchThreads(
474                (const Allocation**)firstCpuClosure->mClosure->mArgs,
475                firstCpuClosure->mClosure->mNumArg,
476                lastCpuClosure->mClosure->mReturnValue,
477                nullptr, &mtls);
478
479        return;
480    }
481
482    for (CPUClosure* cpuClosure : mClosures) {
483        const Closure* closure = cpuClosure->mClosure;
484        const ScriptKernelID* kernelID =
485                (const ScriptKernelID*)closure->mFunctionID.get();
486        cpuClosure->mSi->preLaunch(kernelID->mSlot,
487                                   (const Allocation**)closure->mArgs,
488                                   closure->mNumArg, closure->mReturnValue,
489                                   nullptr, 0, nullptr);
490    }
491
492    const CPUClosure* cpuClosure = mClosures.front();
493    const Closure* closure = cpuClosure->mClosure;
494    MTLaunchStruct mtls;
495
496    if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
497                                          closure->mNumArg,
498                                          closure->mReturnValue,
499                                          nullptr, 0, nullptr, &mtls)) {
500
501        mtls.script = nullptr;
502        mtls.kernel = (void (*)())&groupRoot;
503        mtls.fep.usr = &mClosures;
504
505        mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
506    }
507
508    for (CPUClosure* cpuClosure : mClosures) {
509        const Closure* closure = cpuClosure->mClosure;
510        const ScriptKernelID* kernelID =
511                (const ScriptKernelID*)closure->mFunctionID.get();
512        cpuClosure->mSi->postLaunch(kernelID->mSlot,
513                                    (const Allocation**)closure->mArgs,
514                                    closure->mNumArg, closure->mReturnValue,
515                                    nullptr, 0, nullptr);
516    }
517}
518
519}  // namespace renderscript
520}  // namespace android
521