rsCpuCore.cpp revision cadfac411e6690e39de36c4f9e94deb9b7d2d08e
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "rsCpuCore.h"
18#include "rsCpuScript.h"
19#include "rsCpuScriptGroup.h"
20
21#include <malloc.h>
22#include "rsContext.h"
23
24#include <sys/types.h>
25#include <sys/resource.h>
26#include <sched.h>
27#include <cutils/properties.h>
28#include <sys/syscall.h>
29#include <string.h>
30#include "utils/StopWatch.h"
31
32using namespace android;
33using namespace android::renderscript;
34
35typedef void (*outer_foreach_t)(
36    const android::renderscript::RsForEachStubParamStruct *,
37    uint32_t x1, uint32_t x2,
38    uint32_t instep, uint32_t outstep);
39
40
41static pthread_key_t gThreadTLSKey = 0;
42static uint32_t gThreadTLSKeyCount = 0;
43static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
44
45RsdCpuReference::~RsdCpuReference() {
46}
47
48RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
49        uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn
50#ifndef RS_COMPATIBILITY_LIB
51        , bcc::RSLinkRuntimeCallback pLinkRuntimeCallback
52#endif
53        ) {
54
55    RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
56    if (!cpu) {
57        return NULL;
58    }
59    if (!cpu->init(version_major, version_minor, lfn, slfn)) {
60        delete cpu;
61        return NULL;
62    }
63
64#ifndef RS_COMPATIBILITY_LIB
65    cpu->setLinkRuntimeCallback(pLinkRuntimeCallback);
66#endif
67
68    return cpu;
69}
70
71
72Context * RsdCpuReference::getTlsContext() {
73    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
74    return tls->mContext;
75}
76
77const Script * RsdCpuReference::getTlsScript() {
78    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
79    return tls->mScript;
80}
81
82pthread_key_t RsdCpuReference::getThreadTLSKey(){ return gThreadTLSKey; }
83
84////////////////////////////////////////////////////////////
85///
86
87RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
88    mRSC = rsc;
89
90    version_major = 0;
91    version_minor = 0;
92    mInForEach = false;
93    memset(&mWorkers, 0, sizeof(mWorkers));
94    memset(&mTlsStruct, 0, sizeof(mTlsStruct));
95    mExit = false;
96#ifndef RS_COMPATIBILITY_LIB
97    mLinkRuntimeCallback = NULL;
98#endif
99}
100
101
102void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
103    RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
104
105
106    uint32_t idx = (uint32_t)android_atomic_inc(&dc->mWorkers.mLaunchCount);
107
108    //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
109
110    dc->mWorkers.mLaunchSignals[idx].init();
111    dc->mWorkers.mNativeThreadId[idx] = gettid();
112
113    memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
114    int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
115    if (status) {
116        ALOGE("pthread_setspecific %i", status);
117    }
118
119#if 0
120    typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
121    cpu_set_t cpuset;
122    memset(&cpuset, 0, sizeof(cpuset));
123    cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
124    int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
125              sizeof(cpuset), &cpuset);
126    ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
127#endif
128
129    while (!dc->mExit) {
130        dc->mWorkers.mLaunchSignals[idx].wait();
131        if (dc->mWorkers.mLaunchCallback) {
132           // idx +1 is used because the calling thread is always worker 0.
133           dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
134        }
135        android_atomic_dec(&dc->mWorkers.mRunningCount);
136        dc->mWorkers.mCompleteSignal.set();
137    }
138
139    //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
140    return NULL;
141}
142
143void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
144    mWorkers.mLaunchData = data;
145    mWorkers.mLaunchCallback = cbk;
146
147    // fast path for very small launches
148    MTLaunchStruct *mtls = (MTLaunchStruct *)data;
149    if (mtls && mtls->fep.dimY <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
150        if (mWorkers.mLaunchCallback) {
151            mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
152        }
153        return;
154    }
155
156    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
157    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
158        mWorkers.mLaunchSignals[ct].set();
159    }
160
161    // We use the calling thread as one of the workers so we can start without
162    // the delay of the thread wakeup.
163    if (mWorkers.mLaunchCallback) {
164        mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
165    }
166
167    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
168        mWorkers.mCompleteSignal.wait();
169    }
170}
171
172
173void RsdCpuReferenceImpl::lockMutex() {
174    pthread_mutex_lock(&gInitMutex);
175}
176
177void RsdCpuReferenceImpl::unlockMutex() {
178    pthread_mutex_unlock(&gInitMutex);
179}
180
181bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
182                               sym_lookup_t lfn, script_lookup_t slfn) {
183
184    mSymLookupFn = lfn;
185    mScriptLookupFn = slfn;
186
187    lockMutex();
188    if (!gThreadTLSKeyCount) {
189        int status = pthread_key_create(&gThreadTLSKey, NULL);
190        if (status) {
191            ALOGE("Failed to init thread tls key.");
192            unlockMutex();
193            return false;
194        }
195    }
196    gThreadTLSKeyCount++;
197    unlockMutex();
198
199    mTlsStruct.mContext = mRSC;
200    mTlsStruct.mScript = NULL;
201    int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
202    if (status) {
203        ALOGE("pthread_setspecific %i", status);
204    }
205
206    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
207    if(mRSC->props.mDebugMaxThreads) {
208        cpu = mRSC->props.mDebugMaxThreads;
209    }
210    if (cpu < 2) {
211        mWorkers.mCount = 0;
212        return true;
213    }
214
215    // Subtract one from the cpu count because we also use the command thread as a worker.
216    mWorkers.mCount = (uint32_t)(cpu - 1);
217
218    ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount);
219
220    mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
221    mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
222    mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
223    mWorkers.mLaunchCallback = NULL;
224
225    mWorkers.mCompleteSignal.init();
226
227    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
228    android_atomic_release_store(0, &mWorkers.mLaunchCount);
229
230    pthread_attr_t threadAttr;
231    status = pthread_attr_init(&threadAttr);
232    if (status) {
233        ALOGE("Failed to init thread attribute.");
234        return false;
235    }
236
237    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
238        status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
239        if (status) {
240            mWorkers.mCount = ct;
241            ALOGE("Created fewer than expected number of RS threads.");
242            break;
243        }
244    }
245    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
246        usleep(100);
247    }
248
249    pthread_attr_destroy(&threadAttr);
250    return true;
251}
252
253
254void RsdCpuReferenceImpl::setPriority(int32_t priority) {
255    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
256        setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
257    }
258}
259
260RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
261    mExit = true;
262    mWorkers.mLaunchData = NULL;
263    mWorkers.mLaunchCallback = NULL;
264    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
265    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
266        mWorkers.mLaunchSignals[ct].set();
267    }
268    void *res;
269    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
270        pthread_join(mWorkers.mThreadId[ct], &res);
271    }
272    rsAssert(android_atomic_acquire_load(&mWorkers.mRunningCount) == 0);
273
274    // Global structure cleanup.
275    lockMutex();
276    --gThreadTLSKeyCount;
277    if (!gThreadTLSKeyCount) {
278        pthread_key_delete(gThreadTLSKey);
279    }
280    unlockMutex();
281
282}
283
284typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
285
286static void wc_xy(void *usr, uint32_t idx) {
287    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
288    RsForEachStubParamStruct p;
289    memcpy(&p, &mtls->fep, sizeof(p));
290    p.lid = idx;
291    uint32_t sig = mtls->sig;
292
293    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
294    while (1) {
295        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
296        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
297        uint32_t yEnd = yStart + mtls->mSliceSize;
298        yEnd = rsMin(yEnd, mtls->yEnd);
299        if (yEnd <= yStart) {
300            return;
301        }
302
303        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
304        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
305
306        for (p.y = yStart; p.y < yEnd; p.y++) {
307            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
308                    (mtls->fep.eStrideOut * mtls->xStart);
309            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
310                   (mtls->fep.eStrideIn * mtls->xStart);
311            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
312        }
313    }
314}
315
316static void wc_x(void *usr, uint32_t idx) {
317    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
318    RsForEachStubParamStruct p;
319    memcpy(&p, &mtls->fep, sizeof(p));
320    p.lid = idx;
321    uint32_t sig = mtls->sig;
322
323    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
324    while (1) {
325        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
326        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
327        uint32_t xEnd = xStart + mtls->mSliceSize;
328        xEnd = rsMin(xEnd, mtls->xEnd);
329        if (xEnd <= xStart) {
330            return;
331        }
332
333        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
334        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
335
336        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
337        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
338        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
339    }
340}
341
342void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
343                                     const RsScriptCall *sc, MTLaunchStruct *mtls) {
344
345    //android::StopWatch kernel_time("kernel time");
346
347    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
348        const size_t targetByteChunk = 16 * 1024;
349        mInForEach = true;
350        if (mtls->fep.dimY > 1) {
351            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
352            uint32_t s2 = 0;
353
354            // This chooses our slice size to rate limit atomic ops to
355            // one per 16k bytes of reads/writes.
356            if (mtls->fep.yStrideOut) {
357                s2 = targetByteChunk / mtls->fep.yStrideOut;
358            } else {
359                s2 = targetByteChunk / mtls->fep.yStrideIn;
360            }
361            mtls->mSliceSize = rsMin(s1, s2);
362
363            if(mtls->mSliceSize < 1) {
364                mtls->mSliceSize = 1;
365            }
366
367         //   mtls->mSliceSize = 2;
368            launchThreads(wc_xy, mtls);
369        } else {
370            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
371            uint32_t s2 = 0;
372
373            // This chooses our slice size to rate limit atomic ops to
374            // one per 16k bytes of reads/writes.
375            if (mtls->fep.eStrideOut) {
376                s2 = targetByteChunk / mtls->fep.eStrideOut;
377            } else {
378                s2 = targetByteChunk / mtls->fep.eStrideIn;
379            }
380            mtls->mSliceSize = rsMin(s1, s2);
381
382            if(mtls->mSliceSize < 1) {
383                mtls->mSliceSize = 1;
384            }
385
386            launchThreads(wc_x, mtls);
387        }
388        mInForEach = false;
389
390        //ALOGE("launch 1");
391    } else {
392        RsForEachStubParamStruct p;
393        memcpy(&p, &mtls->fep, sizeof(p));
394        uint32_t sig = mtls->sig;
395
396        //ALOGE("launch 3");
397        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
398        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
399            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
400                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
401                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
402                                      mtls->fep.dimY * p.z + p.y;
403                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
404                            (mtls->fep.eStrideOut * mtls->xStart);
405                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
406                           (mtls->fep.eStrideIn * mtls->xStart);
407                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
408                }
409            }
410        }
411    }
412}
413
414RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
415    //ALOGE("setTls %p", sc);
416    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
417    rsAssert(tls);
418    RsdCpuScriptImpl *old = tls->mImpl;
419    tls->mImpl = sc;
420    tls->mContext = mRSC;
421    if (sc) {
422        tls->mScript = sc->getScript();
423    } else {
424        tls->mScript = NULL;
425    }
426    return old;
427}
428
429const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
430    return mSymLookupFn(mRSC, name);
431}
432
433
434RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
435                                    char const *resName, char const *cacheDir,
436                                    uint8_t const *bitcode, size_t bitcodeSize,
437                                    uint32_t flags) {
438
439    RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
440    if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags)) {
441        delete i;
442        return NULL;
443    }
444    return i;
445}
446
447extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
448                                             const Script *s, const Element *e);
449extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
450                                                   const Script *s, const Element *e);
451extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
452                                                   const Script *s, const Element *e);
453extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
454                                           const Script *s, const Element *e);
455extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
456                                                   const Script *s, const Element *e);
457extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
458                                            const Script *s, const Element *e);
459extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
460                                                const Script *s, const Element *e);
461extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
462                                             const Script *s, const Element *e);
463
464RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
465                                    RsScriptIntrinsicID iid, Element *e) {
466
467    RsdCpuScriptImpl *i = NULL;
468    switch (iid) {
469    case RS_SCRIPT_INTRINSIC_ID_3DLUT:
470        i = rsdIntrinsic_3DLUT(this, s, e);
471        break;
472    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
473        i = rsdIntrinsic_Convolve3x3(this, s, e);
474        break;
475    case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
476        i = rsdIntrinsic_ColorMatrix(this, s, e);
477        break;
478    case RS_SCRIPT_INTRINSIC_ID_LUT:
479        i = rsdIntrinsic_LUT(this, s, e);
480        break;
481    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
482        i = rsdIntrinsic_Convolve5x5(this, s, e);
483        break;
484    case RS_SCRIPT_INTRINSIC_ID_BLUR:
485        i = rsdIntrinsic_Blur(this, s, e);
486        break;
487    case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
488        i = rsdIntrinsic_YuvToRGB(this, s, e);
489        break;
490    case RS_SCRIPT_INTRINSIC_ID_BLEND:
491        i = rsdIntrinsic_Blend(this, s, e);
492        break;
493
494    default:
495        rsAssert(0);
496    }
497
498    return i;
499}
500
501RsdCpuReference::CpuScriptGroup * RsdCpuReferenceImpl::createScriptGroup(const ScriptGroup *sg) {
502    CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
503    if (!sgi->init()) {
504        delete sgi;
505        return NULL;
506    }
507    return sgi;
508}
509
510
511