rsCpuCore.cpp revision c905efd76fdcc1b8846b229bf7d991d185a7b4b7
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "rsCpuCore.h"
18#include "rsCpuScript.h"
19#include "rsCpuScriptGroup.h"
20
21#include <malloc.h>
22#include "rsContext.h"
23
24#include <sys/types.h>
25#include <sys/resource.h>
26#include <sched.h>
27#include <cutils/properties.h>
28#include <sys/syscall.h>
29#include <string.h>
30#include "utils/StopWatch.h"
31
32using namespace android;
33using namespace android::renderscript;
34
35typedef void (*outer_foreach_t)(
36    const android::renderscript::RsForEachStubParamStruct *,
37    uint32_t x1, uint32_t x2,
38    uint32_t instep, uint32_t outstep);
39
40
41static pthread_key_t gThreadTLSKey = 0;
42static uint32_t gThreadTLSKeyCount = 0;
43static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
44
45RsdCpuReference::~RsdCpuReference() {
46}
47
48RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
49                                          uint32_t version_minor, sym_lookup_t lfn,
50                                          script_lookup_t slfn) {
51
52    RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
53    if (!cpu) {
54        return NULL;
55    }
56    if (!cpu->init(version_major, version_minor, lfn, slfn)) {
57        delete cpu;
58        return NULL;
59    }
60    return cpu;
61}
62
63
64Context * RsdCpuReference::getTlsContext() {
65    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
66    return tls->mContext;
67}
68
69const Script * RsdCpuReference::getTlsScript() {
70    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
71    return tls->mScript;
72}
73
74
75////////////////////////////////////////////////////////////
76///
77
78RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
79    mRSC = rsc;
80
81    version_major = 0;
82    version_minor = 0;
83    mInForEach = false;
84    memset(&mWorkers, 0, sizeof(mWorkers));
85    memset(&mTlsStruct, 0, sizeof(mTlsStruct));
86    mExit = false;
87
88}
89
90
91void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
92    RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
93
94
95    uint32_t idx = (uint32_t)android_atomic_inc(&dc->mWorkers.mLaunchCount);
96
97    //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
98
99    dc->mWorkers.mLaunchSignals[idx].init();
100    dc->mWorkers.mNativeThreadId[idx] = gettid();
101
102    memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
103    int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
104    if (status) {
105        ALOGE("pthread_setspecific %i", status);
106    }
107
108#if 0
109    typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
110    cpu_set_t cpuset;
111    memset(&cpuset, 0, sizeof(cpuset));
112    cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
113    int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
114              sizeof(cpuset), &cpuset);
115    ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
116#endif
117
118    while (!dc->mExit) {
119        dc->mWorkers.mLaunchSignals[idx].wait();
120        if (dc->mWorkers.mLaunchCallback) {
121           // idx +1 is used because the calling thread is always worker 0.
122           dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
123        }
124        android_atomic_dec(&dc->mWorkers.mRunningCount);
125        dc->mWorkers.mCompleteSignal.set();
126    }
127
128    //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
129    return NULL;
130}
131
132void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
133    mWorkers.mLaunchData = data;
134    mWorkers.mLaunchCallback = cbk;
135    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
136    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
137        mWorkers.mLaunchSignals[ct].set();
138    }
139
140    // We use the calling thread as one of the workers so we can start without
141    // the delay of the thread wakeup.
142    if (mWorkers.mLaunchCallback) {
143       mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
144    }
145
146    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
147        mWorkers.mCompleteSignal.wait();
148    }
149}
150
151
152void RsdCpuReferenceImpl::lockMutex() {
153    pthread_mutex_lock(&gInitMutex);
154}
155
156void RsdCpuReferenceImpl::unlockMutex() {
157    pthread_mutex_unlock(&gInitMutex);
158}
159
160bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
161                               sym_lookup_t lfn, script_lookup_t slfn) {
162
163    mSymLookupFn = lfn;
164    mScriptLookupFn = slfn;
165
166    lockMutex();
167    if (!gThreadTLSKeyCount) {
168        int status = pthread_key_create(&gThreadTLSKey, NULL);
169        if (status) {
170            ALOGE("Failed to init thread tls key.");
171            unlockMutex();
172            return false;
173        }
174    }
175    gThreadTLSKeyCount++;
176    unlockMutex();
177
178    mTlsStruct.mContext = mRSC;
179    mTlsStruct.mScript = NULL;
180    int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
181    if (status) {
182        ALOGE("pthread_setspecific %i", status);
183    }
184
185    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
186    if(mRSC->props.mDebugMaxThreads) {
187        cpu = mRSC->props.mDebugMaxThreads;
188    }
189    if (cpu < 2) {
190        mWorkers.mCount = 0;
191        return true;
192    }
193
194    // Subtract one from the cpu count because we also use the command thread as a worker.
195    mWorkers.mCount = (uint32_t)(cpu - 1);
196
197    ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount);
198
199    mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
200    mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
201    mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
202    mWorkers.mLaunchCallback = NULL;
203
204    mWorkers.mCompleteSignal.init();
205
206    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
207    android_atomic_release_store(0, &mWorkers.mLaunchCount);
208
209    pthread_attr_t threadAttr;
210    status = pthread_attr_init(&threadAttr);
211    if (status) {
212        ALOGE("Failed to init thread attribute.");
213        return false;
214    }
215
216    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
217        status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
218        if (status) {
219            mWorkers.mCount = ct;
220            ALOGE("Created fewer than expected number of RS threads.");
221            break;
222        }
223    }
224    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
225        usleep(100);
226    }
227
228    pthread_attr_destroy(&threadAttr);
229    return true;
230}
231
232
233void RsdCpuReferenceImpl::setPriority(int32_t priority) {
234    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
235        setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
236    }
237}
238
239RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
240    mExit = true;
241    mWorkers.mLaunchData = NULL;
242    mWorkers.mLaunchCallback = NULL;
243    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
244    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
245        mWorkers.mLaunchSignals[ct].set();
246    }
247    void *res;
248    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
249        pthread_join(mWorkers.mThreadId[ct], &res);
250    }
251    rsAssert(android_atomic_acquire_load(&mWorkers.mRunningCount) == 0);
252
253    // Global structure cleanup.
254    lockMutex();
255    --gThreadTLSKeyCount;
256    if (!gThreadTLSKeyCount) {
257        pthread_key_delete(gThreadTLSKey);
258    }
259    unlockMutex();
260
261}
262
263typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
264
265static void wc_xy(void *usr, uint32_t idx) {
266    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
267    RsForEachStubParamStruct p;
268    memcpy(&p, &mtls->fep, sizeof(p));
269    p.lid = idx;
270    uint32_t sig = mtls->sig;
271
272    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
273    while (1) {
274        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
275        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
276        uint32_t yEnd = yStart + mtls->mSliceSize;
277        yEnd = rsMin(yEnd, mtls->yEnd);
278        if (yEnd <= yStart) {
279            return;
280        }
281
282        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
283        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
284
285        for (p.y = yStart; p.y < yEnd; p.y++) {
286            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
287                    (mtls->fep.eStrideOut * mtls->xStart);
288            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
289                   (mtls->fep.eStrideIn * mtls->xStart);
290            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
291        }
292    }
293}
294
295static void wc_x(void *usr, uint32_t idx) {
296    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
297    RsForEachStubParamStruct p;
298    memcpy(&p, &mtls->fep, sizeof(p));
299    p.lid = idx;
300    uint32_t sig = mtls->sig;
301
302    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
303    while (1) {
304        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
305        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
306        uint32_t xEnd = xStart + mtls->mSliceSize;
307        xEnd = rsMin(xEnd, mtls->xEnd);
308        if (xEnd <= xStart) {
309            return;
310        }
311
312        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
313        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
314
315        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
316        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
317        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
318    }
319}
320
321void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
322                                     const RsScriptCall *sc, MTLaunchStruct *mtls) {
323
324    //android::StopWatch kernel_time("kernel time");
325
326    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
327        const size_t targetByteChunk = 16 * 1024;
328        mInForEach = true;
329        if (mtls->fep.dimY > 1) {
330            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
331            uint32_t s2 = 0;
332
333            // This chooses our slice size to rate limit atomic ops to
334            // one per 16k bytes of reads/writes.
335            if (mtls->fep.yStrideOut) {
336                s2 = targetByteChunk / mtls->fep.yStrideOut;
337            } else {
338                s2 = targetByteChunk / mtls->fep.yStrideIn;
339            }
340            mtls->mSliceSize = rsMin(s1, s2);
341
342            if(mtls->mSliceSize < 1) {
343                mtls->mSliceSize = 1;
344            }
345
346         //   mtls->mSliceSize = 2;
347            launchThreads(wc_xy, mtls);
348        } else {
349            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
350            uint32_t s2 = 0;
351
352            // This chooses our slice size to rate limit atomic ops to
353            // one per 16k bytes of reads/writes.
354            if (mtls->fep.eStrideOut) {
355                s2 = targetByteChunk / mtls->fep.eStrideOut;
356            } else {
357                s2 = targetByteChunk / mtls->fep.eStrideIn;
358            }
359            mtls->mSliceSize = rsMin(s1, s2);
360
361            if(mtls->mSliceSize < 1) {
362                mtls->mSliceSize = 1;
363            }
364
365            launchThreads(wc_x, mtls);
366        }
367        mInForEach = false;
368
369        //ALOGE("launch 1");
370    } else {
371        RsForEachStubParamStruct p;
372        memcpy(&p, &mtls->fep, sizeof(p));
373        uint32_t sig = mtls->sig;
374
375        //ALOGE("launch 3");
376        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
377        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
378            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
379                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
380                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
381                                      mtls->fep.dimY * p.z + p.y;
382                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
383                            (mtls->fep.eStrideOut * mtls->xStart);
384                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
385                           (mtls->fep.eStrideIn * mtls->xStart);
386                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
387                }
388            }
389        }
390    }
391}
392
393RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
394    //ALOGE("setTls %p", sc);
395    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
396    rsAssert(tls);
397    RsdCpuScriptImpl *old = tls->mImpl;
398    tls->mImpl = sc;
399    tls->mContext = mRSC;
400    if (sc) {
401        tls->mScript = sc->getScript();
402    } else {
403        tls->mScript = NULL;
404    }
405    return old;
406}
407
408const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
409    return mSymLookupFn(mRSC, name);
410}
411
412
413RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
414                                    char const *resName, char const *cacheDir,
415                                    uint8_t const *bitcode, size_t bitcodeSize,
416                                    uint32_t flags) {
417
418    RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
419    if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags)) {
420        delete i;
421        return NULL;
422    }
423    return i;
424}
425
426extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
427                                                   const Script *s, const Element *e);
428extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
429                                                   const Script *s, const Element *e);
430extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
431                                           const Script *s, const Element *e);
432extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
433                                                   const Script *s, const Element *e);
434extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
435                                            const Script *s, const Element *e);
436extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
437                                                const Script *s, const Element *e);
438extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
439                                             const Script *s, const Element *e);
440
441RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
442                                    RsScriptIntrinsicID iid, Element *e) {
443
444    RsdCpuScriptImpl *i = NULL;
445    switch (iid) {
446    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
447        i = rsdIntrinsic_Convolve3x3(this, s, e);
448        break;
449    case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
450        i = rsdIntrinsic_ColorMatrix(this, s, e);
451        break;
452    case RS_SCRIPT_INTRINSIC_ID_LUT:
453        i = rsdIntrinsic_LUT(this, s, e);
454        break;
455    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
456        i = rsdIntrinsic_Convolve5x5(this, s, e);
457        break;
458    case RS_SCRIPT_INTRINSIC_ID_BLUR:
459        i = rsdIntrinsic_Blur(this, s, e);
460        break;
461    case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
462        i = rsdIntrinsic_YuvToRGB(this, s, e);
463        break;
464    case RS_SCRIPT_INTRINSIC_ID_BLEND:
465        i = rsdIntrinsic_Blend(this, s, e);
466        break;
467
468    default:
469        rsAssert(0);
470    }
471
472    return i;
473}
474
475RsdCpuReference::CpuScriptGroup * RsdCpuReferenceImpl::createScriptGroup(const ScriptGroup *sg) {
476    CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
477    if (!sgi->init()) {
478        delete sgi;
479        return NULL;
480    }
481    return sgi;
482}
483
484
485