rsCpuIntrinsicBlur.cpp revision 462de21ac2e1773b99aedee012adb374e476ae36
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "rsCpuIntrinsic.h"
18#include "rsCpuIntrinsicInlines.h"
19
20namespace android {
21namespace renderscript {
22
23class RsdCpuScriptIntrinsicBlur : public RsdCpuScriptIntrinsic {
24public:
25    void populateScript(Script *) override;
26    void invokeFreeChildren() override;
27
28    void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
29    void setGlobalObj(uint32_t slot, ObjectBase *data) override;
30
31    ~RsdCpuScriptIntrinsicBlur() override;
32    RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
33
34protected:
35    float mFp[104];
36    uint16_t mIp[104];
37    void **mScratch;
38    size_t *mScratchSize;
39    float mRadius;
40    int mIradius;
41    ObjectBaseRef<Allocation> mAlloc;
42
43    static void kernelU4(const RsExpandKernelDriverInfo *info,
44                         uint32_t xstart, uint32_t xend,
45                         uint32_t outstep);
46    static void kernelU1(const RsExpandKernelDriverInfo *info,
47                         uint32_t xstart, uint32_t xend,
48                         uint32_t outstep);
49    void ComputeGaussianWeights();
50};
51
52
53void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() {
54    memset(mFp, 0, sizeof(mFp));
55    memset(mIp, 0, sizeof(mIp));
56
57    // Compute gaussian weights for the blur
58    // e is the euler's number
59    // TODO Define these constants only once
60    float e = 2.718281828459045f;
61    float pi = 3.1415926535897932f;
62    // g(x) = (1 / (sqrt(2 * pi) * sigma)) * e ^ (-x^2 / (2 * sigma^2))
63    // x is of the form [-radius .. 0 .. radius]
64    // and sigma varies with the radius.
65    // Based on some experimental radius values and sigmas,
66    // we approximately fit sigma = f(radius) as
67    // sigma = radius * 0.4  + 0.6
68    // The larger the radius gets, the more our gaussian blur
69    // will resemble a box blur since with large sigma
70    // the gaussian curve begins to lose its shape
71    float sigma = 0.4f * mRadius + 0.6f;
72
73    // Now compute the coefficients. We will store some redundant values to save
74    // some math during the blur calculations precompute some values
75    float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
76    float coeff2 = - 1.0f / (2.0f * sigma * sigma);
77
78    float normalizeFactor = 0.0f;
79    float floatR = 0.0f;
80    int r;
81    mIradius = (float)ceil(mRadius) + 0.5f;
82    for (r = -mIradius; r <= mIradius; r ++) {
83        floatR = (float)r;
84        mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
85        normalizeFactor += mFp[r + mIradius];
86    }
87
88    // Now we need to normalize the weights because all our coefficients need to add up to one
89    normalizeFactor = 1.0f / normalizeFactor;
90    for (r = -mIradius; r <= mIradius; r ++) {
91        mFp[r + mIradius] *= normalizeFactor;
92        mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f);
93    }
94}
95
96void RsdCpuScriptIntrinsicBlur::setGlobalObj(uint32_t slot, ObjectBase *data) {
97    rsAssert(slot == 1);
98    mAlloc.set(static_cast<Allocation *>(data));
99}
100
101void RsdCpuScriptIntrinsicBlur::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
102    rsAssert(slot == 0);
103    mRadius = ((const float *)data)[0];
104    ComputeGaussianWeights();
105}
106
107
108
109static void OneVU4(const RsExpandKernelDriverInfo *info, float4 *out, int32_t x, int32_t y,
110                   const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
111
112    const uchar *pi = ptrIn + x*4;
113
114    float4 blurredPixel = 0;
115    for (int r = -iradius; r <= iradius; r ++) {
116        int validY = rsMax((y + r), 0);
117        validY = rsMin(validY, (int)(info->dim.y- 1));
118        const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
119        float4 pf = convert_float4(pvy[0]);
120        blurredPixel += pf * gPtr[0];
121        gPtr++;
122    }
123
124    out[0] = blurredPixel;
125}
126
127static void OneVU1(const RsExpandKernelDriverInfo *info, float *out, int32_t x, int32_t y,
128                   const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
129
130    const uchar *pi = ptrIn + x;
131
132    float blurredPixel = 0;
133    for (int r = -iradius; r <= iradius; r ++) {
134        int validY = rsMax((y + r), 0);
135        validY = rsMin(validY, (int)(info->dim.y - 1));
136        float pf = (float)pi[validY * iStride];
137        blurredPixel += pf * gPtr[0];
138        gPtr++;
139    }
140
141    out[0] = blurredPixel;
142}
143
144} // namespace renderscript
145} // namespace android
146
147
148extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h,
149                 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
150extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h,
151                 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
152
153#if defined(ARCH_X86_HAVE_SSSE3)
154extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
155extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
156extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
157#endif
158
159using android::renderscript::gArchUseSIMD;
160
161static void OneVFU4(float4 *out,
162                    const uchar *ptrIn, int iStride, const float* gPtr, int ct,
163                    int x1, int x2) {
164    out += x1;
165#if defined(ARCH_X86_HAVE_SSSE3)
166    if (gArchUseSIMD) {
167        int t = (x2 - x1);
168        t &= ~1;
169        if (t) {
170            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
171        }
172        x1 += t;
173        out += t;
174        ptrIn += t << 2;
175    }
176#endif
177    while(x2 > x1) {
178        const uchar *pi = ptrIn;
179        float4 blurredPixel = 0;
180        const float* gp = gPtr;
181
182        for (int r = 0; r < ct; r++) {
183            float4 pf = convert_float4(((const uchar4 *)pi)[0]);
184            blurredPixel += pf * gp[0];
185            pi += iStride;
186            gp++;
187        }
188        out->xyzw = blurredPixel;
189        x1++;
190        out++;
191        ptrIn+=4;
192    }
193}
194
195static void OneVFU1(float *out,
196                    const uchar *ptrIn, int iStride, const float* gPtr, int ct, int x1, int x2) {
197
198    int len = x2 - x1;
199    out += x1;
200
201    while((x2 > x1) && (((uintptr_t)ptrIn) & 0x3)) {
202        const uchar *pi = ptrIn;
203        float blurredPixel = 0;
204        const float* gp = gPtr;
205
206        for (int r = 0; r < ct; r++) {
207            float pf = (float)pi[0];
208            blurredPixel += pf * gp[0];
209            pi += iStride;
210            gp++;
211        }
212        out[0] = blurredPixel;
213        x1++;
214        out++;
215        ptrIn++;
216        len--;
217    }
218#if defined(ARCH_X86_HAVE_SSSE3)
219    if (gArchUseSIMD && (x2 > x1)) {
220        int t = (x2 - x1) >> 2;
221        t &= ~1;
222        if (t) {
223            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
224            len -= t << 2;
225            ptrIn += t << 2;
226            out += t << 2;
227        }
228    }
229#endif
230    while(len > 0) {
231        const uchar *pi = ptrIn;
232        float blurredPixel = 0;
233        const float* gp = gPtr;
234
235        for (int r = 0; r < ct; r++) {
236            float pf = (float)pi[0];
237            blurredPixel += pf * gp[0];
238            pi += iStride;
239            gp++;
240        }
241        out[0] = blurredPixel;
242        len--;
243        out++;
244        ptrIn++;
245    }
246}
247
248using android::renderscript::rsMin;
249using android::renderscript::rsMax;
250
251static void OneHU4(const RsExpandKernelDriverInfo *info, uchar4 *out, int32_t x,
252                   const float4 *ptrIn, const float* gPtr, int iradius) {
253
254    float4 blurredPixel = 0;
255    for (int r = -iradius; r <= iradius; r ++) {
256        int validX = rsMax((x + r), 0);
257        validX = rsMin(validX, (int)(info->dim.x - 1));
258        float4 pf = ptrIn[validX];
259        blurredPixel += pf * gPtr[0];
260        gPtr++;
261    }
262
263    out->xyzw = convert_uchar4(blurredPixel);
264}
265
266static void OneHU1(const RsExpandKernelDriverInfo *info, uchar *out, int32_t x,
267                   const float *ptrIn, const float* gPtr, int iradius) {
268
269    float blurredPixel = 0;
270    for (int r = -iradius; r <= iradius; r ++) {
271        int validX = rsMax((x + r), 0);
272        validX = rsMin(validX, (int)(info->dim.x - 1));
273        float pf = ptrIn[validX];
274        blurredPixel += pf * gPtr[0];
275        gPtr++;
276    }
277
278    out[0] = (uchar)blurredPixel;
279}
280
281
282namespace android {
283namespace renderscript {
284
285void RsdCpuScriptIntrinsicBlur::kernelU4(const RsExpandKernelDriverInfo *info,
286                                         uint32_t xstart, uint32_t xend,
287                                         uint32_t outstep) {
288
289    float4 stackbuf[2048];
290    float4 *buf = &stackbuf[0];
291    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)info->usr;
292    if (!cp->mAlloc.get()) {
293        ALOGE("Blur executed without input, skipping");
294        return;
295    }
296    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
297    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
298
299    uchar4 *out = (uchar4 *)info->outPtr[0];
300    uint32_t x1 = xstart;
301    uint32_t x2 = xend;
302
303#if defined(ARCH_ARM_USE_INTRINSICS)
304    if (gArchUseSIMD && info->dim.x >= 4) {
305      rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * info->current.y),
306                 info->dim.x, info->dim.y,
307                 stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
308        return;
309    }
310#endif
311
312    if (info->dim.x > 2048) {
313        if ((info->dim.x > cp->mScratchSize[info->lid]) || !cp->mScratch[info->lid]) {
314            // Pad the side of the allocation by one unit to allow alignment later
315            cp->mScratch[info->lid] = realloc(cp->mScratch[info->lid], (info->dim.x + 1) * 16);
316            cp->mScratchSize[info->lid] = info->dim.x;
317        }
318        // realloc only aligns to 8 bytes so we manually align to 16.
319        buf = (float4 *) ((((intptr_t)cp->mScratch[info->lid]) + 15) & ~0xf);
320    }
321    float4 *fout = (float4 *)buf;
322    int y = info->current.y;
323    if ((y > cp->mIradius) && (y < ((int)info->dim.y - cp->mIradius))) {
324        const uchar *pi = pin + (y - cp->mIradius) * stride;
325        OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, info->dim.x);
326    } else {
327        x1 = 0;
328        while(info->dim.x > x1) {
329            OneVU4(info, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
330            fout++;
331            x1++;
332        }
333    }
334
335    x1 = xstart;
336    while ((x1 < (uint32_t)cp->mIradius) && (x1 < x2)) {
337        OneHU4(info, out, x1, buf, cp->mFp, cp->mIradius);
338        out++;
339        x1++;
340    }
341#if defined(ARCH_X86_HAVE_SSSE3)
342    if (gArchUseSIMD) {
343        if ((x1 + cp->mIradius) < x2) {
344            rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
345                                   cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
346            out += (x2 - cp->mIradius) - x1;
347            x1 = x2 - cp->mIradius;
348        }
349    }
350#endif
351    while(x2 > x1) {
352        OneHU4(info, out, x1, buf, cp->mFp, cp->mIradius);
353        out++;
354        x1++;
355    }
356}
357
358void RsdCpuScriptIntrinsicBlur::kernelU1(const RsExpandKernelDriverInfo *info,
359                                         uint32_t xstart, uint32_t xend,
360                                         uint32_t outstep) {
361    float buf[4 * 2048];
362    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)info->usr;
363    if (!cp->mAlloc.get()) {
364        ALOGE("Blur executed without input, skipping");
365        return;
366    }
367    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
368    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
369
370    uchar *out = (uchar *)info->outPtr[0];
371    uint32_t x1 = xstart;
372    uint32_t x2 = xend;
373
374#if defined(ARCH_ARM_USE_INTRINSICS)
375    if (gArchUseSIMD && info->dim.x >= 16) {
376        // The specialisation for r<=8 has an awkward prefill case, which is
377        // fiddly to resolve, where starting close to the right edge can cause
378        // a read beyond the end of input.  So avoid that case here.
379        if (cp->mIradius > 8 || (info->dim.x - rsMax(0, (int32_t)x1 - 8)) >= 16) {
380            rsdIntrinsicBlurU1_K(out, pin + stride * info->current.y, info->dim.x, info->dim.y,
381                     stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
382            return;
383        }
384    }
385#endif
386
387    float *fout = (float *)buf;
388    int y = info->current.y;
389    if ((y > cp->mIradius) && (y < ((int)info->dim.y - cp->mIradius -1))) {
390        const uchar *pi = pin + (y - cp->mIradius) * stride;
391        OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, info->dim.x);
392    } else {
393        x1 = 0;
394        while(info->dim.x > x1) {
395            OneVU1(info, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
396            fout++;
397            x1++;
398        }
399    }
400
401    x1 = xstart;
402    while ((x1 < x2) &&
403           ((x1 < (uint32_t)cp->mIradius) || (((uintptr_t)out) & 0x3))) {
404        OneHU1(info, out, x1, buf, cp->mFp, cp->mIradius);
405        out++;
406        x1++;
407    }
408#if defined(ARCH_X86_HAVE_SSSE3)
409    if (gArchUseSIMD) {
410        if ((x1 + cp->mIradius) < x2) {
411            uint32_t len = x2 - (x1 + cp->mIradius);
412            len &= ~3;
413            if (len > 0) {
414                rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
415                                       cp->mIradius * 2 + 1, x1, x1 + len);
416                out += len;
417                x1 += len;
418            }
419        }
420    }
421#endif
422    while(x2 > x1) {
423        OneHU1(info, out, x1, buf, cp->mFp, cp->mIradius);
424        out++;
425        x1++;
426    }
427}
428
429RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx,
430                                                     const Script *s, const Element *e)
431            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
432
433    mRootPtr = nullptr;
434    if (e->getType() == RS_TYPE_UNSIGNED_8) {
435        switch (e->getVectorSize()) {
436        case 1:
437            mRootPtr = &kernelU1;
438            break;
439        case 4:
440            mRootPtr = &kernelU4;
441            break;
442        }
443    }
444    rsAssert(mRootPtr);
445    mRadius = 5;
446
447    mScratch = new void *[mCtx->getThreadCount()];
448    mScratchSize = new size_t[mCtx->getThreadCount()];
449    memset(mScratch, 0, sizeof(void *) * mCtx->getThreadCount());
450    memset(mScratchSize, 0, sizeof(size_t) * mCtx->getThreadCount());
451
452    ComputeGaussianWeights();
453}
454
455RsdCpuScriptIntrinsicBlur::~RsdCpuScriptIntrinsicBlur() {
456    uint32_t threads = mCtx->getThreadCount();
457    if (mScratch) {
458        for (size_t i = 0; i < threads; i++) {
459            if (mScratch[i]) {
460                free(mScratch[i]);
461            }
462        }
463        delete []mScratch;
464    }
465    if (mScratchSize) {
466        delete []mScratchSize;
467    }
468}
469
470void RsdCpuScriptIntrinsicBlur::populateScript(Script *s) {
471    s->mHal.info.exportedVariableCount = 2;
472}
473
474void RsdCpuScriptIntrinsicBlur::invokeFreeChildren() {
475    mAlloc.clear();
476}
477
478RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
479
480    return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
481}
482
483} // namespace renderscript
484} // namespace android
485