1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21using namespace android;
22using namespace android::renderscript;
23
24namespace android {
25namespace renderscript {
26
27
28class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic {
29public:
30    virtual void populateScript(Script *);
31    virtual void invokeFreeChildren();
32
33    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35
36    virtual ~RsdCpuScriptIntrinsicConvolve3x3();
37    RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
38
39protected:
40    float mFp[16];
41    short mIp[16];
42    ObjectBaseRef<const Allocation> mAlloc;
43    ObjectBaseRef<const Element> mElement;
44
45    static void kernelU1(const RsForEachStubParamStruct *p,
46                         uint32_t xstart, uint32_t xend,
47                         uint32_t instep, uint32_t outstep);
48    static void kernelU2(const RsForEachStubParamStruct *p,
49                         uint32_t xstart, uint32_t xend,
50                         uint32_t instep, uint32_t outstep);
51    static void kernelU4(const RsForEachStubParamStruct *p,
52                         uint32_t xstart, uint32_t xend,
53                         uint32_t instep, uint32_t outstep);
54    static void kernelF1(const RsForEachStubParamStruct *p,
55                         uint32_t xstart, uint32_t xend,
56                         uint32_t instep, uint32_t outstep);
57    static void kernelF2(const RsForEachStubParamStruct *p,
58                         uint32_t xstart, uint32_t xend,
59                         uint32_t instep, uint32_t outstep);
60    static void kernelF4(const RsForEachStubParamStruct *p,
61                         uint32_t xstart, uint32_t xend,
62                         uint32_t instep, uint32_t outstep);
63};
64
65}
66}
67
68
69void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
70    rsAssert(slot == 1);
71    mAlloc.set(static_cast<Allocation *>(data));
72}
73
74void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
75                                                    size_t dataLength) {
76    rsAssert(slot == 0);
77    memcpy (&mFp, data, dataLength);
78    for(int ct=0; ct < 9; ct++) {
79        if (mFp[ct] >= 0) {
80            mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
81        } else {
82            mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
83        }
84    }
85}
86
87extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1,
88                                          const void *y2, const short *coef, uint32_t count);
89
90
91static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
92                          const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
93                          const float* coeff) {
94
95    uint32_t x1 = rsMax((int32_t)x-1, 0);
96    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
97
98    float4 px = convert_float4(py0[x1]) * coeff[0] +
99                convert_float4(py0[x]) * coeff[1] +
100                convert_float4(py0[x2]) * coeff[2] +
101                convert_float4(py1[x1]) * coeff[3] +
102                convert_float4(py1[x]) * coeff[4] +
103                convert_float4(py1[x2]) * coeff[5] +
104                convert_float4(py2[x1]) * coeff[6] +
105                convert_float4(py2[x]) * coeff[7] +
106                convert_float4(py2[x2]) * coeff[8];
107
108    px = clamp(px + 0.5f, 0.f, 255.f);
109    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
110    *out = o;
111}
112
113static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
114                          const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
115                          const float* coeff) {
116
117    uint32_t x1 = rsMax((int32_t)x-1, 0);
118    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
119
120    float2 px = convert_float2(py0[x1]) * coeff[0] +
121                convert_float2(py0[x]) * coeff[1] +
122                convert_float2(py0[x2]) * coeff[2] +
123                convert_float2(py1[x1]) * coeff[3] +
124                convert_float2(py1[x]) * coeff[4] +
125                convert_float2(py1[x2]) * coeff[5] +
126                convert_float2(py2[x1]) * coeff[6] +
127                convert_float2(py2[x]) * coeff[7] +
128                convert_float2(py2[x2]) * coeff[8];
129
130    px = clamp(px + 0.5f, 0.f, 255.f);
131    *out = convert_uchar2(px);
132}
133
134static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
135                          const uchar *py0, const uchar *py1, const uchar *py2,
136                          const float* coeff) {
137
138    uint32_t x1 = rsMax((int32_t)x-1, 0);
139    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
140
141    float px = ((float)py0[x1]) * coeff[0] +
142               ((float)py0[x]) * coeff[1] +
143               ((float)py0[x2]) * coeff[2] +
144               ((float)py1[x1]) * coeff[3] +
145               ((float)py1[x]) * coeff[4] +
146               ((float)py1[x2]) * coeff[5] +
147               ((float)py2[x1]) * coeff[6] +
148               ((float)py2[x]) * coeff[7] +
149               ((float)py2[x2]) * coeff[8];
150    *out = clamp(px + 0.5f, 0.f, 255.f);
151}
152
153static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
154                          const float4 *py0, const float4 *py1, const float4 *py2,
155                          const float* coeff) {
156
157    uint32_t x1 = rsMax((int32_t)x-1, 0);
158    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
159    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
160           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
161           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
162}
163
164static void ConvolveOneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
165                          const float2 *py0, const float2 *py1, const float2 *py2,
166                          const float* coeff) {
167
168    uint32_t x1 = rsMax((int32_t)x-1, 0);
169    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
170    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
171           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
172           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
173}
174
175static void ConvolveOneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
176                          const float *py0, const float *py1, const float *py2,
177                          const float* coeff) {
178
179    uint32_t x1 = rsMax((int32_t)x-1, 0);
180    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
181    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
182           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
183           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
184}
185
186void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsForEachStubParamStruct *p,
187                                                uint32_t xstart, uint32_t xend,
188                                                uint32_t instep, uint32_t outstep) {
189    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
190
191    if (!cp->mAlloc.get()) {
192        ALOGE("Convolve3x3 executed without input, skipping");
193        return;
194    }
195    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
196    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
197
198    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
199    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
200    const uchar4 *py0 = (const uchar4 *)(pin + stride * y2);
201    const uchar4 *py1 = (const uchar4 *)(pin + stride * p->y);
202    const uchar4 *py2 = (const uchar4 *)(pin + stride * y1);
203
204    uchar4 *out = (uchar4 *)p->out;
205    uint32_t x1 = xstart;
206    uint32_t x2 = xend;
207    if(x1 == 0) {
208        ConvolveOneU4(p, 0, out, py0, py1, py2, cp->mFp);
209        x1 ++;
210        out++;
211    }
212
213    if(x2 > x1) {
214#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
215        if (gArchUseSIMD) {
216            int32_t len = (x2 - x1 - 1) >> 1;
217            if(len > 0) {
218                rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
219                x1 += len << 1;
220                out += len << 1;
221            }
222        }
223#endif
224
225        while(x1 != x2) {
226            ConvolveOneU4(p, x1, out, py0, py1, py2, cp->mFp);
227            out++;
228            x1++;
229        }
230    }
231}
232
233void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsForEachStubParamStruct *p,
234                                                uint32_t xstart, uint32_t xend,
235                                                uint32_t instep, uint32_t outstep) {
236    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
237
238    if (!cp->mAlloc.get()) {
239        ALOGE("Convolve3x3 executed without input, skipping");
240        return;
241    }
242    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
243    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
244
245    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
246    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
247    const uchar2 *py0 = (const uchar2 *)(pin + stride * y2);
248    const uchar2 *py1 = (const uchar2 *)(pin + stride * p->y);
249    const uchar2 *py2 = (const uchar2 *)(pin + stride * y1);
250
251    uchar2 *out = (uchar2 *)p->out;
252    uint32_t x1 = xstart;
253    uint32_t x2 = xend;
254    if(x1 == 0) {
255        ConvolveOneU2(p, 0, out, py0, py1, py2, cp->mFp);
256        x1 ++;
257        out++;
258    }
259
260    if(x2 > x1) {
261#if 0//defined(ARCH_ARM_HAVE_NEON)
262        int32_t len = (x2 - x1 - 1) >> 1;
263        if(len > 0) {
264            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
265            x1 += len << 1;
266            out += len << 1;
267        }
268#endif
269
270        while(x1 != x2) {
271            ConvolveOneU2(p, x1, out, py0, py1, py2, cp->mFp);
272            out++;
273            x1++;
274        }
275    }
276}
277
278void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsForEachStubParamStruct *p,
279                                                uint32_t xstart, uint32_t xend,
280                                                uint32_t instep, uint32_t outstep) {
281    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
282
283    if (!cp->mAlloc.get()) {
284        ALOGE("Convolve3x3 executed without input, skipping");
285        return;
286    }
287    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
288    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
289
290    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
291    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
292    const uchar *py0 = (const uchar *)(pin + stride * y2);
293    const uchar *py1 = (const uchar *)(pin + stride * p->y);
294    const uchar *py2 = (const uchar *)(pin + stride * y1);
295
296    uchar *out = (uchar *)p->out;
297    uint32_t x1 = xstart;
298    uint32_t x2 = xend;
299    if(x1 == 0) {
300        ConvolveOneU1(p, 0, out, py0, py1, py2, cp->mFp);
301        x1 ++;
302        out++;
303    }
304
305    if(x2 > x1) {
306#if 0//defined(ARCH_ARM_HAVE_NEON)
307        int32_t len = (x2 - x1 - 1) >> 1;
308        if(len > 0) {
309            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
310            x1 += len << 1;
311            out += len << 1;
312        }
313#endif
314
315        while(x1 != x2) {
316            ConvolveOneU1(p, x1, out, py0, py1, py2, cp->mFp);
317            out++;
318            x1++;
319        }
320    }
321}
322
323void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsForEachStubParamStruct *p,
324                                                uint32_t xstart, uint32_t xend,
325                                                uint32_t instep, uint32_t outstep) {
326    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
327
328    if (!cp->mAlloc.get()) {
329        ALOGE("Convolve3x3 executed without input, skipping");
330        return;
331    }
332    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
333    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
334
335    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
336    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
337    const float4 *py0 = (const float4 *)(pin + stride * y2);
338    const float4 *py1 = (const float4 *)(pin + stride * p->y);
339    const float4 *py2 = (const float4 *)(pin + stride * y1);
340
341    float4 *out = (float4 *)p->out;
342    uint32_t x1 = xstart;
343    uint32_t x2 = xend;
344    if(x1 == 0) {
345        ConvolveOneF4(p, 0, out, py0, py1, py2, cp->mFp);
346        x1 ++;
347        out++;
348    }
349
350    if(x2 > x1) {
351#if 0//defined(ARCH_ARM_HAVE_NEON)
352        int32_t len = (x2 - x1 - 1) >> 1;
353        if(len > 0) {
354            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
355            x1 += len << 1;
356            out += len << 1;
357        }
358#endif
359
360        while(x1 != x2) {
361            ConvolveOneF4(p, x1, out, py0, py1, py2, cp->mFp);
362            out++;
363            x1++;
364        }
365    }
366}
367
368void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsForEachStubParamStruct *p,
369                                                uint32_t xstart, uint32_t xend,
370                                                uint32_t instep, uint32_t outstep) {
371    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
372
373    if (!cp->mAlloc.get()) {
374        ALOGE("Convolve3x3 executed without input, skipping");
375        return;
376    }
377    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
378    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
379
380    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
381    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
382    const float2 *py0 = (const float2 *)(pin + stride * y2);
383    const float2 *py1 = (const float2 *)(pin + stride * p->y);
384    const float2 *py2 = (const float2 *)(pin + stride * y1);
385
386    float2 *out = (float2 *)p->out;
387    uint32_t x1 = xstart;
388    uint32_t x2 = xend;
389    if(x1 == 0) {
390        ConvolveOneF2(p, 0, out, py0, py1, py2, cp->mFp);
391        x1 ++;
392        out++;
393    }
394
395    if(x2 > x1) {
396#if 0//defined(ARCH_ARM_HAVE_NEON)
397        int32_t len = (x2 - x1 - 1) >> 1;
398        if(len > 0) {
399            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
400            x1 += len << 1;
401            out += len << 1;
402        }
403#endif
404
405        while(x1 != x2) {
406            ConvolveOneF2(p, x1, out, py0, py1, py2, cp->mFp);
407            out++;
408            x1++;
409        }
410    }
411}
412void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsForEachStubParamStruct *p,
413                                                uint32_t xstart, uint32_t xend,
414                                                uint32_t instep, uint32_t outstep) {
415    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
416
417    if (!cp->mAlloc.get()) {
418        ALOGE("Convolve3x3 executed without input, skipping");
419        return;
420    }
421    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
422    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
423
424    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
425    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
426    const float *py0 = (const float *)(pin + stride * y2);
427    const float *py1 = (const float *)(pin + stride * p->y);
428    const float *py2 = (const float *)(pin + stride * y1);
429
430    float *out = (float *)p->out;
431    uint32_t x1 = xstart;
432    uint32_t x2 = xend;
433    if(x1 == 0) {
434        ConvolveOneF1(p, 0, out, py0, py1, py2, cp->mFp);
435        x1 ++;
436        out++;
437    }
438
439    if(x2 > x1) {
440#if 0//defined(ARCH_ARM_HAVE_NEON)
441        int32_t len = (x2 - x1 - 1) >> 1;
442        if(len > 0) {
443            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
444            x1 += len << 1;
445            out += len << 1;
446        }
447#endif
448
449        while(x1 != x2) {
450            ConvolveOneF1(p, x1, out, py0, py1, py2, cp->mFp);
451            out++;
452            x1++;
453        }
454    }
455}
456
457RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
458            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
459            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
460
461    if (e->getType() == RS_TYPE_FLOAT_32) {
462        switch(e->getVectorSize()) {
463        case 1:
464            mRootPtr = &kernelF1;
465            break;
466        case 2:
467            mRootPtr = &kernelF2;
468            break;
469        case 3:
470        case 4:
471            mRootPtr = &kernelF4;
472            break;
473        }
474    } else {
475        switch(e->getVectorSize()) {
476        case 1:
477            mRootPtr = &kernelU1;
478            break;
479        case 2:
480            mRootPtr = &kernelU2;
481            break;
482        case 3:
483        case 4:
484            mRootPtr = &kernelU4;
485            break;
486        }
487    }
488    for(int ct=0; ct < 9; ct++) {
489        mFp[ct] = 1.f / 9.f;
490        mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
491    }
492}
493
494RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() {
495}
496
497void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) {
498    s->mHal.info.exportedVariableCount = 2;
499}
500
501void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
502    mAlloc.clear();
503}
504
505
506RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
507
508    return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
509}
510
511
512