rsCpuIntrinsicConvolve5x5.cpp revision fdceadb811ec22c69f879ea0d0108be3d287708b
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21using namespace android;
22using namespace android::renderscript;
23
24namespace android {
25namespace renderscript {
26
27
28class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
29public:
30    virtual void populateScript(Script *);
31    virtual void invokeFreeChildren();
32
33    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35
36    virtual ~RsdCpuScriptIntrinsicConvolve5x5();
37    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
38
39protected:
40    float mFp[28];
41    short mIp[28];
42    ObjectBaseRef<Allocation> alloc;
43
44
45    static void kernelU1(const RsExpandKernelParams *p,
46                         uint32_t xstart, uint32_t xend,
47                         uint32_t outstep);
48    static void kernelU2(const RsExpandKernelParams *p,
49                         uint32_t xstart, uint32_t xend,
50                         uint32_t outstep);
51    static void kernelU4(const RsExpandKernelParams *p,
52                         uint32_t xstart, uint32_t xend,
53                         uint32_t outstep);
54    static void kernelF1(const RsExpandKernelParams *p,
55                         uint32_t xstart, uint32_t xend,
56                         uint32_t outstep);
57    static void kernelF2(const RsExpandKernelParams *p,
58                         uint32_t xstart, uint32_t xend,
59                         uint32_t outstep);
60    static void kernelF4(const RsExpandKernelParams *p,
61                         uint32_t xstart, uint32_t xend,
62                         uint32_t outstep);
63
64
65};
66
67}
68}
69
70void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
71    rsAssert(slot == 1);
72    alloc.set(static_cast<Allocation *>(data));
73}
74
75void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
76                                                    const void *data, size_t dataLength) {
77    rsAssert(slot == 0);
78    memcpy (&mFp, data, dataLength);
79    for(int ct=0; ct < 25; ct++) {
80        if (mFp[ct] >= 0) {
81            mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
82        } else {
83            mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
84        }
85    }
86}
87
88
89static void OneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
90                  const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
91                  const float* coeff) {
92
93    uint32_t x0 = rsMax((int32_t)x-2, 0);
94    uint32_t x1 = rsMax((int32_t)x-1, 0);
95    uint32_t x2 = x;
96    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
97    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
98
99    float4 px = convert_float4(py0[x0]) * coeff[0] +
100                convert_float4(py0[x1]) * coeff[1] +
101                convert_float4(py0[x2]) * coeff[2] +
102                convert_float4(py0[x3]) * coeff[3] +
103                convert_float4(py0[x4]) * coeff[4] +
104
105                convert_float4(py1[x0]) * coeff[5] +
106                convert_float4(py1[x1]) * coeff[6] +
107                convert_float4(py1[x2]) * coeff[7] +
108                convert_float4(py1[x3]) * coeff[8] +
109                convert_float4(py1[x4]) * coeff[9] +
110
111                convert_float4(py2[x0]) * coeff[10] +
112                convert_float4(py2[x1]) * coeff[11] +
113                convert_float4(py2[x2]) * coeff[12] +
114                convert_float4(py2[x3]) * coeff[13] +
115                convert_float4(py2[x4]) * coeff[14] +
116
117                convert_float4(py3[x0]) * coeff[15] +
118                convert_float4(py3[x1]) * coeff[16] +
119                convert_float4(py3[x2]) * coeff[17] +
120                convert_float4(py3[x3]) * coeff[18] +
121                convert_float4(py3[x4]) * coeff[19] +
122
123                convert_float4(py4[x0]) * coeff[20] +
124                convert_float4(py4[x1]) * coeff[21] +
125                convert_float4(py4[x2]) * coeff[22] +
126                convert_float4(py4[x3]) * coeff[23] +
127                convert_float4(py4[x4]) * coeff[24];
128    px = clamp(px + 0.5f, 0.f, 255.f);
129    *out = convert_uchar4(px);
130}
131
132static void OneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
133                  const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
134                  const float* coeff) {
135
136    uint32_t x0 = rsMax((int32_t)x-2, 0);
137    uint32_t x1 = rsMax((int32_t)x-1, 0);
138    uint32_t x2 = x;
139    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
140    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
141
142    float2 px = convert_float2(py0[x0]) * coeff[0] +
143                convert_float2(py0[x1]) * coeff[1] +
144                convert_float2(py0[x2]) * coeff[2] +
145                convert_float2(py0[x3]) * coeff[3] +
146                convert_float2(py0[x4]) * coeff[4] +
147
148                convert_float2(py1[x0]) * coeff[5] +
149                convert_float2(py1[x1]) * coeff[6] +
150                convert_float2(py1[x2]) * coeff[7] +
151                convert_float2(py1[x3]) * coeff[8] +
152                convert_float2(py1[x4]) * coeff[9] +
153
154                convert_float2(py2[x0]) * coeff[10] +
155                convert_float2(py2[x1]) * coeff[11] +
156                convert_float2(py2[x2]) * coeff[12] +
157                convert_float2(py2[x3]) * coeff[13] +
158                convert_float2(py2[x4]) * coeff[14] +
159
160                convert_float2(py3[x0]) * coeff[15] +
161                convert_float2(py3[x1]) * coeff[16] +
162                convert_float2(py3[x2]) * coeff[17] +
163                convert_float2(py3[x3]) * coeff[18] +
164                convert_float2(py3[x4]) * coeff[19] +
165
166                convert_float2(py4[x0]) * coeff[20] +
167                convert_float2(py4[x1]) * coeff[21] +
168                convert_float2(py4[x2]) * coeff[22] +
169                convert_float2(py4[x3]) * coeff[23] +
170                convert_float2(py4[x4]) * coeff[24];
171    px = clamp(px + 0.5f, 0.f, 255.f);
172    *out = convert_uchar2(px);
173}
174
175static void OneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
176                  const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
177                  const float* coeff) {
178
179    uint32_t x0 = rsMax((int32_t)x-2, 0);
180    uint32_t x1 = rsMax((int32_t)x-1, 0);
181    uint32_t x2 = x;
182    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
183    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
184
185    float px = (float)(py0[x0]) * coeff[0] +
186               (float)(py0[x1]) * coeff[1] +
187               (float)(py0[x2]) * coeff[2] +
188               (float)(py0[x3]) * coeff[3] +
189               (float)(py0[x4]) * coeff[4] +
190
191               (float)(py1[x0]) * coeff[5] +
192               (float)(py1[x1]) * coeff[6] +
193               (float)(py1[x2]) * coeff[7] +
194               (float)(py1[x3]) * coeff[8] +
195               (float)(py1[x4]) * coeff[9] +
196
197               (float)(py2[x0]) * coeff[10] +
198               (float)(py2[x1]) * coeff[11] +
199               (float)(py2[x2]) * coeff[12] +
200               (float)(py2[x3]) * coeff[13] +
201               (float)(py2[x4]) * coeff[14] +
202
203               (float)(py3[x0]) * coeff[15] +
204               (float)(py3[x1]) * coeff[16] +
205               (float)(py3[x2]) * coeff[17] +
206               (float)(py3[x3]) * coeff[18] +
207               (float)(py3[x4]) * coeff[19] +
208
209               (float)(py4[x0]) * coeff[20] +
210               (float)(py4[x1]) * coeff[21] +
211               (float)(py4[x2]) * coeff[22] +
212               (float)(py4[x3]) * coeff[23] +
213               (float)(py4[x4]) * coeff[24];
214    px = clamp(px + 0.5f, 0.f, 255.f);
215    *out = px;
216}
217
218static void OneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
219                  const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
220                  const float* coeff) {
221
222    uint32_t x0 = rsMax((int32_t)x-2, 0);
223    uint32_t x1 = rsMax((int32_t)x-1, 0);
224    uint32_t x2 = x;
225    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
226    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
227
228    float4 px = py0[x0] * coeff[0] +
229                py0[x1] * coeff[1] +
230                py0[x2] * coeff[2] +
231                py0[x3] * coeff[3] +
232                py0[x4] * coeff[4] +
233
234                py1[x0] * coeff[5] +
235                py1[x1] * coeff[6] +
236                py1[x2] * coeff[7] +
237                py1[x3] * coeff[8] +
238                py1[x4] * coeff[9] +
239
240                py2[x0] * coeff[10] +
241                py2[x1] * coeff[11] +
242                py2[x2] * coeff[12] +
243                py2[x3] * coeff[13] +
244                py2[x4] * coeff[14] +
245
246                py3[x0] * coeff[15] +
247                py3[x1] * coeff[16] +
248                py3[x2] * coeff[17] +
249                py3[x3] * coeff[18] +
250                py3[x4] * coeff[19] +
251
252                py4[x0] * coeff[20] +
253                py4[x1] * coeff[21] +
254                py4[x2] * coeff[22] +
255                py4[x3] * coeff[23] +
256                py4[x4] * coeff[24];
257    *out = px;
258}
259
260static void OneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
261                  const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
262                  const float* coeff) {
263
264    uint32_t x0 = rsMax((int32_t)x-2, 0);
265    uint32_t x1 = rsMax((int32_t)x-1, 0);
266    uint32_t x2 = x;
267    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
268    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
269
270    float2 px = py0[x0] * coeff[0] +
271                py0[x1] * coeff[1] +
272                py0[x2] * coeff[2] +
273                py0[x3] * coeff[3] +
274                py0[x4] * coeff[4] +
275
276                py1[x0] * coeff[5] +
277                py1[x1] * coeff[6] +
278                py1[x2] * coeff[7] +
279                py1[x3] * coeff[8] +
280                py1[x4] * coeff[9] +
281
282                py2[x0] * coeff[10] +
283                py2[x1] * coeff[11] +
284                py2[x2] * coeff[12] +
285                py2[x3] * coeff[13] +
286                py2[x4] * coeff[14] +
287
288                py3[x0] * coeff[15] +
289                py3[x1] * coeff[16] +
290                py3[x2] * coeff[17] +
291                py3[x3] * coeff[18] +
292                py3[x4] * coeff[19] +
293
294                py4[x0] * coeff[20] +
295                py4[x1] * coeff[21] +
296                py4[x2] * coeff[22] +
297                py4[x3] * coeff[23] +
298                py4[x4] * coeff[24];
299    *out = px;
300}
301
302static void OneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
303                  const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
304                  const float* coeff) {
305
306    uint32_t x0 = rsMax((int32_t)x-2, 0);
307    uint32_t x1 = rsMax((int32_t)x-1, 0);
308    uint32_t x2 = x;
309    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
310    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
311
312    float px = py0[x0] * coeff[0] +
313               py0[x1] * coeff[1] +
314               py0[x2] * coeff[2] +
315               py0[x3] * coeff[3] +
316               py0[x4] * coeff[4] +
317
318               py1[x0] * coeff[5] +
319               py1[x1] * coeff[6] +
320               py1[x2] * coeff[7] +
321               py1[x3] * coeff[8] +
322               py1[x4] * coeff[9] +
323
324               py2[x0] * coeff[10] +
325               py2[x1] * coeff[11] +
326               py2[x2] * coeff[12] +
327               py2[x3] * coeff[13] +
328               py2[x4] * coeff[14] +
329
330               py3[x0] * coeff[15] +
331               py3[x1] * coeff[16] +
332               py3[x2] * coeff[17] +
333               py3[x3] * coeff[18] +
334               py3[x4] * coeff[19] +
335
336               py4[x0] * coeff[20] +
337               py4[x1] * coeff[21] +
338               py4[x2] * coeff[22] +
339               py4[x3] * coeff[23] +
340               py4[x4] * coeff[24];
341    *out = px;
342}
343
344
345extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
346                                          const void *y2, const void *y3, const void *y4,
347                                          const short *coef, uint32_t count);
348
349void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelParams *p,
350                                                uint32_t xstart, uint32_t xend,
351                                                uint32_t outstep) {
352    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
353    if (!cp->alloc.get()) {
354        ALOGE("Convolve5x5 executed without input, skipping");
355        return;
356    }
357    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
358    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
359
360    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
361    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
362    uint32_t y2 = p->y;
363    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
364    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
365
366    const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
367    const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
368    const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
369    const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
370    const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
371
372    uchar4 *out = (uchar4 *)p->out;
373    uint32_t x1 = xstart;
374    uint32_t x2 = xend;
375
376    while((x1 < x2) && (x1 < 2)) {
377        OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
378        out++;
379        x1++;
380    }
381#if defined(ARCH_X86_HAVE_SSSE3)
382    // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
383    // 3 for end boundary where x may hit the end boundary)
384    if (gArchUseSIMD &&((x1 + 6) < x2)) {
385        // subtract 3 for end boundary
386        uint32_t len = (x2 - x1 - 3) >> 2;
387        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
388        out += len << 2;
389        x1 += len << 2;
390    }
391#endif
392
393#if defined(ARCH_ARM_USE_INTRINSICS)
394    if(gArchUseSIMD && ((x1 + 3) < x2)) {
395        uint32_t len = (x2 - x1 - 3) >> 1;
396        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
397        out += len << 1;
398        x1 += len << 1;
399    }
400#endif
401
402    while(x1 < x2) {
403        OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
404        out++;
405        x1++;
406    }
407}
408
409void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelParams *p,
410                                                uint32_t xstart, uint32_t xend,
411                                                uint32_t outstep) {
412    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
413    if (!cp->alloc.get()) {
414        ALOGE("Convolve5x5 executed without input, skipping");
415        return;
416    }
417    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
418    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
419
420    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
421    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
422    uint32_t y2 = p->y;
423    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
424    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
425
426    const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
427    const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
428    const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
429    const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
430    const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
431
432    uchar2 *out = (uchar2 *)p->out;
433    uint32_t x1 = xstart;
434    uint32_t x2 = xend;
435
436    while((x1 < x2) && (x1 < 2)) {
437        OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
438        out++;
439        x1++;
440    }
441
442#if 0//defined(ARCH_ARM_HAVE_NEON)
443    if((x1 + 3) < x2) {
444        uint32_t len = (x2 - x1 - 3) >> 1;
445        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
446        out += len << 1;
447        x1 += len << 1;
448    }
449#endif
450
451    while(x1 < x2) {
452        OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
453        out++;
454        x1++;
455    }
456}
457
458void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelParams *p,
459                                                uint32_t xstart, uint32_t xend,
460                                                uint32_t outstep) {
461    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
462    if (!cp->alloc.get()) {
463        ALOGE("Convolve5x5 executed without input, skipping");
464        return;
465    }
466    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
467    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
468
469    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
470    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
471    uint32_t y2 = p->y;
472    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
473    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
474
475    const uchar *py0 = (const uchar *)(pin + stride * y0);
476    const uchar *py1 = (const uchar *)(pin + stride * y1);
477    const uchar *py2 = (const uchar *)(pin + stride * y2);
478    const uchar *py3 = (const uchar *)(pin + stride * y3);
479    const uchar *py4 = (const uchar *)(pin + stride * y4);
480
481    uchar *out = (uchar *)p->out;
482    uint32_t x1 = xstart;
483    uint32_t x2 = xend;
484
485    while((x1 < x2) && (x1 < 2)) {
486        OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
487        out++;
488        x1++;
489    }
490
491#if 0//defined(ARCH_ARM_HAVE_NEON)
492    if((x1 + 3) < x2) {
493        uint32_t len = (x2 - x1 - 3) >> 1;
494        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
495        out += len << 1;
496        x1 += len << 1;
497    }
498#endif
499
500    while(x1 < x2) {
501        OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
502        out++;
503        x1++;
504    }
505}
506
507void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelParams *p,
508                                                uint32_t xstart, uint32_t xend,
509                                                uint32_t outstep) {
510    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
511    if (!cp->alloc.get()) {
512        ALOGE("Convolve5x5 executed without input, skipping");
513        return;
514    }
515    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
516    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
517
518    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
519    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
520    uint32_t y2 = p->y;
521    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
522    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
523
524    const float4 *py0 = (const float4 *)(pin + stride * y0);
525    const float4 *py1 = (const float4 *)(pin + stride * y1);
526    const float4 *py2 = (const float4 *)(pin + stride * y2);
527    const float4 *py3 = (const float4 *)(pin + stride * y3);
528    const float4 *py4 = (const float4 *)(pin + stride * y4);
529
530    float4 *out = (float4 *)p->out;
531    uint32_t x1 = xstart;
532    uint32_t x2 = xend;
533
534    while((x1 < x2) && (x1 < 2)) {
535        OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
536        out++;
537        x1++;
538    }
539
540#if 0//defined(ARCH_ARM_HAVE_NEON)
541    if((x1 + 3) < x2) {
542        uint32_t len = (x2 - x1 - 3) >> 1;
543        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
544        out += len << 1;
545        x1 += len << 1;
546    }
547#endif
548
549    while(x1 < x2) {
550        OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
551        out++;
552        x1++;
553    }
554}
555
556void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelParams *p,
557                                                uint32_t xstart, uint32_t xend,
558                                                uint32_t outstep) {
559    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
560    if (!cp->alloc.get()) {
561        ALOGE("Convolve5x5 executed without input, skipping");
562        return;
563    }
564    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
565    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
566
567    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
568    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
569    uint32_t y2 = p->y;
570    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
571    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
572
573    const float2 *py0 = (const float2 *)(pin + stride * y0);
574    const float2 *py1 = (const float2 *)(pin + stride * y1);
575    const float2 *py2 = (const float2 *)(pin + stride * y2);
576    const float2 *py3 = (const float2 *)(pin + stride * y3);
577    const float2 *py4 = (const float2 *)(pin + stride * y4);
578
579    float2 *out = (float2 *)p->out;
580    uint32_t x1 = xstart;
581    uint32_t x2 = xend;
582
583    while((x1 < x2) && (x1 < 2)) {
584        OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
585        out++;
586        x1++;
587    }
588
589#if 0//defined(ARCH_ARM_HAVE_NEON)
590    if((x1 + 3) < x2) {
591        uint32_t len = (x2 - x1 - 3) >> 1;
592        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
593        out += len << 1;
594        x1 += len << 1;
595    }
596#endif
597
598    while(x1 < x2) {
599        OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
600        out++;
601        x1++;
602    }
603}
604
605void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelParams *p,
606                                                uint32_t xstart, uint32_t xend,
607                                                uint32_t outstep) {
608    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
609    if (!cp->alloc.get()) {
610        ALOGE("Convolve5x5 executed without input, skipping");
611        return;
612    }
613    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
614    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
615
616    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
617    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
618    uint32_t y2 = p->y;
619    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
620    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
621
622    const float *py0 = (const float *)(pin + stride * y0);
623    const float *py1 = (const float *)(pin + stride * y1);
624    const float *py2 = (const float *)(pin + stride * y2);
625    const float *py3 = (const float *)(pin + stride * y3);
626    const float *py4 = (const float *)(pin + stride * y4);
627
628    float *out = (float *)p->out;
629    uint32_t x1 = xstart;
630    uint32_t x2 = xend;
631
632    while((x1 < x2) && (x1 < 2)) {
633        OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
634        out++;
635        x1++;
636    }
637
638#if 0//defined(ARCH_ARM_HAVE_NEON)
639    if((x1 + 3) < x2) {
640        uint32_t len = (x2 - x1 - 3) >> 1;
641        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
642        out += len << 1;
643        x1 += len << 1;
644    }
645#endif
646
647    while(x1 < x2) {
648        OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
649        out++;
650        x1++;
651    }
652}
653
654RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
655            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
656            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
657
658    if (e->getType() == RS_TYPE_FLOAT_32) {
659        switch(e->getVectorSize()) {
660        case 1:
661            mRootPtr = &kernelF1;
662            break;
663        case 2:
664            mRootPtr = &kernelF2;
665            break;
666        case 3:
667        case 4:
668            mRootPtr = &kernelF4;
669            break;
670        }
671    } else {
672        switch(e->getVectorSize()) {
673        case 1:
674            mRootPtr = &kernelU1;
675            break;
676        case 2:
677            mRootPtr = &kernelU2;
678            break;
679        case 3:
680        case 4:
681            mRootPtr = &kernelU4;
682            break;
683        }
684    }
685    for(int ct=0; ct < 25; ct++) {
686        mFp[ct] = 1.f / 25.f;
687        mIp[ct] = (short)(mFp[ct] * 256.f);
688    }
689}
690
691RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
692}
693
694void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
695    s->mHal.info.exportedVariableCount = 2;
696}
697
698void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
699    alloc.clear();
700}
701
702
703RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
704                                            const Script *s, const Element *e) {
705
706    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
707}
708