1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21using namespace android;
22using namespace android::renderscript;
23
24namespace android {
25namespace renderscript {
26
27
28class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
29public:
30    virtual void populateScript(Script *);
31    virtual void invokeFreeChildren();
32
33    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35
36    virtual ~RsdCpuScriptIntrinsicConvolve5x5();
37    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
38
39protected:
40    float mFp[28];
41    short mIp[28];
42    ObjectBaseRef<Allocation> alloc;
43
44
45    static void kernelU1(const RsForEachStubParamStruct *p,
46                         uint32_t xstart, uint32_t xend,
47                         uint32_t instep, uint32_t outstep);
48    static void kernelU2(const RsForEachStubParamStruct *p,
49                         uint32_t xstart, uint32_t xend,
50                         uint32_t instep, uint32_t outstep);
51    static void kernelU4(const RsForEachStubParamStruct *p,
52                         uint32_t xstart, uint32_t xend,
53                         uint32_t instep, uint32_t outstep);
54    static void kernelF1(const RsForEachStubParamStruct *p,
55                         uint32_t xstart, uint32_t xend,
56                         uint32_t instep, uint32_t outstep);
57    static void kernelF2(const RsForEachStubParamStruct *p,
58                         uint32_t xstart, uint32_t xend,
59                         uint32_t instep, uint32_t outstep);
60    static void kernelF4(const RsForEachStubParamStruct *p,
61                         uint32_t xstart, uint32_t xend,
62                         uint32_t instep, uint32_t outstep);
63
64
65};
66
67}
68}
69
70void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
71    rsAssert(slot == 1);
72    alloc.set(static_cast<Allocation *>(data));
73}
74
75void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
76                                                    const void *data, size_t dataLength) {
77    rsAssert(slot == 0);
78    memcpy (&mFp, data, dataLength);
79    for(int ct=0; ct < 25; ct++) {
80        if (mFp[ct] >= 0) {
81            mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
82        } else {
83            mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
84        }
85    }
86}
87
88
89static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
90                  const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
91                  const float* coeff) {
92
93    uint32_t x0 = rsMax((int32_t)x-2, 0);
94    uint32_t x1 = rsMax((int32_t)x-1, 0);
95    uint32_t x2 = x;
96    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
97    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
98
99    float4 px = convert_float4(py0[x0]) * coeff[0] +
100                convert_float4(py0[x1]) * coeff[1] +
101                convert_float4(py0[x2]) * coeff[2] +
102                convert_float4(py0[x3]) * coeff[3] +
103                convert_float4(py0[x4]) * coeff[4] +
104
105                convert_float4(py1[x0]) * coeff[5] +
106                convert_float4(py1[x1]) * coeff[6] +
107                convert_float4(py1[x2]) * coeff[7] +
108                convert_float4(py1[x3]) * coeff[8] +
109                convert_float4(py1[x4]) * coeff[9] +
110
111                convert_float4(py2[x0]) * coeff[10] +
112                convert_float4(py2[x1]) * coeff[11] +
113                convert_float4(py2[x2]) * coeff[12] +
114                convert_float4(py2[x3]) * coeff[13] +
115                convert_float4(py2[x4]) * coeff[14] +
116
117                convert_float4(py3[x0]) * coeff[15] +
118                convert_float4(py3[x1]) * coeff[16] +
119                convert_float4(py3[x2]) * coeff[17] +
120                convert_float4(py3[x3]) * coeff[18] +
121                convert_float4(py3[x4]) * coeff[19] +
122
123                convert_float4(py4[x0]) * coeff[20] +
124                convert_float4(py4[x1]) * coeff[21] +
125                convert_float4(py4[x2]) * coeff[22] +
126                convert_float4(py4[x3]) * coeff[23] +
127                convert_float4(py4[x4]) * coeff[24];
128    px = clamp(px, 0.f, 255.f);
129    *out = convert_uchar4(px);
130}
131
132static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
133                  const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
134                  const float* coeff) {
135
136    uint32_t x0 = rsMax((int32_t)x-2, 0);
137    uint32_t x1 = rsMax((int32_t)x-1, 0);
138    uint32_t x2 = x;
139    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
140    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
141
142    float2 px = convert_float2(py0[x0]) * coeff[0] +
143                convert_float2(py0[x1]) * coeff[1] +
144                convert_float2(py0[x2]) * coeff[2] +
145                convert_float2(py0[x3]) * coeff[3] +
146                convert_float2(py0[x4]) * coeff[4] +
147
148                convert_float2(py1[x0]) * coeff[5] +
149                convert_float2(py1[x1]) * coeff[6] +
150                convert_float2(py1[x2]) * coeff[7] +
151                convert_float2(py1[x3]) * coeff[8] +
152                convert_float2(py1[x4]) * coeff[9] +
153
154                convert_float2(py2[x0]) * coeff[10] +
155                convert_float2(py2[x1]) * coeff[11] +
156                convert_float2(py2[x2]) * coeff[12] +
157                convert_float2(py2[x3]) * coeff[13] +
158                convert_float2(py2[x4]) * coeff[14] +
159
160                convert_float2(py3[x0]) * coeff[15] +
161                convert_float2(py3[x1]) * coeff[16] +
162                convert_float2(py3[x2]) * coeff[17] +
163                convert_float2(py3[x3]) * coeff[18] +
164                convert_float2(py3[x4]) * coeff[19] +
165
166                convert_float2(py4[x0]) * coeff[20] +
167                convert_float2(py4[x1]) * coeff[21] +
168                convert_float2(py4[x2]) * coeff[22] +
169                convert_float2(py4[x3]) * coeff[23] +
170                convert_float2(py4[x4]) * coeff[24];
171    px = clamp(px, 0.f, 255.f);
172    *out = convert_uchar2(px);
173}
174
175static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
176                  const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
177                  const float* coeff) {
178
179    uint32_t x0 = rsMax((int32_t)x-2, 0);
180    uint32_t x1 = rsMax((int32_t)x-1, 0);
181    uint32_t x2 = x;
182    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
183    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
184
185    float px = (float)(py0[x0]) * coeff[0] +
186               (float)(py0[x1]) * coeff[1] +
187               (float)(py0[x2]) * coeff[2] +
188               (float)(py0[x3]) * coeff[3] +
189               (float)(py0[x4]) * coeff[4] +
190
191               (float)(py1[x0]) * coeff[5] +
192               (float)(py1[x1]) * coeff[6] +
193               (float)(py1[x2]) * coeff[7] +
194               (float)(py1[x3]) * coeff[8] +
195               (float)(py1[x4]) * coeff[9] +
196
197               (float)(py2[x0]) * coeff[10] +
198               (float)(py2[x1]) * coeff[11] +
199               (float)(py2[x2]) * coeff[12] +
200               (float)(py2[x3]) * coeff[13] +
201               (float)(py2[x4]) * coeff[14] +
202
203               (float)(py3[x0]) * coeff[15] +
204               (float)(py3[x1]) * coeff[16] +
205               (float)(py3[x2]) * coeff[17] +
206               (float)(py3[x3]) * coeff[18] +
207               (float)(py3[x4]) * coeff[19] +
208
209               (float)(py4[x0]) * coeff[20] +
210               (float)(py4[x1]) * coeff[21] +
211               (float)(py4[x2]) * coeff[22] +
212               (float)(py4[x3]) * coeff[23] +
213               (float)(py4[x4]) * coeff[24];
214    px = clamp(px, 0.f, 255.f);
215    *out = px;
216}
217
218static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
219                  const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
220                  const float* coeff) {
221
222    uint32_t x0 = rsMax((int32_t)x-2, 0);
223    uint32_t x1 = rsMax((int32_t)x-1, 0);
224    uint32_t x2 = x;
225    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
226    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
227
228    float4 px = py0[x0] * coeff[0] +
229                py0[x1] * coeff[1] +
230                py0[x2] * coeff[2] +
231                py0[x3] * coeff[3] +
232                py0[x4] * coeff[4] +
233
234                py1[x0] * coeff[5] +
235                py1[x1] * coeff[6] +
236                py1[x2] * coeff[7] +
237                py1[x3] * coeff[8] +
238                py1[x4] * coeff[9] +
239
240                py2[x0] * coeff[10] +
241                py2[x1] * coeff[11] +
242                py2[x2] * coeff[12] +
243                py2[x3] * coeff[13] +
244                py2[x4] * coeff[14] +
245
246                py3[x0] * coeff[15] +
247                py3[x1] * coeff[16] +
248                py3[x2] * coeff[17] +
249                py3[x3] * coeff[18] +
250                py3[x4] * coeff[19] +
251
252                py4[x0] * coeff[20] +
253                py4[x1] * coeff[21] +
254                py4[x2] * coeff[22] +
255                py4[x3] * coeff[23] +
256                py4[x4] * coeff[24];
257    *out = px;
258}
259
260static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
261                  const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
262                  const float* coeff) {
263
264    uint32_t x0 = rsMax((int32_t)x-2, 0);
265    uint32_t x1 = rsMax((int32_t)x-1, 0);
266    uint32_t x2 = x;
267    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
268    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
269
270    float2 px = py0[x0] * coeff[0] +
271                py0[x1] * coeff[1] +
272                py0[x2] * coeff[2] +
273                py0[x3] * coeff[3] +
274                py0[x4] * coeff[4] +
275
276                py1[x0] * coeff[5] +
277                py1[x1] * coeff[6] +
278                py1[x2] * coeff[7] +
279                py1[x3] * coeff[8] +
280                py1[x4] * coeff[9] +
281
282                py2[x0] * coeff[10] +
283                py2[x1] * coeff[11] +
284                py2[x2] * coeff[12] +
285                py2[x3] * coeff[13] +
286                py2[x4] * coeff[14] +
287
288                py3[x0] * coeff[15] +
289                py3[x1] * coeff[16] +
290                py3[x2] * coeff[17] +
291                py3[x3] * coeff[18] +
292                py3[x4] * coeff[19] +
293
294                py4[x0] * coeff[20] +
295                py4[x1] * coeff[21] +
296                py4[x2] * coeff[22] +
297                py4[x3] * coeff[23] +
298                py4[x4] * coeff[24];
299    *out = px;
300}
301
302static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
303                  const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
304                  const float* coeff) {
305
306    uint32_t x0 = rsMax((int32_t)x-2, 0);
307    uint32_t x1 = rsMax((int32_t)x-1, 0);
308    uint32_t x2 = x;
309    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
310    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
311
312    float px = py0[x0] * coeff[0] +
313               py0[x1] * coeff[1] +
314               py0[x2] * coeff[2] +
315               py0[x3] * coeff[3] +
316               py0[x4] * coeff[4] +
317
318               py1[x0] * coeff[5] +
319               py1[x1] * coeff[6] +
320               py1[x2] * coeff[7] +
321               py1[x3] * coeff[8] +
322               py1[x4] * coeff[9] +
323
324               py2[x0] * coeff[10] +
325               py2[x1] * coeff[11] +
326               py2[x2] * coeff[12] +
327               py2[x3] * coeff[13] +
328               py2[x4] * coeff[14] +
329
330               py3[x0] * coeff[15] +
331               py3[x1] * coeff[16] +
332               py3[x2] * coeff[17] +
333               py3[x3] * coeff[18] +
334               py3[x4] * coeff[19] +
335
336               py4[x0] * coeff[20] +
337               py4[x1] * coeff[21] +
338               py4[x2] * coeff[22] +
339               py4[x3] * coeff[23] +
340               py4[x4] * coeff[24];
341    *out = px;
342}
343
344
345extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
346                                          const void *y2, const void *y3, const void *y4,
347                                          const short *coef, uint32_t count);
348
349void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p,
350                                                uint32_t xstart, uint32_t xend,
351                                                uint32_t instep, uint32_t outstep) {
352    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
353    if (!cp->alloc.get()) {
354        ALOGE("Convolve5x5 executed without input, skipping");
355        return;
356    }
357    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
358    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
359
360    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
361    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
362    uint32_t y2 = p->y;
363    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
364    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
365
366    const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
367    const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
368    const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
369    const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
370    const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
371
372    uchar4 *out = (uchar4 *)p->out;
373    uint32_t x1 = xstart;
374    uint32_t x2 = xend;
375
376    while((x1 < x2) && (x1 < 2)) {
377        OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
378        out++;
379        x1++;
380    }
381
382#if defined(ARCH_ARM_HAVE_VFP)
383    if(gArchUseSIMD && ((x1 + 3) < x2)) {
384        uint32_t len = (x2 - x1 - 3) >> 1;
385        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->mIp, len);
386        out += len << 1;
387        x1 += len << 1;
388    }
389#endif
390
391    while(x1 < x2) {
392        OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
393        out++;
394        x1++;
395    }
396}
397
398void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p,
399                                                uint32_t xstart, uint32_t xend,
400                                                uint32_t instep, uint32_t outstep) {
401    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
402    if (!cp->alloc.get()) {
403        ALOGE("Convolve5x5 executed without input, skipping");
404        return;
405    }
406    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
407    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
408
409    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
410    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
411    uint32_t y2 = p->y;
412    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
413    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
414
415    const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
416    const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
417    const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
418    const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
419    const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
420
421    uchar2 *out = (uchar2 *)p->out;
422    uint32_t x1 = xstart;
423    uint32_t x2 = xend;
424
425    while((x1 < x2) && (x1 < 2)) {
426        OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
427        out++;
428        x1++;
429    }
430
431#if 0//defined(ARCH_ARM_HAVE_NEON)
432    if((x1 + 3) < x2) {
433        uint32_t len = (x2 - x1 - 3) >> 1;
434        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
435        out += len << 1;
436        x1 += len << 1;
437    }
438#endif
439
440    while(x1 < x2) {
441        OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
442        out++;
443        x1++;
444    }
445}
446
447void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p,
448                                                uint32_t xstart, uint32_t xend,
449                                                uint32_t instep, uint32_t outstep) {
450    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
451    if (!cp->alloc.get()) {
452        ALOGE("Convolve5x5 executed without input, skipping");
453        return;
454    }
455    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
456    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
457
458    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
459    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
460    uint32_t y2 = p->y;
461    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
462    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
463
464    const uchar *py0 = (const uchar *)(pin + stride * y0);
465    const uchar *py1 = (const uchar *)(pin + stride * y1);
466    const uchar *py2 = (const uchar *)(pin + stride * y2);
467    const uchar *py3 = (const uchar *)(pin + stride * y3);
468    const uchar *py4 = (const uchar *)(pin + stride * y4);
469
470    uchar *out = (uchar *)p->out;
471    uint32_t x1 = xstart;
472    uint32_t x2 = xend;
473
474    while((x1 < x2) && (x1 < 2)) {
475        OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
476        out++;
477        x1++;
478    }
479
480#if 0//defined(ARCH_ARM_HAVE_NEON)
481    if((x1 + 3) < x2) {
482        uint32_t len = (x2 - x1 - 3) >> 1;
483        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
484        out += len << 1;
485        x1 += len << 1;
486    }
487#endif
488
489    while(x1 < x2) {
490        OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
491        out++;
492        x1++;
493    }
494}
495
496void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p,
497                                                uint32_t xstart, uint32_t xend,
498                                                uint32_t instep, uint32_t outstep) {
499    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
500    if (!cp->alloc.get()) {
501        ALOGE("Convolve5x5 executed without input, skipping");
502        return;
503    }
504    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
505    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
506
507    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
508    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
509    uint32_t y2 = p->y;
510    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
511    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
512
513    const float4 *py0 = (const float4 *)(pin + stride * y0);
514    const float4 *py1 = (const float4 *)(pin + stride * y1);
515    const float4 *py2 = (const float4 *)(pin + stride * y2);
516    const float4 *py3 = (const float4 *)(pin + stride * y3);
517    const float4 *py4 = (const float4 *)(pin + stride * y4);
518
519    float4 *out = (float4 *)p->out;
520    uint32_t x1 = xstart;
521    uint32_t x2 = xend;
522
523    while((x1 < x2) && (x1 < 2)) {
524        OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
525        out++;
526        x1++;
527    }
528
529#if 0//defined(ARCH_ARM_HAVE_NEON)
530    if((x1 + 3) < x2) {
531        uint32_t len = (x2 - x1 - 3) >> 1;
532        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
533        out += len << 1;
534        x1 += len << 1;
535    }
536#endif
537
538    while(x1 < x2) {
539        OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
540        out++;
541        x1++;
542    }
543}
544
545void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p,
546                                                uint32_t xstart, uint32_t xend,
547                                                uint32_t instep, uint32_t outstep) {
548    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
549    if (!cp->alloc.get()) {
550        ALOGE("Convolve5x5 executed without input, skipping");
551        return;
552    }
553    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
554    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
555
556    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
557    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
558    uint32_t y2 = p->y;
559    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
560    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
561
562    const float2 *py0 = (const float2 *)(pin + stride * y0);
563    const float2 *py1 = (const float2 *)(pin + stride * y1);
564    const float2 *py2 = (const float2 *)(pin + stride * y2);
565    const float2 *py3 = (const float2 *)(pin + stride * y3);
566    const float2 *py4 = (const float2 *)(pin + stride * y4);
567
568    float2 *out = (float2 *)p->out;
569    uint32_t x1 = xstart;
570    uint32_t x2 = xend;
571
572    while((x1 < x2) && (x1 < 2)) {
573        OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
574        out++;
575        x1++;
576    }
577
578#if 0//defined(ARCH_ARM_HAVE_NEON)
579    if((x1 + 3) < x2) {
580        uint32_t len = (x2 - x1 - 3) >> 1;
581        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
582        out += len << 1;
583        x1 += len << 1;
584    }
585#endif
586
587    while(x1 < x2) {
588        OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
589        out++;
590        x1++;
591    }
592}
593
594void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p,
595                                                uint32_t xstart, uint32_t xend,
596                                                uint32_t instep, uint32_t outstep) {
597    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
598    if (!cp->alloc.get()) {
599        ALOGE("Convolve5x5 executed without input, skipping");
600        return;
601    }
602    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
603    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
604
605    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
606    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
607    uint32_t y2 = p->y;
608    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
609    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
610
611    const float *py0 = (const float *)(pin + stride * y0);
612    const float *py1 = (const float *)(pin + stride * y1);
613    const float *py2 = (const float *)(pin + stride * y2);
614    const float *py3 = (const float *)(pin + stride * y3);
615    const float *py4 = (const float *)(pin + stride * y4);
616
617    float *out = (float *)p->out;
618    uint32_t x1 = xstart;
619    uint32_t x2 = xend;
620
621    while((x1 < x2) && (x1 < 2)) {
622        OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
623        out++;
624        x1++;
625    }
626
627#if 0//defined(ARCH_ARM_HAVE_NEON)
628    if((x1 + 3) < x2) {
629        uint32_t len = (x2 - x1 - 3) >> 1;
630        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
631        out += len << 1;
632        x1 += len << 1;
633    }
634#endif
635
636    while(x1 < x2) {
637        OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
638        out++;
639        x1++;
640    }
641}
642
643RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
644            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
645            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
646
647    if (e->getType() == RS_TYPE_FLOAT_32) {
648        switch(e->getVectorSize()) {
649        case 1:
650            mRootPtr = &kernelF1;
651            break;
652        case 2:
653            mRootPtr = &kernelF2;
654            break;
655        case 3:
656        case 4:
657            mRootPtr = &kernelF4;
658            break;
659        }
660    } else {
661        switch(e->getVectorSize()) {
662        case 1:
663            mRootPtr = &kernelU1;
664            break;
665        case 2:
666            mRootPtr = &kernelU2;
667            break;
668        case 3:
669        case 4:
670            mRootPtr = &kernelU4;
671            break;
672        }
673    }
674    for(int ct=0; ct < 25; ct++) {
675        mFp[ct] = 1.f / 25.f;
676        mIp[ct] = (short)(mFp[ct] * 256.f);
677    }
678}
679
680RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
681}
682
683void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
684    s->mHal.info.exportedVariableCount = 2;
685}
686
687void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
688    alloc.clear();
689}
690
691
692RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
693                                            const Script *s, const Element *e) {
694
695    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
696}
697
698
699
700