1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21namespace android {
22namespace renderscript {
23
24
25class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
26public:
27    void populateScript(Script *) override;
28    void invokeFreeChildren() override;
29
30    void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
31    void setGlobalObj(uint32_t slot, ObjectBase *data) override;
32
33    ~RsdCpuScriptIntrinsicConvolve5x5() override;
34    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
35
36protected:
37    float mFp[28];
38    short mIp[28];
39    ObjectBaseRef<Allocation> alloc;
40
41
42    static void kernelU1(const RsExpandKernelDriverInfo *info,
43                         uint32_t xstart, uint32_t xend,
44                         uint32_t outstep);
45    static void kernelU2(const RsExpandKernelDriverInfo *info,
46                         uint32_t xstart, uint32_t xend,
47                         uint32_t outstep);
48    static void kernelU4(const RsExpandKernelDriverInfo *info,
49                         uint32_t xstart, uint32_t xend,
50                         uint32_t outstep);
51    static void kernelF1(const RsExpandKernelDriverInfo *info,
52                         uint32_t xstart, uint32_t xend,
53                         uint32_t outstep);
54    static void kernelF2(const RsExpandKernelDriverInfo *info,
55                         uint32_t xstart, uint32_t xend,
56                         uint32_t outstep);
57    static void kernelF4(const RsExpandKernelDriverInfo *info,
58                         uint32_t xstart, uint32_t xend,
59                         uint32_t outstep);
60
61
62};
63
64void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
65    rsAssert(slot == 1);
66    alloc.set(static_cast<Allocation *>(data));
67}
68
69void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
70                                                    const void *data, size_t dataLength) {
71    rsAssert(slot == 0);
72    memcpy (&mFp, data, dataLength);
73    for(int ct=0; ct < 25; ct++) {
74        if (mFp[ct] >= 0) {
75            mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
76        } else {
77            mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
78        }
79    }
80}
81
82
83static void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
84                  const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
85                  const float* coeff) {
86
87    uint32_t x0 = rsMax((int32_t)x-2, 0);
88    uint32_t x1 = rsMax((int32_t)x-1, 0);
89    uint32_t x2 = x;
90    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
91    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
92
93    float4 px = convert_float4(py0[x0]) * coeff[0] +
94                convert_float4(py0[x1]) * coeff[1] +
95                convert_float4(py0[x2]) * coeff[2] +
96                convert_float4(py0[x3]) * coeff[3] +
97                convert_float4(py0[x4]) * coeff[4] +
98
99                convert_float4(py1[x0]) * coeff[5] +
100                convert_float4(py1[x1]) * coeff[6] +
101                convert_float4(py1[x2]) * coeff[7] +
102                convert_float4(py1[x3]) * coeff[8] +
103                convert_float4(py1[x4]) * coeff[9] +
104
105                convert_float4(py2[x0]) * coeff[10] +
106                convert_float4(py2[x1]) * coeff[11] +
107                convert_float4(py2[x2]) * coeff[12] +
108                convert_float4(py2[x3]) * coeff[13] +
109                convert_float4(py2[x4]) * coeff[14] +
110
111                convert_float4(py3[x0]) * coeff[15] +
112                convert_float4(py3[x1]) * coeff[16] +
113                convert_float4(py3[x2]) * coeff[17] +
114                convert_float4(py3[x3]) * coeff[18] +
115                convert_float4(py3[x4]) * coeff[19] +
116
117                convert_float4(py4[x0]) * coeff[20] +
118                convert_float4(py4[x1]) * coeff[21] +
119                convert_float4(py4[x2]) * coeff[22] +
120                convert_float4(py4[x3]) * coeff[23] +
121                convert_float4(py4[x4]) * coeff[24];
122    px = clamp(px + 0.5f, 0.f, 255.f);
123    *out = convert_uchar4(px);
124}
125
126static void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
127                  const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
128                  const float* coeff) {
129
130    uint32_t x0 = rsMax((int32_t)x-2, 0);
131    uint32_t x1 = rsMax((int32_t)x-1, 0);
132    uint32_t x2 = x;
133    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
134    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
135
136    float2 px = convert_float2(py0[x0]) * coeff[0] +
137                convert_float2(py0[x1]) * coeff[1] +
138                convert_float2(py0[x2]) * coeff[2] +
139                convert_float2(py0[x3]) * coeff[3] +
140                convert_float2(py0[x4]) * coeff[4] +
141
142                convert_float2(py1[x0]) * coeff[5] +
143                convert_float2(py1[x1]) * coeff[6] +
144                convert_float2(py1[x2]) * coeff[7] +
145                convert_float2(py1[x3]) * coeff[8] +
146                convert_float2(py1[x4]) * coeff[9] +
147
148                convert_float2(py2[x0]) * coeff[10] +
149                convert_float2(py2[x1]) * coeff[11] +
150                convert_float2(py2[x2]) * coeff[12] +
151                convert_float2(py2[x3]) * coeff[13] +
152                convert_float2(py2[x4]) * coeff[14] +
153
154                convert_float2(py3[x0]) * coeff[15] +
155                convert_float2(py3[x1]) * coeff[16] +
156                convert_float2(py3[x2]) * coeff[17] +
157                convert_float2(py3[x3]) * coeff[18] +
158                convert_float2(py3[x4]) * coeff[19] +
159
160                convert_float2(py4[x0]) * coeff[20] +
161                convert_float2(py4[x1]) * coeff[21] +
162                convert_float2(py4[x2]) * coeff[22] +
163                convert_float2(py4[x3]) * coeff[23] +
164                convert_float2(py4[x4]) * coeff[24];
165    px = clamp(px + 0.5f, 0.f, 255.f);
166    *out = convert_uchar2(px);
167}
168
169static void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
170                  const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
171                  const float* coeff) {
172
173    uint32_t x0 = rsMax((int32_t)x-2, 0);
174    uint32_t x1 = rsMax((int32_t)x-1, 0);
175    uint32_t x2 = x;
176    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
177    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
178
179    float px = (float)(py0[x0]) * coeff[0] +
180               (float)(py0[x1]) * coeff[1] +
181               (float)(py0[x2]) * coeff[2] +
182               (float)(py0[x3]) * coeff[3] +
183               (float)(py0[x4]) * coeff[4] +
184
185               (float)(py1[x0]) * coeff[5] +
186               (float)(py1[x1]) * coeff[6] +
187               (float)(py1[x2]) * coeff[7] +
188               (float)(py1[x3]) * coeff[8] +
189               (float)(py1[x4]) * coeff[9] +
190
191               (float)(py2[x0]) * coeff[10] +
192               (float)(py2[x1]) * coeff[11] +
193               (float)(py2[x2]) * coeff[12] +
194               (float)(py2[x3]) * coeff[13] +
195               (float)(py2[x4]) * coeff[14] +
196
197               (float)(py3[x0]) * coeff[15] +
198               (float)(py3[x1]) * coeff[16] +
199               (float)(py3[x2]) * coeff[17] +
200               (float)(py3[x3]) * coeff[18] +
201               (float)(py3[x4]) * coeff[19] +
202
203               (float)(py4[x0]) * coeff[20] +
204               (float)(py4[x1]) * coeff[21] +
205               (float)(py4[x2]) * coeff[22] +
206               (float)(py4[x3]) * coeff[23] +
207               (float)(py4[x4]) * coeff[24];
208    px = clamp(px + 0.5f, 0.f, 255.f);
209    *out = px;
210}
211
212static void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
213                  const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
214                  const float* coeff) {
215
216    uint32_t x0 = rsMax((int32_t)x-2, 0);
217    uint32_t x1 = rsMax((int32_t)x-1, 0);
218    uint32_t x2 = x;
219    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
220    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
221
222    float4 px = py0[x0] * coeff[0] +
223                py0[x1] * coeff[1] +
224                py0[x2] * coeff[2] +
225                py0[x3] * coeff[3] +
226                py0[x4] * coeff[4] +
227
228                py1[x0] * coeff[5] +
229                py1[x1] * coeff[6] +
230                py1[x2] * coeff[7] +
231                py1[x3] * coeff[8] +
232                py1[x4] * coeff[9] +
233
234                py2[x0] * coeff[10] +
235                py2[x1] * coeff[11] +
236                py2[x2] * coeff[12] +
237                py2[x3] * coeff[13] +
238                py2[x4] * coeff[14] +
239
240                py3[x0] * coeff[15] +
241                py3[x1] * coeff[16] +
242                py3[x2] * coeff[17] +
243                py3[x3] * coeff[18] +
244                py3[x4] * coeff[19] +
245
246                py4[x0] * coeff[20] +
247                py4[x1] * coeff[21] +
248                py4[x2] * coeff[22] +
249                py4[x3] * coeff[23] +
250                py4[x4] * coeff[24];
251    *out = px;
252}
253
254static void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
255                  const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
256                  const float* coeff) {
257
258    uint32_t x0 = rsMax((int32_t)x-2, 0);
259    uint32_t x1 = rsMax((int32_t)x-1, 0);
260    uint32_t x2 = x;
261    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
262    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
263
264    float2 px = py0[x0] * coeff[0] +
265                py0[x1] * coeff[1] +
266                py0[x2] * coeff[2] +
267                py0[x3] * coeff[3] +
268                py0[x4] * coeff[4] +
269
270                py1[x0] * coeff[5] +
271                py1[x1] * coeff[6] +
272                py1[x2] * coeff[7] +
273                py1[x3] * coeff[8] +
274                py1[x4] * coeff[9] +
275
276                py2[x0] * coeff[10] +
277                py2[x1] * coeff[11] +
278                py2[x2] * coeff[12] +
279                py2[x3] * coeff[13] +
280                py2[x4] * coeff[14] +
281
282                py3[x0] * coeff[15] +
283                py3[x1] * coeff[16] +
284                py3[x2] * coeff[17] +
285                py3[x3] * coeff[18] +
286                py3[x4] * coeff[19] +
287
288                py4[x0] * coeff[20] +
289                py4[x1] * coeff[21] +
290                py4[x2] * coeff[22] +
291                py4[x3] * coeff[23] +
292                py4[x4] * coeff[24];
293    *out = px;
294}
295
296static void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
297                  const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
298                  const float* coeff) {
299
300    uint32_t x0 = rsMax((int32_t)x-2, 0);
301    uint32_t x1 = rsMax((int32_t)x-1, 0);
302    uint32_t x2 = x;
303    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
304    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
305
306    float px = py0[x0] * coeff[0] +
307               py0[x1] * coeff[1] +
308               py0[x2] * coeff[2] +
309               py0[x3] * coeff[3] +
310               py0[x4] * coeff[4] +
311
312               py1[x0] * coeff[5] +
313               py1[x1] * coeff[6] +
314               py1[x2] * coeff[7] +
315               py1[x3] * coeff[8] +
316               py1[x4] * coeff[9] +
317
318               py2[x0] * coeff[10] +
319               py2[x1] * coeff[11] +
320               py2[x2] * coeff[12] +
321               py2[x3] * coeff[13] +
322               py2[x4] * coeff[14] +
323
324               py3[x0] * coeff[15] +
325               py3[x1] * coeff[16] +
326               py3[x2] * coeff[17] +
327               py3[x3] * coeff[18] +
328               py3[x4] * coeff[19] +
329
330               py4[x0] * coeff[20] +
331               py4[x1] * coeff[21] +
332               py4[x2] * coeff[22] +
333               py4[x3] * coeff[23] +
334               py4[x4] * coeff[24];
335    *out = px;
336}
337
338
339extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
340                                          const void *y2, const void *y3, const void *y4,
341                                          const short *coef, uint32_t count);
342
343void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info,
344                                                uint32_t xstart, uint32_t xend,
345                                                uint32_t outstep) {
346    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
347    if (!cp->alloc.get()) {
348        ALOGE("Convolve5x5 executed without input, skipping");
349        return;
350    }
351    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
352    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
353
354    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
355    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
356    uint32_t y2 = info->current.y;
357    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
358    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
359
360    const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
361    const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
362    const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
363    const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
364    const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
365
366    uchar4 *out = (uchar4 *)info->outPtr[0];
367    uint32_t x1 = xstart;
368    uint32_t x2 = xend;
369
370    while((x1 < x2) && (x1 < 2)) {
371        OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
372        out++;
373        x1++;
374    }
375#if defined(ARCH_X86_HAVE_SSSE3)
376    // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
377    // 3 for end boundary where x may hit the end boundary)
378    if (gArchUseSIMD &&((x1 + 6) < x2)) {
379        // subtract 3 for end boundary
380        uint32_t len = (x2 - x1 - 3) >> 2;
381        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
382        out += len << 2;
383        x1 += len << 2;
384    }
385#endif
386
387#if defined(ARCH_ARM_USE_INTRINSICS)
388    if(gArchUseSIMD && ((x1 + 3) < x2)) {
389        uint32_t len = (x2 - x1 - 3) >> 1;
390        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
391        out += len << 1;
392        x1 += len << 1;
393    }
394#endif
395
396    while(x1 < x2) {
397        OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
398        out++;
399        x1++;
400    }
401}
402
403void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info,
404                                                uint32_t xstart, uint32_t xend,
405                                                uint32_t outstep) {
406    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
407    if (!cp->alloc.get()) {
408        ALOGE("Convolve5x5 executed without input, skipping");
409        return;
410    }
411    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
412    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
413
414    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
415    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
416    uint32_t y2 = info->current.y;
417    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
418    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
419
420    const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
421    const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
422    const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
423    const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
424    const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
425
426    uchar2 *out = (uchar2 *)info->outPtr[0];
427    uint32_t x1 = xstart;
428    uint32_t x2 = xend;
429
430    while((x1 < x2) && (x1 < 2)) {
431        OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
432        out++;
433        x1++;
434    }
435
436#if 0//defined(ARCH_ARM_HAVE_NEON)
437    if((x1 + 3) < x2) {
438        uint32_t len = (x2 - x1 - 3) >> 1;
439        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
440        out += len << 1;
441        x1 += len << 1;
442    }
443#endif
444
445    while(x1 < x2) {
446        OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
447        out++;
448        x1++;
449    }
450}
451
452void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info,
453                                                uint32_t xstart, uint32_t xend,
454                                                uint32_t outstep) {
455    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
456    if (!cp->alloc.get()) {
457        ALOGE("Convolve5x5 executed without input, skipping");
458        return;
459    }
460    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
461    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
462
463    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
464    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
465    uint32_t y2 = info->current.y;
466    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
467    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
468
469    const uchar *py0 = (const uchar *)(pin + stride * y0);
470    const uchar *py1 = (const uchar *)(pin + stride * y1);
471    const uchar *py2 = (const uchar *)(pin + stride * y2);
472    const uchar *py3 = (const uchar *)(pin + stride * y3);
473    const uchar *py4 = (const uchar *)(pin + stride * y4);
474
475    uchar *out = (uchar *)info->outPtr[0];
476    uint32_t x1 = xstart;
477    uint32_t x2 = xend;
478
479    while((x1 < x2) && (x1 < 2)) {
480        OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
481        out++;
482        x1++;
483    }
484
485#if 0//defined(ARCH_ARM_HAVE_NEON)
486    if((x1 + 3) < x2) {
487        uint32_t len = (x2 - x1 - 3) >> 1;
488        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
489        out += len << 1;
490        x1 += len << 1;
491    }
492#endif
493
494    while(x1 < x2) {
495        OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
496        out++;
497        x1++;
498    }
499}
500
501void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info,
502                                                uint32_t xstart, uint32_t xend,
503                                                uint32_t outstep) {
504    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
505    if (!cp->alloc.get()) {
506        ALOGE("Convolve5x5 executed without input, skipping");
507        return;
508    }
509    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
510    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
511
512    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
513    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
514    uint32_t y2 = info->current.y;
515    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
516    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
517
518    const float4 *py0 = (const float4 *)(pin + stride * y0);
519    const float4 *py1 = (const float4 *)(pin + stride * y1);
520    const float4 *py2 = (const float4 *)(pin + stride * y2);
521    const float4 *py3 = (const float4 *)(pin + stride * y3);
522    const float4 *py4 = (const float4 *)(pin + stride * y4);
523
524    float4 *out = (float4 *)info->outPtr[0];
525    uint32_t x1 = xstart;
526    uint32_t x2 = xend;
527
528    while((x1 < x2) && (x1 < 2)) {
529        OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
530        out++;
531        x1++;
532    }
533
534#if 0//defined(ARCH_ARM_HAVE_NEON)
535    if((x1 + 3) < x2) {
536        uint32_t len = (x2 - x1 - 3) >> 1;
537        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
538        out += len << 1;
539        x1 += len << 1;
540    }
541#endif
542
543    while(x1 < x2) {
544        OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
545        out++;
546        x1++;
547    }
548}
549
550void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info,
551                                                uint32_t xstart, uint32_t xend,
552                                                uint32_t outstep) {
553    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
554    if (!cp->alloc.get()) {
555        ALOGE("Convolve5x5 executed without input, skipping");
556        return;
557    }
558    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
559    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
560
561    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
562    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
563    uint32_t y2 = info->current.y;
564    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
565    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
566
567    const float2 *py0 = (const float2 *)(pin + stride * y0);
568    const float2 *py1 = (const float2 *)(pin + stride * y1);
569    const float2 *py2 = (const float2 *)(pin + stride * y2);
570    const float2 *py3 = (const float2 *)(pin + stride * y3);
571    const float2 *py4 = (const float2 *)(pin + stride * y4);
572
573    float2 *out = (float2 *)info->outPtr[0];
574    uint32_t x1 = xstart;
575    uint32_t x2 = xend;
576
577    while((x1 < x2) && (x1 < 2)) {
578        OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
579        out++;
580        x1++;
581    }
582
583#if 0//defined(ARCH_ARM_HAVE_NEON)
584    if((x1 + 3) < x2) {
585        uint32_t len = (x2 - x1 - 3) >> 1;
586        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
587        out += len << 1;
588        x1 += len << 1;
589    }
590#endif
591
592    while(x1 < x2) {
593        OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
594        out++;
595        x1++;
596    }
597}
598
599void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info,
600                                                uint32_t xstart, uint32_t xend,
601                                                uint32_t outstep) {
602    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
603    if (!cp->alloc.get()) {
604        ALOGE("Convolve5x5 executed without input, skipping");
605        return;
606    }
607    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
608    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
609
610    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
611    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
612    uint32_t y2 = info->current.y;
613    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
614    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
615
616    const float *py0 = (const float *)(pin + stride * y0);
617    const float *py1 = (const float *)(pin + stride * y1);
618    const float *py2 = (const float *)(pin + stride * y2);
619    const float *py3 = (const float *)(pin + stride * y3);
620    const float *py4 = (const float *)(pin + stride * y4);
621
622    float *out = (float *)info->outPtr[0];
623    uint32_t x1 = xstart;
624    uint32_t x2 = xend;
625
626    while((x1 < x2) && (x1 < 2)) {
627        OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
628        out++;
629        x1++;
630    }
631
632#if 0//defined(ARCH_ARM_HAVE_NEON)
633    if((x1 + 3) < x2) {
634        uint32_t len = (x2 - x1 - 3) >> 1;
635        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
636        out += len << 1;
637        x1 += len << 1;
638    }
639#endif
640
641    while(x1 < x2) {
642        OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
643        out++;
644        x1++;
645    }
646}
647
648RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
649            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
650            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
651
652    if (e->getType() == RS_TYPE_FLOAT_32) {
653        switch(e->getVectorSize()) {
654        case 1:
655            mRootPtr = &kernelF1;
656            break;
657        case 2:
658            mRootPtr = &kernelF2;
659            break;
660        case 3:
661        case 4:
662            mRootPtr = &kernelF4;
663            break;
664        }
665    } else {
666        switch(e->getVectorSize()) {
667        case 1:
668            mRootPtr = &kernelU1;
669            break;
670        case 2:
671            mRootPtr = &kernelU2;
672            break;
673        case 3:
674        case 4:
675            mRootPtr = &kernelU4;
676            break;
677        }
678    }
679    for(int ct=0; ct < 25; ct++) {
680        mFp[ct] = 1.f / 25.f;
681        mIp[ct] = (short)(mFp[ct] * 256.f);
682    }
683}
684
685RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
686}
687
688void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
689    s->mHal.info.exportedVariableCount = 2;
690}
691
692void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
693    alloc.clear();
694}
695
696RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
697                                            const Script *s, const Element *e) {
698
699    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
700}
701
702} // namespace renderscript
703} // namespace android
704