1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21using namespace android;
22using namespace android::renderscript;
23
24namespace android {
25namespace renderscript {
26
27
28class RsdCpuScriptIntrinsicResize : public RsdCpuScriptIntrinsic {
29public:
30    void populateScript(Script *) override;
31    void invokeFreeChildren() override;
32
33    void setGlobalObj(uint32_t slot, ObjectBase *data) override;
34
35    ~RsdCpuScriptIntrinsicResize() override;
36    RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
37
38    void preLaunch(uint32_t slot, const Allocation ** ains,
39                   uint32_t inLen, Allocation * aout, const void * usr,
40                   uint32_t usrLen, const RsScriptCall *sc) override;
41
42    float scaleX;
43    float scaleY;
44
45protected:
46    ObjectBaseRef<const Allocation> mAlloc;
47    ObjectBaseRef<const Element> mElement;
48
49    static void kernelU1(const RsExpandKernelDriverInfo *info,
50                         uint32_t xstart, uint32_t xend,
51                         uint32_t outstep);
52    static void kernelU2(const RsExpandKernelDriverInfo *info,
53                         uint32_t xstart, uint32_t xend,
54                         uint32_t outstep);
55    static void kernelU4(const RsExpandKernelDriverInfo *info,
56                         uint32_t xstart, uint32_t xend,
57                         uint32_t outstep);
58    static void kernelF1(const RsExpandKernelDriverInfo *info,
59                         uint32_t xstart, uint32_t xend,
60                         uint32_t outstep);
61    static void kernelF2(const RsExpandKernelDriverInfo *info,
62                         uint32_t xstart, uint32_t xend,
63                         uint32_t outstep);
64    static void kernelF4(const RsExpandKernelDriverInfo *info,
65                         uint32_t xstart, uint32_t xend,
66                         uint32_t outstep);
67};
68
69}
70}
71
72
73void RsdCpuScriptIntrinsicResize::setGlobalObj(uint32_t slot, ObjectBase *data) {
74    rsAssert(slot == 0);
75    mAlloc.set(static_cast<Allocation *>(data));
76}
77
78static float4 cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3, float x) {
79    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
80            + x * (3.f * (p1 - p2) + p3 - p0)));
81}
82
83static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
84    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
85            + x * (3.f * (p1 - p2) + p3 - p0)));
86}
87
88static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
89    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
90            + x * (3.f * (p1 - p2) + p3 - p0)));
91}
92
93static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
94                         float xf, float yf, int width) {
95    int startx = (int) floor(xf - 1);
96    xf = xf - floor(xf);
97    int maxx = width - 1;
98    int xs0 = rsMax(0, startx + 0);
99    int xs1 = rsMax(0, startx + 1);
100    int xs2 = rsMin(maxx, startx + 2);
101    int xs3 = rsMin(maxx, startx + 3);
102
103    float4 p0  = cubicInterpolate(convert_float4(yp0[xs0]),
104                                  convert_float4(yp0[xs1]),
105                                  convert_float4(yp0[xs2]),
106                                  convert_float4(yp0[xs3]), xf);
107
108    float4 p1  = cubicInterpolate(convert_float4(yp1[xs0]),
109                                  convert_float4(yp1[xs1]),
110                                  convert_float4(yp1[xs2]),
111                                  convert_float4(yp1[xs3]), xf);
112
113    float4 p2  = cubicInterpolate(convert_float4(yp2[xs0]),
114                                  convert_float4(yp2[xs1]),
115                                  convert_float4(yp2[xs2]),
116                                  convert_float4(yp2[xs3]), xf);
117
118    float4 p3  = cubicInterpolate(convert_float4(yp3[xs0]),
119                                  convert_float4(yp3[xs1]),
120                                  convert_float4(yp3[xs2]),
121                                  convert_float4(yp3[xs3]), xf);
122
123    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
124    p = clamp(p + 0.5f, 0.f, 255.f);
125    return convert_uchar4(p);
126}
127
128static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
129                         float xf, float yf, int width) {
130    int startx = (int) floor(xf - 1);
131    xf = xf - floor(xf);
132    int maxx = width - 1;
133    int xs0 = rsMax(0, startx + 0);
134    int xs1 = rsMax(0, startx + 1);
135    int xs2 = rsMin(maxx, startx + 2);
136    int xs3 = rsMin(maxx, startx + 3);
137
138    float2 p0  = cubicInterpolate(convert_float2(yp0[xs0]),
139                                  convert_float2(yp0[xs1]),
140                                  convert_float2(yp0[xs2]),
141                                  convert_float2(yp0[xs3]), xf);
142
143    float2 p1  = cubicInterpolate(convert_float2(yp1[xs0]),
144                                  convert_float2(yp1[xs1]),
145                                  convert_float2(yp1[xs2]),
146                                  convert_float2(yp1[xs3]), xf);
147
148    float2 p2  = cubicInterpolate(convert_float2(yp2[xs0]),
149                                  convert_float2(yp2[xs1]),
150                                  convert_float2(yp2[xs2]),
151                                  convert_float2(yp2[xs3]), xf);
152
153    float2 p3  = cubicInterpolate(convert_float2(yp3[xs0]),
154                                  convert_float2(yp3[xs1]),
155                                  convert_float2(yp3[xs2]),
156                                  convert_float2(yp3[xs3]), xf);
157
158    float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
159    p = clamp(p + 0.5f, 0.f, 255.f);
160    return convert_uchar2(p);
161}
162
163static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
164                        float xf, float yf, int width) {
165    int startx = (int) floor(xf - 1);
166    xf = xf - floor(xf);
167    int maxx = width - 1;
168    int xs0 = rsMax(0, startx + 0);
169    int xs1 = rsMax(0, startx + 1);
170    int xs2 = rsMin(maxx, startx + 2);
171    int xs3 = rsMin(maxx, startx + 3);
172
173    float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
174                                 (float)yp0[xs2], (float)yp0[xs3], xf);
175    float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
176                                 (float)yp1[xs2], (float)yp1[xs3], xf);
177    float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
178                                 (float)yp2[xs2], (float)yp2[xs3], xf);
179    float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
180                                 (float)yp3[xs2], (float)yp3[xs3], xf);
181
182    float p  = cubicInterpolate(p0, p1, p2, p3, yf);
183    p = clamp(p + 0.5f, 0.f, 255.f);
184    return (uchar)p;
185}
186
187extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
188
189extern "C" void rsdIntrinsicResizeB4_K(
190            uchar4 *dst,
191            size_t count,
192            uint32_t xf,
193            uint32_t xinc,
194            uchar4 const *srcn,
195            uchar4 const *src0,
196            uchar4 const *src1,
197            uchar4 const *src2,
198            size_t xclip,
199            size_t avail,
200            uint64_t osc_ctl,
201            int32_t const *yr);
202
203extern "C" void rsdIntrinsicResizeB2_K(
204            uchar2 *dst,
205            size_t count,
206            uint32_t xf,
207            uint32_t xinc,
208            uchar2 const *srcn,
209            uchar2 const *src0,
210            uchar2 const *src1,
211            uchar2 const *src2,
212            size_t xclip,
213            size_t avail,
214            uint64_t osc_ctl,
215            int32_t const *yr);
216
217extern "C" void rsdIntrinsicResizeB1_K(
218            uchar *dst,
219            size_t count,
220            uint32_t xf,
221            uint32_t xinc,
222            uchar const *srcn,
223            uchar const *src0,
224            uchar const *src1,
225            uchar const *src2,
226            size_t xclip,
227            size_t avail,
228            uint64_t osc_ctl,
229            int32_t const *yr);
230
231#if defined(ARCH_ARM_USE_INTRINSICS)
232static void mkYCoeff(int32_t *yr, float yf) {
233    int32_t yf1 = rint(yf * 0x10000);
234    int32_t yf2 = rint(yf * yf * 0x10000);
235    int32_t yf3 = rint(yf * yf * yf * 0x10000);
236
237    yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
238    yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
239    yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
240    yr[3] = -(yf3 - yf2) >> 1;
241}
242#endif
243
244static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
245                         float xf, float yf, int width) {
246    int startx = (int) floor(xf - 1);
247    xf = xf - floor(xf);
248    int maxx = width - 1;
249    int xs0 = rsMax(0, startx + 0);
250    int xs1 = rsMax(0, startx + 1);
251    int xs2 = rsMin(maxx, startx + 2);
252    int xs3 = rsMin(maxx, startx + 3);
253
254    float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
255                                  yp0[xs2], yp0[xs3], xf);
256    float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
257                                  yp1[xs2], yp1[xs3], xf);
258    float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
259                                  yp2[xs2], yp2[xs3], xf);
260    float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
261                                  yp3[xs2], yp3[xs3], xf);
262
263    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
264    return p;
265}
266
267static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
268                         float xf, float yf, int width) {
269    int startx = (int) floor(xf - 1);
270    xf = xf - floor(xf);
271    int maxx = width - 1;
272    int xs0 = rsMax(0, startx + 0);
273    int xs1 = rsMax(0, startx + 1);
274    int xs2 = rsMin(maxx, startx + 2);
275    int xs3 = rsMin(maxx, startx + 3);
276
277    float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
278                                  yp0[xs2], yp0[xs3], xf);
279    float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
280                                  yp1[xs2], yp1[xs3], xf);
281    float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
282                                  yp2[xs2], yp2[xs3], xf);
283    float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
284                                  yp3[xs2], yp3[xs3], xf);
285
286    float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
287    return p;
288}
289
290static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
291                        float xf, float yf, int width) {
292    int startx = (int) floor(xf - 1);
293    xf = xf - floor(xf);
294    int maxx = width - 1;
295    int xs0 = rsMax(0, startx + 0);
296    int xs1 = rsMax(0, startx + 1);
297    int xs2 = rsMin(maxx, startx + 2);
298    int xs3 = rsMin(maxx, startx + 3);
299
300    float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
301                                 yp0[xs2], yp0[xs3], xf);
302    float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
303                                 yp1[xs2], yp1[xs3], xf);
304    float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
305                                 yp2[xs2], yp2[xs3], xf);
306    float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
307                                 yp3[xs2], yp3[xs3], xf);
308
309    float p  = cubicInterpolate(p0, p1, p2, p3, yf);
310    return p;
311}
312
313void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelDriverInfo *info,
314                                                uint32_t xstart, uint32_t xend,
315                                                uint32_t outstep) {
316    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
317
318    if (!cp->mAlloc.get()) {
319        ALOGE("Resize executed without input, skipping");
320        return;
321    }
322    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
323    const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
324    const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
325    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
326
327    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
328    int starty = (int) floor(yf - 1);
329    yf = yf - floor(yf);
330    int maxy = srcHeight - 1;
331    int ys0 = rsMax(0, starty + 0);
332    int ys1 = rsMax(0, starty + 1);
333    int ys2 = rsMin(maxy, starty + 2);
334    int ys3 = rsMin(maxy, starty + 3);
335
336    const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
337    const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
338    const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
339    const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
340
341    uchar4 *out = ((uchar4 *)info->outPtr[0]) + xstart;
342    uint32_t x1 = xstart;
343    uint32_t x2 = xend;
344
345#if defined(ARCH_ARM_USE_INTRINSICS)
346    if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
347        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
348        long xf16 = rint(xf * 0x10000);
349        uint32_t xinc16 = rint(cp->scaleX * 0x10000);
350
351        int xoff = (xf16 >> 16) - 1;
352        int xclip = rsMax(0, xoff) - xoff;
353        int len = x2 - x1;
354
355        int32_t yr[4];
356        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
357        mkYCoeff(yr, yf);
358
359        xoff += xclip;
360
361        rsdIntrinsicResizeB4_K(
362                out, len,
363                xf16 & 0xffff, xinc16,
364                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
365                xclip, srcWidth - xoff + xclip,
366                osc_ctl, yr);
367        out += len;
368        x1 += len;
369    }
370#endif
371
372    while(x1 < x2) {
373        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
374        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
375        out++;
376        x1++;
377    }
378}
379
380void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelDriverInfo *info,
381                                                uint32_t xstart, uint32_t xend,
382                                                uint32_t outstep) {
383    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
384
385    if (!cp->mAlloc.get()) {
386        ALOGE("Resize executed without input, skipping");
387        return;
388    }
389    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
390    const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
391    const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
392    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
393
394    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
395    int starty = (int) floor(yf - 1);
396    yf = yf - floor(yf);
397    int maxy = srcHeight - 1;
398    int ys0 = rsMax(0, starty + 0);
399    int ys1 = rsMax(0, starty + 1);
400    int ys2 = rsMin(maxy, starty + 2);
401    int ys3 = rsMin(maxy, starty + 3);
402
403    const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
404    const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
405    const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
406    const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
407
408    uchar2 *out = ((uchar2 *)info->outPtr[0]) + xstart;
409    uint32_t x1 = xstart;
410    uint32_t x2 = xend;
411
412#if defined(ARCH_ARM_USE_INTRINSICS)
413    if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
414        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
415        long xf16 = rint(xf * 0x10000);
416        uint32_t xinc16 = rint(cp->scaleX * 0x10000);
417
418        int xoff = (xf16 >> 16) - 1;
419        int xclip = rsMax(0, xoff) - xoff;
420        int len = x2 - x1;
421
422        int32_t yr[4];
423        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
424        mkYCoeff(yr, yf);
425
426        xoff += xclip;
427
428        rsdIntrinsicResizeB2_K(
429                out, len,
430                xf16 & 0xffff, xinc16,
431                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
432                xclip, srcWidth - xoff + xclip,
433                osc_ctl, yr);
434        out += len;
435        x1 += len;
436    }
437#endif
438
439    while(x1 < x2) {
440        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
441        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
442        out++;
443        x1++;
444    }
445}
446
447void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelDriverInfo *info,
448                                                uint32_t xstart, uint32_t xend,
449                                                uint32_t outstep) {
450    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
451
452    if (!cp->mAlloc.get()) {
453        ALOGE("Resize executed without input, skipping");
454        return;
455    }
456    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
457    const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
458    const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
459    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
460
461    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
462    int starty = (int) floor(yf - 1);
463    yf = yf - floor(yf);
464    int maxy = srcHeight - 1;
465    int ys0 = rsMax(0, starty + 0);
466    int ys1 = rsMax(0, starty + 1);
467    int ys2 = rsMin(maxy, starty + 2);
468    int ys3 = rsMin(maxy, starty + 3);
469
470    const uchar *yp0 = pin + stride * ys0;
471    const uchar *yp1 = pin + stride * ys1;
472    const uchar *yp2 = pin + stride * ys2;
473    const uchar *yp3 = pin + stride * ys3;
474
475    uchar *out = ((uchar *)info->outPtr[0]) + xstart;
476    uint32_t x1 = xstart;
477    uint32_t x2 = xend;
478
479#if defined(ARCH_ARM_USE_INTRINSICS)
480    if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
481        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
482        long xf16 = rint(xf * 0x10000);
483        uint32_t xinc16 = rint(cp->scaleX * 0x10000);
484
485        int xoff = (xf16 >> 16) - 1;
486        int xclip = rsMax(0, xoff) - xoff;
487        int len = x2 - x1;
488
489        int32_t yr[4];
490        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
491        mkYCoeff(yr, yf);
492
493        xoff += xclip;
494
495        rsdIntrinsicResizeB1_K(
496                out, len,
497                xf16 & 0xffff, xinc16,
498                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
499                xclip, srcWidth - xoff + xclip,
500                osc_ctl, yr);
501        out += len;
502        x1 += len;
503    }
504#endif
505
506    while(x1 < x2) {
507        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
508        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
509        out++;
510        x1++;
511    }
512}
513
514void RsdCpuScriptIntrinsicResize::kernelF4(const RsExpandKernelDriverInfo *info,
515                                                uint32_t xstart, uint32_t xend,
516                                                uint32_t outstep) {
517    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
518
519    if (!cp->mAlloc.get()) {
520        ALOGE("Resize executed without input, skipping");
521        return;
522    }
523    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
524    const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
525    const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
526    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
527
528    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
529    int starty = (int) floor(yf - 1);
530    yf = yf - floor(yf);
531    int maxy = srcHeight - 1;
532    int ys0 = rsMax(0, starty + 0);
533    int ys1 = rsMax(0, starty + 1);
534    int ys2 = rsMin(maxy, starty + 2);
535    int ys3 = rsMin(maxy, starty + 3);
536
537    const float4 *yp0 = (const float4 *)(pin + stride * ys0);
538    const float4 *yp1 = (const float4 *)(pin + stride * ys1);
539    const float4 *yp2 = (const float4 *)(pin + stride * ys2);
540    const float4 *yp3 = (const float4 *)(pin + stride * ys3);
541
542    float4 *out = ((float4 *)info->outPtr[0]) + xstart;
543    uint32_t x1 = xstart;
544    uint32_t x2 = xend;
545
546    while(x1 < x2) {
547        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
548        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
549        out++;
550        x1++;
551    }
552}
553
554void RsdCpuScriptIntrinsicResize::kernelF2(const RsExpandKernelDriverInfo *info,
555                                                uint32_t xstart, uint32_t xend,
556                                                uint32_t outstep) {
557    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
558
559    if (!cp->mAlloc.get()) {
560        ALOGE("Resize executed without input, skipping");
561        return;
562    }
563    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
564    const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
565    const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
566    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
567
568    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
569    int starty = (int) floor(yf - 1);
570    yf = yf - floor(yf);
571    int maxy = srcHeight - 1;
572    int ys0 = rsMax(0, starty + 0);
573    int ys1 = rsMax(0, starty + 1);
574    int ys2 = rsMin(maxy, starty + 2);
575    int ys3 = rsMin(maxy, starty + 3);
576
577    const float2 *yp0 = (const float2 *)(pin + stride * ys0);
578    const float2 *yp1 = (const float2 *)(pin + stride * ys1);
579    const float2 *yp2 = (const float2 *)(pin + stride * ys2);
580    const float2 *yp3 = (const float2 *)(pin + stride * ys3);
581
582    float2 *out = ((float2 *)info->outPtr[0]) + xstart;
583    uint32_t x1 = xstart;
584    uint32_t x2 = xend;
585
586    while(x1 < x2) {
587        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
588        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
589        out++;
590        x1++;
591    }
592}
593
594void RsdCpuScriptIntrinsicResize::kernelF1(const RsExpandKernelDriverInfo *info,
595                                                uint32_t xstart, uint32_t xend,
596                                                uint32_t outstep) {
597    RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
598
599    if (!cp->mAlloc.get()) {
600        ALOGE("Resize executed without input, skipping");
601        return;
602    }
603    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
604    const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
605    const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
606    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
607
608    float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
609    int starty = (int) floor(yf - 1);
610    yf = yf - floor(yf);
611    int maxy = srcHeight - 1;
612    int ys0 = rsMax(0, starty + 0);
613    int ys1 = rsMax(0, starty + 1);
614    int ys2 = rsMin(maxy, starty + 2);
615    int ys3 = rsMin(maxy, starty + 3);
616
617    const float *yp0 = (const float *)(pin + stride * ys0);
618    const float *yp1 = (const float *)(pin + stride * ys1);
619    const float *yp2 = (const float *)(pin + stride * ys2);
620    const float *yp3 = (const float *)(pin + stride * ys3);
621
622    float *out = ((float *)info->outPtr[0]) + xstart;
623    uint32_t x1 = xstart;
624    uint32_t x2 = xend;
625
626    while(x1 < x2) {
627        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
628        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
629        out++;
630        x1++;
631    }
632}
633
634RsdCpuScriptIntrinsicResize::RsdCpuScriptIntrinsicResize (
635            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
636            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_RESIZE) {
637
638}
639
640RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
641}
642
643void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
644                                            const Allocation ** ains,
645                                            uint32_t inLen, Allocation * aout,
646                                            const void * usr, uint32_t usrLen,
647                                            const RsScriptCall *sc)
648{
649    if (!mAlloc.get()) {
650        ALOGE("Resize executed without input, skipping");
651        return;
652    }
653    const uint32_t srcHeight = mAlloc->mHal.drvState.lod[0].dimY;
654    const uint32_t srcWidth = mAlloc->mHal.drvState.lod[0].dimX;
655    const size_t stride = mAlloc->mHal.drvState.lod[0].stride;
656
657    //check the data type to determine F or U.
658    if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
659        switch(mAlloc->getType()->getElement()->getVectorSize()) {
660        case 1:
661            mRootPtr = &kernelU1;
662            break;
663        case 2:
664            mRootPtr = &kernelU2;
665            break;
666        case 3:
667        case 4:
668            mRootPtr = &kernelU4;
669            break;
670        }
671    } else {
672        switch(mAlloc->getType()->getElement()->getVectorSize()) {
673        case 1:
674            mRootPtr = &kernelF1;
675            break;
676        case 2:
677            mRootPtr = &kernelF2;
678            break;
679        case 3:
680        case 4:
681            mRootPtr = &kernelF4;
682            break;
683        }
684    }
685
686    scaleX = (float)srcWidth / aout->mHal.drvState.lod[0].dimX;
687    scaleY = (float)srcHeight / aout->mHal.drvState.lod[0].dimY;
688
689}
690
691void RsdCpuScriptIntrinsicResize::populateScript(Script *s) {
692    s->mHal.info.exportedVariableCount = 1;
693}
694
695void RsdCpuScriptIntrinsicResize::invokeFreeChildren() {
696    mAlloc.clear();
697}
698
699
700RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
701
702    return new RsdCpuScriptIntrinsicResize(ctx, s, e);
703}
704