1
2#include <arm_neon.h>
3
4
5#define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
6#define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
7#define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
8#define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
9#define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
10#define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
11
12#define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
13#define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
14#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
15#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
16
17#ifndef PREAMBLE
18    #define PREAMBLE(state)
19    #define PREAMBLE_PARAM_X
20    #define PREAMBLE_PARAM_Y
21    #define PREAMBLE_ARG_X
22    #define PREAMBLE_ARG_Y
23#endif
24
25static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
26                                uint32_t xy[], int count, int x, int y) {
27    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
28                             SkMatrix::kScale_Mask)) == 0);
29
30    PREAMBLE(s);
31
32    // we store y, x, x, x, x, x
33    const unsigned maxX = s.fBitmap->width() - 1;
34    SkFractionalInt fx;
35    {
36        SkPoint pt;
37        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
38                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
39        fx = SkScalarToFractionalInt(pt.fY);
40        const unsigned maxY = s.fBitmap->height() - 1;
41        *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY);
42        fx = SkScalarToFractionalInt(pt.fX);
43    }
44
45    if (0 == maxX) {
46        // all of the following X values must be 0
47        memset(xy, 0, count * sizeof(uint16_t));
48        return;
49    }
50
51    const SkFractionalInt dx = s.fInvSxFractionalInt;
52
53#ifdef CHECK_FOR_DECAL
54    // test if we don't need to apply the tile proc
55    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
56        decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
57                             SkFractionalIntToFixed(dx), count);
58        return;
59    }
60#endif
61
62    if (count >= 8) {
63        SkFractionalInt dx2 = dx+dx;
64        SkFractionalInt dx4 = dx2+dx2;
65        SkFractionalInt dx8 = dx4+dx4;
66
67        // now build fx/fx+dx/fx+2dx/fx+3dx
68        SkFractionalInt fx1, fx2, fx3;
69        int32x4_t lbase, hbase;
70        int16_t *dst16 = (int16_t *)xy;
71
72        fx1 = fx+dx;
73        fx2 = fx1+dx;
74        fx3 = fx2+dx;
75
76        lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
77        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
78        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
79        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
80        hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
81
82        // store & bump
83        while (count >= 8) {
84
85            int16x8_t fx8;
86
87            fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
88
89            vst1q_s16(dst16, fx8);
90
91            // but preserving base & on to the next
92            lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
93            hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
94            dst16 += 8;
95            count -= 8;
96            fx += dx8;
97        };
98        xy = (uint32_t *) dst16;
99    }
100
101    uint16_t* xx = (uint16_t*)xy;
102    for (int i = count; i > 0; --i) {
103        *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
104        fx += dx;
105    }
106}
107
108static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
109                                 uint32_t xy[], int count, int x, int y) {
110    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
111    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
112                             SkMatrix::kScale_Mask |
113                             SkMatrix::kAffine_Mask)) == 0);
114
115    PREAMBLE(s);
116    SkPoint srcPt;
117    s.fInvProc(s.fInvMatrix,
118               SkIntToScalar(x) + SK_ScalarHalf,
119               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
120
121    SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX);
122    SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY);
123    SkFractionalInt dx = s.fInvSxFractionalInt;
124    SkFractionalInt dy = s.fInvKyFractionalInt;
125    int maxX = s.fBitmap->width() - 1;
126    int maxY = s.fBitmap->height() - 1;
127
128    if (count >= 8) {
129        SkFractionalInt dx4 = dx * 4;
130        SkFractionalInt dy4 = dy * 4;
131        SkFractionalInt dx8 = dx * 8;
132        SkFractionalInt dy8 = dy * 8;
133
134        int32x4_t xbase, ybase;
135        int32x4_t x2base, y2base;
136        int16_t *dst16 = (int16_t *) xy;
137
138        // now build fx, fx+dx, fx+2dx, fx+3dx
139        xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
140        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
141        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
142        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
143
144        // same for fy
145        ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
146        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
147        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
148        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
149
150        x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
151        y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
152
153        // store & bump
154        do {
155            int16x8x2_t hi16;
156
157            hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
158            hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
159
160            vst2q_s16(dst16, hi16);
161
162            // moving base and on to the next
163            xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
164            ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
165            x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
166            y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
167
168            dst16 += 16; // 8x32 aka 16x16
169            count -= 8;
170            fx += dx8;
171            fy += dy8;
172        } while (count >= 8);
173        xy = (uint32_t *) dst16;
174    }
175
176    for (int i = count; i > 0; --i) {
177        *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
178                 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
179        fx += dx; fy += dy;
180    }
181}
182
183static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
184                                uint32_t* SK_RESTRICT xy,
185                                int count, int x, int y) {
186    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
187
188    PREAMBLE(s);
189    // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
190    int maxX = s.fBitmap->width() - 1;
191    int maxY = s.fBitmap->height() - 1;
192
193    SkPerspIter iter(s.fInvMatrix,
194                     SkIntToScalar(x) + SK_ScalarHalf,
195                     SkIntToScalar(y) + SK_ScalarHalf, count);
196
197    while ((count = iter.next()) != 0) {
198        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
199
200        if (count >= 8) {
201            int32_t *mysrc = (int32_t *) srcXY;
202            int16_t *mydst = (int16_t *) xy;
203            do {
204                int16x8x2_t hi16;
205                int32x4x2_t xy1, xy2;
206
207                xy1 = vld2q_s32(mysrc);
208                xy2 = vld2q_s32(mysrc+8);
209
210                hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
211                hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
212
213                vst2q_s16(mydst, hi16);
214
215                count -= 8;  // 8 iterations
216                mysrc += 16; // 16 longs
217                mydst += 16; // 16 shorts, aka 8 longs
218            } while (count >= 8);
219            // get xy and srcXY fixed up
220            srcXY = (const SkFixed *) mysrc;
221            xy = (uint32_t *) mydst;
222        }
223
224        while (--count >= 0) {
225            *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
226                     TILEX_PROCF(srcXY[0], maxX);
227            srcXY += 2;
228        }
229    }
230}
231
232static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
233                                          SkFixed one PREAMBLE_PARAM_Y) {
234    unsigned i = TILEY_PROCF(f, max);
235    i = (i << 4) | TILEY_LOW_BITS(f, max);
236    return (i << 14) | (TILEY_PROCF((f + one), max));
237}
238
239static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
240                                          SkFixed one PREAMBLE_PARAM_X) {
241    unsigned i = TILEX_PROCF(f, max);
242    i = (i << 4) | TILEX_LOW_BITS(f, max);
243    return (i << 14) | (TILEX_PROCF((f + one), max));
244}
245
246static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
247                                          SkFixed one PREAMBLE_PARAM_X) {
248    int32x4_t ret, res, wide_one;
249
250    // Prepare constants
251    wide_one = vdupq_n_s32(one);
252
253    // Step 1
254    res = TILEX_PROCF_NEON4(f, max);
255
256    // Step 2
257    ret = TILEX_LOW_BITS_NEON4(f, max);
258    ret = vsliq_n_s32(ret, res, 4);
259
260    // Step 3
261    res = TILEX_PROCF_NEON4(f + wide_one, max);
262    ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
263
264    return ret;
265}
266
267static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
268                                          SkFixed one PREAMBLE_PARAM_X) {
269    int32x4_t ret, res, wide_one;
270
271    // Prepare constants
272    wide_one = vdupq_n_s32(one);
273
274    // Step 1
275    res = TILEY_PROCF_NEON4(f, max);
276
277    // Step 2
278    ret = TILEY_LOW_BITS_NEON4(f, max);
279    ret = vsliq_n_s32(ret, res, 4);
280
281    // Step 3
282    res = TILEY_PROCF_NEON4(f + wide_one, max);
283    ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
284
285    return ret;
286}
287
288static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
289                              uint32_t xy[], int count, int x, int y) {
290    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
291                             SkMatrix::kScale_Mask)) == 0);
292    SkASSERT(s.fInvKy == 0);
293
294    PREAMBLE(s);
295
296    const unsigned maxX = s.fBitmap->width() - 1;
297    const SkFixed one = s.fFilterOneX;
298    const SkFractionalInt dx = s.fInvSxFractionalInt;
299    SkFractionalInt fx;
300
301    {
302        SkPoint pt;
303        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
304                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
305        const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
306        const unsigned maxY = s.fBitmap->height() - 1;
307        // compute our two Y values up front
308        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
309        // now initialize fx
310        fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
311    }
312
313#ifdef CHECK_FOR_DECAL
314    // test if we don't need to apply the tile proc
315    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
316        decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
317                             SkFractionalIntToFixed(dx), count);
318        return;
319    }
320#endif
321    {
322
323    if (count >= 4) {
324        int32x4_t wide_fx;
325
326        wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
327        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
328        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
329        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
330
331        while (count >= 4) {
332            int32x4_t res;
333
334            res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
335
336            vst1q_u32(xy, vreinterpretq_u32_s32(res));
337
338            wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
339            fx += dx+dx+dx+dx;
340            xy += 4;
341            count -= 4;
342        }
343    }
344
345    while (--count >= 0) {
346        *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
347        fx += dx;
348    }
349
350    }
351}
352
353static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
354                               uint32_t xy[], int count, int x, int y) {
355    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
356    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
357                             SkMatrix::kScale_Mask |
358                             SkMatrix::kAffine_Mask)) == 0);
359
360    PREAMBLE(s);
361    SkPoint srcPt;
362    s.fInvProc(s.fInvMatrix,
363               SkIntToScalar(x) + SK_ScalarHalf,
364               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
365
366    SkFixed oneX = s.fFilterOneX;
367    SkFixed oneY = s.fFilterOneY;
368    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
369    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
370    SkFixed dx = s.fInvSx;
371    SkFixed dy = s.fInvKy;
372    unsigned maxX = s.fBitmap->width() - 1;
373    unsigned maxY = s.fBitmap->height() - 1;
374
375    if (count >= 4) {
376        int32x4_t wide_fy, wide_fx;
377
378        wide_fx = vdupq_n_s32(fx);
379        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
380        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
381        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
382
383        wide_fy = vdupq_n_s32(fy);
384        wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
385        wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
386        wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
387
388        while (count >= 4) {
389            int32x4x2_t vxy;
390
391            // do the X side, then the Y side, then interleave them
392            vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
393            vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
394
395            // interleave as YXYXYXYX as part of the storing
396            vst2q_s32((int32_t*)xy, vxy);
397
398            // prepare next iteration
399            wide_fx += vdupq_n_s32(dx+dx+dx+dx);
400            fx += dx + dx + dx + dx;
401            wide_fy += vdupq_n_s32(dy+dy+dy+dy);
402            fy += dy+dy+dy+dy;
403            xy += 8; // 4 x's, 4 y's
404            count -= 4;
405        }
406    }
407
408    while (--count >= 0) {
409        // NB: writing Y/X
410        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
411        fy += dy;
412        *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
413        fx += dx;
414    }
415}
416
417static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
418                              uint32_t* SK_RESTRICT xy, int count,
419                              int x, int y) {
420    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
421
422    PREAMBLE(s);
423    unsigned maxX = s.fBitmap->width() - 1;
424    unsigned maxY = s.fBitmap->height() - 1;
425    SkFixed oneX = s.fFilterOneX;
426    SkFixed oneY = s.fFilterOneY;
427
428    SkPerspIter iter(s.fInvMatrix,
429                     SkIntToScalar(x) + SK_ScalarHalf,
430                     SkIntToScalar(y) + SK_ScalarHalf, count);
431
432    while ((count = iter.next()) != 0) {
433        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
434
435        while (count >= 4) {
436            int32x4_t wide_x, wide_y;
437            int32x4x2_t vxy, vresyx;
438
439            // load src:  x-y-x-y-x-y-x-y
440            vxy = vld2q_s32(srcXY);
441
442            // do the X side, then the Y side, then interleave them
443            wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
444            wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
445
446            vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
447            vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
448
449            // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
450            vst2q_s32((int32_t*)xy, vresyx);
451
452            // on to the next iteration
453            srcXY += 2*4;
454            count -= 4;
455            xy += 2*4;
456        }
457
458        while (--count >= 0) {
459            // NB: we read x/y, we write y/x
460            *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
461                                       oneY PREAMBLE_ARG_Y);
462            *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
463                                       oneX PREAMBLE_ARG_X);
464            srcXY += 2;
465        }
466    }
467}
468
469const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
470    SCALE_NOFILTER_NAME,
471    SCALE_FILTER_NAME,
472    AFFINE_NOFILTER_NAME,
473    AFFINE_FILTER_NAME,
474    PERSP_NOFILTER_NAME,
475    PERSP_FILTER_NAME
476};
477
478#undef TILEX_PROCF_NEON8
479#undef TILEY_PROCF_NEON8
480#undef TILEX_PROCF_NEON4
481#undef TILEY_PROCF_NEON4
482#undef TILEX_LOW_BITS_NEON4
483#undef TILEY_LOW_BITS_NEON4
484
485#undef MAKENAME
486#undef TILEX_PROCF
487#undef TILEY_PROCF
488#ifdef CHECK_FOR_DECAL
489    #undef CHECK_FOR_DECAL
490#endif
491
492#undef SCALE_NOFILTER_NAME
493#undef SCALE_FILTER_NAME
494#undef AFFINE_NOFILTER_NAME
495#undef AFFINE_FILTER_NAME
496#undef PERSP_NOFILTER_NAME
497#undef PERSP_FILTER_NAME
498
499#undef PREAMBLE
500#undef PREAMBLE_PARAM_X
501#undef PREAMBLE_PARAM_Y
502#undef PREAMBLE_ARG_X
503#undef PREAMBLE_ARG_Y
504
505#undef TILEX_LOW_BITS
506#undef TILEY_LOW_BITS
507