1/*
2 * Copyright 2014 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <arm_neon.h>
9
10#define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
11#define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
12#define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
13#define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
14#define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
15#define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
16
17#define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
18#define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
19#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
20#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
21
22#ifndef PREAMBLE
23    #define PREAMBLE(state)
24    #define PREAMBLE_PARAM_X
25    #define PREAMBLE_PARAM_Y
26    #define PREAMBLE_ARG_X
27    #define PREAMBLE_ARG_Y
28#endif
29
30static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
31                                uint32_t xy[], int count, int x, int y) {
32    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
33                             SkMatrix::kScale_Mask)) == 0);
34
35    PREAMBLE(s);
36
37    // we store y, x, x, x, x, x
38    const unsigned maxX = s.fPixmap.width() - 1;
39    SkFractionalInt fx;
40    {
41        const SkBitmapProcStateAutoMapper mapper(s, x, y);
42        const unsigned maxY = s.fPixmap.height() - 1;
43        *xy++ = TILEY_PROCF(mapper.fixedY(), maxY);
44        fx = mapper.fractionalIntX();
45    }
46
47    if (0 == maxX) {
48        // all of the following X values must be 0
49        memset(xy, 0, count * sizeof(uint16_t));
50        return;
51    }
52
53    const SkFractionalInt dx = s.fInvSxFractionalInt;
54
55#ifdef CHECK_FOR_DECAL
56    // test if we don't need to apply the tile proc
57    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
58        decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
59                             SkFractionalIntToFixed(dx), count);
60        return;
61    }
62#endif
63
64    if (count >= 8) {
65        SkFractionalInt dx2 = dx+dx;
66        SkFractionalInt dx4 = dx2+dx2;
67        SkFractionalInt dx8 = dx4+dx4;
68
69        // now build fx/fx+dx/fx+2dx/fx+3dx
70        SkFractionalInt fx1, fx2, fx3;
71        int32x4_t lbase, hbase;
72        int16_t *dst16 = (int16_t *)xy;
73
74        fx1 = fx+dx;
75        fx2 = fx1+dx;
76        fx3 = fx2+dx;
77
78        lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
79        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
80        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
81        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
82        hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
83
84        // store & bump
85        while (count >= 8) {
86
87            int16x8_t fx8;
88
89            fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
90
91            vst1q_s16(dst16, fx8);
92
93            // but preserving base & on to the next
94            lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
95            hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
96            dst16 += 8;
97            count -= 8;
98            fx += dx8;
99        };
100        xy = (uint32_t *) dst16;
101    }
102
103    uint16_t* xx = (uint16_t*)xy;
104    for (int i = count; i > 0; --i) {
105        *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
106        fx += dx;
107    }
108}
109
110static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
111                                 uint32_t xy[], int count, int x, int y) {
112    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
113    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
114                             SkMatrix::kScale_Mask |
115                             SkMatrix::kAffine_Mask)) == 0);
116
117    PREAMBLE(s);
118    const SkBitmapProcStateAutoMapper mapper(s, x, y);
119
120    SkFractionalInt fx = mapper.fractionalIntX();
121    SkFractionalInt fy = mapper.fractionalIntY();
122    SkFractionalInt dx = s.fInvSxFractionalInt;
123    SkFractionalInt dy = s.fInvKyFractionalInt;
124    int maxX = s.fPixmap.width() - 1;
125    int maxY = s.fPixmap.height() - 1;
126
127    if (count >= 8) {
128        SkFractionalInt dx4 = dx * 4;
129        SkFractionalInt dy4 = dy * 4;
130        SkFractionalInt dx8 = dx * 8;
131        SkFractionalInt dy8 = dy * 8;
132
133        int32x4_t xbase, ybase;
134        int32x4_t x2base, y2base;
135        int16_t *dst16 = (int16_t *) xy;
136
137        // now build fx, fx+dx, fx+2dx, fx+3dx
138        xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
139        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
140        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
141        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
142
143        // same for fy
144        ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
145        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
146        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
147        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
148
149        x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
150        y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
151
152        // store & bump
153        do {
154            int16x8x2_t hi16;
155
156            hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
157            hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
158
159            vst2q_s16(dst16, hi16);
160
161            // moving base and on to the next
162            xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
163            ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
164            x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
165            y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
166
167            dst16 += 16; // 8x32 aka 16x16
168            count -= 8;
169            fx += dx8;
170            fy += dy8;
171        } while (count >= 8);
172        xy = (uint32_t *) dst16;
173    }
174
175    for (int i = count; i > 0; --i) {
176        *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
177                 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
178        fx += dx; fy += dy;
179    }
180}
181
182static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
183                                uint32_t* SK_RESTRICT xy,
184                                int count, int x, int y) {
185    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
186
187    PREAMBLE(s);
188    // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
189    int maxX = s.fPixmap.width() - 1;
190    int maxY = s.fPixmap.height() - 1;
191
192    SkPerspIter iter(s.fInvMatrix,
193                     SkIntToScalar(x) + SK_ScalarHalf,
194                     SkIntToScalar(y) + SK_ScalarHalf, count);
195
196    while ((count = iter.next()) != 0) {
197        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
198
199        if (count >= 8) {
200            int32_t *mysrc = (int32_t *) srcXY;
201            int16_t *mydst = (int16_t *) xy;
202            do {
203                int16x8x2_t hi16;
204                int32x4x2_t xy1, xy2;
205
206                xy1 = vld2q_s32(mysrc);
207                xy2 = vld2q_s32(mysrc+8);
208
209                hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
210                hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
211
212                vst2q_s16(mydst, hi16);
213
214                count -= 8;  // 8 iterations
215                mysrc += 16; // 16 longs
216                mydst += 16; // 16 shorts, aka 8 longs
217            } while (count >= 8);
218            // get xy and srcXY fixed up
219            srcXY = (const SkFixed *) mysrc;
220            xy = (uint32_t *) mydst;
221        }
222
223        while (--count >= 0) {
224            *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
225                     TILEX_PROCF(srcXY[0], maxX);
226            srcXY += 2;
227        }
228    }
229}
230
231static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
232                                          SkFixed one PREAMBLE_PARAM_Y) {
233    unsigned i = TILEY_PROCF(f, max);
234    i = (i << 4) | TILEY_LOW_BITS(f, max);
235    return (i << 14) | (TILEY_PROCF((f + one), max));
236}
237
238static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
239                                          SkFixed one PREAMBLE_PARAM_X) {
240    unsigned i = TILEX_PROCF(f, max);
241    i = (i << 4) | TILEX_LOW_BITS(f, max);
242    return (i << 14) | (TILEX_PROCF((f + one), max));
243}
244
245static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
246                                          SkFixed one PREAMBLE_PARAM_X) {
247    int32x4_t ret, res, wide_one;
248
249    // Prepare constants
250    wide_one = vdupq_n_s32(one);
251
252    // Step 1
253    res = TILEX_PROCF_NEON4(f, max);
254
255    // Step 2
256    ret = TILEX_LOW_BITS_NEON4(f, max);
257    ret = vsliq_n_s32(ret, res, 4);
258
259    // Step 3
260    res = TILEX_PROCF_NEON4(f + wide_one, max);
261    ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
262
263    return ret;
264}
265
266static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
267                                          SkFixed one PREAMBLE_PARAM_X) {
268    int32x4_t ret, res, wide_one;
269
270    // Prepare constants
271    wide_one = vdupq_n_s32(one);
272
273    // Step 1
274    res = TILEY_PROCF_NEON4(f, max);
275
276    // Step 2
277    ret = TILEY_LOW_BITS_NEON4(f, max);
278    ret = vsliq_n_s32(ret, res, 4);
279
280    // Step 3
281    res = TILEY_PROCF_NEON4(f + wide_one, max);
282    ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
283
284    return ret;
285}
286
287static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
288                              uint32_t xy[], int count, int x, int y) {
289    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
290                             SkMatrix::kScale_Mask)) == 0);
291    SkASSERT(s.fInvKy == 0);
292
293    PREAMBLE(s);
294
295    const unsigned maxX = s.fPixmap.width() - 1;
296    const SkFixed one = s.fFilterOneX;
297    const SkFractionalInt dx = s.fInvSxFractionalInt;
298    SkFractionalInt fx;
299
300    {
301        const SkBitmapProcStateAutoMapper mapper(s, x, y);
302        const SkFixed fy = mapper.fixedY();
303        const unsigned maxY = s.fPixmap.height() - 1;
304        // compute our two Y values up front
305        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
306        // now initialize fx
307        fx = mapper.fractionalIntX();
308    }
309
310#ifdef CHECK_FOR_DECAL
311    // test if we don't need to apply the tile proc
312    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
313        decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
314                             SkFractionalIntToFixed(dx), count);
315        return;
316    }
317#endif
318    {
319
320    if (count >= 4) {
321        int32x4_t wide_fx;
322
323        wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
324        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
325        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
326        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
327
328        while (count >= 4) {
329            int32x4_t res;
330
331            res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
332
333            vst1q_u32(xy, vreinterpretq_u32_s32(res));
334
335            wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
336            fx += dx+dx+dx+dx;
337            xy += 4;
338            count -= 4;
339        }
340    }
341
342    while (--count >= 0) {
343        *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
344        fx += dx;
345    }
346
347    }
348}
349
350static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
351                               uint32_t xy[], int count, int x, int y) {
352    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
353    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
354                             SkMatrix::kScale_Mask |
355                             SkMatrix::kAffine_Mask)) == 0);
356
357    PREAMBLE(s);
358    const SkBitmapProcStateAutoMapper mapper(s, x, y);
359
360    SkFixed oneX = s.fFilterOneX;
361    SkFixed oneY = s.fFilterOneY;
362    SkFixed fx = mapper.fixedX();
363    SkFixed fy = mapper.fixedY();
364    SkFixed dx = s.fInvSx;
365    SkFixed dy = s.fInvKy;
366    unsigned maxX = s.fPixmap.width() - 1;
367    unsigned maxY = s.fPixmap.height() - 1;
368
369    if (count >= 4) {
370        int32x4_t wide_fy, wide_fx;
371
372        wide_fx = vdupq_n_s32(fx);
373        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
374        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
375        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
376
377        wide_fy = vdupq_n_s32(fy);
378        wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
379        wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
380        wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
381
382        while (count >= 4) {
383            int32x4x2_t vxy;
384
385            // do the X side, then the Y side, then interleave them
386            vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
387            vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
388
389            // interleave as YXYXYXYX as part of the storing
390            vst2q_s32((int32_t*)xy, vxy);
391
392            // prepare next iteration
393            wide_fx += vdupq_n_s32(dx+dx+dx+dx);
394            fx += dx + dx + dx + dx;
395            wide_fy += vdupq_n_s32(dy+dy+dy+dy);
396            fy += dy+dy+dy+dy;
397            xy += 8; // 4 x's, 4 y's
398            count -= 4;
399        }
400    }
401
402    while (--count >= 0) {
403        // NB: writing Y/X
404        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
405        fy += dy;
406        *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
407        fx += dx;
408    }
409}
410
411static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
412                              uint32_t* SK_RESTRICT xy, int count,
413                              int x, int y) {
414    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
415
416    PREAMBLE(s);
417    unsigned maxX = s.fPixmap.width() - 1;
418    unsigned maxY = s.fPixmap.height() - 1;
419    SkFixed oneX = s.fFilterOneX;
420    SkFixed oneY = s.fFilterOneY;
421
422    SkPerspIter iter(s.fInvMatrix,
423                     SkIntToScalar(x) + SK_ScalarHalf,
424                     SkIntToScalar(y) + SK_ScalarHalf, count);
425
426    while ((count = iter.next()) != 0) {
427        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
428
429        while (count >= 4) {
430            int32x4_t wide_x, wide_y;
431            int32x4x2_t vxy, vresyx;
432
433            // load src:  x-y-x-y-x-y-x-y
434            vxy = vld2q_s32(srcXY);
435
436            // do the X side, then the Y side, then interleave them
437            wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
438            wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
439
440            vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
441            vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
442
443            // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
444            vst2q_s32((int32_t*)xy, vresyx);
445
446            // on to the next iteration
447            srcXY += 2*4;
448            count -= 4;
449            xy += 2*4;
450        }
451
452        while (--count >= 0) {
453            // NB: we read x/y, we write y/x
454            *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
455                                       oneY PREAMBLE_ARG_Y);
456            *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
457                                       oneX PREAMBLE_ARG_X);
458            srcXY += 2;
459        }
460    }
461}
462
463const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
464    SCALE_NOFILTER_NAME,
465    SCALE_FILTER_NAME,
466    AFFINE_NOFILTER_NAME,
467    AFFINE_FILTER_NAME,
468    PERSP_NOFILTER_NAME,
469    PERSP_FILTER_NAME
470};
471
472#undef TILEX_PROCF_NEON8
473#undef TILEY_PROCF_NEON8
474#undef TILEX_PROCF_NEON4
475#undef TILEY_PROCF_NEON4
476#undef TILEX_LOW_BITS_NEON4
477#undef TILEY_LOW_BITS_NEON4
478
479#undef MAKENAME
480#undef TILEX_PROCF
481#undef TILEY_PROCF
482#ifdef CHECK_FOR_DECAL
483    #undef CHECK_FOR_DECAL
484#endif
485
486#undef SCALE_NOFILTER_NAME
487#undef SCALE_FILTER_NAME
488#undef AFFINE_NOFILTER_NAME
489#undef AFFINE_FILTER_NAME
490#undef PERSP_NOFILTER_NAME
491#undef PERSP_FILTER_NAME
492
493#undef PREAMBLE
494#undef PREAMBLE_PARAM_X
495#undef PREAMBLE_PARAM_Y
496#undef PREAMBLE_ARG_X
497#undef PREAMBLE_ARG_Y
498
499#undef TILEX_LOW_BITS
500#undef TILEY_LOW_BITS
501