SkBitmapProcState_matrixProcs.cpp revision 4d0078aa5115fab8ccd8ef59519a8937ea4e8854
1/* NEON optimized code (C) COPYRIGHT 2009 Motorola
2 *
3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file.
5 */
6
7#include "SkBitmapProcState.h"
8#include "SkPerspIter.h"
9#include "SkShader.h"
10#include "SkUtils.h"
11
12// Helper to ensure that when we shift down, we do it w/o sign-extension
13// so the caller doesn't have to manually mask off the top 16 bits
14//
15static unsigned SK_USHIFT16(unsigned x) {
16    return x >> 16;
17}
18
19/*  returns 0...(n-1) given any x (positive or negative).
20
21    As an example, if n (which is always positive) is 5...
22
23          x: -8 -7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7  8
24    returns:  2  3  4  0  1  2  3  4  0  1  2  3  4  0  1  2  3
25 */
26static inline int sk_int_mod(int x, int n) {
27    SkASSERT(n > 0);
28    if ((unsigned)x >= (unsigned)n) {
29        if (x < 0) {
30            x = n + ~(~x % n);
31        } else {
32            x = x % n;
33        }
34    }
35    return x;
36}
37
38/*
39 *  The decal_ functions require that
40 *  1. dx > 0
41 *  2. [fx, fx+dx, fx+2dx, fx+3dx, ... fx+(count-1)dx] are all <= maxX
42 *
43 *  In addition, we use SkFractionalInt to keep more fractional precision than
44 *  just SkFixed, so we will abort the decal_ call if dx is very small, since
45 *  the decal_ function just operates on SkFixed. If that were changed, we could
46 *  skip the very_small test here.
47 */
48static inline bool can_truncate_to_fixed_for_decal(SkFractionalInt frX,
49                                                   SkFractionalInt frDx,
50                                                   int count, unsigned max) {
51    SkFixed dx = SkFractionalIntToFixed(frDx);
52
53    // if decal_ kept SkFractionalInt precision, this would just be dx <= 0
54    // I just made up the 1/256. Just don't want to perceive accumulated error
55    // if we truncate frDx and lose its low bits.
56    if (dx <= SK_Fixed1 / 256) {
57        return false;
58    }
59
60    // We cast to unsigned so we don't have to check for negative values, which
61    // will now appear as very large positive values, and thus fail our test!
62    SkFixed fx = SkFractionalIntToFixed(frX);
63    return (unsigned)SkFixedFloorToInt(fx) <= max &&
64           (unsigned)SkFixedFloorToInt(fx + dx * (count - 1)) < max;
65}
66
67void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
68void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
69
70#define MAKENAME(suffix)        ClampX_ClampY ## suffix
71#define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
72#define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
73#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
74#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
75#define CHECK_FOR_DECAL
76#if	defined(__ARM_HAVE_NEON)
77    #include "SkBitmapProcState_matrix_clamp.h"
78#else
79    #include "SkBitmapProcState_matrix.h"
80#endif
81
82#define MAKENAME(suffix)        RepeatX_RepeatY ## suffix
83#define TILEX_PROCF(fx, max)    SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
84#define TILEY_PROCF(fy, max)    SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
85#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
86#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
87#if	defined(__ARM_HAVE_NEON)
88    #include "SkBitmapProcState_matrix_repeat.h"
89#else
90    #include "SkBitmapProcState_matrix.h"
91#endif
92
93#define MAKENAME(suffix)        GeneralXY ## suffix
94#define PREAMBLE(state)         SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \
95                                SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY; \
96                                SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX = (state).fTileLowBitsProcX; \
97                                SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY = (state).fTileLowBitsProcY
98#define PREAMBLE_PARAM_X        , SkBitmapProcState::FixedTileProc tileProcX, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX
99#define PREAMBLE_PARAM_Y        , SkBitmapProcState::FixedTileProc tileProcY, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY
100#define PREAMBLE_ARG_X          , tileProcX, tileLowBitsProcX
101#define PREAMBLE_ARG_Y          , tileProcY, tileLowBitsProcY
102#define TILEX_PROCF(fx, max)    SK_USHIFT16(tileProcX(fx) * ((max) + 1))
103#define TILEY_PROCF(fy, max)    SK_USHIFT16(tileProcY(fy) * ((max) + 1))
104#define TILEX_LOW_BITS(fx, max) tileLowBitsProcX(fx, (max) + 1)
105#define TILEY_LOW_BITS(fy, max) tileLowBitsProcY(fy, (max) + 1)
106#include "SkBitmapProcState_matrix.h"
107
108static inline U16CPU fixed_clamp(SkFixed x)
109{
110#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
111    if (x >> 16)
112        x = 0xFFFF;
113    if (x < 0)
114        x = 0;
115#else
116    if (x >> 16)
117    {
118        if (x < 0)
119            x = 0;
120        else
121            x = 0xFFFF;
122    }
123#endif
124    return x;
125}
126
127static inline U16CPU fixed_repeat(SkFixed x)
128{
129    return x & 0xFFFF;
130}
131
132// Visual Studio 2010 (MSC_VER=1600) optimizes bit-shift code incorrectly.
133// See http://code.google.com/p/skia/issues/detail?id=472
134#if defined(_MSC_VER) && (_MSC_VER >= 1600)
135#pragma optimize("", off)
136#endif
137
138static inline U16CPU fixed_mirror(SkFixed x)
139{
140    SkFixed s = x << 15 >> 31;
141    // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval
142    return (x ^ s) & 0xFFFF;
143}
144
145#if defined(_MSC_VER) && (_MSC_VER >= 1600)
146#pragma optimize("", on)
147#endif
148
149static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m)
150{
151    if (SkShader::kClamp_TileMode == m)
152        return fixed_clamp;
153    if (SkShader::kRepeat_TileMode == m)
154        return fixed_repeat;
155    SkASSERT(SkShader::kMirror_TileMode == m);
156    return fixed_mirror;
157}
158
159static inline U16CPU fixed_clamp_lowbits(SkFixed x, int) {
160    return (x >> 12) & 0xF;
161}
162
163static inline U16CPU fixed_repeat_or_mirrow_lowbits(SkFixed x, int scale) {
164    return ((x * scale) >> 12) & 0xF;
165}
166
167static SkBitmapProcState::FixedTileLowBitsProc choose_tile_lowbits_proc(unsigned m) {
168    if (SkShader::kClamp_TileMode == m) {
169        return fixed_clamp_lowbits;
170    } else {
171        SkASSERT(SkShader::kMirror_TileMode == m ||
172                 SkShader::kRepeat_TileMode == m);
173        // mirror and repeat have the same behavior for the low bits.
174        return fixed_repeat_or_mirrow_lowbits;
175    }
176}
177
178static inline U16CPU int_clamp(int x, int n) {
179#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
180    if (x >= n)
181        x = n - 1;
182    if (x < 0)
183        x = 0;
184#else
185    if ((unsigned)x >= (unsigned)n) {
186        if (x < 0) {
187            x = 0;
188        } else {
189            x = n - 1;
190        }
191    }
192#endif
193    return x;
194}
195
196static inline U16CPU int_repeat(int x, int n) {
197    return sk_int_mod(x, n);
198}
199
200static inline U16CPU int_mirror(int x, int n) {
201    x = sk_int_mod(x, 2 * n);
202    if (x >= n) {
203        x = n + ~(x - n);
204    }
205    return x;
206}
207
208#if 0
209static void test_int_tileprocs() {
210    for (int i = -8; i <= 8; i++) {
211        SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3));
212    }
213}
214#endif
215
216static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) {
217    if (SkShader::kClamp_TileMode == tm)
218        return int_clamp;
219    if (SkShader::kRepeat_TileMode == tm)
220        return int_repeat;
221    SkASSERT(SkShader::kMirror_TileMode == tm);
222    return int_mirror;
223}
224
225//////////////////////////////////////////////////////////////////////////////
226
227void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
228{
229    int i;
230
231#if	defined(__ARM_HAVE_NEON)
232    if (count >= 8) {
233        /* SkFixed is 16.16 fixed point */
234        SkFixed dx2 = dx+dx;
235        SkFixed dx4 = dx2+dx2;
236        SkFixed dx8 = dx4+dx4;
237
238        /* now build fx/fx+dx/fx+2dx/fx+3dx */
239        SkFixed fx1, fx2, fx3;
240        int32x2_t lower, upper;
241        int32x4_t lbase, hbase;
242        uint16_t *dst16 = (uint16_t *)dst;
243
244        fx1 = fx+dx;
245        fx2 = fx1+dx;
246        fx3 = fx2+dx;
247
248        /* avoid an 'lbase unitialized' warning */
249        lbase = vdupq_n_s32(fx);
250        lbase = vsetq_lane_s32(fx1, lbase, 1);
251        lbase = vsetq_lane_s32(fx2, lbase, 2);
252        lbase = vsetq_lane_s32(fx3, lbase, 3);
253        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
254
255        /* take upper 16 of each, store, and bump everything */
256        do {
257            int32x4_t lout, hout;
258            uint16x8_t hi16;
259
260            lout = lbase;
261            hout = hbase;
262            /* gets hi's of all louts then hi's of all houts */
263            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
264            hi16 = vreinterpretq_u16_s32(hout);
265            vst1q_u16(dst16, hi16);
266
267            /* on to the next */
268            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
269            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
270            dst16 += 8;
271            count -= 8;
272            fx += dx8;
273        } while (count >= 8);
274        dst = (uint32_t *) dst16;
275    }
276#else
277    for (i = (count >> 2); i > 0; --i)
278    {
279        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
280        fx += dx+dx;
281        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
282        fx += dx+dx;
283    }
284    count &= 3;
285#endif
286
287    uint16_t* xx = (uint16_t*)dst;
288    for (i = count; i > 0; --i) {
289        *xx++ = SkToU16(fx >> 16); fx += dx;
290    }
291}
292
293void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
294{
295
296#if	defined(__ARM_HAVE_NEON)
297    if (count >= 8) {
298        int32x4_t wide_fx;
299        int32x4_t wide_fx2;
300        int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
301
302        wide_fx = vdupq_n_s32(fx);
303        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
304        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
305        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
306
307        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
308
309        while (count >= 8) {
310            int32x4_t wide_out;
311            int32x4_t wide_out2;
312
313            wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
314            wide_out = vorrq_s32(wide_out,
315            vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
316
317            wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
318            wide_out2 = vorrq_s32(wide_out2,
319            vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
320
321            vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
322            vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
323
324            dst += 8;
325            fx += dx*8;
326            wide_fx = vaddq_s32(wide_fx, wide_dx8);
327            wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
328            count -= 8;
329        }
330    }
331#endif
332
333    if (count & 1)
334    {
335        SkASSERT((fx >> (16 + 14)) == 0);
336        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
337        fx += dx;
338    }
339    while ((count -= 2) >= 0)
340    {
341        SkASSERT((fx >> (16 + 14)) == 0);
342        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
343        fx += dx;
344
345        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
346        fx += dx;
347    }
348}
349
350///////////////////////////////////////////////////////////////////////////////
351// stores the same as SCALE, but is cheaper to compute. Also since there is no
352// scale, we don't need/have a FILTER version
353
354static void fill_sequential(uint16_t xptr[], int start, int count) {
355#if 1
356    if (reinterpret_cast<intptr_t>(xptr) & 0x2) {
357        *xptr++ = start++;
358        count -= 1;
359    }
360    if (count > 3) {
361        uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr);
362        uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1);
363        uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3);
364        start += count & ~3;
365        int qcount = count >> 2;
366        do {
367            *xxptr++ = pattern0;
368            pattern0 += 0x40004;
369            *xxptr++ = pattern1;
370            pattern1 += 0x40004;
371        } while (--qcount != 0);
372        xptr = reinterpret_cast<uint16_t*>(xxptr);
373        count &= 3;
374    }
375    while (--count >= 0) {
376        *xptr++ = start++;
377    }
378#else
379    for (int i = 0; i < count; i++) {
380        *xptr++ = start++;
381    }
382#endif
383}
384
385static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy,
386                                   int x, int y) {
387    SkPoint pt;
388    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
389               SkIntToScalar(y) + SK_ScalarHalf, &pt);
390    **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16,
391                           s.fBitmap->height());
392    *xy += 1;   // bump the ptr
393    // return our starting X position
394    return SkScalarToFixed(pt.fX) >> 16;
395}
396
397static void clampx_nofilter_trans(const SkBitmapProcState& s,
398                                  uint32_t xy[], int count, int x, int y) {
399    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
400
401    int xpos = nofilter_trans_preamble(s, &xy, x, y);
402    const int width = s.fBitmap->width();
403    if (1 == width) {
404        // all of the following X values must be 0
405        memset(xy, 0, count * sizeof(uint16_t));
406        return;
407    }
408
409    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
410    int n;
411
412    // fill before 0 as needed
413    if (xpos < 0) {
414        n = -xpos;
415        if (n > count) {
416            n = count;
417        }
418        memset(xptr, 0, n * sizeof(uint16_t));
419        count -= n;
420        if (0 == count) {
421            return;
422        }
423        xptr += n;
424        xpos = 0;
425    }
426
427    // fill in 0..width-1 if needed
428    if (xpos < width) {
429        n = width - xpos;
430        if (n > count) {
431            n = count;
432        }
433        fill_sequential(xptr, xpos, n);
434        count -= n;
435        if (0 == count) {
436            return;
437        }
438        xptr += n;
439    }
440
441    // fill the remaining with the max value
442    sk_memset16(xptr, width - 1, count);
443}
444
445static void repeatx_nofilter_trans(const SkBitmapProcState& s,
446                                   uint32_t xy[], int count, int x, int y) {
447    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
448
449    int xpos = nofilter_trans_preamble(s, &xy, x, y);
450    const int width = s.fBitmap->width();
451    if (1 == width) {
452        // all of the following X values must be 0
453        memset(xy, 0, count * sizeof(uint16_t));
454        return;
455    }
456
457    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
458    int start = sk_int_mod(xpos, width);
459    int n = width - start;
460    if (n > count) {
461        n = count;
462    }
463    fill_sequential(xptr, start, n);
464    xptr += n;
465    count -= n;
466
467    while (count >= width) {
468        fill_sequential(xptr, 0, width);
469        xptr += width;
470        count -= width;
471    }
472
473    if (count > 0) {
474        fill_sequential(xptr, 0, count);
475    }
476}
477
478static void fill_backwards(uint16_t xptr[], int pos, int count) {
479    for (int i = 0; i < count; i++) {
480        SkASSERT(pos >= 0);
481        xptr[i] = pos--;
482    }
483}
484
485static void mirrorx_nofilter_trans(const SkBitmapProcState& s,
486                                   uint32_t xy[], int count, int x, int y) {
487    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
488
489    int xpos = nofilter_trans_preamble(s, &xy, x, y);
490    const int width = s.fBitmap->width();
491    if (1 == width) {
492        // all of the following X values must be 0
493        memset(xy, 0, count * sizeof(uint16_t));
494        return;
495    }
496
497    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
498    // need to know our start, and our initial phase (forward or backward)
499    bool forward;
500    int n;
501    int start = sk_int_mod(xpos, 2 * width);
502    if (start >= width) {
503        start = width + ~(start - width);
504        forward = false;
505        n = start + 1;  // [start .. 0]
506    } else {
507        forward = true;
508        n = width - start;  // [start .. width)
509    }
510    if (n > count) {
511        n = count;
512    }
513    if (forward) {
514        fill_sequential(xptr, start, n);
515    } else {
516        fill_backwards(xptr, start, n);
517    }
518    forward = !forward;
519    xptr += n;
520    count -= n;
521
522    while (count >= width) {
523        if (forward) {
524            fill_sequential(xptr, 0, width);
525        } else {
526            fill_backwards(xptr, width - 1, width);
527        }
528        forward = !forward;
529        xptr += width;
530        count -= width;
531    }
532
533    if (count > 0) {
534        if (forward) {
535            fill_sequential(xptr, 0, count);
536        } else {
537            fill_backwards(xptr, width - 1, count);
538        }
539    }
540}
541
542///////////////////////////////////////////////////////////////////////////////
543
544SkBitmapProcState::MatrixProc
545SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
546//    test_int_tileprocs();
547    // check for our special case when there is no scale/affine/perspective
548    if (trivial_matrix) {
549        SkASSERT(!fDoFilter);
550        fIntTileProcY = choose_int_tile_proc(fTileModeY);
551        switch (fTileModeX) {
552            case SkShader::kClamp_TileMode:
553                return clampx_nofilter_trans;
554            case SkShader::kRepeat_TileMode:
555                return repeatx_nofilter_trans;
556            case SkShader::kMirror_TileMode:
557                return mirrorx_nofilter_trans;
558        }
559    }
560
561    int index = 0;
562    if (fDoFilter) {
563        index = 1;
564    }
565    if (fInvType & SkMatrix::kPerspective_Mask) {
566        index += 4;
567    } else if (fInvType & SkMatrix::kAffine_Mask) {
568        index += 2;
569    }
570
571    if (SkShader::kClamp_TileMode == fTileModeX &&
572        SkShader::kClamp_TileMode == fTileModeY)
573    {
574        // clamp gets special version of filterOne
575        fFilterOneX = SK_Fixed1;
576        fFilterOneY = SK_Fixed1;
577        return ClampX_ClampY_Procs[index];
578    }
579
580    // all remaining procs use this form for filterOne
581    fFilterOneX = SK_Fixed1 / fBitmap->width();
582    fFilterOneY = SK_Fixed1 / fBitmap->height();
583
584    if (SkShader::kRepeat_TileMode == fTileModeX &&
585        SkShader::kRepeat_TileMode == fTileModeY)
586    {
587        return RepeatX_RepeatY_Procs[index];
588    }
589
590    fTileProcX = choose_tile_proc(fTileModeX);
591    fTileProcY = choose_tile_proc(fTileModeY);
592    fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX);
593    fTileLowBitsProcY = choose_tile_lowbits_proc(fTileModeY);
594    return GeneralXY_Procs[index];
595}
596
597