SkBitmapProcState_matrixProcs.cpp revision 99c114e0ac732ba01705e24d12f5e4dd7e144abd
1/* NEON optimized code (C) COPYRIGHT 2009 Motorola
2 *
3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file.
5 */
6
7#include "SkBitmapProcState.h"
8#include "SkPerspIter.h"
9#include "SkShader.h"
10#include "SkUtils.h"
11
12// Helper to ensure that when we shift down, we do it w/o sign-extension
13// so the caller doesn't have to manually mask off the top 16 bits
14//
15static unsigned SK_USHIFT16(unsigned x) {
16    return x >> 16;
17}
18
19/*  returns 0...(n-1) given any x (positive or negative).
20
21    As an example, if n (which is always positive) is 5...
22
23          x: -8 -7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7  8
24    returns:  2  3  4  0  1  2  3  4  0  1  2  3  4  0  1  2  3
25 */
26static inline int sk_int_mod(int x, int n) {
27    SkASSERT(n > 0);
28    if ((unsigned)x >= (unsigned)n) {
29        if (x < 0) {
30            x = n + ~(~x % n);
31        } else {
32            x = x % n;
33        }
34    }
35    return x;
36}
37
38void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
39void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
40
41#define MAKENAME(suffix)        ClampX_ClampY ## suffix
42#define TILEX_PROCF(fx, max)    SkClampMax(SK_USHIFT16(fx), max)
43#define TILEY_PROCF(fy, max)    SkClampMax(SK_USHIFT16(fy), max)
44#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
45#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
46#define CHECK_FOR_DECAL
47#if	defined(__ARM_HAVE_NEON)
48    #include "SkBitmapProcState_matrix_clamp.h"
49#else
50    #include "SkBitmapProcState_matrix.h"
51#endif
52
53#define MAKENAME(suffix)        RepeatX_RepeatY ## suffix
54#define TILEX_PROCF(fx, max)    SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
55#define TILEY_PROCF(fy, max)    SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
56#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
57#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
58#if	defined(__ARM_HAVE_NEON)
59    #include "SkBitmapProcState_matrix_repeat.h"
60#else
61    #include "SkBitmapProcState_matrix.h"
62#endif
63
64#define MAKENAME(suffix)        GeneralXY ## suffix
65#define PREAMBLE(state)         SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \
66                                SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY; \
67                                SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX = (state).fTileLowBitsProcX; \
68                                SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY = (state).fTileLowBitsProcY
69#define PREAMBLE_PARAM_X        , SkBitmapProcState::FixedTileProc tileProcX, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX
70#define PREAMBLE_PARAM_Y        , SkBitmapProcState::FixedTileProc tileProcY, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY
71#define PREAMBLE_ARG_X          , tileProcX, tileLowBitsProcX
72#define PREAMBLE_ARG_Y          , tileProcY, tileLowBitsProcY
73#define TILEX_PROCF(fx, max)    SK_USHIFT16(tileProcX(fx) * ((max) + 1))
74#define TILEY_PROCF(fy, max)    SK_USHIFT16(tileProcY(fy) * ((max) + 1))
75#define TILEX_LOW_BITS(fx, max) tileLowBitsProcX(fx, (max) + 1)
76#define TILEY_LOW_BITS(fy, max) tileLowBitsProcY(fy, (max) + 1)
77#include "SkBitmapProcState_matrix.h"
78
79static inline U16CPU fixed_clamp(SkFixed x)
80{
81#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
82    if (x >> 16)
83        x = 0xFFFF;
84    if (x < 0)
85        x = 0;
86#else
87    if (x >> 16)
88    {
89        if (x < 0)
90            x = 0;
91        else
92            x = 0xFFFF;
93    }
94#endif
95    return x;
96}
97
98static inline U16CPU fixed_repeat(SkFixed x)
99{
100    return x & 0xFFFF;
101}
102
103// Visual Studio 2010 (MSC_VER=1600) optimizes bit-shift code incorrectly.
104// See http://code.google.com/p/skia/issues/detail?id=472
105#if defined(_MSC_VER) && (_MSC_VER >= 1600)
106#pragma optimize("", off)
107#endif
108
109static inline U16CPU fixed_mirror(SkFixed x)
110{
111    SkFixed s = x << 15 >> 31;
112    // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval
113    return (x ^ s) & 0xFFFF;
114}
115
116#if defined(_MSC_VER) && (_MSC_VER >= 1600)
117#pragma optimize("", on)
118#endif
119
120static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m)
121{
122    if (SkShader::kClamp_TileMode == m)
123        return fixed_clamp;
124    if (SkShader::kRepeat_TileMode == m)
125        return fixed_repeat;
126    SkASSERT(SkShader::kMirror_TileMode == m);
127    return fixed_mirror;
128}
129
130static inline U16CPU fixed_clamp_lowbits(SkFixed x, int) {
131    return (x >> 12) & 0xF;
132}
133
134static inline U16CPU fixed_repeat_or_mirrow_lowbits(SkFixed x, int scale) {
135    return ((x * scale) >> 12) & 0xF;
136}
137
138static SkBitmapProcState::FixedTileLowBitsProc choose_tile_lowbits_proc(unsigned m) {
139    if (SkShader::kClamp_TileMode == m) {
140        return fixed_clamp_lowbits;
141    } else {
142        SkASSERT(SkShader::kMirror_TileMode == m ||
143                 SkShader::kRepeat_TileMode == m);
144        // mirror and repeat have the same behavior for the low bits.
145        return fixed_repeat_or_mirrow_lowbits;
146    }
147}
148
149static inline U16CPU int_clamp(int x, int n) {
150#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
151    if (x >= n)
152        x = n - 1;
153    if (x < 0)
154        x = 0;
155#else
156    if ((unsigned)x >= (unsigned)n) {
157        if (x < 0) {
158            x = 0;
159        } else {
160            x = n - 1;
161        }
162    }
163#endif
164    return x;
165}
166
167static inline U16CPU int_repeat(int x, int n) {
168    return sk_int_mod(x, n);
169}
170
171static inline U16CPU int_mirror(int x, int n) {
172    x = sk_int_mod(x, 2 * n);
173    if (x >= n) {
174        x = n + ~(x - n);
175    }
176    return x;
177}
178
179#if 0
180static void test_int_tileprocs() {
181    for (int i = -8; i <= 8; i++) {
182        SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3));
183    }
184}
185#endif
186
187static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) {
188    if (SkShader::kClamp_TileMode == tm)
189        return int_clamp;
190    if (SkShader::kRepeat_TileMode == tm)
191        return int_repeat;
192    SkASSERT(SkShader::kMirror_TileMode == tm);
193    return int_mirror;
194}
195
196//////////////////////////////////////////////////////////////////////////////
197
198void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
199{
200    int i;
201
202#if	defined(__ARM_HAVE_NEON)
203    if (count >= 8) {
204        /* SkFixed is 16.16 fixed point */
205        SkFixed dx2 = dx+dx;
206        SkFixed dx4 = dx2+dx2;
207        SkFixed dx8 = dx4+dx4;
208
209        /* now build fx/fx+dx/fx+2dx/fx+3dx */
210        SkFixed fx1, fx2, fx3;
211        int32x2_t lower, upper;
212        int32x4_t lbase, hbase;
213        uint16_t *dst16 = (uint16_t *)dst;
214
215        fx1 = fx+dx;
216        fx2 = fx1+dx;
217        fx3 = fx2+dx;
218
219        /* avoid an 'lbase unitialized' warning */
220        lbase = vdupq_n_s32(fx);
221        lbase = vsetq_lane_s32(fx1, lbase, 1);
222        lbase = vsetq_lane_s32(fx2, lbase, 2);
223        lbase = vsetq_lane_s32(fx3, lbase, 3);
224        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
225
226        /* take upper 16 of each, store, and bump everything */
227        do {
228            int32x4_t lout, hout;
229            uint16x8_t hi16;
230
231            lout = lbase;
232            hout = hbase;
233            /* gets hi's of all louts then hi's of all houts */
234            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
235            hi16 = vreinterpretq_u16_s32(hout);
236            vst1q_u16(dst16, hi16);
237
238            /* on to the next */
239            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
240            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
241            dst16 += 8;
242            count -= 8;
243            fx += dx8;
244        } while (count >= 8);
245        dst = (uint32_t *) dst16;
246    }
247#else
248    for (i = (count >> 2); i > 0; --i)
249    {
250        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
251        fx += dx+dx;
252        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
253        fx += dx+dx;
254    }
255    count &= 3;
256#endif
257
258    uint16_t* xx = (uint16_t*)dst;
259    for (i = count; i > 0; --i) {
260        *xx++ = SkToU16(fx >> 16); fx += dx;
261    }
262}
263
264void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
265{
266
267#if	defined(__ARM_HAVE_NEON)
268    if (count >= 8) {
269        int32x4_t wide_fx;
270        int32x4_t wide_fx2;
271        int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
272
273        wide_fx = vdupq_n_s32(fx);
274        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
275        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
276        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
277
278        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
279
280        while (count >= 8) {
281            int32x4_t wide_out;
282            int32x4_t wide_out2;
283
284            wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
285            wide_out = vorrq_s32(wide_out,
286            vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
287
288            wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
289            wide_out2 = vorrq_s32(wide_out2,
290            vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
291
292            vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
293            vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
294
295            dst += 8;
296            fx += dx*8;
297            wide_fx = vaddq_s32(wide_fx, wide_dx8);
298            wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
299            count -= 8;
300        }
301    }
302#endif
303
304    if (count & 1)
305    {
306        SkASSERT((fx >> (16 + 14)) == 0);
307        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
308        fx += dx;
309    }
310    while ((count -= 2) >= 0)
311    {
312        SkASSERT((fx >> (16 + 14)) == 0);
313        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
314        fx += dx;
315
316        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
317        fx += dx;
318    }
319}
320
321///////////////////////////////////////////////////////////////////////////////
322// stores the same as SCALE, but is cheaper to compute. Also since there is no
323// scale, we don't need/have a FILTER version
324
325static void fill_sequential(uint16_t xptr[], int start, int count) {
326#if 1
327    if (reinterpret_cast<intptr_t>(xptr) & 0x2) {
328        *xptr++ = start++;
329        count -= 1;
330    }
331    if (count > 3) {
332        uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr);
333        uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1);
334        uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3);
335        start += count & ~3;
336        int qcount = count >> 2;
337        do {
338            *xxptr++ = pattern0;
339            pattern0 += 0x40004;
340            *xxptr++ = pattern1;
341            pattern1 += 0x40004;
342        } while (--qcount != 0);
343        xptr = reinterpret_cast<uint16_t*>(xxptr);
344        count &= 3;
345    }
346    while (--count >= 0) {
347        *xptr++ = start++;
348    }
349#else
350    for (int i = 0; i < count; i++) {
351        *xptr++ = start++;
352    }
353#endif
354}
355
356static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy,
357                                   int x, int y) {
358    SkPoint pt;
359    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
360               SkIntToScalar(y) + SK_ScalarHalf, &pt);
361    **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16,
362                           s.fBitmap->height());
363    *xy += 1;   // bump the ptr
364    // return our starting X position
365    return SkScalarToFixed(pt.fX) >> 16;
366}
367
368static void clampx_nofilter_trans(const SkBitmapProcState& s,
369                                  uint32_t xy[], int count, int x, int y) {
370    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
371
372    int xpos = nofilter_trans_preamble(s, &xy, x, y);
373    const int width = s.fBitmap->width();
374    if (1 == width) {
375        // all of the following X values must be 0
376        memset(xy, 0, count * sizeof(uint16_t));
377        return;
378    }
379
380    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
381    int n;
382
383    // fill before 0 as needed
384    if (xpos < 0) {
385        n = -xpos;
386        if (n > count) {
387            n = count;
388        }
389        memset(xptr, 0, n * sizeof(uint16_t));
390        count -= n;
391        if (0 == count) {
392            return;
393        }
394        xptr += n;
395        xpos = 0;
396    }
397
398    // fill in 0..width-1 if needed
399    if (xpos < width) {
400        n = width - xpos;
401        if (n > count) {
402            n = count;
403        }
404        fill_sequential(xptr, xpos, n);
405        count -= n;
406        if (0 == count) {
407            return;
408        }
409        xptr += n;
410    }
411
412    // fill the remaining with the max value
413    sk_memset16(xptr, width - 1, count);
414}
415
416static void repeatx_nofilter_trans(const SkBitmapProcState& s,
417                                   uint32_t xy[], int count, int x, int y) {
418    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
419
420    int xpos = nofilter_trans_preamble(s, &xy, x, y);
421    const int width = s.fBitmap->width();
422    if (1 == width) {
423        // all of the following X values must be 0
424        memset(xy, 0, count * sizeof(uint16_t));
425        return;
426    }
427
428    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
429    int start = sk_int_mod(xpos, width);
430    int n = width - start;
431    if (n > count) {
432        n = count;
433    }
434    fill_sequential(xptr, start, n);
435    xptr += n;
436    count -= n;
437
438    while (count >= width) {
439        fill_sequential(xptr, 0, width);
440        xptr += width;
441        count -= width;
442    }
443
444    if (count > 0) {
445        fill_sequential(xptr, 0, count);
446    }
447}
448
449static void fill_backwards(uint16_t xptr[], int pos, int count) {
450    for (int i = 0; i < count; i++) {
451        SkASSERT(pos >= 0);
452        xptr[i] = pos--;
453    }
454}
455
456static void mirrorx_nofilter_trans(const SkBitmapProcState& s,
457                                   uint32_t xy[], int count, int x, int y) {
458    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
459
460    int xpos = nofilter_trans_preamble(s, &xy, x, y);
461    const int width = s.fBitmap->width();
462    if (1 == width) {
463        // all of the following X values must be 0
464        memset(xy, 0, count * sizeof(uint16_t));
465        return;
466    }
467
468    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
469    // need to know our start, and our initial phase (forward or backward)
470    bool forward;
471    int n;
472    int start = sk_int_mod(xpos, 2 * width);
473    if (start >= width) {
474        start = width + ~(start - width);
475        forward = false;
476        n = start + 1;  // [start .. 0]
477    } else {
478        forward = true;
479        n = width - start;  // [start .. width)
480    }
481    if (n > count) {
482        n = count;
483    }
484    if (forward) {
485        fill_sequential(xptr, start, n);
486    } else {
487        fill_backwards(xptr, start, n);
488    }
489    forward = !forward;
490    xptr += n;
491    count -= n;
492
493    while (count >= width) {
494        if (forward) {
495            fill_sequential(xptr, 0, width);
496        } else {
497            fill_backwards(xptr, width - 1, width);
498        }
499        forward = !forward;
500        xptr += width;
501        count -= width;
502    }
503
504    if (count > 0) {
505        if (forward) {
506            fill_sequential(xptr, 0, count);
507        } else {
508            fill_backwards(xptr, width - 1, count);
509        }
510    }
511}
512
513///////////////////////////////////////////////////////////////////////////////
514
515SkBitmapProcState::MatrixProc
516SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
517//    test_int_tileprocs();
518    // check for our special case when there is no scale/affine/perspective
519    if (trivial_matrix) {
520        SkASSERT(!fDoFilter);
521        fIntTileProcY = choose_int_tile_proc(fTileModeY);
522        switch (fTileModeX) {
523            case SkShader::kClamp_TileMode:
524                return clampx_nofilter_trans;
525            case SkShader::kRepeat_TileMode:
526                return repeatx_nofilter_trans;
527            case SkShader::kMirror_TileMode:
528                return mirrorx_nofilter_trans;
529        }
530    }
531
532    int index = 0;
533    if (fDoFilter) {
534        index = 1;
535    }
536    if (fInvType & SkMatrix::kPerspective_Mask) {
537        index += 4;
538    } else if (fInvType & SkMatrix::kAffine_Mask) {
539        index += 2;
540    }
541
542    if (SkShader::kClamp_TileMode == fTileModeX &&
543        SkShader::kClamp_TileMode == fTileModeY)
544    {
545        // clamp gets special version of filterOne
546        fFilterOneX = SK_Fixed1;
547        fFilterOneY = SK_Fixed1;
548        return ClampX_ClampY_Procs[index];
549    }
550
551    // all remaining procs use this form for filterOne
552    fFilterOneX = SK_Fixed1 / fBitmap->width();
553    fFilterOneY = SK_Fixed1 / fBitmap->height();
554
555    if (SkShader::kRepeat_TileMode == fTileModeX &&
556        SkShader::kRepeat_TileMode == fTileModeY)
557    {
558        return RepeatX_RepeatY_Procs[index];
559    }
560
561    fTileProcX = choose_tile_proc(fTileModeX);
562    fTileProcY = choose_tile_proc(fTileModeY);
563    fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX);
564    fTileLowBitsProcY = choose_tile_lowbits_proc(fTileModeY);
565    return GeneralXY_Procs[index];
566}
567
568