SkBitmapProcState_matrixProcs.cpp revision f444e8ccda8905a8ce16bac368e09f205786db31
1/* NEON optimized code (C) COPYRIGHT 2009 Motorola
2 *
3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file.
5 */
6
7#include "SkBitmapProcState.h"
8#include "SkPerspIter.h"
9#include "SkShader.h"
10#include "SkUtils.h"
11
12/*  returns 0...(n-1) given any x (positive or negative).
13
14    As an example, if n (which is always positive) is 5...
15
16          x: -8 -7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7  8
17    returns:  2  3  4  0  1  2  3  4  0  1  2  3  4  0  1  2  3
18 */
19static inline int sk_int_mod(int x, int n) {
20    SkASSERT(n > 0);
21    if ((unsigned)x >= (unsigned)n) {
22        if (x < 0) {
23            x = n + ~(~x % n);
24        } else {
25            x = x % n;
26        }
27    }
28    return x;
29}
30
31void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
32void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
33
34#define MAKENAME(suffix)        ClampX_ClampY ## suffix
35#define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
36#define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
37#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
38#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
39#define CHECK_FOR_DECAL
40#if	defined(__ARM_HAVE_NEON)
41    #include "SkBitmapProcState_matrix_clamp.h"
42#else
43    #include "SkBitmapProcState_matrix.h"
44#endif
45
46#define MAKENAME(suffix)        RepeatX_RepeatY ## suffix
47#define TILEX_PROCF(fx, max)    (((fx) & 0xFFFF) * ((max) + 1) >> 16)
48#define TILEY_PROCF(fy, max)    (((fy) & 0xFFFF) * ((max) + 1) >> 16)
49#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
50#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
51#if	defined(__ARM_HAVE_NEON)
52    #include "SkBitmapProcState_matrix_repeat.h"
53#else
54    #include "SkBitmapProcState_matrix.h"
55#endif
56
57#define MAKENAME(suffix)        GeneralXY ## suffix
58#define PREAMBLE(state)         SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \
59                                SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY; \
60                                SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX = (state).fTileLowBitsProcX; \
61                                SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY = (state).fTileLowBitsProcY
62#define PREAMBLE_PARAM_X        , SkBitmapProcState::FixedTileProc tileProcX, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX
63#define PREAMBLE_PARAM_Y        , SkBitmapProcState::FixedTileProc tileProcY, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY
64#define PREAMBLE_ARG_X          , tileProcX, tileLowBitsProcX
65#define PREAMBLE_ARG_Y          , tileProcY, tileLowBitsProcY
66#define TILEX_PROCF(fx, max)    (tileProcX(fx) * ((max) + 1) >> 16)
67#define TILEY_PROCF(fy, max)    (tileProcY(fy) * ((max) + 1) >> 16)
68#define TILEX_LOW_BITS(fx, max) tileLowBitsProcX(fx, (max) + 1)
69#define TILEY_LOW_BITS(fy, max) tileLowBitsProcY(fy, (max) + 1)
70#include "SkBitmapProcState_matrix.h"
71
72static inline U16CPU fixed_clamp(SkFixed x)
73{
74#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
75    if (x >> 16)
76        x = 0xFFFF;
77    if (x < 0)
78        x = 0;
79#else
80    if (x >> 16)
81    {
82        if (x < 0)
83            x = 0;
84        else
85            x = 0xFFFF;
86    }
87#endif
88    return x;
89}
90
91static inline U16CPU fixed_repeat(SkFixed x)
92{
93    return x & 0xFFFF;
94}
95
96static inline U16CPU fixed_mirror(SkFixed x)
97{
98    SkFixed s = x << 15 >> 31;
99    // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval
100    return (x ^ s) & 0xFFFF;
101}
102
103static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m)
104{
105    if (SkShader::kClamp_TileMode == m)
106        return fixed_clamp;
107    if (SkShader::kRepeat_TileMode == m)
108        return fixed_repeat;
109    SkASSERT(SkShader::kMirror_TileMode == m);
110    return fixed_mirror;
111}
112
113static inline U16CPU fixed_clamp_lowbits(SkFixed x, int) {
114    return (x >> 12) & 0xF;
115}
116
117static inline U16CPU fixed_repeat_or_mirrow_lowbits(SkFixed x, int scale) {
118    return ((x * scale) >> 12) & 0xF;
119}
120
121static SkBitmapProcState::FixedTileLowBitsProc choose_tile_lowbits_proc(unsigned m) {
122    if (SkShader::kClamp_TileMode == m) {
123        return fixed_clamp_lowbits;
124    } else {
125        SkASSERT(SkShader::kMirror_TileMode == m ||
126                 SkShader::kRepeat_TileMode == m);
127        // mirror and repeat have the same behavior for the low bits.
128        return fixed_repeat_or_mirrow_lowbits;
129    }
130}
131
132static inline U16CPU int_clamp(int x, int n) {
133#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
134    if (x >= n)
135        x = n - 1;
136    if (x < 0)
137        x = 0;
138#else
139    if ((unsigned)x >= (unsigned)n) {
140        if (x < 0) {
141            x = 0;
142        } else {
143            x = n - 1;
144        }
145    }
146#endif
147    return x;
148}
149
150static inline U16CPU int_repeat(int x, int n) {
151    return sk_int_mod(x, n);
152}
153
154static inline U16CPU int_mirror(int x, int n) {
155    x = sk_int_mod(x, 2 * n);
156    if (x >= n) {
157        x = n + ~(x - n);
158    }
159    return x;
160}
161
162#if 0
163static void test_int_tileprocs() {
164    for (int i = -8; i <= 8; i++) {
165        SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3));
166    }
167}
168#endif
169
170static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) {
171    if (SkShader::kClamp_TileMode == tm)
172        return int_clamp;
173    if (SkShader::kRepeat_TileMode == tm)
174        return int_repeat;
175    SkASSERT(SkShader::kMirror_TileMode == tm);
176    return int_mirror;
177}
178
179//////////////////////////////////////////////////////////////////////////////
180
181void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
182{
183    int i;
184
185#if	defined(__ARM_HAVE_NEON)
186    if (count >= 8) {
187        /* SkFixed is 16.16 fixed point */
188        SkFixed dx2 = dx+dx;
189        SkFixed dx4 = dx2+dx2;
190        SkFixed dx8 = dx4+dx4;
191
192        /* now build fx/fx+dx/fx+2dx/fx+3dx */
193        SkFixed fx1, fx2, fx3;
194        int32x2_t lower, upper;
195        int32x4_t lbase, hbase;
196        uint16_t *dst16 = (uint16_t *)dst;
197
198        fx1 = fx+dx;
199        fx2 = fx1+dx;
200        fx3 = fx2+dx;
201
202        /* avoid an 'lbase unitialized' warning */
203        lbase = vdupq_n_s32(fx);
204        lbase = vsetq_lane_s32(fx1, lbase, 1);
205        lbase = vsetq_lane_s32(fx2, lbase, 2);
206        lbase = vsetq_lane_s32(fx3, lbase, 3);
207        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
208
209        /* take upper 16 of each, store, and bump everything */
210        do {
211            int32x4_t lout, hout;
212            uint16x8_t hi16;
213
214            lout = lbase;
215            hout = hbase;
216            /* gets hi's of all louts then hi's of all houts */
217            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
218            hi16 = vreinterpretq_u16_s32(hout);
219            vst1q_u16(dst16, hi16);
220
221            /* on to the next */
222            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
223            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
224            dst16 += 8;
225            count -= 8;
226            fx += dx8;
227        } while (count >= 8);
228        dst = (uint32_t *) dst16;
229    }
230#else
231    for (i = (count >> 2); i > 0; --i)
232    {
233        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
234        fx += dx+dx;
235        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
236        fx += dx+dx;
237    }
238    count &= 3;
239#endif
240
241    uint16_t* xx = (uint16_t*)dst;
242    for (i = count; i > 0; --i) {
243        *xx++ = SkToU16(fx >> 16); fx += dx;
244    }
245}
246
247void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
248{
249
250#if	defined(__ARM_HAVE_NEON)
251    if (count >= 8) {
252        int32x4_t wide_fx;
253        int32x4_t wide_fx2;
254        int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
255
256        wide_fx = vdupq_n_s32(fx);
257        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
258        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
259        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
260
261        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
262
263        while (count >= 8) {
264            int32x4_t wide_out;
265            int32x4_t wide_out2;
266
267            wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
268            wide_out = vorrq_s32(wide_out,
269            vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
270
271            wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
272            wide_out2 = vorrq_s32(wide_out2,
273            vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
274
275            vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
276            vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
277
278            dst += 8;
279            fx += dx*8;
280            wide_fx = vaddq_s32(wide_fx, wide_dx8);
281            wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
282            count -= 8;
283        }
284    }
285#endif
286
287    if (count & 1)
288    {
289        SkASSERT((fx >> (16 + 14)) == 0);
290        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
291        fx += dx;
292    }
293    while ((count -= 2) >= 0)
294    {
295        SkASSERT((fx >> (16 + 14)) == 0);
296        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
297        fx += dx;
298
299        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
300        fx += dx;
301    }
302}
303
304///////////////////////////////////////////////////////////////////////////////
305// stores the same as SCALE, but is cheaper to compute. Also since there is no
306// scale, we don't need/have a FILTER version
307
308static void fill_sequential(uint16_t xptr[], int start, int count) {
309#if 1
310    if (reinterpret_cast<intptr_t>(xptr) & 0x2) {
311        *xptr++ = start++;
312        count -= 1;
313    }
314    if (count > 3) {
315        uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr);
316        uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1);
317        uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3);
318        start += count & ~3;
319        int qcount = count >> 2;
320        do {
321            *xxptr++ = pattern0;
322            pattern0 += 0x40004;
323            *xxptr++ = pattern1;
324            pattern1 += 0x40004;
325        } while (--qcount != 0);
326        xptr = reinterpret_cast<uint16_t*>(xxptr);
327        count &= 3;
328    }
329    while (--count >= 0) {
330        *xptr++ = start++;
331    }
332#else
333    for (int i = 0; i < count; i++) {
334        *xptr++ = start++;
335    }
336#endif
337}
338
339static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy,
340                                   int x, int y) {
341    SkPoint pt;
342    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
343               SkIntToScalar(y) + SK_ScalarHalf, &pt);
344    **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16,
345                           s.fBitmap->height());
346    *xy += 1;   // bump the ptr
347    // return our starting X position
348    return SkScalarToFixed(pt.fX) >> 16;
349}
350
351static void clampx_nofilter_trans(const SkBitmapProcState& s,
352                                  uint32_t xy[], int count, int x, int y) {
353    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
354
355    int xpos = nofilter_trans_preamble(s, &xy, x, y);
356    const int width = s.fBitmap->width();
357    if (1 == width) {
358        // all of the following X values must be 0
359        memset(xy, 0, count * sizeof(uint16_t));
360        return;
361    }
362
363    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
364    int n;
365
366    // fill before 0 as needed
367    if (xpos < 0) {
368        n = -xpos;
369        if (n > count) {
370            n = count;
371        }
372        memset(xptr, 0, n * sizeof(uint16_t));
373        count -= n;
374        if (0 == count) {
375            return;
376        }
377        xptr += n;
378        xpos = 0;
379    }
380
381    // fill in 0..width-1 if needed
382    if (xpos < width) {
383        n = width - xpos;
384        if (n > count) {
385            n = count;
386        }
387        fill_sequential(xptr, xpos, n);
388        count -= n;
389        if (0 == count) {
390            return;
391        }
392        xptr += n;
393    }
394
395    // fill the remaining with the max value
396    sk_memset16(xptr, width - 1, count);
397}
398
399static void repeatx_nofilter_trans(const SkBitmapProcState& s,
400                                   uint32_t xy[], int count, int x, int y) {
401    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
402
403    int xpos = nofilter_trans_preamble(s, &xy, x, y);
404    const int width = s.fBitmap->width();
405    if (1 == width) {
406        // all of the following X values must be 0
407        memset(xy, 0, count * sizeof(uint16_t));
408        return;
409    }
410
411    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
412    int start = sk_int_mod(xpos, width);
413    int n = width - start;
414    if (n > count) {
415        n = count;
416    }
417    fill_sequential(xptr, start, n);
418    xptr += n;
419    count -= n;
420
421    while (count >= width) {
422        fill_sequential(xptr, 0, width);
423        xptr += width;
424        count -= width;
425    }
426
427    if (count > 0) {
428        fill_sequential(xptr, 0, count);
429    }
430}
431
432static void fill_backwards(uint16_t xptr[], int pos, int count) {
433    for (int i = 0; i < count; i++) {
434        SkASSERT(pos >= 0);
435        xptr[i] = pos--;
436    }
437}
438
439static void mirrorx_nofilter_trans(const SkBitmapProcState& s,
440                                   uint32_t xy[], int count, int x, int y) {
441    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
442
443    int xpos = nofilter_trans_preamble(s, &xy, x, y);
444    const int width = s.fBitmap->width();
445    if (1 == width) {
446        // all of the following X values must be 0
447        memset(xy, 0, count * sizeof(uint16_t));
448        return;
449    }
450
451    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
452    // need to know our start, and our initial phase (forward or backward)
453    bool forward;
454    int n;
455    int start = sk_int_mod(xpos, 2 * width);
456    if (start >= width) {
457        start = width + ~(start - width);
458        forward = false;
459        n = start + 1;  // [start .. 0]
460    } else {
461        forward = true;
462        n = width - start;  // [start .. width)
463    }
464    if (n > count) {
465        n = count;
466    }
467    if (forward) {
468        fill_sequential(xptr, start, n);
469    } else {
470        fill_backwards(xptr, start, n);
471    }
472    forward = !forward;
473    xptr += n;
474    count -= n;
475
476    while (count >= width) {
477        if (forward) {
478            fill_sequential(xptr, 0, width);
479        } else {
480            fill_backwards(xptr, width - 1, width);
481        }
482        forward = !forward;
483        xptr += width;
484        count -= width;
485    }
486
487    if (count > 0) {
488        if (forward) {
489            fill_sequential(xptr, 0, count);
490        } else {
491            fill_backwards(xptr, width - 1, count);
492        }
493    }
494}
495
496///////////////////////////////////////////////////////////////////////////////
497
498SkBitmapProcState::MatrixProc
499SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
500//    test_int_tileprocs();
501    // check for our special case when there is no scale/affine/perspective
502    if (trivial_matrix) {
503        SkASSERT(!fDoFilter);
504        fIntTileProcY = choose_int_tile_proc(fTileModeY);
505        switch (fTileModeX) {
506            case SkShader::kClamp_TileMode:
507                return clampx_nofilter_trans;
508            case SkShader::kRepeat_TileMode:
509                return repeatx_nofilter_trans;
510            case SkShader::kMirror_TileMode:
511                return mirrorx_nofilter_trans;
512        }
513    }
514
515    int index = 0;
516    if (fDoFilter) {
517        index = 1;
518    }
519    if (fInvType & SkMatrix::kPerspective_Mask) {
520        index += 4;
521    } else if (fInvType & SkMatrix::kAffine_Mask) {
522        index += 2;
523    }
524
525    if (SkShader::kClamp_TileMode == fTileModeX &&
526        SkShader::kClamp_TileMode == fTileModeY)
527    {
528        // clamp gets special version of filterOne
529        fFilterOneX = SK_Fixed1;
530        fFilterOneY = SK_Fixed1;
531        return ClampX_ClampY_Procs[index];
532    }
533
534    // all remaining procs use this form for filterOne
535    fFilterOneX = SK_Fixed1 / fBitmap->width();
536    fFilterOneY = SK_Fixed1 / fBitmap->height();
537
538    if (SkShader::kRepeat_TileMode == fTileModeX &&
539        SkShader::kRepeat_TileMode == fTileModeY)
540    {
541        return RepeatX_RepeatY_Procs[index];
542    }
543
544    fTileProcX = choose_tile_proc(fTileModeX);
545    fTileProcY = choose_tile_proc(fTileModeY);
546    fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX);
547    fTileLowBitsProcY = choose_tile_lowbits_proc(fTileModeY);
548    return GeneralXY_Procs[index];
549}
550
551