SkBitmapProcState_matrixProcs.cpp revision ed881c2704bc81fe46a68c0cf9e292287313baa6
1/* NEON optimized code (C) COPYRIGHT 2009 Motorola */
2
3#include "SkBitmapProcState.h"
4#include "SkPerspIter.h"
5#include "SkShader.h"
6#include "SkUtils.h"
7
8/*  returns 0...(n-1) given any x (positive or negative).
9
10    As an example, if n (which is always positive) is 5...
11
12          x: -8 -7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7  8
13    returns:  2  3  4  0  1  2  3  4  0  1  2  3  4  0  1  2  3
14 */
15static inline int sk_int_mod(int x, int n) {
16    SkASSERT(n > 0);
17    if ((unsigned)x >= (unsigned)n) {
18        if (x < 0) {
19            x = n + ~(~x % n);
20        } else {
21            x = x % n;
22        }
23    }
24    return x;
25}
26
27void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
28void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
29
30#define MAKENAME(suffix)        ClampX_ClampY ## suffix
31#define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
32#define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
33#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
34#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
35#define CHECK_FOR_DECAL
36#if	defined(__ARM_HAVE_NEON)
37    #include "SkBitmapProcState_matrix_clamp.h"
38#else
39    #include "SkBitmapProcState_matrix.h"
40#endif
41
42#define MAKENAME(suffix)        RepeatX_RepeatY ## suffix
43#define TILEX_PROCF(fx, max)    (((fx) & 0xFFFF) * ((max) + 1) >> 16)
44#define TILEY_PROCF(fy, max)    (((fy) & 0xFFFF) * ((max) + 1) >> 16)
45#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
46#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
47#if	defined(__ARM_HAVE_NEON)
48    #include "SkBitmapProcState_matrix_repeat.h"
49#else
50    #include "SkBitmapProcState_matrix.h"
51#endif
52
53#define MAKENAME(suffix)        GeneralXY ## suffix
54#define PREAMBLE(state)         SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \
55                                SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY
56#define PREAMBLE_PARAM_X        , SkBitmapProcState::FixedTileProc tileProcX
57#define PREAMBLE_PARAM_Y        , SkBitmapProcState::FixedTileProc tileProcY
58#define PREAMBLE_ARG_X          , tileProcX
59#define PREAMBLE_ARG_Y          , tileProcY
60#define TILEX_PROCF(fx, max)    (tileProcX(fx) * ((max) + 1) >> 16)
61#define TILEY_PROCF(fy, max)    (tileProcY(fy) * ((max) + 1) >> 16)
62#define TILEX_LOW_BITS(fx, max) ((tileProcX(fx) * ((max) + 1) >> 12) & 0xF)
63#define TILEY_LOW_BITS(fy, max) ((tileProcY(fy) * ((max) + 1) >> 12) & 0xF)
64#include "SkBitmapProcState_matrix.h"
65
66static inline U16CPU fixed_clamp(SkFixed x)
67{
68#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
69    if (x >> 16)
70        x = 0xFFFF;
71    if (x < 0)
72        x = 0;
73#else
74    if (x >> 16)
75    {
76        if (x < 0)
77            x = 0;
78        else
79            x = 0xFFFF;
80    }
81#endif
82    return x;
83}
84
85static inline U16CPU fixed_repeat(SkFixed x)
86{
87    return x & 0xFFFF;
88}
89
90static inline U16CPU fixed_mirror(SkFixed x)
91{
92    SkFixed s = x << 15 >> 31;
93    // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval
94    return (x ^ s) & 0xFFFF;
95}
96
97static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m)
98{
99    if (SkShader::kClamp_TileMode == m)
100        return fixed_clamp;
101    if (SkShader::kRepeat_TileMode == m)
102        return fixed_repeat;
103    SkASSERT(SkShader::kMirror_TileMode == m);
104    return fixed_mirror;
105}
106
107static inline U16CPU int_clamp(int x, int n) {
108#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
109    if (x >= n)
110        x = n - 1;
111    if (x < 0)
112        x = 0;
113#else
114    if ((unsigned)x >= (unsigned)n) {
115        if (x < 0) {
116            x = 0;
117        } else {
118            x = n - 1;
119        }
120    }
121#endif
122    return x;
123}
124
125static inline U16CPU int_repeat(int x, int n) {
126    return sk_int_mod(x, n);
127}
128
129static inline U16CPU int_mirror(int x, int n) {
130    x = sk_int_mod(x, 2 * n);
131    if (x >= n) {
132        x = n + ~(x - n);
133    }
134    return x;
135}
136
137#if 0
138static void test_int_tileprocs() {
139    for (int i = -8; i <= 8; i++) {
140        SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3));
141    }
142}
143#endif
144
145static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) {
146    if (SkShader::kClamp_TileMode == tm)
147        return int_clamp;
148    if (SkShader::kRepeat_TileMode == tm)
149        return int_repeat;
150    SkASSERT(SkShader::kMirror_TileMode == tm);
151    return int_mirror;
152}
153
154//////////////////////////////////////////////////////////////////////////////
155
156void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
157{
158    int i;
159
160#if	defined(__ARM_HAVE_NEON)
161    if (count >= 8) {
162        /* SkFixed is 16.16 fixed point */
163        SkFixed dx2 = dx+dx;
164        SkFixed dx4 = dx2+dx2;
165        SkFixed dx8 = dx4+dx4;
166
167        /* now build fx/fx+dx/fx+2dx/fx+3dx */
168        SkFixed fx1, fx2, fx3;
169        int32x2_t lower, upper;
170        int32x4_t lbase, hbase;
171        uint16_t *dst16 = (uint16_t *)dst;
172
173        fx1 = fx+dx;
174        fx2 = fx1+dx;
175        fx3 = fx2+dx;
176
177        /* avoid an 'lbase unitialized' warning */
178        lbase = vdupq_n_s32(fx);
179        lbase = vsetq_lane_s32(fx1, lbase, 1);
180        lbase = vsetq_lane_s32(fx2, lbase, 2);
181        lbase = vsetq_lane_s32(fx3, lbase, 3);
182        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
183
184        /* take upper 16 of each, store, and bump everything */
185        do {
186            int32x4_t lout, hout;
187            uint16x8_t hi16;
188
189            lout = lbase;
190            hout = hbase;
191            /* gets hi's of all louts then hi's of all houts */
192            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
193            hi16 = vreinterpretq_u16_s32(hout);
194            vst1q_u16(dst16, hi16);
195
196            /* on to the next */
197            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
198            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
199            dst16 += 8;
200            count -= 8;
201            fx += dx8;
202        } while (count >= 8);
203        dst = (uint32_t *) dst16;
204    }
205#else
206    for (i = (count >> 2); i > 0; --i)
207    {
208        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
209        fx += dx+dx;
210        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
211        fx += dx+dx;
212    }
213    count &= 3;
214#endif
215
216    uint16_t* xx = (uint16_t*)dst;
217    for (i = count; i > 0; --i) {
218        *xx++ = SkToU16(fx >> 16); fx += dx;
219    }
220}
221
222void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
223{
224
225#if	defined(__ARM_HAVE_NEON)
226    if (count >= 8) {
227        int32x4_t wide_fx;
228        int32x4_t wide_fx2;
229        int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
230
231        wide_fx = vdupq_n_s32(fx);
232        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
233        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
234        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
235
236        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
237
238        while (count >= 8) {
239            int32x4_t wide_out;
240            int32x4_t wide_out2;
241
242            wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
243            wide_out = vorrq_s32(wide_out,
244            vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
245
246            wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
247            wide_out2 = vorrq_s32(wide_out2,
248            vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
249
250            vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
251            vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
252
253            dst += 8;
254            fx += dx*8;
255            wide_fx = vaddq_s32(wide_fx, wide_dx8);
256            wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
257            count -= 8;
258        }
259    }
260#endif
261
262    if (count & 1)
263    {
264        SkASSERT((fx >> (16 + 14)) == 0);
265        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
266        fx += dx;
267    }
268    while ((count -= 2) >= 0)
269    {
270        SkASSERT((fx >> (16 + 14)) == 0);
271        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
272        fx += dx;
273
274        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
275        fx += dx;
276    }
277}
278
279///////////////////////////////////////////////////////////////////////////////
280// stores the same as SCALE, but is cheaper to compute. Also since there is no
281// scale, we don't need/have a FILTER version
282
283static void fill_sequential(uint16_t xptr[], int start, int count) {
284#if 1
285    if (reinterpret_cast<intptr_t>(xptr) & 0x2) {
286        *xptr++ = start++;
287        count -= 1;
288    }
289    if (count > 3) {
290        uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr);
291        uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1);
292        uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3);
293        start += count & ~3;
294        int qcount = count >> 2;
295        do {
296            *xxptr++ = pattern0;
297            pattern0 += 0x40004;
298            *xxptr++ = pattern1;
299            pattern1 += 0x40004;
300        } while (--qcount != 0);
301        xptr = reinterpret_cast<uint16_t*>(xxptr);
302        count &= 3;
303    }
304    while (--count >= 0) {
305        *xptr++ = start++;
306    }
307#else
308    for (int i = 0; i < count; i++) {
309        *xptr++ = start++;
310    }
311#endif
312}
313
314static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy,
315                                   int x, int y) {
316    SkPoint pt;
317    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
318               SkIntToScalar(y) + SK_ScalarHalf, &pt);
319    **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16,
320                           s.fBitmap->height());
321    *xy += 1;   // bump the ptr
322    // return our starting X position
323    return SkScalarToFixed(pt.fX) >> 16;
324}
325
326static void clampx_nofilter_trans(const SkBitmapProcState& s,
327                                  uint32_t xy[], int count, int x, int y) {
328    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
329
330    int xpos = nofilter_trans_preamble(s, &xy, x, y);
331    const int width = s.fBitmap->width();
332    if (1 == width) {
333        // all of the following X values must be 0
334        memset(xy, 0, count * sizeof(uint16_t));
335        return;
336    }
337
338    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
339    int n;
340
341    // fill before 0 as needed
342    if (xpos < 0) {
343        n = -xpos;
344        if (n > count) {
345            n = count;
346        }
347        memset(xptr, 0, n * sizeof(uint16_t));
348        count -= n;
349        if (0 == count) {
350            return;
351        }
352        xptr += n;
353        xpos = 0;
354    }
355
356    // fill in 0..width-1 if needed
357    if (xpos < width) {
358        n = width - xpos;
359        if (n > count) {
360            n = count;
361        }
362        fill_sequential(xptr, xpos, n);
363        count -= n;
364        if (0 == count) {
365            return;
366        }
367        xptr += n;
368    }
369
370    // fill the remaining with the max value
371    sk_memset16(xptr, width - 1, count * sizeof(uint16_t));
372}
373
374static void repeatx_nofilter_trans(const SkBitmapProcState& s,
375                                   uint32_t xy[], int count, int x, int y) {
376    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
377
378    int xpos = nofilter_trans_preamble(s, &xy, x, y);
379    const int width = s.fBitmap->width();
380    if (1 == width) {
381        // all of the following X values must be 0
382        memset(xy, 0, count * sizeof(uint16_t));
383        return;
384    }
385
386    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
387    int start = sk_int_mod(xpos, width);
388    int n = width - start;
389    if (n > count) {
390        n = count;
391    }
392    fill_sequential(xptr, start, n);
393    xptr += n;
394    count -= n;
395
396    while (count >= width) {
397        fill_sequential(xptr, 0, width);
398        xptr += width;
399        count -= width;
400    }
401
402    if (count > 0) {
403        fill_sequential(xptr, 0, count);
404    }
405}
406
407static void fill_backwards(uint16_t xptr[], int pos, int count) {
408    for (int i = 0; i < count; i++) {
409        SkASSERT(pos >= 0);
410        xptr[i] = pos--;
411    }
412}
413
414static void mirrorx_nofilter_trans(const SkBitmapProcState& s,
415                                   uint32_t xy[], int count, int x, int y) {
416    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
417
418    int xpos = nofilter_trans_preamble(s, &xy, x, y);
419    const int width = s.fBitmap->width();
420    if (1 == width) {
421        // all of the following X values must be 0
422        memset(xy, 0, count * sizeof(uint16_t));
423        return;
424    }
425
426    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
427    // need to know our start, and our initial phase (forward or backward)
428    bool forward;
429    int n;
430    int start = sk_int_mod(xpos, 2 * width);
431    if (start >= width) {
432        start = width + ~(start - width);
433        forward = false;
434        n = start + 1;  // [start .. 0]
435    } else {
436        forward = true;
437        n = width - start;  // [start .. width)
438    }
439    if (n > count) {
440        n = count;
441    }
442    if (forward) {
443        fill_sequential(xptr, start, n);
444    } else {
445        fill_backwards(xptr, start, n);
446    }
447    forward = !forward;
448    xptr += n;
449    count -= n;
450
451    while (count >= width) {
452        if (forward) {
453            fill_sequential(xptr, 0, width);
454        } else {
455            fill_backwards(xptr, width - 1, width);
456        }
457        forward = !forward;
458        xptr += width;
459        count -= width;
460    }
461
462    if (count > 0) {
463        if (forward) {
464            fill_sequential(xptr, 0, count);
465        } else {
466            fill_backwards(xptr, width - 1, count);
467        }
468    }
469}
470
471///////////////////////////////////////////////////////////////////////////////
472
473SkBitmapProcState::MatrixProc
474SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
475//    test_int_tileprocs();
476    // check for our special case when there is no scale/affine/perspective
477    if (trivial_matrix) {
478        SkASSERT(!fDoFilter);
479        fIntTileProcY = choose_int_tile_proc(fTileModeY);
480        switch (fTileModeX) {
481            case SkShader::kClamp_TileMode:
482                return clampx_nofilter_trans;
483            case SkShader::kRepeat_TileMode:
484                return repeatx_nofilter_trans;
485            case SkShader::kMirror_TileMode:
486                return mirrorx_nofilter_trans;
487        }
488    }
489
490    int index = 0;
491    if (fDoFilter) {
492        index = 1;
493    }
494    if (fInvType & SkMatrix::kPerspective_Mask) {
495        index += 4;
496    } else if (fInvType & SkMatrix::kAffine_Mask) {
497        index += 2;
498    }
499
500    if (SkShader::kClamp_TileMode == fTileModeX &&
501        SkShader::kClamp_TileMode == fTileModeY)
502    {
503        // clamp gets special version of filterOne
504        fFilterOneX = SK_Fixed1;
505        fFilterOneY = SK_Fixed1;
506        return ClampX_ClampY_Procs[index];
507    }
508
509    // all remaining procs use this form for filterOne
510    fFilterOneX = SK_Fixed1 / fBitmap->width();
511    fFilterOneY = SK_Fixed1 / fBitmap->height();
512
513    if (SkShader::kRepeat_TileMode == fTileModeX &&
514        SkShader::kRepeat_TileMode == fTileModeY)
515    {
516        return RepeatX_RepeatY_Procs[index];
517    }
518
519    fTileProcX = choose_tile_proc(fTileModeX);
520    fTileProcY = choose_tile_proc(fTileModeY);
521    return GeneralXY_Procs[index];
522}
523
524