SkBitmapProcState_matrixProcs.cpp revision 6c5bf8d9fe6fe263f583a4c22d04e3be879ecfeb
1/* NEON optimized code (C) COPYRIGHT 2009 Motorola
2 *
3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file.
5 */
6
7#include "SkBitmapProcState.h"
8#include "SkPerspIter.h"
9#include "SkShader.h"
10#include "SkUtils.h"
11
12/*  returns 0...(n-1) given any x (positive or negative).
13
14    As an example, if n (which is always positive) is 5...
15
16          x: -8 -7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7  8
17    returns:  2  3  4  0  1  2  3  4  0  1  2  3  4  0  1  2  3
18 */
19static inline int sk_int_mod(int x, int n) {
20    SkASSERT(n > 0);
21    if ((unsigned)x >= (unsigned)n) {
22        if (x < 0) {
23            x = n + ~(~x % n);
24        } else {
25            x = x % n;
26        }
27    }
28    return x;
29}
30
31void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
32void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
33
34#define MAKENAME(suffix)        ClampX_ClampY ## suffix
35#define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
36#define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
37#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
38#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
39#define CHECK_FOR_DECAL
40#if	defined(__ARM_HAVE_NEON)
41    #include "SkBitmapProcState_matrix_clamp.h"
42#else
43    #include "SkBitmapProcState_matrix.h"
44#endif
45
46#define MAKENAME(suffix)        RepeatX_RepeatY ## suffix
47#define TILEX_PROCF(fx, max)    (((fx) & 0xFFFF) * ((max) + 1) >> 16)
48#define TILEY_PROCF(fy, max)    (((fy) & 0xFFFF) * ((max) + 1) >> 16)
49#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
50#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
51#if	defined(__ARM_HAVE_NEON)
52    #include "SkBitmapProcState_matrix_repeat.h"
53#else
54    #include "SkBitmapProcState_matrix.h"
55#endif
56
57#define MAKENAME(suffix)        GeneralXY ## suffix
58#define PREAMBLE(state)         SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \
59                                SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY
60#define PREAMBLE_PARAM_X        , SkBitmapProcState::FixedTileProc tileProcX
61#define PREAMBLE_PARAM_Y        , SkBitmapProcState::FixedTileProc tileProcY
62#define PREAMBLE_ARG_X          , tileProcX
63#define PREAMBLE_ARG_Y          , tileProcY
64#define TILEX_PROCF(fx, max)    (tileProcX(fx) * ((max) + 1) >> 16)
65#define TILEY_PROCF(fy, max)    (tileProcY(fy) * ((max) + 1) >> 16)
66#define TILEX_LOW_BITS(fx, max) ((tileProcX(fx) * ((max) + 1) >> 12) & 0xF)
67#define TILEY_LOW_BITS(fy, max) ((tileProcY(fy) * ((max) + 1) >> 12) & 0xF)
68#include "SkBitmapProcState_matrix.h"
69
70static inline U16CPU fixed_clamp(SkFixed x)
71{
72#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
73    if (x >> 16)
74        x = 0xFFFF;
75    if (x < 0)
76        x = 0;
77#else
78    if (x >> 16)
79    {
80        if (x < 0)
81            x = 0;
82        else
83            x = 0xFFFF;
84    }
85#endif
86    return x;
87}
88
89static inline U16CPU fixed_repeat(SkFixed x)
90{
91    return x & 0xFFFF;
92}
93
94static inline U16CPU fixed_mirror(SkFixed x)
95{
96    SkFixed s = x << 15 >> 31;
97    // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval
98    return (x ^ s) & 0xFFFF;
99}
100
101static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m)
102{
103    if (SkShader::kClamp_TileMode == m)
104        return fixed_clamp;
105    if (SkShader::kRepeat_TileMode == m)
106        return fixed_repeat;
107    SkASSERT(SkShader::kMirror_TileMode == m);
108    return fixed_mirror;
109}
110
111static inline U16CPU int_clamp(int x, int n) {
112#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
113    if (x >= n)
114        x = n - 1;
115    if (x < 0)
116        x = 0;
117#else
118    if ((unsigned)x >= (unsigned)n) {
119        if (x < 0) {
120            x = 0;
121        } else {
122            x = n - 1;
123        }
124    }
125#endif
126    return x;
127}
128
129static inline U16CPU int_repeat(int x, int n) {
130    return sk_int_mod(x, n);
131}
132
133static inline U16CPU int_mirror(int x, int n) {
134    x = sk_int_mod(x, 2 * n);
135    if (x >= n) {
136        x = n + ~(x - n);
137    }
138    return x;
139}
140
141#if 0
142static void test_int_tileprocs() {
143    for (int i = -8; i <= 8; i++) {
144        SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3));
145    }
146}
147#endif
148
149static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) {
150    if (SkShader::kClamp_TileMode == tm)
151        return int_clamp;
152    if (SkShader::kRepeat_TileMode == tm)
153        return int_repeat;
154    SkASSERT(SkShader::kMirror_TileMode == tm);
155    return int_mirror;
156}
157
158//////////////////////////////////////////////////////////////////////////////
159
160void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
161{
162    int i;
163
164#if	defined(__ARM_HAVE_NEON)
165    if (count >= 8) {
166        /* SkFixed is 16.16 fixed point */
167        SkFixed dx2 = dx+dx;
168        SkFixed dx4 = dx2+dx2;
169        SkFixed dx8 = dx4+dx4;
170
171        /* now build fx/fx+dx/fx+2dx/fx+3dx */
172        SkFixed fx1, fx2, fx3;
173        int32x2_t lower, upper;
174        int32x4_t lbase, hbase;
175        uint16_t *dst16 = (uint16_t *)dst;
176
177        fx1 = fx+dx;
178        fx2 = fx1+dx;
179        fx3 = fx2+dx;
180
181        /* avoid an 'lbase unitialized' warning */
182        lbase = vdupq_n_s32(fx);
183        lbase = vsetq_lane_s32(fx1, lbase, 1);
184        lbase = vsetq_lane_s32(fx2, lbase, 2);
185        lbase = vsetq_lane_s32(fx3, lbase, 3);
186        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
187
188        /* take upper 16 of each, store, and bump everything */
189        do {
190            int32x4_t lout, hout;
191            uint16x8_t hi16;
192
193            lout = lbase;
194            hout = hbase;
195            /* gets hi's of all louts then hi's of all houts */
196            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
197            hi16 = vreinterpretq_u16_s32(hout);
198            vst1q_u16(dst16, hi16);
199
200            /* on to the next */
201            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
202            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
203            dst16 += 8;
204            count -= 8;
205            fx += dx8;
206        } while (count >= 8);
207        dst = (uint32_t *) dst16;
208    }
209#else
210    for (i = (count >> 2); i > 0; --i)
211    {
212        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
213        fx += dx+dx;
214        *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
215        fx += dx+dx;
216    }
217    count &= 3;
218#endif
219
220    uint16_t* xx = (uint16_t*)dst;
221    for (i = count; i > 0; --i) {
222        *xx++ = SkToU16(fx >> 16); fx += dx;
223    }
224}
225
226void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
227{
228
229#if	defined(__ARM_HAVE_NEON)
230    if (count >= 8) {
231        int32x4_t wide_fx;
232        int32x4_t wide_fx2;
233        int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
234
235        wide_fx = vdupq_n_s32(fx);
236        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
237        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
238        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
239
240        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
241
242        while (count >= 8) {
243            int32x4_t wide_out;
244            int32x4_t wide_out2;
245
246            wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
247            wide_out = vorrq_s32(wide_out,
248            vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
249
250            wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
251            wide_out2 = vorrq_s32(wide_out2,
252            vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
253
254            vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
255            vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
256
257            dst += 8;
258            fx += dx*8;
259            wide_fx = vaddq_s32(wide_fx, wide_dx8);
260            wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
261            count -= 8;
262        }
263    }
264#endif
265
266    if (count & 1)
267    {
268        SkASSERT((fx >> (16 + 14)) == 0);
269        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
270        fx += dx;
271    }
272    while ((count -= 2) >= 0)
273    {
274        SkASSERT((fx >> (16 + 14)) == 0);
275        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
276        fx += dx;
277
278        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
279        fx += dx;
280    }
281}
282
283///////////////////////////////////////////////////////////////////////////////
284// stores the same as SCALE, but is cheaper to compute. Also since there is no
285// scale, we don't need/have a FILTER version
286
287static void fill_sequential(uint16_t xptr[], int start, int count) {
288#if 1
289    if (reinterpret_cast<intptr_t>(xptr) & 0x2) {
290        *xptr++ = start++;
291        count -= 1;
292    }
293    if (count > 3) {
294        uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr);
295        uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1);
296        uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3);
297        start += count & ~3;
298        int qcount = count >> 2;
299        do {
300            *xxptr++ = pattern0;
301            pattern0 += 0x40004;
302            *xxptr++ = pattern1;
303            pattern1 += 0x40004;
304        } while (--qcount != 0);
305        xptr = reinterpret_cast<uint16_t*>(xxptr);
306        count &= 3;
307    }
308    while (--count >= 0) {
309        *xptr++ = start++;
310    }
311#else
312    for (int i = 0; i < count; i++) {
313        *xptr++ = start++;
314    }
315#endif
316}
317
318static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy,
319                                   int x, int y) {
320    SkPoint pt;
321    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
322               SkIntToScalar(y) + SK_ScalarHalf, &pt);
323    **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16,
324                           s.fBitmap->height());
325    *xy += 1;   // bump the ptr
326    // return our starting X position
327    return SkScalarToFixed(pt.fX) >> 16;
328}
329
330static void clampx_nofilter_trans(const SkBitmapProcState& s,
331                                  uint32_t xy[], int count, int x, int y) {
332    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
333
334    int xpos = nofilter_trans_preamble(s, &xy, x, y);
335    const int width = s.fBitmap->width();
336    if (1 == width) {
337        // all of the following X values must be 0
338        memset(xy, 0, count * sizeof(uint16_t));
339        return;
340    }
341
342    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
343    int n;
344
345    // fill before 0 as needed
346    if (xpos < 0) {
347        n = -xpos;
348        if (n > count) {
349            n = count;
350        }
351        memset(xptr, 0, n * sizeof(uint16_t));
352        count -= n;
353        if (0 == count) {
354            return;
355        }
356        xptr += n;
357        xpos = 0;
358    }
359
360    // fill in 0..width-1 if needed
361    if (xpos < width) {
362        n = width - xpos;
363        if (n > count) {
364            n = count;
365        }
366        fill_sequential(xptr, xpos, n);
367        count -= n;
368        if (0 == count) {
369            return;
370        }
371        xptr += n;
372    }
373
374    // fill the remaining with the max value
375    sk_memset16(xptr, width - 1, count);
376}
377
378static void repeatx_nofilter_trans(const SkBitmapProcState& s,
379                                   uint32_t xy[], int count, int x, int y) {
380    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
381
382    int xpos = nofilter_trans_preamble(s, &xy, x, y);
383    const int width = s.fBitmap->width();
384    if (1 == width) {
385        // all of the following X values must be 0
386        memset(xy, 0, count * sizeof(uint16_t));
387        return;
388    }
389
390    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
391    int start = sk_int_mod(xpos, width);
392    int n = width - start;
393    if (n > count) {
394        n = count;
395    }
396    fill_sequential(xptr, start, n);
397    xptr += n;
398    count -= n;
399
400    while (count >= width) {
401        fill_sequential(xptr, 0, width);
402        xptr += width;
403        count -= width;
404    }
405
406    if (count > 0) {
407        fill_sequential(xptr, 0, count);
408    }
409}
410
411static void fill_backwards(uint16_t xptr[], int pos, int count) {
412    for (int i = 0; i < count; i++) {
413        SkASSERT(pos >= 0);
414        xptr[i] = pos--;
415    }
416}
417
418static void mirrorx_nofilter_trans(const SkBitmapProcState& s,
419                                   uint32_t xy[], int count, int x, int y) {
420    SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0);
421
422    int xpos = nofilter_trans_preamble(s, &xy, x, y);
423    const int width = s.fBitmap->width();
424    if (1 == width) {
425        // all of the following X values must be 0
426        memset(xy, 0, count * sizeof(uint16_t));
427        return;
428    }
429
430    uint16_t* xptr = reinterpret_cast<uint16_t*>(xy);
431    // need to know our start, and our initial phase (forward or backward)
432    bool forward;
433    int n;
434    int start = sk_int_mod(xpos, 2 * width);
435    if (start >= width) {
436        start = width + ~(start - width);
437        forward = false;
438        n = start + 1;  // [start .. 0]
439    } else {
440        forward = true;
441        n = width - start;  // [start .. width)
442    }
443    if (n > count) {
444        n = count;
445    }
446    if (forward) {
447        fill_sequential(xptr, start, n);
448    } else {
449        fill_backwards(xptr, start, n);
450    }
451    forward = !forward;
452    xptr += n;
453    count -= n;
454
455    while (count >= width) {
456        if (forward) {
457            fill_sequential(xptr, 0, width);
458        } else {
459            fill_backwards(xptr, width - 1, width);
460        }
461        forward = !forward;
462        xptr += width;
463        count -= width;
464    }
465
466    if (count > 0) {
467        if (forward) {
468            fill_sequential(xptr, 0, count);
469        } else {
470            fill_backwards(xptr, width - 1, count);
471        }
472    }
473}
474
475///////////////////////////////////////////////////////////////////////////////
476
477SkBitmapProcState::MatrixProc
478SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) {
479//    test_int_tileprocs();
480    // check for our special case when there is no scale/affine/perspective
481    if (trivial_matrix) {
482        SkASSERT(!fDoFilter);
483        fIntTileProcY = choose_int_tile_proc(fTileModeY);
484        switch (fTileModeX) {
485            case SkShader::kClamp_TileMode:
486                return clampx_nofilter_trans;
487            case SkShader::kRepeat_TileMode:
488                return repeatx_nofilter_trans;
489            case SkShader::kMirror_TileMode:
490                return mirrorx_nofilter_trans;
491        }
492    }
493
494    int index = 0;
495    if (fDoFilter) {
496        index = 1;
497    }
498    if (fInvType & SkMatrix::kPerspective_Mask) {
499        index += 4;
500    } else if (fInvType & SkMatrix::kAffine_Mask) {
501        index += 2;
502    }
503
504    if (SkShader::kClamp_TileMode == fTileModeX &&
505        SkShader::kClamp_TileMode == fTileModeY)
506    {
507        // clamp gets special version of filterOne
508        fFilterOneX = SK_Fixed1;
509        fFilterOneY = SK_Fixed1;
510        return ClampX_ClampY_Procs[index];
511    }
512
513    // all remaining procs use this form for filterOne
514    fFilterOneX = SK_Fixed1 / fBitmap->width();
515    fFilterOneY = SK_Fixed1 / fBitmap->height();
516
517    if (SkShader::kRepeat_TileMode == fTileModeX &&
518        SkShader::kRepeat_TileMode == fTileModeY)
519    {
520        return RepeatX_RepeatY_Procs[index];
521    }
522
523    fTileProcX = choose_tile_proc(fTileModeX);
524    fTileProcY = choose_tile_proc(fTileModeY);
525    return GeneralXY_Procs[index];
526}
527
528