SkBlitRow_opts_arm_neon.cpp revision 0060159457453ca45a47828648c8f29d5695983c
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkCachePreload_arm.h"
18#include "SkColor_opts_neon.h"
19#include <arm_neon.h>
20
21void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22                           const SkPMColor* SK_RESTRICT src, int count,
23                           U8CPU alpha, int /*x*/, int /*y*/) {
24    SkASSERT(255 == alpha);
25
26    while (count >= 8) {
27        uint8x8x4_t vsrc;
28        uint16x8_t vdst;
29
30        // Load
31        vsrc = vld4_u8((uint8_t*)src);
32
33        // Convert src to 565
34        vdst = vshll_n_u8(vsrc.val[NEON_R], 8);
35        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5);
36        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6);
37
38        // Store
39        vst1q_u16(dst, vdst);
40
41        // Prepare next iteration
42        dst += 8;
43        src += 8;
44        count -= 8;
45    };
46
47    // Leftovers
48    while (count > 0) {
49        SkPMColor c = *src++;
50        SkPMColorAssert(c);
51        *dst = SkPixel32ToPixel16_ToU16(c);
52        dst++;
53        count--;
54    };
55}
56
57void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
58                           const SkPMColor* SK_RESTRICT src, int count,
59                           U8CPU alpha, int /*x*/, int /*y*/) {
60    SkASSERT(255 == alpha);
61
62    if (count >= 8) {
63        uint16_t* SK_RESTRICT keep_dst = 0;
64
65        asm volatile (
66                      "ands       ip, %[count], #7            \n\t"
67                      "vmov.u8    d31, #1<<7                  \n\t"
68                      "vld1.16    {q12}, [%[dst]]             \n\t"
69                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
70                      // Thumb does not support the standard ARM conditional
71                      // instructions but instead requires the 'it' instruction
72                      // to signal conditional execution
73                      "it eq                                  \n\t"
74                      "moveq      ip, #8                      \n\t"
75                      "mov        %[keep_dst], %[dst]         \n\t"
76
77                      "add        %[src], %[src], ip, LSL#2   \n\t"
78                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
79                      "subs       %[count], %[count], ip      \n\t"
80                      "b          9f                          \n\t"
81                      // LOOP
82                      "2:                                         \n\t"
83
84                      "vld1.16    {q12}, [%[dst]]!            \n\t"
85                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
86                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
87                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
88                      "subs       %[count], %[count], #8      \n\t"
89                      "9:                                         \n\t"
90                      "pld        [%[dst],#32]                \n\t"
91                      // expand 0565 q12 to 8888 {d4-d7}
92                      "vmovn.u16  d4, q12                     \n\t"
93                      "vshr.u16   q11, q12, #5                \n\t"
94                      "vshr.u16   q10, q12, #6+5              \n\t"
95                      "vmovn.u16  d5, q11                     \n\t"
96                      "vmovn.u16  d6, q10                     \n\t"
97                      "vshl.u8    d4, d4, #3                  \n\t"
98                      "vshl.u8    d5, d5, #2                  \n\t"
99                      "vshl.u8    d6, d6, #3                  \n\t"
100
101                      "vmovl.u8   q14, d31                    \n\t"
102                      "vmovl.u8   q13, d31                    \n\t"
103                      "vmovl.u8   q12, d31                    \n\t"
104
105                      // duplicate in 4/2/1 & 8pix vsns
106                      "vmvn.8     d30, d3                     \n\t"
107                      "vmlal.u8   q14, d30, d6                \n\t"
108                      "vmlal.u8   q13, d30, d5                \n\t"
109                      "vmlal.u8   q12, d30, d4                \n\t"
110                      "vshr.u16   q8, q14, #5                 \n\t"
111                      "vshr.u16   q9, q13, #6                 \n\t"
112                      "vaddhn.u16 d6, q14, q8                 \n\t"
113                      "vshr.u16   q8, q12, #5                 \n\t"
114                      "vaddhn.u16 d5, q13, q9                 \n\t"
115                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
116                      "vaddhn.u16 d4, q12, q8                 \n\t"
117                      // intentionally don't calculate alpha
118                      // result in d4-d6
119
120                      "vqadd.u8   d5, d5, d1                  \n\t"
121                      "vqadd.u8   d4, d4, d2                  \n\t"
122
123                      // pack 8888 {d4-d6} to 0565 q10
124                      "vshll.u8   q10, d6, #8                 \n\t"
125                      "vshll.u8   q3, d5, #8                  \n\t"
126                      "vshll.u8   q2, d4, #8                  \n\t"
127                      "vsri.u16   q10, q3, #5                 \n\t"
128                      "vsri.u16   q10, q2, #11                \n\t"
129
130                      "bne        2b                          \n\t"
131
132                      "1:                                         \n\t"
133                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
134                      : [count] "+r" (count)
135                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
136                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
137                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
138                      "d30","d31"
139                      );
140    }
141    else
142    {   // handle count < 8
143        uint16_t* SK_RESTRICT keep_dst = 0;
144
145        asm volatile (
146                      "vmov.u8    d31, #1<<7                  \n\t"
147                      "mov        %[keep_dst], %[dst]         \n\t"
148
149                      "tst        %[count], #4                \n\t"
150                      "beq        14f                         \n\t"
151                      "vld1.16    {d25}, [%[dst]]!            \n\t"
152                      "vld1.32    {q1}, [%[src]]!             \n\t"
153
154                      "14:                                        \n\t"
155                      "tst        %[count], #2                \n\t"
156                      "beq        12f                         \n\t"
157                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
158                      "vld1.32    {d1}, [%[src]]!             \n\t"
159
160                      "12:                                        \n\t"
161                      "tst        %[count], #1                \n\t"
162                      "beq        11f                         \n\t"
163                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
164                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
165
166                      "11:                                        \n\t"
167                      // unzips achieve the same as a vld4 operation
168                      "vuzpq.u16  q0, q1                      \n\t"
169                      "vuzp.u8    d0, d1                      \n\t"
170                      "vuzp.u8    d2, d3                      \n\t"
171                      // expand 0565 q12 to 8888 {d4-d7}
172                      "vmovn.u16  d4, q12                     \n\t"
173                      "vshr.u16   q11, q12, #5                \n\t"
174                      "vshr.u16   q10, q12, #6+5              \n\t"
175                      "vmovn.u16  d5, q11                     \n\t"
176                      "vmovn.u16  d6, q10                     \n\t"
177                      "vshl.u8    d4, d4, #3                  \n\t"
178                      "vshl.u8    d5, d5, #2                  \n\t"
179                      "vshl.u8    d6, d6, #3                  \n\t"
180
181                      "vmovl.u8   q14, d31                    \n\t"
182                      "vmovl.u8   q13, d31                    \n\t"
183                      "vmovl.u8   q12, d31                    \n\t"
184
185                      // duplicate in 4/2/1 & 8pix vsns
186                      "vmvn.8     d30, d3                     \n\t"
187                      "vmlal.u8   q14, d30, d6                \n\t"
188                      "vmlal.u8   q13, d30, d5                \n\t"
189                      "vmlal.u8   q12, d30, d4                \n\t"
190                      "vshr.u16   q8, q14, #5                 \n\t"
191                      "vshr.u16   q9, q13, #6                 \n\t"
192                      "vaddhn.u16 d6, q14, q8                 \n\t"
193                      "vshr.u16   q8, q12, #5                 \n\t"
194                      "vaddhn.u16 d5, q13, q9                 \n\t"
195                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
196                      "vaddhn.u16 d4, q12, q8                 \n\t"
197                      // intentionally don't calculate alpha
198                      // result in d4-d6
199
200                      "vqadd.u8   d5, d5, d1                  \n\t"
201                      "vqadd.u8   d4, d4, d2                  \n\t"
202
203                      // pack 8888 {d4-d6} to 0565 q10
204                      "vshll.u8   q10, d6, #8                 \n\t"
205                      "vshll.u8   q3, d5, #8                  \n\t"
206                      "vshll.u8   q2, d4, #8                  \n\t"
207                      "vsri.u16   q10, q3, #5                 \n\t"
208                      "vsri.u16   q10, q2, #11                \n\t"
209
210                      // store
211                      "tst        %[count], #4                \n\t"
212                      "beq        24f                         \n\t"
213                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
214
215                      "24:                                        \n\t"
216                      "tst        %[count], #2                \n\t"
217                      "beq        22f                         \n\t"
218                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
219
220                      "22:                                        \n\t"
221                      "tst        %[count], #1                \n\t"
222                      "beq        21f                         \n\t"
223                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
224
225                      "21:                                        \n\t"
226                      : [count] "+r" (count)
227                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
228                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
229                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
230                      "d30","d31"
231                      );
232    }
233}
234
235void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
236                          const SkPMColor* SK_RESTRICT src, int count,
237                          U8CPU alpha, int /*x*/, int /*y*/) {
238
239    U8CPU alpha_for_asm = alpha;
240
241    asm volatile (
242    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
243     * the original in two respects:
244     *  1. The results have a few mismatches compared to the original code. These mismatches
245     *     never exceed 1. It's possible to improve accuracy vs. a floating point
246     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
247     *     Rounding is not present in the code below, because although results would be closer
248     *     to a floating point implementation, the number of mismatches compared to the
249     *     original code would be far greater.
250     *  2. On certain inputs, the original code can overflow, causing colour channels to
251     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
252     *     to affect another.
253     */
254
255#if 1
256        /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
257                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
258#else
259                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
260#endif
261                  "vmov.u16   q3, #255                        \n\t"   // set up constant
262                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
263                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
264                  "beq        2f                              \n\t"   // if count8 == 0, exit
265                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
266
267                  "1:                                             \n\t"
268                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
269                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
270                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
271                  //  and deinterleave
272
273                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
274                  "vand       q10, q0, q15                    \n\t"   // extract blue
275                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
276                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
277                  // dstrgb = {q8, q9, q10}
278
279                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
280                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
281                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
282
283                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
284                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
285                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
286                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
287                  // srcrgba = {q11, q12, q13, q14}
288
289                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
290                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
291                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
292                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
293
294                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
295                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
296                  // dst_scale = q2
297
298                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
299                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
300                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
301
302#if 1
303    // trying for a better match with SkDiv255Round(a)
304    // C alg is:  a+=128; (a+a>>8)>>8
305    // we'll use just a rounding shift [q2 is available for scratch]
306                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
307                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
308                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
309#else
310    // arm's original "truncating divide by 256"
311                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
312                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
313                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
314#endif
315
316                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
317                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
318                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
319
320                  "bne        1b                              \n\t"   // if counter != 0, loop
321                  "2:                                             \n\t"   // exit
322
323                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
324                  :
325                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
326                  );
327
328    count &= 7;
329    if (count > 0) {
330        do {
331            SkPMColor sc = *src++;
332            if (sc) {
333                uint16_t dc = *dst;
334                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
335                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
336                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
337                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
338                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
339            }
340            dst += 1;
341        } while (--count != 0);
342    }
343}
344
345/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
346 * each dither value is spaced out into byte lanes, and repeated
347 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
348 * start of each row.
349 */
350static const uint8_t gDitherMatrix_Neon[48] = {
351    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
352    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
353    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
354    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
355
356};
357
358void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
359                                int count, U8CPU alpha, int x, int y)
360{
361    /* select row and offset for dither array */
362    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
363
364    /* rescale alpha to range 0 - 256 */
365    int scale = SkAlpha255To256(alpha);
366
367    asm volatile (
368                  "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
369                  "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
370                  "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
371                  "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
372                  "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
373                  "1:                                                 \n\t"
374                  "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
375                  "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
376                  "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
377                  "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
378                  "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
379                  "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
380                  "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
381                  "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
382                  "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
383                  "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
384                  "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
385                  "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
386                  "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
387                  // load 8 pixels from dst, extract rgb
388                  "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
389                  "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
390                  "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
391                  "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
392                  "vand           d17, d17, d29                   \n\t"   // and green with green mask
393                  "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
394                  "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
395                  // src = {d22 (r), d23 (g), d24 (b)}
396                  // dst = {d16 (r), d17 (g), d18 (b)}
397                  // subtract dst from src and widen
398                  "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
399                  "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
400                  "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
401                  // multiply diffs by scale and shift
402                  "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
403                  "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
404                  "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
405                  "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
406                  "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
407                  "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
408                  "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
409                  // add dst to result
410                  "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
411                  "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
412                  "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
413                  // put result into 565 format
414                  "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
415                  "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
416                  "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
417                  "bgt            1b                              \n\t"   // loop if count > 0
418                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
419                  : [dstart] "r" (dstart), [scale] "r" (scale)
420                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
421                  );
422
423    DITHER_565_SCAN(y);
424
425    while((count & 7) > 0)
426    {
427        SkPMColor c = *src++;
428
429        int dither = DITHER_VALUE(x);
430        int sr = SkGetPackedR32(c);
431        int sg = SkGetPackedG32(c);
432        int sb = SkGetPackedB32(c);
433        sr = SkDITHER_R32To565(sr, dither);
434        sg = SkDITHER_G32To565(sg, dither);
435        sb = SkDITHER_B32To565(sb, dither);
436
437        uint16_t d = *dst;
438        *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
439                             SkAlphaBlend(sg, SkGetPackedG16(d), scale),
440                             SkAlphaBlend(sb, SkGetPackedB16(d), scale));
441        DITHER_INC_X(x);
442        count--;
443    }
444}
445
446void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
447                                const SkPMColor* SK_RESTRICT src,
448                                int count, U8CPU alpha) {
449
450    SkASSERT(255 == alpha);
451    if (count > 0) {
452
453
454    uint8x8_t alpha_mask;
455
456    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
457    alpha_mask = vld1_u8(alpha_mask_setup);
458
459    /* do the NEON unrolled code */
460#define    UNROLL    4
461    while (count >= UNROLL) {
462        uint8x8_t src_raw, dst_raw, dst_final;
463        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
464
465        /* The two prefetches below may make the code slighlty
466         * slower for small values of count but are worth having
467         * in the general case.
468         */
469        __builtin_prefetch(src+32);
470        __builtin_prefetch(dst+32);
471
472        /* get the source */
473        src_raw = vreinterpret_u8_u32(vld1_u32(src));
474#if    UNROLL > 2
475        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
476#endif
477
478        /* get and hold the dst too */
479        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
480#if    UNROLL > 2
481        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
482#endif
483
484    /* 1st and 2nd bits of the unrolling */
485    {
486        uint8x8_t dst_cooked;
487        uint16x8_t dst_wide;
488        uint8x8_t alpha_narrow;
489        uint16x8_t alpha_wide;
490
491        /* get the alphas spread out properly */
492        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
493        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
494
495        /* spread the dest */
496        dst_wide = vmovl_u8(dst_raw);
497
498        /* alpha mul the dest */
499        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
500        dst_cooked = vshrn_n_u16(dst_wide, 8);
501
502        /* sum -- ignoring any byte lane overflows */
503        dst_final = vadd_u8(src_raw, dst_cooked);
504    }
505
506#if    UNROLL > 2
507    /* the 3rd and 4th bits of our unrolling */
508    {
509        uint8x8_t dst_cooked;
510        uint16x8_t dst_wide;
511        uint8x8_t alpha_narrow;
512        uint16x8_t alpha_wide;
513
514        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
515        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
516
517        /* spread the dest */
518        dst_wide = vmovl_u8(dst_raw_2);
519
520        /* alpha mul the dest */
521        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
522        dst_cooked = vshrn_n_u16(dst_wide, 8);
523
524        /* sum -- ignoring any byte lane overflows */
525        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
526    }
527#endif
528
529        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
530#if    UNROLL > 2
531        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
532#endif
533
534        src += UNROLL;
535        dst += UNROLL;
536        count -= UNROLL;
537    }
538#undef    UNROLL
539
540    /* do any residual iterations */
541        while (--count >= 0) {
542            *dst = SkPMSrcOver(*src, *dst);
543            src += 1;
544            dst += 1;
545        }
546    }
547}
548
549void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
550                                const SkPMColor* SK_RESTRICT src,
551                                int count, U8CPU alpha) {
552    SkASSERT(255 == alpha);
553
554    if (count <= 0)
555    return;
556
557    /* Use these to check if src is transparent or opaque */
558    const unsigned int ALPHA_OPAQ  = 0xFF000000;
559    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
560
561#define UNROLL  4
562    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
563    const SkPMColor* SK_RESTRICT src_temp = src;
564
565    /* set up the NEON variables */
566    uint8x8_t alpha_mask;
567    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
568    alpha_mask = vld1_u8(alpha_mask_setup);
569
570    uint8x8_t src_raw, dst_raw, dst_final;
571    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
572    uint8x8_t dst_cooked;
573    uint16x8_t dst_wide;
574    uint8x8_t alpha_narrow;
575    uint16x8_t alpha_wide;
576
577    /* choose the first processing type */
578    if( src >= src_end)
579        goto TAIL;
580    if(*src <= ALPHA_TRANS)
581        goto ALPHA_0;
582    if(*src >= ALPHA_OPAQ)
583        goto ALPHA_255;
584    /* fall-thru */
585
586ALPHA_1_TO_254:
587    do {
588
589        /* get the source */
590        src_raw = vreinterpret_u8_u32(vld1_u32(src));
591        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
592
593        /* get and hold the dst too */
594        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
595        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
596
597
598        /* get the alphas spread out properly */
599        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
600        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
601        /* we collapsed (255-a)+1 ... */
602        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
603
604        /* spread the dest */
605        dst_wide = vmovl_u8(dst_raw);
606
607        /* alpha mul the dest */
608        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
609        dst_cooked = vshrn_n_u16(dst_wide, 8);
610
611        /* sum -- ignoring any byte lane overflows */
612        dst_final = vadd_u8(src_raw, dst_cooked);
613
614        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
615        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
616        /* we collapsed (255-a)+1 ... */
617        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
618
619        /* spread the dest */
620        dst_wide = vmovl_u8(dst_raw_2);
621
622        /* alpha mul the dest */
623        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
624        dst_cooked = vshrn_n_u16(dst_wide, 8);
625
626        /* sum -- ignoring any byte lane overflows */
627        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
628
629        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
630        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
631
632        src += UNROLL;
633        dst += UNROLL;
634
635        /* if 2 of the next pixels aren't between 1 and 254
636        it might make sense to go to the optimized loops */
637        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
638            break;
639
640    } while(src < src_end);
641
642    if (src >= src_end)
643        goto TAIL;
644
645    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
646        goto ALPHA_255;
647
648    /*fall-thru*/
649
650ALPHA_0:
651
652    /*In this state, we know the current alpha is 0 and
653     we optimize for the next alpha also being zero. */
654    src_temp = src;  //so we don't have to increment dst every time
655    do {
656        if(*(++src) > ALPHA_TRANS)
657            break;
658        if(*(++src) > ALPHA_TRANS)
659            break;
660        if(*(++src) > ALPHA_TRANS)
661            break;
662        if(*(++src) > ALPHA_TRANS)
663            break;
664    } while(src < src_end);
665
666    dst += (src - src_temp);
667
668    /* no longer alpha 0, so determine where to go next. */
669    if( src >= src_end)
670        goto TAIL;
671    if(*src >= ALPHA_OPAQ)
672        goto ALPHA_255;
673    else
674        goto ALPHA_1_TO_254;
675
676ALPHA_255:
677    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
678        dst[0]=src[0];
679        dst[1]=src[1];
680        dst[2]=src[2];
681        dst[3]=src[3];
682        src+=UNROLL;
683        dst+=UNROLL;
684        if(src >= src_end)
685            goto TAIL;
686    }
687
688    //Handle remainder.
689    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
690        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
691            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
692        }
693    }
694
695    if( src >= src_end)
696        goto TAIL;
697    if(*src <= ALPHA_TRANS)
698        goto ALPHA_0;
699    else
700        goto ALPHA_1_TO_254;
701
702TAIL:
703    /* do any residual iterations */
704    src_end += UNROLL + 1;  //goto the real end
705    while(src != src_end) {
706        if( *src != 0 ) {
707            if( *src >= ALPHA_OPAQ ) {
708                *dst = *src;
709            }
710            else {
711                *dst = SkPMSrcOver(*src, *dst);
712            }
713        }
714        src++;
715        dst++;
716    }
717
718#undef    UNROLL
719    return;
720}
721
722/* Neon version of S32_Blend_BlitRow32()
723 * portable version is in src/core/SkBlitRow_D32.cpp
724 */
725void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
726                              const SkPMColor* SK_RESTRICT src,
727                              int count, U8CPU alpha) {
728    SkASSERT(alpha <= 255);
729    if (count > 0) {
730        uint16_t src_scale = SkAlpha255To256(alpha);
731        uint16_t dst_scale = 256 - src_scale;
732
733    /* run them N at a time through the NEON unit */
734    /* note that each 1 is 4 bytes, each treated exactly the same,
735     * so we can work under that guise. We *do* know that the src&dst
736     * will be 32-bit aligned quantities, so we can specify that on
737     * the load/store ops and do a neon 'reinterpret' to get us to
738     * byte-sized (pun intended) pieces that we widen/multiply/shift
739     * we're limited at 128 bits in the wide ops, which is 8x16bits
740     * or a pair of 32 bit src/dsts.
741     */
742    /* we *could* manually unroll this loop so that we load 128 bits
743     * (as a pair of 64s) from each of src and dst, processing them
744     * in pieces. This might give us a little better management of
745     * the memory latency, but my initial attempts here did not
746     * produce an instruction stream that looked all that nice.
747     */
748#define    UNROLL    2
749    while (count >= UNROLL) {
750        uint8x8_t  src_raw, dst_raw, dst_final;
751        uint16x8_t  src_wide, dst_wide;
752
753        /* get 64 bits of src, widen it, multiply by src_scale */
754        src_raw = vreinterpret_u8_u32(vld1_u32(src));
755        src_wide = vmovl_u8(src_raw);
756        /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
757        src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
758
759        /* ditto with dst */
760        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
761        dst_wide = vmovl_u8(dst_raw);
762
763        /* combine add with dst multiply into mul-accumulate */
764        dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
765
766        dst_final = vshrn_n_u16(dst_wide, 8);
767        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
768
769        src += UNROLL;
770        dst += UNROLL;
771        count -= UNROLL;
772    }
773    /* RBE: well, i don't like how gcc manages src/dst across the above
774     * loop it's constantly calculating src+bias, dst+bias and it only
775     * adjusts the real ones when we leave the loop. Not sure why
776     * it's "hoisting down" (hoisting implies above in my lexicon ;))
777     * the adjustments to src/dst/count, but it does...
778     * (might be SSA-style internal logic...
779     */
780
781#if    UNROLL == 2
782    if (count == 1) {
783            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
784    }
785#else
786    if (count > 0) {
787            do {
788                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
789                src += 1;
790                dst += 1;
791            } while (--count > 0);
792    }
793#endif
794
795#undef    UNROLL
796    }
797}
798
799void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
800                         const SkPMColor* SK_RESTRICT src,
801                         int count, U8CPU alpha) {
802
803    SkASSERT(255 >= alpha);
804
805    if (count <= 0) {
806        return;
807    }
808
809    unsigned alpha256 = SkAlpha255To256(alpha);
810
811    // First deal with odd counts
812    if (count & 1) {
813        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
814        uint16x8_t vdst_wide, vsrc_wide;
815        unsigned dst_scale;
816
817        // Load
818        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
819        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
820
821        // Calc dst_scale
822        dst_scale = vget_lane_u8(vsrc, 3);
823        dst_scale *= alpha256;
824        dst_scale >>= 8;
825        dst_scale = 256 - dst_scale;
826
827        // Process src
828        vsrc_wide = vmovl_u8(vsrc);
829        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
830
831        // Process dst
832        vdst_wide = vmovl_u8(vdst);
833        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
834
835        // Combine
836        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
837
838        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
839        dst++;
840        src++;
841        count--;
842    }
843
844    if (count) {
845        uint8x8_t alpha_mask;
846        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
847        alpha_mask = vld1_u8(alpha_mask_setup);
848
849        do {
850
851            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
852            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
853
854            __builtin_prefetch(src+32);
855            __builtin_prefetch(dst+32);
856
857            // Load
858            vsrc = vreinterpret_u8_u32(vld1_u32(src));
859            vdst = vreinterpret_u8_u32(vld1_u32(dst));
860
861            // Prepare src_scale
862            vsrc_scale = vdupq_n_u16(alpha256);
863
864            // Calc dst_scale
865            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
866            vdst_scale = vmovl_u8(vsrc_alphas);
867            vdst_scale *= vsrc_scale;
868            vdst_scale = vshrq_n_u16(vdst_scale, 8);
869            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
870
871            // Process src
872            vsrc_wide = vmovl_u8(vsrc);
873            vsrc_wide *= vsrc_scale;
874
875            // Process dst
876            vdst_wide = vmovl_u8(vdst);
877            vdst_wide *= vdst_scale;
878
879            // Combine
880            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
881
882            vst1_u32(dst, vreinterpret_u32_u8(vres));
883
884            src += 2;
885            dst += 2;
886            count -= 2;
887        } while(count);
888    }
889}
890
891///////////////////////////////////////////////////////////////////////////////
892
893#undef    DEBUG_OPAQUE_DITHER
894
895#if    defined(DEBUG_OPAQUE_DITHER)
896static void showme8(char *str, void *p, int len)
897{
898    static char buf[256];
899    char tbuf[32];
900    int i;
901    char *pc = (char*) p;
902    sprintf(buf,"%8s:", str);
903    for(i=0;i<len;i++) {
904        sprintf(tbuf, "   %02x", pc[i]);
905        strcat(buf, tbuf);
906    }
907    SkDebugf("%s\n", buf);
908}
909static void showme16(char *str, void *p, int len)
910{
911    static char buf[256];
912    char tbuf[32];
913    int i;
914    uint16_t *pc = (uint16_t*) p;
915    sprintf(buf,"%8s:", str);
916    len = (len / sizeof(uint16_t));    /* passed as bytes */
917    for(i=0;i<len;i++) {
918        sprintf(tbuf, " %04x", pc[i]);
919        strcat(buf, tbuf);
920    }
921    SkDebugf("%s\n", buf);
922}
923#endif
924
925void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
926                                   const SkPMColor* SK_RESTRICT src,
927                                   int count, U8CPU alpha, int x, int y) {
928    SkASSERT(255 == alpha);
929
930#define    UNROLL    8
931
932    if (count >= UNROLL) {
933    uint8x8_t dbase;
934
935#if    defined(DEBUG_OPAQUE_DITHER)
936    uint16_t tmpbuf[UNROLL];
937    int td[UNROLL];
938    int tdv[UNROLL];
939    int ta[UNROLL];
940    int tap[UNROLL];
941    uint16_t in_dst[UNROLL];
942    int offset = 0;
943    int noisy = 0;
944#endif
945
946    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
947    dbase = vld1_u8(dstart);
948
949        do {
950        uint8x8_t sr, sg, sb, sa, d;
951        uint16x8_t dst8, scale8, alpha8;
952        uint16x8_t dst_r, dst_g, dst_b;
953
954#if    defined(DEBUG_OPAQUE_DITHER)
955    /* calculate 8 elements worth into a temp buffer */
956    {
957      int my_y = y;
958      int my_x = x;
959      SkPMColor* my_src = (SkPMColor*)src;
960      uint16_t* my_dst = dst;
961      int i;
962
963          DITHER_565_SCAN(my_y);
964          for(i=0;i<UNROLL;i++) {
965            SkPMColor c = *my_src++;
966            SkPMColorAssert(c);
967            if (c) {
968                unsigned a = SkGetPackedA32(c);
969
970                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
971        tdv[i] = DITHER_VALUE(my_x);
972        ta[i] = a;
973        tap[i] = SkAlpha255To256(a);
974        td[i] = d;
975
976                unsigned sr = SkGetPackedR32(c);
977                unsigned sg = SkGetPackedG32(c);
978                unsigned sb = SkGetPackedB32(c);
979                sr = SkDITHER_R32_FOR_565(sr, d);
980                sg = SkDITHER_G32_FOR_565(sg, d);
981                sb = SkDITHER_B32_FOR_565(sb, d);
982
983                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
984                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
985                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
986                // now src and dst expanded are in g:11 r:10 x:1 b:10
987                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
988        td[i] = d;
989
990            } else {
991        tmpbuf[i] = *my_dst;
992        ta[i] = tdv[i] = td[i] = 0xbeef;
993        }
994        in_dst[i] = *my_dst;
995            my_dst += 1;
996            DITHER_INC_X(my_x);
997          }
998    }
999#endif
1000
1001        /* source is in ABGR */
1002        {
1003        register uint8x8_t d0 asm("d0");
1004        register uint8x8_t d1 asm("d1");
1005        register uint8x8_t d2 asm("d2");
1006        register uint8x8_t d3 asm("d3");
1007
1008        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1009            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1010            : "r" (src)
1011                    );
1012            sr = d0; sg = d1; sb = d2; sa = d3;
1013        }
1014
1015        /* calculate 'd', which will be 0..7 */
1016        /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1017#if defined(SK_BUILD_FOR_ANDROID)
1018        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1019        alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1020#else
1021        alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1022#endif
1023        alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1024        d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
1025
1026        /* sr = sr - (sr>>5) + d */
1027        /* watching for 8-bit overflow.  d is 0..7; risky range of
1028         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1029         * safe  as long as we do ((sr-sr>>5) + d) */
1030        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1031        sr = vadd_u8(sr, d);
1032
1033        /* sb = sb - (sb>>5) + d */
1034        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1035        sb = vadd_u8(sb, d);
1036
1037        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1038        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1039        sg = vadd_u8(sg, vshr_n_u8(d,1));
1040
1041        /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1042        dst8 = vld1q_u16(dst);
1043        dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1044        dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1045        dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
1046
1047        /* blend */
1048#if 1
1049        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1050        /* originally 255-sa + 1 */
1051        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1052#else
1053        scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1054        scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1055#endif
1056
1057#if 1
1058        /* combine the addq and mul, save 3 insns */
1059        scale8 = vshrq_n_u16(scale8, 3);
1060        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1061        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1062        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1063#else
1064        /* known correct, but +3 insns over above */
1065        scale8 = vshrq_n_u16(scale8, 3);
1066        dst_b = vmulq_u16(dst_b, scale8);
1067        dst_g = vmulq_u16(dst_g, scale8);
1068        dst_r = vmulq_u16(dst_r, scale8);
1069
1070        /* combine */
1071        /* NB: vshll widens, need to preserve those bits */
1072        dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1073        dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1074        dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1075#endif
1076
1077        /* repack to store */
1078        dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1079        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1080        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1081
1082        vst1q_u16(dst, dst8);
1083
1084#if    defined(DEBUG_OPAQUE_DITHER)
1085        /* verify my 8 elements match the temp buffer */
1086    {
1087       int i, bad=0;
1088       static int invocation;
1089
1090       for (i=0;i<UNROLL;i++)
1091        if (tmpbuf[i] != dst[i]) bad=1;
1092       if (bad) {
1093        SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1094            invocation, offset);
1095        SkDebugf("  alpha 0x%x\n", alpha);
1096        for (i=0;i<UNROLL;i++)
1097            SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1098            i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1099            dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1100
1101        showme16("alpha8", &alpha8, sizeof(alpha8));
1102        showme16("scale8", &scale8, sizeof(scale8));
1103        showme8("d", &d, sizeof(d));
1104        showme16("dst8", &dst8, sizeof(dst8));
1105        showme16("dst_b", &dst_b, sizeof(dst_b));
1106        showme16("dst_g", &dst_g, sizeof(dst_g));
1107        showme16("dst_r", &dst_r, sizeof(dst_r));
1108        showme8("sb", &sb, sizeof(sb));
1109        showme8("sg", &sg, sizeof(sg));
1110        showme8("sr", &sr, sizeof(sr));
1111
1112        /* cop out */
1113        return;
1114       }
1115       offset += UNROLL;
1116       invocation++;
1117    }
1118#endif
1119
1120            dst += UNROLL;
1121        src += UNROLL;
1122        count -= UNROLL;
1123        /* skip x += UNROLL, since it's unchanged mod-4 */
1124        } while (count >= UNROLL);
1125    }
1126#undef    UNROLL
1127
1128    /* residuals */
1129    if (count > 0) {
1130        DITHER_565_SCAN(y);
1131        do {
1132            SkPMColor c = *src++;
1133            SkPMColorAssert(c);
1134            if (c) {
1135                unsigned a = SkGetPackedA32(c);
1136
1137                // dither and alpha are just temporary variables to work-around
1138                // an ICE in debug.
1139                unsigned dither = DITHER_VALUE(x);
1140                unsigned alpha = SkAlpha255To256(a);
1141                int d = SkAlphaMul(dither, alpha);
1142
1143                unsigned sr = SkGetPackedR32(c);
1144                unsigned sg = SkGetPackedG32(c);
1145                unsigned sb = SkGetPackedB32(c);
1146                sr = SkDITHER_R32_FOR_565(sr, d);
1147                sg = SkDITHER_G32_FOR_565(sg, d);
1148                sb = SkDITHER_B32_FOR_565(sb, d);
1149
1150                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1151                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1152                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1153                // now src and dst expanded are in g:11 r:10 x:1 b:10
1154                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1155            }
1156            dst += 1;
1157            DITHER_INC_X(x);
1158        } while (--count != 0);
1159    }
1160}
1161
1162///////////////////////////////////////////////////////////////////////////////
1163
1164#undef    DEBUG_S32_OPAQUE_DITHER
1165
1166void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1167                                 const SkPMColor* SK_RESTRICT src,
1168                                 int count, U8CPU alpha, int x, int y) {
1169    SkASSERT(255 == alpha);
1170
1171#define    UNROLL    8
1172    if (count >= UNROLL) {
1173    uint8x8_t d;
1174    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1175    d = vld1_u8(dstart);
1176
1177    while (count >= UNROLL) {
1178        uint8x8_t sr, sg, sb;
1179        uint16x8_t dr, dg, db;
1180        uint16x8_t dst8;
1181
1182        {
1183        register uint8x8_t d0 asm("d0");
1184        register uint8x8_t d1 asm("d1");
1185        register uint8x8_t d2 asm("d2");
1186        register uint8x8_t d3 asm("d3");
1187
1188        asm (
1189            "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1190            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1191            :
1192        );
1193        sg = d1;
1194#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1195        sr = d2; sb = d0;
1196#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1197        sr = d0; sb = d2;
1198#endif
1199        }
1200        /* XXX: if we want to prefetch, hide it in the above asm()
1201         * using the gcc __builtin_prefetch(), the prefetch will
1202         * fall to the bottom of the loop -- it won't stick up
1203         * at the top of the loop, just after the vld4.
1204         */
1205
1206        // sr = sr - (sr>>5) + d
1207        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1208        dr = vaddl_u8(sr, d);
1209
1210        // sb = sb - (sb>>5) + d
1211        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1212        db = vaddl_u8(sb, d);
1213
1214        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1215        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1216        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1217
1218        // pack high bits of each into 565 format  (rgb, b is lsb)
1219        dst8 = vshrq_n_u16(db, 3);
1220        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1221        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1222
1223        // store it
1224        vst1q_u16(dst, dst8);
1225
1226#if    defined(DEBUG_S32_OPAQUE_DITHER)
1227        // always good to know if we generated good results
1228        {
1229        int i, myx = x, myy = y;
1230        DITHER_565_SCAN(myy);
1231        for (i=0;i<UNROLL;i++) {
1232            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1233            SkPMColor c = src[i-8];
1234            unsigned dither = DITHER_VALUE(myx);
1235            uint16_t val = SkDitherRGB32To565(c, dither);
1236            if (val != dst[i]) {
1237            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1238                c, dither, val, dst[i], dstart[i]);
1239            }
1240            DITHER_INC_X(myx);
1241        }
1242        }
1243#endif
1244
1245        dst += UNROLL;
1246        // we don't need to increment src as the asm above has already done it
1247        count -= UNROLL;
1248        x += UNROLL;        // probably superfluous
1249    }
1250    }
1251#undef    UNROLL
1252
1253    // residuals
1254    if (count > 0) {
1255        DITHER_565_SCAN(y);
1256        do {
1257            SkPMColor c = *src++;
1258            SkPMColorAssert(c);
1259            SkASSERT(SkGetPackedA32(c) == 255);
1260
1261            unsigned dither = DITHER_VALUE(x);
1262            *dst++ = SkDitherRGB32To565(c, dither);
1263            DITHER_INC_X(x);
1264        } while (--count != 0);
1265    }
1266}
1267
1268void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1269                      SkPMColor color) {
1270    if (count <= 0) {
1271        return;
1272    }
1273
1274    if (0 == color) {
1275        if (src != dst) {
1276            memcpy(dst, src, count * sizeof(SkPMColor));
1277        }
1278        return;
1279    }
1280
1281    unsigned colorA = SkGetPackedA32(color);
1282    if (255 == colorA) {
1283        sk_memset32(dst, color, count);
1284    } else {
1285        unsigned scale = 256 - SkAlpha255To256(colorA);
1286
1287        if (count >= 8) {
1288            // at the end of this assembly, count will have been decremented
1289            // to a negative value. That is, if count mod 8 = x, it will be
1290            // -8 +x coming out.
1291            asm volatile (
1292                PLD128(src, 0)
1293
1294                "vdup.32    q0, %[color]                \n\t"
1295
1296                PLD128(src, 128)
1297
1298                // scale numerical interval [0-255], so load as 8 bits
1299                "vdup.8     d2, %[scale]                \n\t"
1300
1301                PLD128(src, 256)
1302
1303                "subs       %[count], %[count], #8      \n\t"
1304
1305                PLD128(src, 384)
1306
1307                "Loop_Color32:                          \n\t"
1308
1309                // load src color, 8 pixels, 4 64 bit registers
1310                // (and increment src).
1311                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1312
1313                PLD128(src, 384)
1314
1315                // multiply long by scale, 64 bits at a time,
1316                // destination into a 128 bit register.
1317                "vmull.u8   q4, d4, d2                  \n\t"
1318                "vmull.u8   q5, d5, d2                  \n\t"
1319                "vmull.u8   q6, d6, d2                  \n\t"
1320                "vmull.u8   q7, d7, d2                  \n\t"
1321
1322                // shift the 128 bit registers, containing the 16
1323                // bit scaled values back to 8 bits, narrowing the
1324                // results to 64 bit registers.
1325                "vshrn.i16  d8, q4, #8                  \n\t"
1326                "vshrn.i16  d9, q5, #8                  \n\t"
1327                "vshrn.i16  d10, q6, #8                 \n\t"
1328                "vshrn.i16  d11, q7, #8                 \n\t"
1329
1330                // adding back the color, using 128 bit registers.
1331                "vadd.i8    q6, q4, q0                  \n\t"
1332                "vadd.i8    q7, q5, q0                  \n\t"
1333
1334                // store back the 8 calculated pixels (2 128 bit
1335                // registers), and increment dst.
1336                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1337
1338                "subs       %[count], %[count], #8      \n\t"
1339                "bge        Loop_Color32                \n\t"
1340                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1341                : [color] "r" (color), [scale] "r" (scale)
1342                : "cc", "memory",
1343                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1344                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1345                          );
1346            // At this point, if we went through the inline assembly, count is
1347            // a negative value:
1348            // if the value is -8, there is no pixel left to process.
1349            // if the value is -7, there is one pixel left to process
1350            // ...
1351            // And'ing it with 7 will give us the number of pixels
1352            // left to process.
1353            count = count & 0x7;
1354        }
1355
1356        while (count > 0) {
1357            *dst = color + SkAlphaMulQ(*src, scale);
1358            src += 1;
1359            dst += 1;
1360            count--;
1361        }
1362    }
1363}
1364
1365///////////////////////////////////////////////////////////////////////////////
1366
1367const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1368    // no dither
1369    // NOTE: For the S32_D565_Blend function below, we don't have a special
1370    //       version that assumes that each source pixel is opaque. But our
1371    //       S32A is still faster than the default, so use it.
1372    S32_D565_Opaque_neon,
1373    S32A_D565_Blend_neon,   // really S32_D565_Blend
1374    S32A_D565_Opaque_neon,
1375    S32A_D565_Blend_neon,
1376
1377    // dither
1378    S32_D565_Opaque_Dither_neon,
1379    S32_D565_Blend_Dither_neon,
1380    S32A_D565_Opaque_Dither_neon,
1381    NULL,   // S32A_D565_Blend_Dither
1382};
1383
1384const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1385    NULL,   // S32_Opaque,
1386    S32_Blend_BlitRow32_neon,        // S32_Blend,
1387    /*
1388     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1389     * value and attempts to optimize accordingly.  The optimization is
1390     * sensitive to the source content and is not a win in all cases. For
1391     * example, if there are a lot of transitions between the alpha states,
1392     * the performance will almost certainly be worse.  However, for many
1393     * common cases the performance is equivalent or better than the standard
1394     * case where we do not inspect the src alpha.
1395     */
1396#if SK_A32_SHIFT == 24
1397    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1398    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1399#else
1400    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1401#endif
1402    S32A_Blend_BlitRow32_neon        // S32A_Blend
1403};
1404