SkBlitRow_opts_arm_neon.cpp revision 3f55eed73f5af405909c2c10bff179d80526d423
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkColor_opts_neon.h"
18#include <arm_neon.h>
19
20#ifdef SK_CPU_ARM64
21static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22    uint8x8x4_t vsrc;
23    uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24
25    asm (
26        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27        "mov    %[vsrc0].8b, v0.8b             \t\n"
28        "mov    %[vsrc1].8b, v1.8b             \t\n"
29        "mov    %[vsrc2].8b, v2.8b             \t\n"
30        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31          [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32        : : "v0", "v1", "v2", "v3"
33    );
34
35    vsrc.val[0] = vsrc_0;
36    vsrc.val[1] = vsrc_1;
37    vsrc.val[2] = vsrc_2;
38
39    return vsrc;
40}
41
42static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43    uint8x8x4_t vsrc;
44    uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45
46    asm (
47        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48        "mov    %[vsrc0].8b, v0.8b             \t\n"
49        "mov    %[vsrc1].8b, v1.8b             \t\n"
50        "mov    %[vsrc2].8b, v2.8b             \t\n"
51        "mov    %[vsrc3].8b, v3.8b             \t\n"
52        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53          [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54          [src] "+&r" (src)
55        : : "v0", "v1", "v2", "v3"
56    );
57
58    vsrc.val[0] = vsrc_0;
59    vsrc.val[1] = vsrc_1;
60    vsrc.val[2] = vsrc_2;
61    vsrc.val[3] = vsrc_3;
62
63    return vsrc;
64}
65#endif
66
67void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                           const SkPMColor* SK_RESTRICT src, int count,
69                           U8CPU alpha, int /*x*/, int /*y*/) {
70    SkASSERT(255 == alpha);
71
72    while (count >= 8) {
73        uint8x8x4_t vsrc;
74        uint16x8_t vdst;
75
76        // Load
77#ifdef SK_CPU_ARM64
78        vsrc = sk_vld4_u8_arm64_3(src);
79#else
80        vsrc = vld4_u8((uint8_t*)src);
81        src += 8;
82#endif
83
84        // Convert src to 565
85        vdst = SkPixel32ToPixel16_neon8(vsrc);
86
87        // Store
88        vst1q_u16(dst, vdst);
89
90        // Prepare next iteration
91        dst += 8;
92        count -= 8;
93    };
94
95    // Leftovers
96    while (count > 0) {
97        SkPMColor c = *src++;
98        SkPMColorAssert(c);
99        *dst = SkPixel32ToPixel16_ToU16(c);
100        dst++;
101        count--;
102    };
103}
104
105void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                          const SkPMColor* SK_RESTRICT src, int count,
107                          U8CPU alpha, int /*x*/, int /*y*/) {
108    SkASSERT(255 > alpha);
109
110    uint16x8_t vmask_blue, vscale;
111
112    // prepare constants
113    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114    vmask_blue = vmovq_n_u16(0x1F);
115
116    while (count >= 8) {
117        uint8x8x4_t vsrc;
118        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119        uint16x8_t vres_r, vres_g, vres_b;
120
121        // Load src
122#ifdef SK_CPU_ARM64
123        vsrc = sk_vld4_u8_arm64_3(src);
124#else
125        {
126        register uint8x8_t d0 asm("d0");
127        register uint8x8_t d1 asm("d1");
128        register uint8x8_t d2 asm("d2");
129        register uint8x8_t d3 asm("d3");
130
131        asm (
132            "vld4.8    {d0-d3},[%[src]]!"
133            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134            :
135        );
136        vsrc.val[0] = d0;
137        vsrc.val[1] = d1;
138        vsrc.val[2] = d2;
139        }
140#endif
141
142        // Load and unpack dst
143        vdst = vld1q_u16(dst);
144        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148
149        // Shift src to 565 range
150        vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151        vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152        vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153
154        // Scale src - dst
155        vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156        vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157        vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158
159        vres_r = vshrq_n_u16(vres_r * vscale, 8);
160        vres_g = vshrq_n_u16(vres_g * vscale, 8);
161        vres_b = vshrq_n_u16(vres_b * vscale, 8);
162
163        vres_r += vdst_r;
164        vres_g += vdst_g;
165        vres_b += vdst_b;
166
167        // Combine
168        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170
171        // Store
172        vst1q_u16(dst, vres_b);
173        dst += 8;
174        count -= 8;
175    }
176    if (count > 0) {
177        int scale = SkAlpha255To256(alpha);
178        do {
179            SkPMColor c = *src++;
180            SkPMColorAssert(c);
181            uint16_t d = *dst;
182            *dst++ = SkPackRGB16(
183                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186        } while (--count != 0);
187    }
188}
189
190#ifdef SK_CPU_ARM32
191void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                           const SkPMColor* SK_RESTRICT src, int count,
193                           U8CPU alpha, int /*x*/, int /*y*/) {
194    SkASSERT(255 == alpha);
195
196    if (count >= 8) {
197        uint16_t* SK_RESTRICT keep_dst = 0;
198
199        asm volatile (
200                      "ands       ip, %[count], #7            \n\t"
201                      "vmov.u8    d31, #1<<7                  \n\t"
202                      "vld1.16    {q12}, [%[dst]]             \n\t"
203                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
204                      // Thumb does not support the standard ARM conditional
205                      // instructions but instead requires the 'it' instruction
206                      // to signal conditional execution
207                      "it eq                                  \n\t"
208                      "moveq      ip, #8                      \n\t"
209                      "mov        %[keep_dst], %[dst]         \n\t"
210
211                      "add        %[src], %[src], ip, LSL#2   \n\t"
212                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
213                      "subs       %[count], %[count], ip      \n\t"
214                      "b          9f                          \n\t"
215                      // LOOP
216                      "2:                                         \n\t"
217
218                      "vld1.16    {q12}, [%[dst]]!            \n\t"
219                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
222                      "subs       %[count], %[count], #8      \n\t"
223                      "9:                                         \n\t"
224                      "pld        [%[dst],#32]                \n\t"
225                      // expand 0565 q12 to 8888 {d4-d7}
226                      "vmovn.u16  d4, q12                     \n\t"
227                      "vshr.u16   q11, q12, #5                \n\t"
228                      "vshr.u16   q10, q12, #6+5              \n\t"
229                      "vmovn.u16  d5, q11                     \n\t"
230                      "vmovn.u16  d6, q10                     \n\t"
231                      "vshl.u8    d4, d4, #3                  \n\t"
232                      "vshl.u8    d5, d5, #2                  \n\t"
233                      "vshl.u8    d6, d6, #3                  \n\t"
234
235                      "vmovl.u8   q14, d31                    \n\t"
236                      "vmovl.u8   q13, d31                    \n\t"
237                      "vmovl.u8   q12, d31                    \n\t"
238
239                      // duplicate in 4/2/1 & 8pix vsns
240                      "vmvn.8     d30, d3                     \n\t"
241                      "vmlal.u8   q14, d30, d6                \n\t"
242                      "vmlal.u8   q13, d30, d5                \n\t"
243                      "vmlal.u8   q12, d30, d4                \n\t"
244                      "vshr.u16   q8, q14, #5                 \n\t"
245                      "vshr.u16   q9, q13, #6                 \n\t"
246                      "vaddhn.u16 d6, q14, q8                 \n\t"
247                      "vshr.u16   q8, q12, #5                 \n\t"
248                      "vaddhn.u16 d5, q13, q9                 \n\t"
249                      "vaddhn.u16 d4, q12, q8                 \n\t"
250                      // intentionally don't calculate alpha
251                      // result in d4-d6
252
253            #ifdef SK_PMCOLOR_IS_RGBA
254                      "vqadd.u8   d6, d6, d0                  \n\t"
255                      "vqadd.u8   d5, d5, d1                  \n\t"
256                      "vqadd.u8   d4, d4, d2                  \n\t"
257            #else
258                      "vqadd.u8   d6, d6, d2                  \n\t"
259                      "vqadd.u8   d5, d5, d1                  \n\t"
260                      "vqadd.u8   d4, d4, d0                  \n\t"
261            #endif
262
263                      // pack 8888 {d4-d6} to 0565 q10
264                      "vshll.u8   q10, d6, #8                 \n\t"
265                      "vshll.u8   q3, d5, #8                  \n\t"
266                      "vshll.u8   q2, d4, #8                  \n\t"
267                      "vsri.u16   q10, q3, #5                 \n\t"
268                      "vsri.u16   q10, q2, #11                \n\t"
269
270                      "bne        2b                          \n\t"
271
272                      "1:                                         \n\t"
273                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
274                      : [count] "+r" (count)
275                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
276                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
277                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
278                      "d30","d31"
279                      );
280    }
281    else
282    {   // handle count < 8
283        uint16_t* SK_RESTRICT keep_dst = 0;
284
285        asm volatile (
286                      "vmov.u8    d31, #1<<7                  \n\t"
287                      "mov        %[keep_dst], %[dst]         \n\t"
288
289                      "tst        %[count], #4                \n\t"
290                      "beq        14f                         \n\t"
291                      "vld1.16    {d25}, [%[dst]]!            \n\t"
292                      "vld1.32    {q1}, [%[src]]!             \n\t"
293
294                      "14:                                        \n\t"
295                      "tst        %[count], #2                \n\t"
296                      "beq        12f                         \n\t"
297                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
298                      "vld1.32    {d1}, [%[src]]!             \n\t"
299
300                      "12:                                        \n\t"
301                      "tst        %[count], #1                \n\t"
302                      "beq        11f                         \n\t"
303                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
304                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
305
306                      "11:                                        \n\t"
307                      // unzips achieve the same as a vld4 operation
308                      "vuzp.u16   q0, q1                      \n\t"
309                      "vuzp.u8    d0, d1                      \n\t"
310                      "vuzp.u8    d2, d3                      \n\t"
311                      // expand 0565 q12 to 8888 {d4-d7}
312                      "vmovn.u16  d4, q12                     \n\t"
313                      "vshr.u16   q11, q12, #5                \n\t"
314                      "vshr.u16   q10, q12, #6+5              \n\t"
315                      "vmovn.u16  d5, q11                     \n\t"
316                      "vmovn.u16  d6, q10                     \n\t"
317                      "vshl.u8    d4, d4, #3                  \n\t"
318                      "vshl.u8    d5, d5, #2                  \n\t"
319                      "vshl.u8    d6, d6, #3                  \n\t"
320
321                      "vmovl.u8   q14, d31                    \n\t"
322                      "vmovl.u8   q13, d31                    \n\t"
323                      "vmovl.u8   q12, d31                    \n\t"
324
325                      // duplicate in 4/2/1 & 8pix vsns
326                      "vmvn.8     d30, d3                     \n\t"
327                      "vmlal.u8   q14, d30, d6                \n\t"
328                      "vmlal.u8   q13, d30, d5                \n\t"
329                      "vmlal.u8   q12, d30, d4                \n\t"
330                      "vshr.u16   q8, q14, #5                 \n\t"
331                      "vshr.u16   q9, q13, #6                 \n\t"
332                      "vaddhn.u16 d6, q14, q8                 \n\t"
333                      "vshr.u16   q8, q12, #5                 \n\t"
334                      "vaddhn.u16 d5, q13, q9                 \n\t"
335                      "vaddhn.u16 d4, q12, q8                 \n\t"
336                      // intentionally don't calculate alpha
337                      // result in d4-d6
338
339            #ifdef SK_PMCOLOR_IS_RGBA
340                      "vqadd.u8   d6, d6, d0                  \n\t"
341                      "vqadd.u8   d5, d5, d1                  \n\t"
342                      "vqadd.u8   d4, d4, d2                  \n\t"
343            #else
344                      "vqadd.u8   d6, d6, d2                  \n\t"
345                      "vqadd.u8   d5, d5, d1                  \n\t"
346                      "vqadd.u8   d4, d4, d0                  \n\t"
347            #endif
348
349                      // pack 8888 {d4-d6} to 0565 q10
350                      "vshll.u8   q10, d6, #8                 \n\t"
351                      "vshll.u8   q3, d5, #8                  \n\t"
352                      "vshll.u8   q2, d4, #8                  \n\t"
353                      "vsri.u16   q10, q3, #5                 \n\t"
354                      "vsri.u16   q10, q2, #11                \n\t"
355
356                      // store
357                      "tst        %[count], #4                \n\t"
358                      "beq        24f                         \n\t"
359                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
360
361                      "24:                                        \n\t"
362                      "tst        %[count], #2                \n\t"
363                      "beq        22f                         \n\t"
364                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
365
366                      "22:                                        \n\t"
367                      "tst        %[count], #1                \n\t"
368                      "beq        21f                         \n\t"
369                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
370
371                      "21:                                        \n\t"
372                      : [count] "+r" (count)
373                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
374                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
375                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
376                      "d30","d31"
377                      );
378    }
379}
380
381#else // #ifdef SK_CPU_ARM32
382
383void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
384                           const SkPMColor* SK_RESTRICT src, int count,
385                           U8CPU alpha, int /*x*/, int /*y*/) {
386    SkASSERT(255 == alpha);
387
388    if (count >= 16) {
389        asm (
390            "movi    v4.8h, #0x80                   \t\n"
391
392            "1:                                     \t\n"
393            "sub     %[count], %[count], #16        \t\n"
394            "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
395            "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
396            "prfm    pldl1keep, [%[src],#512]       \t\n"
397            "prfm    pldl1keep, [%[dst],#256]       \t\n"
398            "ushr    v20.8h, v17.8h, #5             \t\n"
399            "ushr    v31.8h, v16.8h, #5             \t\n"
400            "xtn     v6.8b, v31.8h                  \t\n"
401            "xtn2    v6.16b, v20.8h                 \t\n"
402            "ushr    v20.8h, v17.8h, #11            \t\n"
403            "shl     v19.16b, v6.16b, #2            \t\n"
404            "ushr    v31.8h, v16.8h, #11            \t\n"
405            "xtn     v22.8b, v31.8h                 \t\n"
406            "xtn2    v22.16b, v20.8h                \t\n"
407            "shl     v18.16b, v22.16b, #3           \t\n"
408            "mvn     v3.16b, v3.16b                 \t\n"
409            "xtn     v16.8b, v16.8h                 \t\n"
410            "mov     v7.16b, v4.16b                 \t\n"
411            "xtn2    v16.16b, v17.8h                \t\n"
412            "umlal   v7.8h, v3.8b, v19.8b           \t\n"
413            "shl     v16.16b, v16.16b, #3           \t\n"
414            "mov     v22.16b, v4.16b                \t\n"
415            "ushr    v24.8h, v7.8h, #6              \t\n"
416            "umlal   v22.8h, v3.8b, v18.8b          \t\n"
417            "ushr    v20.8h, v22.8h, #5             \t\n"
418            "addhn   v20.8b, v22.8h, v20.8h         \t\n"
419            "cmp     %[count], #16                  \t\n"
420            "mov     v6.16b, v4.16b                 \t\n"
421            "mov     v5.16b, v4.16b                 \t\n"
422            "umlal   v6.8h, v3.8b, v16.8b           \t\n"
423            "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
424            "mov     v17.16b, v4.16b                \t\n"
425            "ushr    v19.8h, v6.8h, #5              \t\n"
426            "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
427            "addhn   v7.8b, v7.8h, v24.8h           \t\n"
428            "ushr    v18.8h, v5.8h, #6              \t\n"
429            "ushr    v21.8h, v17.8h, #5             \t\n"
430            "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
431            "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
432            "mov     v22.16b, v4.16b                \t\n"
433            "addhn   v6.8b, v6.8h, v19.8h           \t\n"
434            "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
435            "ushr    v5.8h, v22.8h, #5              \t\n"
436            "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
437            "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
438#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
439            "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
440            "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
441#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
442            "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
443            "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
444#else
445#error "This function only supports BGRA and RGBA."
446#endif
447            "shll    v22.8h, v20.8b, #8             \t\n"
448            "shll    v5.8h, v7.8b, #8               \t\n"
449            "sri     v22.8h, v5.8h, #5              \t\n"
450            "shll    v17.8h, v6.8b, #8              \t\n"
451            "shll2   v23.8h, v20.16b, #8            \t\n"
452            "shll2   v7.8h, v7.16b, #8              \t\n"
453            "sri     v22.8h, v17.8h, #11            \t\n"
454            "sri     v23.8h, v7.8h, #5              \t\n"
455            "shll2   v6.8h, v6.16b, #8              \t\n"
456            "st1     {v22.8h}, [%[dst]], #16        \t\n"
457            "sri     v23.8h, v6.8h, #11             \t\n"
458            "st1     {v23.8h}, [%[dst]], #16        \t\n"
459            "b.ge    1b                             \t\n"
460            : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
461            :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
462               "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
463               "v31"
464        );
465    }
466        // Leftovers
467    if (count > 0) {
468        do {
469            SkPMColor c = *src++;
470            SkPMColorAssert(c);
471            if (c) {
472                *dst = SkSrcOver32To16(c, *dst);
473            }
474            dst += 1;
475        } while (--count != 0);
476    }
477}
478#endif // #ifdef SK_CPU_ARM32
479
480static uint32_t pmcolor_to_expand16(SkPMColor c) {
481    unsigned r = SkGetPackedR32(c);
482    unsigned g = SkGetPackedG32(c);
483    unsigned b = SkGetPackedB32(c);
484    return (g << 24) | (r << 13) | (b << 2);
485}
486
487void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
488    uint32_t src_expand;
489    unsigned scale;
490    uint16x8_t vmask_blue;
491
492    if (count <= 0) return;
493    SkASSERT(((size_t)dst & 0x01) == 0);
494
495    /*
496     * This preamble code is in order to make dst aligned to 8 bytes
497     * in the next mutiple bytes read & write access.
498     */
499    src_expand = pmcolor_to_expand16(src);
500    scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
501
502#define DST_ALIGN 8
503
504    /*
505     * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
506     */
507    int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
508
509    for (int i = 0; i < preamble_size; i+=2, dst++) {
510        uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
511        *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
512        if (--count == 0)
513            break;
514    }
515
516    int count16 = 0;
517    count16 = count >> 4;
518    vmask_blue = vmovq_n_u16(SK_B16_MASK);
519
520    if (count16) {
521        uint16x8_t wide_sr;
522        uint16x8_t wide_sg;
523        uint16x8_t wide_sb;
524        uint16x8_t wide_256_sa;
525
526        unsigned sr = SkGetPackedR32(src);
527        unsigned sg = SkGetPackedG32(src);
528        unsigned sb = SkGetPackedB32(src);
529        unsigned sa = SkGetPackedA32(src);
530
531        // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
532        // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
533        //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
534        wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
535
536        // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
537        //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
538        wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
539
540        // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
541        //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
542        wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
543
544        wide_256_sa =
545            vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
546
547        while (count16-- > 0) {
548            uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
549            uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
550            vdst1 = vld1q_u16(dst);
551            dst += 8;
552            vdst2 = vld1q_u16(dst);
553            dst -= 8;    //to store dst again.
554
555            vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
556            vdst1_b = vdst1 & vmask_blue;                              // extract blue
557            vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
558            vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
559
560            vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
561            vdst2_b = vdst2 & vmask_blue;                              // extract blue
562            vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
563            vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
564
565            vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
566            vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
567            vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
568
569            vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
570            vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
571            vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
572
573            vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
574            vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
575            vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
576
577            vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
578            vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
579
580            vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
581            vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
582            vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
583
584            vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
585            vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
586
587            vst1q_u16(dst, vdst1);
588            dst += 8;
589            vst1q_u16(dst, vdst2);
590            dst += 8;
591        }
592    }
593
594    count &= 0xF;
595    if (count > 0) {
596        do {
597            uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
598            *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
599            dst += 1;
600        } while (--count != 0);
601    }
602}
603
604static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
605    prod += vdupq_n_u16(128);
606    prod += vshrq_n_u16(prod, 8);
607    return vshrq_n_u16(prod, 8);
608}
609
610void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
611                          const SkPMColor* SK_RESTRICT src, int count,
612                          U8CPU alpha, int /*x*/, int /*y*/) {
613   SkASSERT(255 > alpha);
614
615    /* This code implements a Neon version of S32A_D565_Blend. The results have
616     * a few mismatches compared to the original code. These mismatches never
617     * exceed 1.
618     */
619
620    if (count >= 8) {
621        uint16x8_t valpha_max, vmask_blue;
622        uint8x8_t valpha;
623
624        // prepare constants
625        valpha_max = vmovq_n_u16(255);
626        valpha = vdup_n_u8(alpha);
627        vmask_blue = vmovq_n_u16(SK_B16_MASK);
628
629        do {
630            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
631            uint16x8_t vres_a, vres_r, vres_g, vres_b;
632            uint8x8x4_t vsrc;
633
634            // load pixels
635            vdst = vld1q_u16(dst);
636#ifdef SK_CPU_ARM64
637            vsrc = sk_vld4_u8_arm64_4(src);
638#else
639#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
640            asm (
641                "vld4.u8 %h[vsrc], [%[src]]!"
642                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
643                : :
644            );
645#else
646            register uint8x8_t d0 asm("d0");
647            register uint8x8_t d1 asm("d1");
648            register uint8x8_t d2 asm("d2");
649            register uint8x8_t d3 asm("d3");
650
651            asm volatile (
652                "vld4.u8    {d0-d3},[%[src]]!;"
653                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
654                  [src] "+&r" (src)
655                : :
656            );
657            vsrc.val[0] = d0;
658            vsrc.val[1] = d1;
659            vsrc.val[2] = d2;
660            vsrc.val[3] = d3;
661#endif
662#endif // #ifdef SK_CPU_ARM64
663
664
665            // deinterleave dst
666            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
667            vdst_b = vdst & vmask_blue;                     // extract blue
668            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
669            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
670
671            // shift src to 565
672            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
673            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
674            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
675
676            // calc src * src_scale
677            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
678            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
679            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
680            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
681
682            // prepare dst_scale
683            vres_a = SkDiv255Round_neon8(vres_a);
684            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
685
686            // add dst * dst_scale to previous result
687            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
688            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
689            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
690
691#ifdef S32A_D565_BLEND_EXACT
692            // It is possible to get exact results with this but it is slow,
693            // even slower than C code in some cases
694            vres_r = SkDiv255Round_neon8(vres_r);
695            vres_g = SkDiv255Round_neon8(vres_g);
696            vres_b = SkDiv255Round_neon8(vres_b);
697#else
698            vres_r = vrshrq_n_u16(vres_r, 8);
699            vres_g = vrshrq_n_u16(vres_g, 8);
700            vres_b = vrshrq_n_u16(vres_b, 8);
701#endif
702            // pack result
703            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
704            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
705
706            // store
707            vst1q_u16(dst, vres_b);
708            dst += 8;
709            count -= 8;
710        } while (count >= 8);
711    }
712
713    // leftovers
714    while (count-- > 0) {
715        SkPMColor sc = *src++;
716        if (sc) {
717            uint16_t dc = *dst;
718            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
719            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
720            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
721            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
722            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
723        }
724        dst += 1;
725    }
726}
727
728/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
729 * each dither value is spaced out into byte lanes, and repeated
730 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
731 * start of each row.
732 */
733static const uint8_t gDitherMatrix_Neon[48] = {
734    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
735    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
736    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
737    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
738
739};
740
741void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
742                                int count, U8CPU alpha, int x, int y)
743{
744
745    SkASSERT(255 > alpha);
746
747    // rescale alpha to range 1 - 256
748    int scale = SkAlpha255To256(alpha);
749
750    if (count >= 8) {
751        /* select row and offset for dither array */
752        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
753
754        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
755        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
756
757        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
758        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
759
760        do {
761
762            uint8x8x4_t vsrc;
763            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
764            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
765            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
766            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
767            uint16x8_t vdst;
768            uint16x8_t vdst_r, vdst_g, vdst_b;
769            int16x8_t vres_r, vres_g, vres_b;
770            int8x8_t vres8_r, vres8_g, vres8_b;
771
772            // Load source and add dither
773#ifdef SK_CPU_ARM64
774            vsrc = sk_vld4_u8_arm64_3(src);
775#else
776            {
777            register uint8x8_t d0 asm("d0");
778            register uint8x8_t d1 asm("d1");
779            register uint8x8_t d2 asm("d2");
780            register uint8x8_t d3 asm("d3");
781
782            asm (
783                "vld4.8    {d0-d3},[%[src]]! "
784                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
785                :
786            );
787            vsrc.val[0] = d0;
788            vsrc.val[1] = d1;
789            vsrc.val[2] = d2;
790            }
791#endif
792            vsrc_r = vsrc.val[NEON_R];
793            vsrc_g = vsrc.val[NEON_G];
794            vsrc_b = vsrc.val[NEON_B];
795
796            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
797            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
798            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
799
800            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
801            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
802            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
803
804            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
805            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
806            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
807
808            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
809            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
810            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
811
812            // Load dst and unpack
813            vdst = vld1q_u16(dst);
814            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
815            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
816            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
817
818            // subtract dst from src and widen
819            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
820            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
821            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
822
823            // multiply diffs by scale and shift
824            vres_r = vmulq_s16(vres_r, vscale);
825            vres_g = vmulq_s16(vres_g, vscale);
826            vres_b = vmulq_s16(vres_b, vscale);
827
828            vres8_r = vshrn_n_s16(vres_r, 8);
829            vres8_g = vshrn_n_s16(vres_g, 8);
830            vres8_b = vshrn_n_s16(vres_b, 8);
831
832            // add dst to result
833            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
834            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
835            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
836
837            // put result into 565 format
838            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
839            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
840
841            // Store result
842            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
843
844            // Next iteration
845            dst += 8;
846            count -= 8;
847
848        } while (count >= 8);
849    }
850
851    // Leftovers
852    if (count > 0) {
853        int scale = SkAlpha255To256(alpha);
854        DITHER_565_SCAN(y);
855        do {
856            SkPMColor c = *src++;
857            SkPMColorAssert(c);
858
859            int dither = DITHER_VALUE(x);
860            int sr = SkGetPackedR32(c);
861            int sg = SkGetPackedG32(c);
862            int sb = SkGetPackedB32(c);
863            sr = SkDITHER_R32To565(sr, dither);
864            sg = SkDITHER_G32To565(sg, dither);
865            sb = SkDITHER_B32To565(sb, dither);
866
867            uint16_t d = *dst;
868            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
869                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
870                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
871            DITHER_INC_X(x);
872        } while (--count != 0);
873    }
874}
875
876void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
877                                const SkPMColor* SK_RESTRICT src,
878                                int count, U8CPU alpha) {
879
880    SkASSERT(255 == alpha);
881    if (count > 0) {
882
883
884    uint8x8_t alpha_mask;
885
886    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
887    alpha_mask = vld1_u8(alpha_mask_setup);
888
889    /* do the NEON unrolled code */
890#define    UNROLL    4
891    while (count >= UNROLL) {
892        uint8x8_t src_raw, dst_raw, dst_final;
893        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
894
895        /* The two prefetches below may make the code slighlty
896         * slower for small values of count but are worth having
897         * in the general case.
898         */
899        __builtin_prefetch(src+32);
900        __builtin_prefetch(dst+32);
901
902        /* get the source */
903        src_raw = vreinterpret_u8_u32(vld1_u32(src));
904#if    UNROLL > 2
905        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
906#endif
907
908        /* get and hold the dst too */
909        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
910#if    UNROLL > 2
911        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
912#endif
913
914    /* 1st and 2nd bits of the unrolling */
915    {
916        uint8x8_t dst_cooked;
917        uint16x8_t dst_wide;
918        uint8x8_t alpha_narrow;
919        uint16x8_t alpha_wide;
920
921        /* get the alphas spread out properly */
922        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
923        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
924
925        /* spread the dest */
926        dst_wide = vmovl_u8(dst_raw);
927
928        /* alpha mul the dest */
929        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
930        dst_cooked = vshrn_n_u16(dst_wide, 8);
931
932        /* sum -- ignoring any byte lane overflows */
933        dst_final = vadd_u8(src_raw, dst_cooked);
934    }
935
936#if    UNROLL > 2
937    /* the 3rd and 4th bits of our unrolling */
938    {
939        uint8x8_t dst_cooked;
940        uint16x8_t dst_wide;
941        uint8x8_t alpha_narrow;
942        uint16x8_t alpha_wide;
943
944        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
945        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
946
947        /* spread the dest */
948        dst_wide = vmovl_u8(dst_raw_2);
949
950        /* alpha mul the dest */
951        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
952        dst_cooked = vshrn_n_u16(dst_wide, 8);
953
954        /* sum -- ignoring any byte lane overflows */
955        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
956    }
957#endif
958
959        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
960#if    UNROLL > 2
961        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
962#endif
963
964        src += UNROLL;
965        dst += UNROLL;
966        count -= UNROLL;
967    }
968#undef    UNROLL
969
970    /* do any residual iterations */
971        while (--count >= 0) {
972            *dst = SkPMSrcOver(*src, *dst);
973            src += 1;
974            dst += 1;
975        }
976    }
977}
978
979void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
980                                const SkPMColor* SK_RESTRICT src,
981                                int count, U8CPU alpha) {
982    SkASSERT(255 == alpha);
983
984    if (count <= 0)
985    return;
986
987    /* Use these to check if src is transparent or opaque */
988    const unsigned int ALPHA_OPAQ  = 0xFF000000;
989    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
990
991#define UNROLL  4
992    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
993    const SkPMColor* SK_RESTRICT src_temp = src;
994
995    /* set up the NEON variables */
996    uint8x8_t alpha_mask;
997    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
998    alpha_mask = vld1_u8(alpha_mask_setup);
999
1000    uint8x8_t src_raw, dst_raw, dst_final;
1001    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
1002    uint8x8_t dst_cooked;
1003    uint16x8_t dst_wide;
1004    uint8x8_t alpha_narrow;
1005    uint16x8_t alpha_wide;
1006
1007    /* choose the first processing type */
1008    if( src >= src_end)
1009        goto TAIL;
1010    if(*src <= ALPHA_TRANS)
1011        goto ALPHA_0;
1012    if(*src >= ALPHA_OPAQ)
1013        goto ALPHA_255;
1014    /* fall-thru */
1015
1016ALPHA_1_TO_254:
1017    do {
1018
1019        /* get the source */
1020        src_raw = vreinterpret_u8_u32(vld1_u32(src));
1021        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
1022
1023        /* get and hold the dst too */
1024        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
1025        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
1026
1027
1028        /* get the alphas spread out properly */
1029        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
1030        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1031        /* we collapsed (255-a)+1 ... */
1032        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1033
1034        /* spread the dest */
1035        dst_wide = vmovl_u8(dst_raw);
1036
1037        /* alpha mul the dest */
1038        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1039        dst_cooked = vshrn_n_u16(dst_wide, 8);
1040
1041        /* sum -- ignoring any byte lane overflows */
1042        dst_final = vadd_u8(src_raw, dst_cooked);
1043
1044        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
1045        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1046        /* we collapsed (255-a)+1 ... */
1047        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1048
1049        /* spread the dest */
1050        dst_wide = vmovl_u8(dst_raw_2);
1051
1052        /* alpha mul the dest */
1053        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1054        dst_cooked = vshrn_n_u16(dst_wide, 8);
1055
1056        /* sum -- ignoring any byte lane overflows */
1057        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
1058
1059        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
1060        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
1061
1062        src += UNROLL;
1063        dst += UNROLL;
1064
1065        /* if 2 of the next pixels aren't between 1 and 254
1066        it might make sense to go to the optimized loops */
1067        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
1068            break;
1069
1070    } while(src < src_end);
1071
1072    if (src >= src_end)
1073        goto TAIL;
1074
1075    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
1076        goto ALPHA_255;
1077
1078    /*fall-thru*/
1079
1080ALPHA_0:
1081
1082    /*In this state, we know the current alpha is 0 and
1083     we optimize for the next alpha also being zero. */
1084    src_temp = src;  //so we don't have to increment dst every time
1085    do {
1086        if(*(++src) > ALPHA_TRANS)
1087            break;
1088        if(*(++src) > ALPHA_TRANS)
1089            break;
1090        if(*(++src) > ALPHA_TRANS)
1091            break;
1092        if(*(++src) > ALPHA_TRANS)
1093            break;
1094    } while(src < src_end);
1095
1096    dst += (src - src_temp);
1097
1098    /* no longer alpha 0, so determine where to go next. */
1099    if( src >= src_end)
1100        goto TAIL;
1101    if(*src >= ALPHA_OPAQ)
1102        goto ALPHA_255;
1103    else
1104        goto ALPHA_1_TO_254;
1105
1106ALPHA_255:
1107    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
1108        dst[0]=src[0];
1109        dst[1]=src[1];
1110        dst[2]=src[2];
1111        dst[3]=src[3];
1112        src+=UNROLL;
1113        dst+=UNROLL;
1114        if(src >= src_end)
1115            goto TAIL;
1116    }
1117
1118    //Handle remainder.
1119    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1120        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1121            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
1122        }
1123    }
1124
1125    if( src >= src_end)
1126        goto TAIL;
1127    if(*src <= ALPHA_TRANS)
1128        goto ALPHA_0;
1129    else
1130        goto ALPHA_1_TO_254;
1131
1132TAIL:
1133    /* do any residual iterations */
1134    src_end += UNROLL + 1;  //goto the real end
1135    while(src != src_end) {
1136        if( *src != 0 ) {
1137            if( *src >= ALPHA_OPAQ ) {
1138                *dst = *src;
1139            }
1140            else {
1141                *dst = SkPMSrcOver(*src, *dst);
1142            }
1143        }
1144        src++;
1145        dst++;
1146    }
1147
1148#undef    UNROLL
1149    return;
1150}
1151
1152/* Neon version of S32_Blend_BlitRow32()
1153 * portable version is in src/core/SkBlitRow_D32.cpp
1154 */
1155void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1156                              const SkPMColor* SK_RESTRICT src,
1157                              int count, U8CPU alpha) {
1158    SkASSERT(alpha <= 255);
1159
1160    if (count <= 0) {
1161        return;
1162    }
1163
1164    uint16_t src_scale = SkAlpha255To256(alpha);
1165    uint16_t dst_scale = 256 - src_scale;
1166
1167    while (count >= 2) {
1168        uint8x8_t vsrc, vdst, vres;
1169        uint16x8_t vsrc_wide, vdst_wide;
1170
1171        /* These commented prefetches are a big win for count
1172         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
1173         * They also hurt a little (<5%) on an A15
1174         */
1175        //__builtin_prefetch(src+32);
1176        //__builtin_prefetch(dst+32);
1177
1178        // Load
1179        vsrc = vreinterpret_u8_u32(vld1_u32(src));
1180        vdst = vreinterpret_u8_u32(vld1_u32(dst));
1181
1182        // Process src
1183        vsrc_wide = vmovl_u8(vsrc);
1184        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1185
1186        // Process dst
1187        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1188
1189        // Combine
1190        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1191
1192        // Store
1193        vst1_u32(dst, vreinterpret_u32_u8(vres));
1194
1195        src += 2;
1196        dst += 2;
1197        count -= 2;
1198    }
1199
1200    if (count == 1) {
1201        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1202        uint16x8_t vsrc_wide, vdst_wide;
1203
1204        // Load
1205        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1206        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1207
1208        // Process
1209        vsrc_wide = vmovl_u8(vsrc);
1210        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1211        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1212        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1213
1214        // Store
1215        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1216    }
1217}
1218
1219#ifdef SK_CPU_ARM32
1220void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1221                         const SkPMColor* SK_RESTRICT src,
1222                         int count, U8CPU alpha) {
1223
1224    SkASSERT(255 >= alpha);
1225
1226    if (count <= 0) {
1227        return;
1228    }
1229
1230    unsigned alpha256 = SkAlpha255To256(alpha);
1231
1232    // First deal with odd counts
1233    if (count & 1) {
1234        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1235        uint16x8_t vdst_wide, vsrc_wide;
1236        unsigned dst_scale;
1237
1238        // Load
1239        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1240        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1241
1242        // Calc dst_scale
1243        dst_scale = vget_lane_u8(vsrc, 3);
1244        dst_scale *= alpha256;
1245        dst_scale >>= 8;
1246        dst_scale = 256 - dst_scale;
1247
1248        // Process src
1249        vsrc_wide = vmovl_u8(vsrc);
1250        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1251
1252        // Process dst
1253        vdst_wide = vmovl_u8(vdst);
1254        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1255
1256        // Combine
1257        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1258
1259        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1260        dst++;
1261        src++;
1262        count--;
1263    }
1264
1265    if (count) {
1266        uint8x8_t alpha_mask;
1267        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1268        alpha_mask = vld1_u8(alpha_mask_setup);
1269
1270        do {
1271
1272            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1273            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1274
1275            __builtin_prefetch(src+32);
1276            __builtin_prefetch(dst+32);
1277
1278            // Load
1279            vsrc = vreinterpret_u8_u32(vld1_u32(src));
1280            vdst = vreinterpret_u8_u32(vld1_u32(dst));
1281
1282            // Prepare src_scale
1283            vsrc_scale = vdupq_n_u16(alpha256);
1284
1285            // Calc dst_scale
1286            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1287            vdst_scale = vmovl_u8(vsrc_alphas);
1288            vdst_scale *= vsrc_scale;
1289            vdst_scale = vshrq_n_u16(vdst_scale, 8);
1290            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1291
1292            // Process src
1293            vsrc_wide = vmovl_u8(vsrc);
1294            vsrc_wide *= vsrc_scale;
1295
1296            // Process dst
1297            vdst_wide = vmovl_u8(vdst);
1298            vdst_wide *= vdst_scale;
1299
1300            // Combine
1301            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1302
1303            vst1_u32(dst, vreinterpret_u32_u8(vres));
1304
1305            src += 2;
1306            dst += 2;
1307            count -= 2;
1308        } while(count);
1309    }
1310}
1311
1312///////////////////////////////////////////////////////////////////////////////
1313
1314#undef    DEBUG_OPAQUE_DITHER
1315
1316#if    defined(DEBUG_OPAQUE_DITHER)
1317static void showme8(char *str, void *p, int len)
1318{
1319    static char buf[256];
1320    char tbuf[32];
1321    int i;
1322    char *pc = (char*) p;
1323    sprintf(buf,"%8s:", str);
1324    for(i=0;i<len;i++) {
1325        sprintf(tbuf, "   %02x", pc[i]);
1326        strcat(buf, tbuf);
1327    }
1328    SkDebugf("%s\n", buf);
1329}
1330static void showme16(char *str, void *p, int len)
1331{
1332    static char buf[256];
1333    char tbuf[32];
1334    int i;
1335    uint16_t *pc = (uint16_t*) p;
1336    sprintf(buf,"%8s:", str);
1337    len = (len / sizeof(uint16_t));    /* passed as bytes */
1338    for(i=0;i<len;i++) {
1339        sprintf(tbuf, " %04x", pc[i]);
1340        strcat(buf, tbuf);
1341    }
1342    SkDebugf("%s\n", buf);
1343}
1344#endif
1345#endif // #ifdef SK_CPU_ARM32
1346
1347void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1348                                   const SkPMColor* SK_RESTRICT src,
1349                                   int count, U8CPU alpha, int x, int y) {
1350    SkASSERT(255 == alpha);
1351
1352#define    UNROLL    8
1353
1354    if (count >= UNROLL) {
1355
1356#if defined(DEBUG_OPAQUE_DITHER)
1357    uint16_t tmpbuf[UNROLL];
1358    int td[UNROLL];
1359    int tdv[UNROLL];
1360    int ta[UNROLL];
1361    int tap[UNROLL];
1362    uint16_t in_dst[UNROLL];
1363    int offset = 0;
1364    int noisy = 0;
1365#endif
1366
1367    uint8x8_t dbase;
1368    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1369    dbase = vld1_u8(dstart);
1370
1371        do {
1372        uint8x8x4_t vsrc;
1373        uint8x8_t sr, sg, sb, sa, d;
1374        uint16x8_t dst8, scale8, alpha8;
1375        uint16x8_t dst_r, dst_g, dst_b;
1376
1377#if defined(DEBUG_OPAQUE_DITHER)
1378        // calculate 8 elements worth into a temp buffer
1379        {
1380        int my_y = y;
1381        int my_x = x;
1382        SkPMColor* my_src = (SkPMColor*)src;
1383        uint16_t* my_dst = dst;
1384        int i;
1385
1386        DITHER_565_SCAN(my_y);
1387        for(i = 0; i < UNROLL; i++) {
1388            SkPMColor c = *my_src++;
1389            SkPMColorAssert(c);
1390            if (c) {
1391                unsigned a = SkGetPackedA32(c);
1392
1393                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1394                tdv[i] = DITHER_VALUE(my_x);
1395                ta[i] = a;
1396                tap[i] = SkAlpha255To256(a);
1397                td[i] = d;
1398
1399                unsigned sr = SkGetPackedR32(c);
1400                unsigned sg = SkGetPackedG32(c);
1401                unsigned sb = SkGetPackedB32(c);
1402                sr = SkDITHER_R32_FOR_565(sr, d);
1403                sg = SkDITHER_G32_FOR_565(sg, d);
1404                sb = SkDITHER_B32_FOR_565(sb, d);
1405
1406                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1407                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1408                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1409                // now src and dst expanded are in g:11 r:10 x:1 b:10
1410                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1411                td[i] = d;
1412            } else {
1413                tmpbuf[i] = *my_dst;
1414                ta[i] = tdv[i] = td[i] = 0xbeef;
1415            }
1416            in_dst[i] = *my_dst;
1417            my_dst += 1;
1418            DITHER_INC_X(my_x);
1419        }
1420        }
1421#endif
1422
1423#ifdef SK_CPU_ARM64
1424        vsrc = sk_vld4_u8_arm64_4(src);
1425#else
1426        {
1427        register uint8x8_t d0 asm("d0");
1428        register uint8x8_t d1 asm("d1");
1429        register uint8x8_t d2 asm("d2");
1430        register uint8x8_t d3 asm("d3");
1431
1432        asm ("vld4.8    {d0-d3},[%[src]]! "
1433            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1434            :
1435        );
1436        vsrc.val[0] = d0;
1437        vsrc.val[1] = d1;
1438        vsrc.val[2] = d2;
1439        vsrc.val[3] = d3;
1440        }
1441#endif
1442        sa = vsrc.val[NEON_A];
1443        sr = vsrc.val[NEON_R];
1444        sg = vsrc.val[NEON_G];
1445        sb = vsrc.val[NEON_B];
1446
1447        /* calculate 'd', which will be 0..7
1448         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1449         */
1450        alpha8 = vmovl_u8(dbase);
1451        alpha8 = vmlal_u8(alpha8, sa, dbase);
1452        d = vshrn_n_u16(alpha8, 8);    // narrowing too
1453
1454        // sr = sr - (sr>>5) + d
1455        /* watching for 8-bit overflow.  d is 0..7; risky range of
1456         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1457         * safe  as long as we do ((sr-sr>>5) + d)
1458         */
1459        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1460        sr = vadd_u8(sr, d);
1461
1462        // sb = sb - (sb>>5) + d
1463        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1464        sb = vadd_u8(sb, d);
1465
1466        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1467        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1468        sg = vadd_u8(sg, vshr_n_u8(d,1));
1469
1470        // need to pick up 8 dst's -- at 16 bits each, 128 bits
1471        dst8 = vld1q_u16(dst);
1472        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1473        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1474        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1475
1476        // blend
1477        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1478
1479        // combine the addq and mul, save 3 insns
1480        scale8 = vshrq_n_u16(scale8, 3);
1481        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1482        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1483        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1484
1485        // repack to store
1486        dst8 = vshrq_n_u16(dst_b, 5);
1487        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1488        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1489
1490        vst1q_u16(dst, dst8);
1491
1492#if defined(DEBUG_OPAQUE_DITHER)
1493        // verify my 8 elements match the temp buffer
1494        {
1495        int i, bad=0;
1496        static int invocation;
1497
1498        for (i = 0; i < UNROLL; i++) {
1499            if (tmpbuf[i] != dst[i]) {
1500                bad=1;
1501            }
1502        }
1503        if (bad) {
1504            SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1505                     invocation, offset);
1506            SkDebugf("  alpha 0x%x\n", alpha);
1507            for (i = 0; i < UNROLL; i++)
1508                SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1509                         i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1510                         in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1511
1512            showme16("alpha8", &alpha8, sizeof(alpha8));
1513            showme16("scale8", &scale8, sizeof(scale8));
1514            showme8("d", &d, sizeof(d));
1515            showme16("dst8", &dst8, sizeof(dst8));
1516            showme16("dst_b", &dst_b, sizeof(dst_b));
1517            showme16("dst_g", &dst_g, sizeof(dst_g));
1518            showme16("dst_r", &dst_r, sizeof(dst_r));
1519            showme8("sb", &sb, sizeof(sb));
1520            showme8("sg", &sg, sizeof(sg));
1521            showme8("sr", &sr, sizeof(sr));
1522
1523            return;
1524        }
1525        offset += UNROLL;
1526        invocation++;
1527        }
1528#endif
1529        dst += UNROLL;
1530        count -= UNROLL;
1531        // skip x += UNROLL, since it's unchanged mod-4
1532        } while (count >= UNROLL);
1533    }
1534#undef    UNROLL
1535
1536    // residuals
1537    if (count > 0) {
1538        DITHER_565_SCAN(y);
1539        do {
1540            SkPMColor c = *src++;
1541            SkPMColorAssert(c);
1542            if (c) {
1543                unsigned a = SkGetPackedA32(c);
1544
1545                // dither and alpha are just temporary variables to work-around
1546                // an ICE in debug.
1547                unsigned dither = DITHER_VALUE(x);
1548                unsigned alpha = SkAlpha255To256(a);
1549                int d = SkAlphaMul(dither, alpha);
1550
1551                unsigned sr = SkGetPackedR32(c);
1552                unsigned sg = SkGetPackedG32(c);
1553                unsigned sb = SkGetPackedB32(c);
1554                sr = SkDITHER_R32_FOR_565(sr, d);
1555                sg = SkDITHER_G32_FOR_565(sg, d);
1556                sb = SkDITHER_B32_FOR_565(sb, d);
1557
1558                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1559                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1560                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1561                // now src and dst expanded are in g:11 r:10 x:1 b:10
1562                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1563            }
1564            dst += 1;
1565            DITHER_INC_X(x);
1566        } while (--count != 0);
1567    }
1568}
1569
1570///////////////////////////////////////////////////////////////////////////////
1571
1572#undef    DEBUG_S32_OPAQUE_DITHER
1573
1574void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1575                                 const SkPMColor* SK_RESTRICT src,
1576                                 int count, U8CPU alpha, int x, int y) {
1577    SkASSERT(255 == alpha);
1578
1579#define    UNROLL    8
1580    if (count >= UNROLL) {
1581    uint8x8_t d;
1582    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1583    d = vld1_u8(dstart);
1584
1585    while (count >= UNROLL) {
1586        uint8x8_t sr, sg, sb;
1587        uint16x8_t dr, dg, db;
1588        uint16x8_t dst8;
1589        uint8x8x4_t vsrc;
1590
1591#ifdef SK_CPU_ARM64
1592        vsrc = sk_vld4_u8_arm64_3(src);
1593#else
1594        {
1595        register uint8x8_t d0 asm("d0");
1596        register uint8x8_t d1 asm("d1");
1597        register uint8x8_t d2 asm("d2");
1598        register uint8x8_t d3 asm("d3");
1599
1600        asm (
1601            "vld4.8    {d0-d3},[%[src]]! "
1602            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1603            :
1604        );
1605        vsrc.val[0] = d0;
1606        vsrc.val[1] = d1;
1607        vsrc.val[2] = d2;
1608        }
1609#endif
1610        sr = vsrc.val[NEON_R];
1611        sg = vsrc.val[NEON_G];
1612        sb = vsrc.val[NEON_B];
1613
1614        /* XXX: if we want to prefetch, hide it in the above asm()
1615         * using the gcc __builtin_prefetch(), the prefetch will
1616         * fall to the bottom of the loop -- it won't stick up
1617         * at the top of the loop, just after the vld4.
1618         */
1619
1620        // sr = sr - (sr>>5) + d
1621        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1622        dr = vaddl_u8(sr, d);
1623
1624        // sb = sb - (sb>>5) + d
1625        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1626        db = vaddl_u8(sb, d);
1627
1628        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1629        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1630        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1631
1632        // pack high bits of each into 565 format  (rgb, b is lsb)
1633        dst8 = vshrq_n_u16(db, 3);
1634        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1635        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1636
1637        // store it
1638        vst1q_u16(dst, dst8);
1639
1640#if    defined(DEBUG_S32_OPAQUE_DITHER)
1641        // always good to know if we generated good results
1642        {
1643        int i, myx = x, myy = y;
1644        DITHER_565_SCAN(myy);
1645        for (i=0;i<UNROLL;i++) {
1646            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1647            SkPMColor c = src[i-8];
1648            unsigned dither = DITHER_VALUE(myx);
1649            uint16_t val = SkDitherRGB32To565(c, dither);
1650            if (val != dst[i]) {
1651            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1652                c, dither, val, dst[i], dstart[i]);
1653            }
1654            DITHER_INC_X(myx);
1655        }
1656        }
1657#endif
1658
1659        dst += UNROLL;
1660        // we don't need to increment src as the asm above has already done it
1661        count -= UNROLL;
1662        x += UNROLL;        // probably superfluous
1663    }
1664    }
1665#undef    UNROLL
1666
1667    // residuals
1668    if (count > 0) {
1669        DITHER_565_SCAN(y);
1670        do {
1671            SkPMColor c = *src++;
1672            SkPMColorAssert(c);
1673            SkASSERT(SkGetPackedA32(c) == 255);
1674
1675            unsigned dither = DITHER_VALUE(x);
1676            *dst++ = SkDitherRGB32To565(c, dither);
1677            DITHER_INC_X(x);
1678        } while (--count != 0);
1679    }
1680}
1681
1682void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1683                      SkPMColor color) {
1684    if (count <= 0) {
1685        return;
1686    }
1687
1688    if (0 == color) {
1689        if (src != dst) {
1690            memcpy(dst, src, count * sizeof(SkPMColor));
1691        }
1692        return;
1693    }
1694
1695    unsigned colorA = SkGetPackedA32(color);
1696    if (255 == colorA) {
1697        sk_memset32(dst, color, count);
1698        return;
1699    }
1700
1701    unsigned scale = 256 - SkAlpha255To256(colorA);
1702
1703    if (count >= 8) {
1704        uint32x4_t vcolor;
1705        uint8x8_t vscale;
1706
1707        vcolor = vdupq_n_u32(color);
1708
1709        // scale numerical interval [0-255], so load as 8 bits
1710        vscale = vdup_n_u8(scale);
1711
1712        do {
1713            // load src color, 8 pixels, 4 64 bit registers
1714            // (and increment src).
1715            uint32x2x4_t vsrc;
1716#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1717            asm (
1718                "vld1.32    %h[vsrc], [%[src]]!"
1719                : [vsrc] "=w" (vsrc), [src] "+r" (src)
1720                : :
1721            );
1722#else // 64bit targets and Clang
1723            vsrc.val[0] = vld1_u32(src);
1724            vsrc.val[1] = vld1_u32(src+2);
1725            vsrc.val[2] = vld1_u32(src+4);
1726            vsrc.val[3] = vld1_u32(src+6);
1727            src += 8;
1728#endif
1729
1730            // multiply long by scale, 64 bits at a time,
1731            // destination into a 128 bit register.
1732            uint16x8x4_t vtmp;
1733            vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1734            vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1735            vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1736            vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1737
1738            // shift the 128 bit registers, containing the 16
1739            // bit scaled values back to 8 bits, narrowing the
1740            // results to 64 bit registers.
1741            uint8x16x2_t vres;
1742            vres.val[0] = vcombine_u8(
1743                            vshrn_n_u16(vtmp.val[0], 8),
1744                            vshrn_n_u16(vtmp.val[1], 8));
1745            vres.val[1] = vcombine_u8(
1746                            vshrn_n_u16(vtmp.val[2], 8),
1747                            vshrn_n_u16(vtmp.val[3], 8));
1748
1749            // adding back the color, using 128 bit registers.
1750            uint32x4x2_t vdst;
1751            vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1752                                               vreinterpretq_u8_u32(vcolor));
1753            vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1754                                               vreinterpretq_u8_u32(vcolor));
1755
1756            // store back the 8 calculated pixels (2 128 bit
1757            // registers), and increment dst.
1758#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1759            asm (
1760                "vst1.32    %h[vdst], [%[dst]]!"
1761                : [dst] "+r" (dst)
1762                : [vdst] "w" (vdst)
1763                : "memory"
1764            );
1765#else // 64bit targets and Clang
1766            vst1q_u32(dst, vdst.val[0]);
1767            vst1q_u32(dst+4, vdst.val[1]);
1768            dst += 8;
1769#endif
1770            count -= 8;
1771
1772        } while (count >= 8);
1773    }
1774
1775    while (count > 0) {
1776        *dst = color + SkAlphaMulQ(*src, scale);
1777        src += 1;
1778        dst += 1;
1779        count--;
1780    }
1781}
1782
1783///////////////////////////////////////////////////////////////////////////////
1784
1785const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
1786    // no dither
1787    S32_D565_Opaque_neon,
1788    S32_D565_Blend_neon,
1789    S32A_D565_Opaque_neon,
1790#if 0
1791    S32A_D565_Blend_neon,
1792#else
1793    NULL,   // https://code.google.com/p/skia/issues/detail?id=2797
1794#endif
1795
1796    // dither
1797    S32_D565_Opaque_Dither_neon,
1798    S32_D565_Blend_Dither_neon,
1799    S32A_D565_Opaque_Dither_neon,
1800    NULL,   // S32A_D565_Blend_Dither
1801};
1802
1803const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1804    Color32A_D565_neon,    // Color32_D565,
1805    Color32A_D565_neon,    // Color32A_D565,
1806    Color32A_D565_neon,    // Color32_D565_Dither,
1807    Color32A_D565_neon,    // Color32A_D565_Dither
1808};
1809
1810const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1811    NULL,   // S32_Opaque,
1812    S32_Blend_BlitRow32_neon,        // S32_Blend,
1813    /*
1814     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1815     * value and attempts to optimize accordingly.  The optimization is
1816     * sensitive to the source content and is not a win in all cases. For
1817     * example, if there are a lot of transitions between the alpha states,
1818     * the performance will almost certainly be worse.  However, for many
1819     * common cases the performance is equivalent or better than the standard
1820     * case where we do not inspect the src alpha.
1821     */
1822#if SK_A32_SHIFT == 24
1823    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1824    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1825#else
1826    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1827#endif
1828#ifdef SK_CPU_ARM32
1829    S32A_Blend_BlitRow32_neon        // S32A_Blend
1830#else
1831    NULL
1832#endif
1833};
1834