1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkColor_opts_neon.h"
18#include <arm_neon.h>
19
20#ifdef SK_CPU_ARM64
21static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22    uint8x8x4_t vsrc;
23    uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24
25    asm (
26        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27        "mov    %[vsrc0].8b, v0.8b             \t\n"
28        "mov    %[vsrc1].8b, v1.8b             \t\n"
29        "mov    %[vsrc2].8b, v2.8b             \t\n"
30        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31          [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32        : : "v0", "v1", "v2", "v3"
33    );
34
35    vsrc.val[0] = vsrc_0;
36    vsrc.val[1] = vsrc_1;
37    vsrc.val[2] = vsrc_2;
38
39    return vsrc;
40}
41
42static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43    uint8x8x4_t vsrc;
44    uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45
46    asm (
47        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48        "mov    %[vsrc0].8b, v0.8b             \t\n"
49        "mov    %[vsrc1].8b, v1.8b             \t\n"
50        "mov    %[vsrc2].8b, v2.8b             \t\n"
51        "mov    %[vsrc3].8b, v3.8b             \t\n"
52        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53          [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54          [src] "+&r" (src)
55        : : "v0", "v1", "v2", "v3"
56    );
57
58    vsrc.val[0] = vsrc_0;
59    vsrc.val[1] = vsrc_1;
60    vsrc.val[2] = vsrc_2;
61    vsrc.val[3] = vsrc_3;
62
63    return vsrc;
64}
65#endif
66
67void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                           const SkPMColor* SK_RESTRICT src, int count,
69                           U8CPU alpha, int /*x*/, int /*y*/) {
70    SkASSERT(255 == alpha);
71
72    while (count >= 8) {
73        uint8x8x4_t vsrc;
74        uint16x8_t vdst;
75
76        // Load
77#ifdef SK_CPU_ARM64
78        vsrc = sk_vld4_u8_arm64_3(src);
79#else
80        vsrc = vld4_u8((uint8_t*)src);
81        src += 8;
82#endif
83
84        // Convert src to 565
85        vdst = SkPixel32ToPixel16_neon8(vsrc);
86
87        // Store
88        vst1q_u16(dst, vdst);
89
90        // Prepare next iteration
91        dst += 8;
92        count -= 8;
93    };
94
95    // Leftovers
96    while (count > 0) {
97        SkPMColor c = *src++;
98        SkPMColorAssert(c);
99        *dst = SkPixel32ToPixel16_ToU16(c);
100        dst++;
101        count--;
102    };
103}
104
105void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                          const SkPMColor* SK_RESTRICT src, int count,
107                          U8CPU alpha, int /*x*/, int /*y*/) {
108    SkASSERT(255 > alpha);
109
110    uint16x8_t vmask_blue, vscale;
111
112    // prepare constants
113    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114    vmask_blue = vmovq_n_u16(0x1F);
115
116    while (count >= 8) {
117        uint8x8x4_t vsrc;
118        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119        uint16x8_t vres_r, vres_g, vres_b;
120
121        // Load src
122#ifdef SK_CPU_ARM64
123        vsrc = sk_vld4_u8_arm64_3(src);
124#else
125        {
126        register uint8x8_t d0 asm("d0");
127        register uint8x8_t d1 asm("d1");
128        register uint8x8_t d2 asm("d2");
129        register uint8x8_t d3 asm("d3");
130
131        asm (
132            "vld4.8    {d0-d3},[%[src]]!"
133            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134            :
135        );
136        vsrc.val[0] = d0;
137        vsrc.val[1] = d1;
138        vsrc.val[2] = d2;
139        }
140#endif
141
142        // Load and unpack dst
143        vdst = vld1q_u16(dst);
144        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148
149        // Shift src to 565 range
150        vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151        vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152        vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153
154        // Scale src - dst
155        vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156        vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157        vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158
159        vres_r = vshrq_n_u16(vres_r * vscale, 8);
160        vres_g = vshrq_n_u16(vres_g * vscale, 8);
161        vres_b = vshrq_n_u16(vres_b * vscale, 8);
162
163        vres_r += vdst_r;
164        vres_g += vdst_g;
165        vres_b += vdst_b;
166
167        // Combine
168        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170
171        // Store
172        vst1q_u16(dst, vres_b);
173        dst += 8;
174        count -= 8;
175    }
176    if (count > 0) {
177        int scale = SkAlpha255To256(alpha);
178        do {
179            SkPMColor c = *src++;
180            SkPMColorAssert(c);
181            uint16_t d = *dst;
182            *dst++ = SkPackRGB16(
183                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186        } while (--count != 0);
187    }
188}
189
190#ifdef SK_CPU_ARM32
191void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                           const SkPMColor* SK_RESTRICT src, int count,
193                           U8CPU alpha, int /*x*/, int /*y*/) {
194    SkASSERT(255 == alpha);
195
196    if (count >= 8) {
197        uint16_t* SK_RESTRICT keep_dst = 0;
198
199        asm volatile (
200                      "ands       ip, %[count], #7            \n\t"
201                      "vmov.u8    d31, #1<<7                  \n\t"
202                      "vld1.16    {q12}, [%[dst]]             \n\t"
203                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
204                      // Thumb does not support the standard ARM conditional
205                      // instructions but instead requires the 'it' instruction
206                      // to signal conditional execution
207                      "it eq                                  \n\t"
208                      "moveq      ip, #8                      \n\t"
209                      "mov        %[keep_dst], %[dst]         \n\t"
210
211                      "add        %[src], %[src], ip, LSL#2   \n\t"
212                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
213                      "subs       %[count], %[count], ip      \n\t"
214                      "b          9f                          \n\t"
215                      // LOOP
216                      "2:                                         \n\t"
217
218                      "vld1.16    {q12}, [%[dst]]!            \n\t"
219                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
222                      "subs       %[count], %[count], #8      \n\t"
223                      "9:                                         \n\t"
224                      "pld        [%[dst],#32]                \n\t"
225                      // expand 0565 q12 to 8888 {d4-d7}
226                      "vmovn.u16  d4, q12                     \n\t"
227                      "vshr.u16   q11, q12, #5                \n\t"
228                      "vshr.u16   q10, q12, #6+5              \n\t"
229                      "vmovn.u16  d5, q11                     \n\t"
230                      "vmovn.u16  d6, q10                     \n\t"
231                      "vshl.u8    d4, d4, #3                  \n\t"
232                      "vshl.u8    d5, d5, #2                  \n\t"
233                      "vshl.u8    d6, d6, #3                  \n\t"
234
235                      "vmovl.u8   q14, d31                    \n\t"
236                      "vmovl.u8   q13, d31                    \n\t"
237                      "vmovl.u8   q12, d31                    \n\t"
238
239                      // duplicate in 4/2/1 & 8pix vsns
240                      "vmvn.8     d30, d3                     \n\t"
241                      "vmlal.u8   q14, d30, d6                \n\t"
242                      "vmlal.u8   q13, d30, d5                \n\t"
243                      "vmlal.u8   q12, d30, d4                \n\t"
244                      "vshr.u16   q8, q14, #5                 \n\t"
245                      "vshr.u16   q9, q13, #6                 \n\t"
246                      "vaddhn.u16 d6, q14, q8                 \n\t"
247                      "vshr.u16   q8, q12, #5                 \n\t"
248                      "vaddhn.u16 d5, q13, q9                 \n\t"
249                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
250                      "vaddhn.u16 d4, q12, q8                 \n\t"
251                      // intentionally don't calculate alpha
252                      // result in d4-d6
253
254                      "vqadd.u8   d5, d5, d1                  \n\t"
255                      "vqadd.u8   d4, d4, d2                  \n\t"
256
257                      // pack 8888 {d4-d6} to 0565 q10
258                      "vshll.u8   q10, d6, #8                 \n\t"
259                      "vshll.u8   q3, d5, #8                  \n\t"
260                      "vshll.u8   q2, d4, #8                  \n\t"
261                      "vsri.u16   q10, q3, #5                 \n\t"
262                      "vsri.u16   q10, q2, #11                \n\t"
263
264                      "bne        2b                          \n\t"
265
266                      "1:                                         \n\t"
267                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
268                      : [count] "+r" (count)
269                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
270                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
271                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
272                      "d30","d31"
273                      );
274    }
275    else
276    {   // handle count < 8
277        uint16_t* SK_RESTRICT keep_dst = 0;
278
279        asm volatile (
280                      "vmov.u8    d31, #1<<7                  \n\t"
281                      "mov        %[keep_dst], %[dst]         \n\t"
282
283                      "tst        %[count], #4                \n\t"
284                      "beq        14f                         \n\t"
285                      "vld1.16    {d25}, [%[dst]]!            \n\t"
286                      "vld1.32    {q1}, [%[src]]!             \n\t"
287
288                      "14:                                        \n\t"
289                      "tst        %[count], #2                \n\t"
290                      "beq        12f                         \n\t"
291                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
292                      "vld1.32    {d1}, [%[src]]!             \n\t"
293
294                      "12:                                        \n\t"
295                      "tst        %[count], #1                \n\t"
296                      "beq        11f                         \n\t"
297                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
298                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
299
300                      "11:                                        \n\t"
301                      // unzips achieve the same as a vld4 operation
302                      "vuzp.u16   q0, q1                      \n\t"
303                      "vuzp.u8    d0, d1                      \n\t"
304                      "vuzp.u8    d2, d3                      \n\t"
305                      // expand 0565 q12 to 8888 {d4-d7}
306                      "vmovn.u16  d4, q12                     \n\t"
307                      "vshr.u16   q11, q12, #5                \n\t"
308                      "vshr.u16   q10, q12, #6+5              \n\t"
309                      "vmovn.u16  d5, q11                     \n\t"
310                      "vmovn.u16  d6, q10                     \n\t"
311                      "vshl.u8    d4, d4, #3                  \n\t"
312                      "vshl.u8    d5, d5, #2                  \n\t"
313                      "vshl.u8    d6, d6, #3                  \n\t"
314
315                      "vmovl.u8   q14, d31                    \n\t"
316                      "vmovl.u8   q13, d31                    \n\t"
317                      "vmovl.u8   q12, d31                    \n\t"
318
319                      // duplicate in 4/2/1 & 8pix vsns
320                      "vmvn.8     d30, d3                     \n\t"
321                      "vmlal.u8   q14, d30, d6                \n\t"
322                      "vmlal.u8   q13, d30, d5                \n\t"
323                      "vmlal.u8   q12, d30, d4                \n\t"
324                      "vshr.u16   q8, q14, #5                 \n\t"
325                      "vshr.u16   q9, q13, #6                 \n\t"
326                      "vaddhn.u16 d6, q14, q8                 \n\t"
327                      "vshr.u16   q8, q12, #5                 \n\t"
328                      "vaddhn.u16 d5, q13, q9                 \n\t"
329                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
330                      "vaddhn.u16 d4, q12, q8                 \n\t"
331                      // intentionally don't calculate alpha
332                      // result in d4-d6
333
334                      "vqadd.u8   d5, d5, d1                  \n\t"
335                      "vqadd.u8   d4, d4, d2                  \n\t"
336
337                      // pack 8888 {d4-d6} to 0565 q10
338                      "vshll.u8   q10, d6, #8                 \n\t"
339                      "vshll.u8   q3, d5, #8                  \n\t"
340                      "vshll.u8   q2, d4, #8                  \n\t"
341                      "vsri.u16   q10, q3, #5                 \n\t"
342                      "vsri.u16   q10, q2, #11                \n\t"
343
344                      // store
345                      "tst        %[count], #4                \n\t"
346                      "beq        24f                         \n\t"
347                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
348
349                      "24:                                        \n\t"
350                      "tst        %[count], #2                \n\t"
351                      "beq        22f                         \n\t"
352                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
353
354                      "22:                                        \n\t"
355                      "tst        %[count], #1                \n\t"
356                      "beq        21f                         \n\t"
357                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
358
359                      "21:                                        \n\t"
360                      : [count] "+r" (count)
361                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
362                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
363                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
364                      "d30","d31"
365                      );
366    }
367}
368
369#else // #ifdef SK_CPU_ARM32
370
371void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
372                           const SkPMColor* SK_RESTRICT src, int count,
373                           U8CPU alpha, int /*x*/, int /*y*/) {
374    SkASSERT(255 == alpha);
375
376    if (count >= 16) {
377        asm (
378            "movi    v4.8h, #0x80                   \t\n"
379
380            "1:                                     \t\n"
381            "sub     %[count], %[count], #16        \t\n"
382            "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
383            "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
384            "prfm    pldl1keep, [%[src],#512]       \t\n"
385            "prfm    pldl1keep, [%[dst],#256]       \t\n"
386            "ushr    v20.8h, v17.8h, #5             \t\n"
387            "ushr    v31.8h, v16.8h, #5             \t\n"
388            "xtn     v6.8b, v31.8h                  \t\n"
389            "xtn2    v6.16b, v20.8h                 \t\n"
390            "ushr    v20.8h, v17.8h, #11            \t\n"
391            "shl     v19.16b, v6.16b, #2            \t\n"
392            "ushr    v31.8h, v16.8h, #11            \t\n"
393            "xtn     v22.8b, v31.8h                 \t\n"
394            "xtn2    v22.16b, v20.8h                \t\n"
395            "shl     v18.16b, v22.16b, #3           \t\n"
396            "mvn     v3.16b, v3.16b                 \t\n"
397            "xtn     v16.8b, v16.8h                 \t\n"
398            "mov     v7.16b, v4.16b                 \t\n"
399            "xtn2    v16.16b, v17.8h                \t\n"
400            "umlal   v7.8h, v3.8b, v19.8b           \t\n"
401            "shl     v16.16b, v16.16b, #3           \t\n"
402            "mov     v22.16b, v4.16b                \t\n"
403            "ushr    v24.8h, v7.8h, #6              \t\n"
404            "umlal   v22.8h, v3.8b, v18.8b          \t\n"
405            "ushr    v20.8h, v22.8h, #5             \t\n"
406            "addhn   v20.8b, v22.8h, v20.8h         \t\n"
407            "cmp     %[count], #16                  \t\n"
408            "mov     v6.16b, v4.16b                 \t\n"
409            "mov     v5.16b, v4.16b                 \t\n"
410            "umlal   v6.8h, v3.8b, v16.8b           \t\n"
411            "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
412            "mov     v17.16b, v4.16b                \t\n"
413            "ushr    v19.8h, v6.8h, #5              \t\n"
414            "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
415            "addhn   v7.8b, v7.8h, v24.8h           \t\n"
416            "ushr    v18.8h, v5.8h, #6              \t\n"
417            "ushr    v21.8h, v17.8h, #5             \t\n"
418            "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
419            "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
420            "mov     v22.16b, v4.16b                \t\n"
421            "addhn   v6.8b, v6.8h, v19.8h           \t\n"
422            "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
423            "ushr    v5.8h, v22.8h, #5              \t\n"
424            "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
425            "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
426#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
427            "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
428            "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
429#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
430            "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
431            "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
432#else
433#error "This function only supports BGRA and RGBA."
434#endif
435            "shll    v22.8h, v20.8b, #8             \t\n"
436            "shll    v5.8h, v7.8b, #8               \t\n"
437            "sri     v22.8h, v5.8h, #5              \t\n"
438            "shll    v17.8h, v6.8b, #8              \t\n"
439            "shll2   v23.8h, v20.16b, #8            \t\n"
440            "shll2   v7.8h, v7.16b, #8              \t\n"
441            "sri     v22.8h, v17.8h, #11            \t\n"
442            "sri     v23.8h, v7.8h, #5              \t\n"
443            "shll2   v6.8h, v6.16b, #8              \t\n"
444            "st1     {v22.8h}, [%[dst]], #16        \t\n"
445            "sri     v23.8h, v6.8h, #11             \t\n"
446            "st1     {v23.8h}, [%[dst]], #16        \t\n"
447            "b.ge    1b                             \t\n"
448            : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
449            :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
450               "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
451               "v31"
452        );
453    }
454        // Leftovers
455    if (count > 0) {
456        do {
457            SkPMColor c = *src++;
458            SkPMColorAssert(c);
459            if (c) {
460                *dst = SkSrcOver32To16(c, *dst);
461            }
462            dst += 1;
463        } while (--count != 0);
464    }
465}
466#endif // #ifdef SK_CPU_ARM32
467
468static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
469    prod += vdupq_n_u16(128);
470    prod += vshrq_n_u16(prod, 8);
471    return vshrq_n_u16(prod, 8);
472}
473
474void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
475                          const SkPMColor* SK_RESTRICT src, int count,
476                          U8CPU alpha, int /*x*/, int /*y*/) {
477   SkASSERT(255 > alpha);
478
479    /* This code implements a Neon version of S32A_D565_Blend. The results have
480     * a few mismatches compared to the original code. These mismatches never
481     * exceed 1.
482     */
483
484    if (count >= 8) {
485        uint16x8_t valpha_max, vmask_blue;
486        uint8x8_t valpha;
487
488        // prepare constants
489        valpha_max = vmovq_n_u16(255);
490        valpha = vdup_n_u8(alpha);
491        vmask_blue = vmovq_n_u16(SK_B16_MASK);
492
493        do {
494            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
495            uint16x8_t vres_a, vres_r, vres_g, vres_b;
496            uint8x8x4_t vsrc;
497
498            // load pixels
499            vdst = vld1q_u16(dst);
500#ifdef SK_CPU_ARM64
501            vsrc = sk_vld4_u8_arm64_4(src);
502#else
503#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
504            asm (
505                "vld4.u8 %h[vsrc], [%[src]]!"
506                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
507                : :
508            );
509#else
510            register uint8x8_t d0 asm("d0");
511            register uint8x8_t d1 asm("d1");
512            register uint8x8_t d2 asm("d2");
513            register uint8x8_t d3 asm("d3");
514
515            asm volatile (
516                "vld4.u8    {d0-d3},[%[src]]!;"
517                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
518                  [src] "+&r" (src)
519                : :
520            );
521            vsrc.val[0] = d0;
522            vsrc.val[1] = d1;
523            vsrc.val[2] = d2;
524            vsrc.val[3] = d3;
525#endif
526#endif // #ifdef SK_CPU_ARM64
527
528
529            // deinterleave dst
530            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
531            vdst_b = vdst & vmask_blue;                     // extract blue
532            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
533            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
534
535            // shift src to 565
536            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
537            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
538            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
539
540            // calc src * src_scale
541            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
542            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
543            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
544            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
545
546            // prepare dst_scale
547            vres_a = SkDiv255Round_neon8(vres_a);
548            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
549
550            // add dst * dst_scale to previous result
551            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
552            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
553            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
554
555#ifdef S32A_D565_BLEND_EXACT
556            // It is possible to get exact results with this but it is slow,
557            // even slower than C code in some cases
558            vres_r = SkDiv255Round_neon8(vres_r);
559            vres_g = SkDiv255Round_neon8(vres_g);
560            vres_b = SkDiv255Round_neon8(vres_b);
561#else
562            vres_r = vrshrq_n_u16(vres_r, 8);
563            vres_g = vrshrq_n_u16(vres_g, 8);
564            vres_b = vrshrq_n_u16(vres_b, 8);
565#endif
566            // pack result
567            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
568            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
569
570            // store
571            vst1q_u16(dst, vres_b);
572            dst += 8;
573            count -= 8;
574        } while (count >= 8);
575    }
576
577    // leftovers
578    while (count-- > 0) {
579        SkPMColor sc = *src++;
580        if (sc) {
581            uint16_t dc = *dst;
582            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
583            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
584            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
585            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
586            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
587        }
588        dst += 1;
589    }
590}
591
592/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
593 * each dither value is spaced out into byte lanes, and repeated
594 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
595 * start of each row.
596 */
597static const uint8_t gDitherMatrix_Neon[48] = {
598    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
599    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
600    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
601    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
602
603};
604
605void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
606                                int count, U8CPU alpha, int x, int y)
607{
608
609    SkASSERT(255 > alpha);
610
611    // rescale alpha to range 1 - 256
612    int scale = SkAlpha255To256(alpha);
613
614    if (count >= 8) {
615        /* select row and offset for dither array */
616        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
617
618        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
619        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
620
621        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
622        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
623
624        do {
625
626            uint8x8x4_t vsrc;
627            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
628            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
629            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
630            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
631            uint16x8_t vdst;
632            uint16x8_t vdst_r, vdst_g, vdst_b;
633            int16x8_t vres_r, vres_g, vres_b;
634            int8x8_t vres8_r, vres8_g, vres8_b;
635
636            // Load source and add dither
637#ifdef SK_CPU_ARM64
638            vsrc = sk_vld4_u8_arm64_3(src);
639#else
640            {
641            register uint8x8_t d0 asm("d0");
642            register uint8x8_t d1 asm("d1");
643            register uint8x8_t d2 asm("d2");
644            register uint8x8_t d3 asm("d3");
645
646            asm (
647                "vld4.8    {d0-d3},[%[src]]! "
648                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
649                :
650            );
651            vsrc.val[0] = d0;
652            vsrc.val[1] = d1;
653            vsrc.val[2] = d2;
654            }
655#endif
656            vsrc_r = vsrc.val[NEON_R];
657            vsrc_g = vsrc.val[NEON_G];
658            vsrc_b = vsrc.val[NEON_B];
659
660            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
661            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
662            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
663
664            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
665            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
666            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
667
668            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
669            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
670            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
671
672            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
673            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
674            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
675
676            // Load dst and unpack
677            vdst = vld1q_u16(dst);
678            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
679            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
680            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
681
682            // subtract dst from src and widen
683            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
684            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
685            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
686
687            // multiply diffs by scale and shift
688            vres_r = vmulq_s16(vres_r, vscale);
689            vres_g = vmulq_s16(vres_g, vscale);
690            vres_b = vmulq_s16(vres_b, vscale);
691
692            vres8_r = vshrn_n_s16(vres_r, 8);
693            vres8_g = vshrn_n_s16(vres_g, 8);
694            vres8_b = vshrn_n_s16(vres_b, 8);
695
696            // add dst to result
697            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
698            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
699            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
700
701            // put result into 565 format
702            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
703            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
704
705            // Store result
706            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
707
708            // Next iteration
709            dst += 8;
710            count -= 8;
711
712        } while (count >= 8);
713    }
714
715    // Leftovers
716    if (count > 0) {
717        int scale = SkAlpha255To256(alpha);
718        DITHER_565_SCAN(y);
719        do {
720            SkPMColor c = *src++;
721            SkPMColorAssert(c);
722
723            int dither = DITHER_VALUE(x);
724            int sr = SkGetPackedR32(c);
725            int sg = SkGetPackedG32(c);
726            int sb = SkGetPackedB32(c);
727            sr = SkDITHER_R32To565(sr, dither);
728            sg = SkDITHER_G32To565(sg, dither);
729            sb = SkDITHER_B32To565(sb, dither);
730
731            uint16_t d = *dst;
732            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
733                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
734                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
735            DITHER_INC_X(x);
736        } while (--count != 0);
737    }
738}
739
740void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
741                                const SkPMColor* SK_RESTRICT src,
742                                int count, U8CPU alpha) {
743
744    SkASSERT(255 == alpha);
745    if (count > 0) {
746
747
748    uint8x8_t alpha_mask;
749
750    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
751    alpha_mask = vld1_u8(alpha_mask_setup);
752
753    /* do the NEON unrolled code */
754#define    UNROLL    4
755    while (count >= UNROLL) {
756        uint8x8_t src_raw, dst_raw, dst_final;
757        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
758
759        /* The two prefetches below may make the code slighlty
760         * slower for small values of count but are worth having
761         * in the general case.
762         */
763        __builtin_prefetch(src+32);
764        __builtin_prefetch(dst+32);
765
766        /* get the source */
767        src_raw = vreinterpret_u8_u32(vld1_u32(src));
768#if    UNROLL > 2
769        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
770#endif
771
772        /* get and hold the dst too */
773        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
774#if    UNROLL > 2
775        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
776#endif
777
778    /* 1st and 2nd bits of the unrolling */
779    {
780        uint8x8_t dst_cooked;
781        uint16x8_t dst_wide;
782        uint8x8_t alpha_narrow;
783        uint16x8_t alpha_wide;
784
785        /* get the alphas spread out properly */
786        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
787        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
788
789        /* spread the dest */
790        dst_wide = vmovl_u8(dst_raw);
791
792        /* alpha mul the dest */
793        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
794        dst_cooked = vshrn_n_u16(dst_wide, 8);
795
796        /* sum -- ignoring any byte lane overflows */
797        dst_final = vadd_u8(src_raw, dst_cooked);
798    }
799
800#if    UNROLL > 2
801    /* the 3rd and 4th bits of our unrolling */
802    {
803        uint8x8_t dst_cooked;
804        uint16x8_t dst_wide;
805        uint8x8_t alpha_narrow;
806        uint16x8_t alpha_wide;
807
808        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
809        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
810
811        /* spread the dest */
812        dst_wide = vmovl_u8(dst_raw_2);
813
814        /* alpha mul the dest */
815        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
816        dst_cooked = vshrn_n_u16(dst_wide, 8);
817
818        /* sum -- ignoring any byte lane overflows */
819        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
820    }
821#endif
822
823        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
824#if    UNROLL > 2
825        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
826#endif
827
828        src += UNROLL;
829        dst += UNROLL;
830        count -= UNROLL;
831    }
832#undef    UNROLL
833
834    /* do any residual iterations */
835        while (--count >= 0) {
836            *dst = SkPMSrcOver(*src, *dst);
837            src += 1;
838            dst += 1;
839        }
840    }
841}
842
843void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
844                                const SkPMColor* SK_RESTRICT src,
845                                int count, U8CPU alpha) {
846    SkASSERT(255 == alpha);
847
848    if (count <= 0)
849    return;
850
851    /* Use these to check if src is transparent or opaque */
852    const unsigned int ALPHA_OPAQ  = 0xFF000000;
853    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
854
855#define UNROLL  4
856    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
857    const SkPMColor* SK_RESTRICT src_temp = src;
858
859    /* set up the NEON variables */
860    uint8x8_t alpha_mask;
861    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
862    alpha_mask = vld1_u8(alpha_mask_setup);
863
864    uint8x8_t src_raw, dst_raw, dst_final;
865    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
866    uint8x8_t dst_cooked;
867    uint16x8_t dst_wide;
868    uint8x8_t alpha_narrow;
869    uint16x8_t alpha_wide;
870
871    /* choose the first processing type */
872    if( src >= src_end)
873        goto TAIL;
874    if(*src <= ALPHA_TRANS)
875        goto ALPHA_0;
876    if(*src >= ALPHA_OPAQ)
877        goto ALPHA_255;
878    /* fall-thru */
879
880ALPHA_1_TO_254:
881    do {
882
883        /* get the source */
884        src_raw = vreinterpret_u8_u32(vld1_u32(src));
885        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
886
887        /* get and hold the dst too */
888        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
889        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
890
891
892        /* get the alphas spread out properly */
893        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
894        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
895        /* we collapsed (255-a)+1 ... */
896        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
897
898        /* spread the dest */
899        dst_wide = vmovl_u8(dst_raw);
900
901        /* alpha mul the dest */
902        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
903        dst_cooked = vshrn_n_u16(dst_wide, 8);
904
905        /* sum -- ignoring any byte lane overflows */
906        dst_final = vadd_u8(src_raw, dst_cooked);
907
908        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
909        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
910        /* we collapsed (255-a)+1 ... */
911        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
912
913        /* spread the dest */
914        dst_wide = vmovl_u8(dst_raw_2);
915
916        /* alpha mul the dest */
917        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
918        dst_cooked = vshrn_n_u16(dst_wide, 8);
919
920        /* sum -- ignoring any byte lane overflows */
921        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
922
923        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
924        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
925
926        src += UNROLL;
927        dst += UNROLL;
928
929        /* if 2 of the next pixels aren't between 1 and 254
930        it might make sense to go to the optimized loops */
931        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
932            break;
933
934    } while(src < src_end);
935
936    if (src >= src_end)
937        goto TAIL;
938
939    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
940        goto ALPHA_255;
941
942    /*fall-thru*/
943
944ALPHA_0:
945
946    /*In this state, we know the current alpha is 0 and
947     we optimize for the next alpha also being zero. */
948    src_temp = src;  //so we don't have to increment dst every time
949    do {
950        if(*(++src) > ALPHA_TRANS)
951            break;
952        if(*(++src) > ALPHA_TRANS)
953            break;
954        if(*(++src) > ALPHA_TRANS)
955            break;
956        if(*(++src) > ALPHA_TRANS)
957            break;
958    } while(src < src_end);
959
960    dst += (src - src_temp);
961
962    /* no longer alpha 0, so determine where to go next. */
963    if( src >= src_end)
964        goto TAIL;
965    if(*src >= ALPHA_OPAQ)
966        goto ALPHA_255;
967    else
968        goto ALPHA_1_TO_254;
969
970ALPHA_255:
971    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
972        dst[0]=src[0];
973        dst[1]=src[1];
974        dst[2]=src[2];
975        dst[3]=src[3];
976        src+=UNROLL;
977        dst+=UNROLL;
978        if(src >= src_end)
979            goto TAIL;
980    }
981
982    //Handle remainder.
983    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
984        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
985            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
986        }
987    }
988
989    if( src >= src_end)
990        goto TAIL;
991    if(*src <= ALPHA_TRANS)
992        goto ALPHA_0;
993    else
994        goto ALPHA_1_TO_254;
995
996TAIL:
997    /* do any residual iterations */
998    src_end += UNROLL + 1;  //goto the real end
999    while(src != src_end) {
1000        if( *src != 0 ) {
1001            if( *src >= ALPHA_OPAQ ) {
1002                *dst = *src;
1003            }
1004            else {
1005                *dst = SkPMSrcOver(*src, *dst);
1006            }
1007        }
1008        src++;
1009        dst++;
1010    }
1011
1012#undef    UNROLL
1013    return;
1014}
1015
1016/* Neon version of S32_Blend_BlitRow32()
1017 * portable version is in src/core/SkBlitRow_D32.cpp
1018 */
1019void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1020                              const SkPMColor* SK_RESTRICT src,
1021                              int count, U8CPU alpha) {
1022    SkASSERT(alpha <= 255);
1023
1024    if (count <= 0) {
1025        return;
1026    }
1027
1028    uint16_t src_scale = SkAlpha255To256(alpha);
1029    uint16_t dst_scale = 256 - src_scale;
1030
1031    while (count >= 2) {
1032        uint8x8_t vsrc, vdst, vres;
1033        uint16x8_t vsrc_wide, vdst_wide;
1034
1035        /* These commented prefetches are a big win for count
1036         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
1037         * They also hurt a little (<5%) on an A15
1038         */
1039        //__builtin_prefetch(src+32);
1040        //__builtin_prefetch(dst+32);
1041
1042        // Load
1043        vsrc = vreinterpret_u8_u32(vld1_u32(src));
1044        vdst = vreinterpret_u8_u32(vld1_u32(dst));
1045
1046        // Process src
1047        vsrc_wide = vmovl_u8(vsrc);
1048        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1049
1050        // Process dst
1051        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1052
1053        // Combine
1054        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1055
1056        // Store
1057        vst1_u32(dst, vreinterpret_u32_u8(vres));
1058
1059        src += 2;
1060        dst += 2;
1061        count -= 2;
1062    }
1063
1064    if (count == 1) {
1065        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1066        uint16x8_t vsrc_wide, vdst_wide;
1067
1068        // Load
1069        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1070        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1071
1072        // Process
1073        vsrc_wide = vmovl_u8(vsrc);
1074        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1075        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1076        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1077
1078        // Store
1079        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1080    }
1081}
1082
1083#ifdef SK_CPU_ARM32
1084void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1085                         const SkPMColor* SK_RESTRICT src,
1086                         int count, U8CPU alpha) {
1087
1088    SkASSERT(255 >= alpha);
1089
1090    if (count <= 0) {
1091        return;
1092    }
1093
1094    unsigned alpha256 = SkAlpha255To256(alpha);
1095
1096    // First deal with odd counts
1097    if (count & 1) {
1098        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1099        uint16x8_t vdst_wide, vsrc_wide;
1100        unsigned dst_scale;
1101
1102        // Load
1103        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1104        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1105
1106        // Calc dst_scale
1107        dst_scale = vget_lane_u8(vsrc, 3);
1108        dst_scale *= alpha256;
1109        dst_scale >>= 8;
1110        dst_scale = 256 - dst_scale;
1111
1112        // Process src
1113        vsrc_wide = vmovl_u8(vsrc);
1114        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1115
1116        // Process dst
1117        vdst_wide = vmovl_u8(vdst);
1118        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1119
1120        // Combine
1121        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1122
1123        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1124        dst++;
1125        src++;
1126        count--;
1127    }
1128
1129    if (count) {
1130        uint8x8_t alpha_mask;
1131        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1132        alpha_mask = vld1_u8(alpha_mask_setup);
1133
1134        do {
1135
1136            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1137            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1138
1139            __builtin_prefetch(src+32);
1140            __builtin_prefetch(dst+32);
1141
1142            // Load
1143            vsrc = vreinterpret_u8_u32(vld1_u32(src));
1144            vdst = vreinterpret_u8_u32(vld1_u32(dst));
1145
1146            // Prepare src_scale
1147            vsrc_scale = vdupq_n_u16(alpha256);
1148
1149            // Calc dst_scale
1150            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1151            vdst_scale = vmovl_u8(vsrc_alphas);
1152            vdst_scale *= vsrc_scale;
1153            vdst_scale = vshrq_n_u16(vdst_scale, 8);
1154            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1155
1156            // Process src
1157            vsrc_wide = vmovl_u8(vsrc);
1158            vsrc_wide *= vsrc_scale;
1159
1160            // Process dst
1161            vdst_wide = vmovl_u8(vdst);
1162            vdst_wide *= vdst_scale;
1163
1164            // Combine
1165            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1166
1167            vst1_u32(dst, vreinterpret_u32_u8(vres));
1168
1169            src += 2;
1170            dst += 2;
1171            count -= 2;
1172        } while(count);
1173    }
1174}
1175
1176///////////////////////////////////////////////////////////////////////////////
1177
1178#undef    DEBUG_OPAQUE_DITHER
1179
1180#if    defined(DEBUG_OPAQUE_DITHER)
1181static void showme8(char *str, void *p, int len)
1182{
1183    static char buf[256];
1184    char tbuf[32];
1185    int i;
1186    char *pc = (char*) p;
1187    sprintf(buf,"%8s:", str);
1188    for(i=0;i<len;i++) {
1189        sprintf(tbuf, "   %02x", pc[i]);
1190        strcat(buf, tbuf);
1191    }
1192    SkDebugf("%s\n", buf);
1193}
1194static void showme16(char *str, void *p, int len)
1195{
1196    static char buf[256];
1197    char tbuf[32];
1198    int i;
1199    uint16_t *pc = (uint16_t*) p;
1200    sprintf(buf,"%8s:", str);
1201    len = (len / sizeof(uint16_t));    /* passed as bytes */
1202    for(i=0;i<len;i++) {
1203        sprintf(tbuf, " %04x", pc[i]);
1204        strcat(buf, tbuf);
1205    }
1206    SkDebugf("%s\n", buf);
1207}
1208#endif
1209#endif // #ifdef SK_CPU_ARM32
1210
1211void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1212                                   const SkPMColor* SK_RESTRICT src,
1213                                   int count, U8CPU alpha, int x, int y) {
1214    SkASSERT(255 == alpha);
1215
1216#define    UNROLL    8
1217
1218    if (count >= UNROLL) {
1219
1220#if defined(DEBUG_OPAQUE_DITHER)
1221    uint16_t tmpbuf[UNROLL];
1222    int td[UNROLL];
1223    int tdv[UNROLL];
1224    int ta[UNROLL];
1225    int tap[UNROLL];
1226    uint16_t in_dst[UNROLL];
1227    int offset = 0;
1228    int noisy = 0;
1229#endif
1230
1231    uint8x8_t dbase;
1232    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1233    dbase = vld1_u8(dstart);
1234
1235        do {
1236        uint8x8x4_t vsrc;
1237        uint8x8_t sr, sg, sb, sa, d;
1238        uint16x8_t dst8, scale8, alpha8;
1239        uint16x8_t dst_r, dst_g, dst_b;
1240
1241#if defined(DEBUG_OPAQUE_DITHER)
1242        // calculate 8 elements worth into a temp buffer
1243        {
1244        int my_y = y;
1245        int my_x = x;
1246        SkPMColor* my_src = (SkPMColor*)src;
1247        uint16_t* my_dst = dst;
1248        int i;
1249
1250        DITHER_565_SCAN(my_y);
1251        for(i = 0; i < UNROLL; i++) {
1252            SkPMColor c = *my_src++;
1253            SkPMColorAssert(c);
1254            if (c) {
1255                unsigned a = SkGetPackedA32(c);
1256
1257                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1258                tdv[i] = DITHER_VALUE(my_x);
1259                ta[i] = a;
1260                tap[i] = SkAlpha255To256(a);
1261                td[i] = d;
1262
1263                unsigned sr = SkGetPackedR32(c);
1264                unsigned sg = SkGetPackedG32(c);
1265                unsigned sb = SkGetPackedB32(c);
1266                sr = SkDITHER_R32_FOR_565(sr, d);
1267                sg = SkDITHER_G32_FOR_565(sg, d);
1268                sb = SkDITHER_B32_FOR_565(sb, d);
1269
1270                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1271                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1272                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1273                // now src and dst expanded are in g:11 r:10 x:1 b:10
1274                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1275                td[i] = d;
1276            } else {
1277                tmpbuf[i] = *my_dst;
1278                ta[i] = tdv[i] = td[i] = 0xbeef;
1279            }
1280            in_dst[i] = *my_dst;
1281            my_dst += 1;
1282            DITHER_INC_X(my_x);
1283        }
1284        }
1285#endif
1286
1287#ifdef SK_CPU_ARM64
1288        vsrc = sk_vld4_u8_arm64_4(src);
1289#else
1290        {
1291        register uint8x8_t d0 asm("d0");
1292        register uint8x8_t d1 asm("d1");
1293        register uint8x8_t d2 asm("d2");
1294        register uint8x8_t d3 asm("d3");
1295
1296        asm ("vld4.8    {d0-d3},[%[src]]! "
1297            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1298            :
1299        );
1300        vsrc.val[0] = d0;
1301        vsrc.val[1] = d1;
1302        vsrc.val[2] = d2;
1303        vsrc.val[3] = d3;
1304        }
1305#endif
1306        sa = vsrc.val[NEON_A];
1307        sr = vsrc.val[NEON_R];
1308        sg = vsrc.val[NEON_G];
1309        sb = vsrc.val[NEON_B];
1310
1311        /* calculate 'd', which will be 0..7
1312         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1313         */
1314        alpha8 = vmovl_u8(dbase);
1315        alpha8 = vmlal_u8(alpha8, sa, dbase);
1316        d = vshrn_n_u16(alpha8, 8);    // narrowing too
1317
1318        // sr = sr - (sr>>5) + d
1319        /* watching for 8-bit overflow.  d is 0..7; risky range of
1320         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1321         * safe  as long as we do ((sr-sr>>5) + d)
1322         */
1323        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1324        sr = vadd_u8(sr, d);
1325
1326        // sb = sb - (sb>>5) + d
1327        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1328        sb = vadd_u8(sb, d);
1329
1330        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1331        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1332        sg = vadd_u8(sg, vshr_n_u8(d,1));
1333
1334        // need to pick up 8 dst's -- at 16 bits each, 128 bits
1335        dst8 = vld1q_u16(dst);
1336        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1337        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1338        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1339
1340        // blend
1341        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1342
1343        // combine the addq and mul, save 3 insns
1344        scale8 = vshrq_n_u16(scale8, 3);
1345        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1346        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1347        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1348
1349        // repack to store
1350        dst8 = vshrq_n_u16(dst_b, 5);
1351        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1352        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1353
1354        vst1q_u16(dst, dst8);
1355
1356#if defined(DEBUG_OPAQUE_DITHER)
1357        // verify my 8 elements match the temp buffer
1358        {
1359        int i, bad=0;
1360        static int invocation;
1361
1362        for (i = 0; i < UNROLL; i++) {
1363            if (tmpbuf[i] != dst[i]) {
1364                bad=1;
1365            }
1366        }
1367        if (bad) {
1368            SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1369                     invocation, offset);
1370            SkDebugf("  alpha 0x%x\n", alpha);
1371            for (i = 0; i < UNROLL; i++)
1372                SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1373                         i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1374                         in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1375
1376            showme16("alpha8", &alpha8, sizeof(alpha8));
1377            showme16("scale8", &scale8, sizeof(scale8));
1378            showme8("d", &d, sizeof(d));
1379            showme16("dst8", &dst8, sizeof(dst8));
1380            showme16("dst_b", &dst_b, sizeof(dst_b));
1381            showme16("dst_g", &dst_g, sizeof(dst_g));
1382            showme16("dst_r", &dst_r, sizeof(dst_r));
1383            showme8("sb", &sb, sizeof(sb));
1384            showme8("sg", &sg, sizeof(sg));
1385            showme8("sr", &sr, sizeof(sr));
1386
1387            return;
1388        }
1389        offset += UNROLL;
1390        invocation++;
1391        }
1392#endif
1393        dst += UNROLL;
1394        count -= UNROLL;
1395        // skip x += UNROLL, since it's unchanged mod-4
1396        } while (count >= UNROLL);
1397    }
1398#undef    UNROLL
1399
1400    // residuals
1401    if (count > 0) {
1402        DITHER_565_SCAN(y);
1403        do {
1404            SkPMColor c = *src++;
1405            SkPMColorAssert(c);
1406            if (c) {
1407                unsigned a = SkGetPackedA32(c);
1408
1409                // dither and alpha are just temporary variables to work-around
1410                // an ICE in debug.
1411                unsigned dither = DITHER_VALUE(x);
1412                unsigned alpha = SkAlpha255To256(a);
1413                int d = SkAlphaMul(dither, alpha);
1414
1415                unsigned sr = SkGetPackedR32(c);
1416                unsigned sg = SkGetPackedG32(c);
1417                unsigned sb = SkGetPackedB32(c);
1418                sr = SkDITHER_R32_FOR_565(sr, d);
1419                sg = SkDITHER_G32_FOR_565(sg, d);
1420                sb = SkDITHER_B32_FOR_565(sb, d);
1421
1422                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1423                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1424                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1425                // now src and dst expanded are in g:11 r:10 x:1 b:10
1426                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1427            }
1428            dst += 1;
1429            DITHER_INC_X(x);
1430        } while (--count != 0);
1431    }
1432}
1433
1434///////////////////////////////////////////////////////////////////////////////
1435
1436#undef    DEBUG_S32_OPAQUE_DITHER
1437
1438void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1439                                 const SkPMColor* SK_RESTRICT src,
1440                                 int count, U8CPU alpha, int x, int y) {
1441    SkASSERT(255 == alpha);
1442
1443#define    UNROLL    8
1444    if (count >= UNROLL) {
1445    uint8x8_t d;
1446    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1447    d = vld1_u8(dstart);
1448
1449    while (count >= UNROLL) {
1450        uint8x8_t sr, sg, sb;
1451        uint16x8_t dr, dg, db;
1452        uint16x8_t dst8;
1453        uint8x8x4_t vsrc;
1454
1455#ifdef SK_CPU_ARM64
1456        vsrc = sk_vld4_u8_arm64_3(src);
1457#else
1458        {
1459        register uint8x8_t d0 asm("d0");
1460        register uint8x8_t d1 asm("d1");
1461        register uint8x8_t d2 asm("d2");
1462        register uint8x8_t d3 asm("d3");
1463
1464        asm (
1465            "vld4.8    {d0-d3},[%[src]]! "
1466            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1467            :
1468        );
1469        vsrc.val[0] = d0;
1470        vsrc.val[1] = d1;
1471        vsrc.val[2] = d2;
1472        }
1473#endif
1474        sr = vsrc.val[NEON_R];
1475        sg = vsrc.val[NEON_G];
1476        sb = vsrc.val[NEON_B];
1477
1478        /* XXX: if we want to prefetch, hide it in the above asm()
1479         * using the gcc __builtin_prefetch(), the prefetch will
1480         * fall to the bottom of the loop -- it won't stick up
1481         * at the top of the loop, just after the vld4.
1482         */
1483
1484        // sr = sr - (sr>>5) + d
1485        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1486        dr = vaddl_u8(sr, d);
1487
1488        // sb = sb - (sb>>5) + d
1489        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1490        db = vaddl_u8(sb, d);
1491
1492        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1493        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1494        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1495
1496        // pack high bits of each into 565 format  (rgb, b is lsb)
1497        dst8 = vshrq_n_u16(db, 3);
1498        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1499        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1500
1501        // store it
1502        vst1q_u16(dst, dst8);
1503
1504#if    defined(DEBUG_S32_OPAQUE_DITHER)
1505        // always good to know if we generated good results
1506        {
1507        int i, myx = x, myy = y;
1508        DITHER_565_SCAN(myy);
1509        for (i=0;i<UNROLL;i++) {
1510            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1511            SkPMColor c = src[i-8];
1512            unsigned dither = DITHER_VALUE(myx);
1513            uint16_t val = SkDitherRGB32To565(c, dither);
1514            if (val != dst[i]) {
1515            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1516                c, dither, val, dst[i], dstart[i]);
1517            }
1518            DITHER_INC_X(myx);
1519        }
1520        }
1521#endif
1522
1523        dst += UNROLL;
1524        // we don't need to increment src as the asm above has already done it
1525        count -= UNROLL;
1526        x += UNROLL;        // probably superfluous
1527    }
1528    }
1529#undef    UNROLL
1530
1531    // residuals
1532    if (count > 0) {
1533        DITHER_565_SCAN(y);
1534        do {
1535            SkPMColor c = *src++;
1536            SkPMColorAssert(c);
1537            SkASSERT(SkGetPackedA32(c) == 255);
1538
1539            unsigned dither = DITHER_VALUE(x);
1540            *dst++ = SkDitherRGB32To565(c, dither);
1541            DITHER_INC_X(x);
1542        } while (--count != 0);
1543    }
1544}
1545
1546void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1547                      SkPMColor color) {
1548    if (count <= 0) {
1549        return;
1550    }
1551
1552    if (0 == color) {
1553        if (src != dst) {
1554            memcpy(dst, src, count * sizeof(SkPMColor));
1555        }
1556        return;
1557    }
1558
1559    unsigned colorA = SkGetPackedA32(color);
1560    if (255 == colorA) {
1561        sk_memset32(dst, color, count);
1562        return;
1563    }
1564
1565    unsigned scale = 256 - SkAlpha255To256(colorA);
1566
1567    if (count >= 8) {
1568        uint32x4_t vcolor;
1569        uint8x8_t vscale;
1570
1571        vcolor = vdupq_n_u32(color);
1572
1573        // scale numerical interval [0-255], so load as 8 bits
1574        vscale = vdup_n_u8(scale);
1575
1576        do {
1577            // load src color, 8 pixels, 4 64 bit registers
1578            // (and increment src).
1579            uint32x2x4_t vsrc;
1580#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1581            asm (
1582                "vld1.32    %h[vsrc], [%[src]]!"
1583                : [vsrc] "=w" (vsrc), [src] "+r" (src)
1584                : :
1585            );
1586#else // 64bit targets and Clang
1587            vsrc.val[0] = vld1_u32(src);
1588            vsrc.val[1] = vld1_u32(src+2);
1589            vsrc.val[2] = vld1_u32(src+4);
1590            vsrc.val[3] = vld1_u32(src+6);
1591            src += 8;
1592#endif
1593
1594            // multiply long by scale, 64 bits at a time,
1595            // destination into a 128 bit register.
1596            uint16x8x4_t vtmp;
1597            vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1598            vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1599            vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1600            vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1601
1602            // shift the 128 bit registers, containing the 16
1603            // bit scaled values back to 8 bits, narrowing the
1604            // results to 64 bit registers.
1605            uint8x16x2_t vres;
1606            vres.val[0] = vcombine_u8(
1607                            vshrn_n_u16(vtmp.val[0], 8),
1608                            vshrn_n_u16(vtmp.val[1], 8));
1609            vres.val[1] = vcombine_u8(
1610                            vshrn_n_u16(vtmp.val[2], 8),
1611                            vshrn_n_u16(vtmp.val[3], 8));
1612
1613            // adding back the color, using 128 bit registers.
1614            uint32x4x2_t vdst;
1615            vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1616                                               vreinterpretq_u8_u32(vcolor));
1617            vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1618                                               vreinterpretq_u8_u32(vcolor));
1619
1620            // store back the 8 calculated pixels (2 128 bit
1621            // registers), and increment dst.
1622#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1623            asm (
1624                "vst1.32    %h[vdst], [%[dst]]!"
1625                : [dst] "+r" (dst)
1626                : [vdst] "w" (vdst)
1627                : "memory"
1628            );
1629#else // 64bit targets and Clang
1630            vst1q_u32(dst, vdst.val[0]);
1631            vst1q_u32(dst+4, vdst.val[1]);
1632            dst += 8;
1633#endif
1634            count -= 8;
1635
1636        } while (count >= 8);
1637    }
1638
1639    while (count > 0) {
1640        *dst = color + SkAlphaMulQ(*src, scale);
1641        src += 1;
1642        dst += 1;
1643        count--;
1644    }
1645}
1646
1647///////////////////////////////////////////////////////////////////////////////
1648
1649const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1650    // no dither
1651    S32_D565_Opaque_neon,
1652    S32_D565_Blend_neon,
1653    S32A_D565_Opaque_neon,
1654#if 0
1655    S32A_D565_Blend_neon,
1656#else
1657    NULL,   // https://code.google.com/p/skia/issues/detail?id=2845
1658            // https://code.google.com/p/skia/issues/detail?id=2797
1659#endif
1660
1661    // dither
1662    S32_D565_Opaque_Dither_neon,
1663    S32_D565_Blend_Dither_neon,
1664    S32A_D565_Opaque_Dither_neon,
1665    NULL,   // S32A_D565_Blend_Dither
1666};
1667
1668const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1669    NULL,   // S32_Opaque,
1670    S32_Blend_BlitRow32_neon,        // S32_Blend,
1671    /*
1672     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1673     * value and attempts to optimize accordingly.  The optimization is
1674     * sensitive to the source content and is not a win in all cases. For
1675     * example, if there are a lot of transitions between the alpha states,
1676     * the performance will almost certainly be worse.  However, for many
1677     * common cases the performance is equivalent or better than the standard
1678     * case where we do not inspect the src alpha.
1679     */
1680#if SK_A32_SHIFT == 24
1681    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1682    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1683#else
1684    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1685#endif
1686#ifdef SK_CPU_ARM32
1687    S32A_Blend_BlitRow32_neon        // S32A_Blend
1688#else
1689    NULL
1690#endif
1691};
1692