SkBlitRow_opts_arm_neon.cpp revision f61088321c0906f62431c029173c1a7a70856ec7
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkColor_opts_neon.h"
18#include <arm_neon.h>
19
20#ifdef SK_CPU_ARM64
21static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22    uint8x8x4_t vsrc;
23    uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24
25    asm (
26        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27        "mov    %[vsrc0].8b, v0.8b             \t\n"
28        "mov    %[vsrc1].8b, v1.8b             \t\n"
29        "mov    %[vsrc2].8b, v2.8b             \t\n"
30        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31          [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32        : : "v0", "v1", "v2", "v3"
33    );
34
35    vsrc.val[0] = vsrc_0;
36    vsrc.val[1] = vsrc_1;
37    vsrc.val[2] = vsrc_2;
38
39    return vsrc;
40}
41
42static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43    uint8x8x4_t vsrc;
44    uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45
46    asm (
47        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48        "mov    %[vsrc0].8b, v0.8b             \t\n"
49        "mov    %[vsrc1].8b, v1.8b             \t\n"
50        "mov    %[vsrc2].8b, v2.8b             \t\n"
51        "mov    %[vsrc3].8b, v3.8b             \t\n"
52        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53          [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54          [src] "+&r" (src)
55        : : "v0", "v1", "v2", "v3"
56    );
57
58    vsrc.val[0] = vsrc_0;
59    vsrc.val[1] = vsrc_1;
60    vsrc.val[2] = vsrc_2;
61    vsrc.val[3] = vsrc_3;
62
63    return vsrc;
64}
65#endif
66
67void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                           const SkPMColor* SK_RESTRICT src, int count,
69                           U8CPU alpha, int /*x*/, int /*y*/) {
70    SkASSERT(255 == alpha);
71
72    while (count >= 8) {
73        uint8x8x4_t vsrc;
74        uint16x8_t vdst;
75
76        // Load
77#ifdef SK_CPU_ARM64
78        vsrc = sk_vld4_u8_arm64_3(src);
79#else
80        vsrc = vld4_u8((uint8_t*)src);
81        src += 8;
82#endif
83
84        // Convert src to 565
85        vdst = SkPixel32ToPixel16_neon8(vsrc);
86
87        // Store
88        vst1q_u16(dst, vdst);
89
90        // Prepare next iteration
91        dst += 8;
92        count -= 8;
93    };
94
95    // Leftovers
96    while (count > 0) {
97        SkPMColor c = *src++;
98        SkPMColorAssert(c);
99        *dst = SkPixel32ToPixel16_ToU16(c);
100        dst++;
101        count--;
102    };
103}
104
105void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                          const SkPMColor* SK_RESTRICT src, int count,
107                          U8CPU alpha, int /*x*/, int /*y*/) {
108    SkASSERT(255 > alpha);
109
110    uint16x8_t vmask_blue, vscale;
111
112    // prepare constants
113    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114    vmask_blue = vmovq_n_u16(0x1F);
115
116    while (count >= 8) {
117        uint8x8x4_t vsrc;
118        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119        uint16x8_t vres_r, vres_g, vres_b;
120
121        // Load src
122#ifdef SK_CPU_ARM64
123        vsrc = sk_vld4_u8_arm64_3(src);
124#else
125        {
126        register uint8x8_t d0 asm("d0");
127        register uint8x8_t d1 asm("d1");
128        register uint8x8_t d2 asm("d2");
129        register uint8x8_t d3 asm("d3");
130
131        asm (
132            "vld4.8    {d0-d3},[%[src]]!"
133            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134            :
135        );
136        vsrc.val[0] = d0;
137        vsrc.val[1] = d1;
138        vsrc.val[2] = d2;
139        }
140#endif
141
142        // Load and unpack dst
143        vdst = vld1q_u16(dst);
144        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148
149        // Shift src to 565 range
150        vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151        vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152        vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153
154        // Scale src - dst
155        vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156        vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157        vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158
159        vres_r = vshrq_n_u16(vres_r * vscale, 8);
160        vres_g = vshrq_n_u16(vres_g * vscale, 8);
161        vres_b = vshrq_n_u16(vres_b * vscale, 8);
162
163        vres_r += vdst_r;
164        vres_g += vdst_g;
165        vres_b += vdst_b;
166
167        // Combine
168        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170
171        // Store
172        vst1q_u16(dst, vres_b);
173        dst += 8;
174        count -= 8;
175    }
176    if (count > 0) {
177        int scale = SkAlpha255To256(alpha);
178        do {
179            SkPMColor c = *src++;
180            SkPMColorAssert(c);
181            uint16_t d = *dst;
182            *dst++ = SkPackRGB16(
183                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186        } while (--count != 0);
187    }
188}
189
190#ifdef SK_CPU_ARM32
191void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                           const SkPMColor* SK_RESTRICT src, int count,
193                           U8CPU alpha, int /*x*/, int /*y*/) {
194    SkASSERT(255 == alpha);
195
196    if (count >= 8) {
197        uint16_t* SK_RESTRICT keep_dst = 0;
198
199        asm volatile (
200                      "ands       ip, %[count], #7            \n\t"
201                      "vmov.u8    d31, #1<<7                  \n\t"
202                      "vld1.16    {q12}, [%[dst]]             \n\t"
203                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
204                      // Thumb does not support the standard ARM conditional
205                      // instructions but instead requires the 'it' instruction
206                      // to signal conditional execution
207                      "it eq                                  \n\t"
208                      "moveq      ip, #8                      \n\t"
209                      "mov        %[keep_dst], %[dst]         \n\t"
210
211                      "add        %[src], %[src], ip, LSL#2   \n\t"
212                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
213                      "subs       %[count], %[count], ip      \n\t"
214                      "b          9f                          \n\t"
215                      // LOOP
216                      "2:                                         \n\t"
217
218                      "vld1.16    {q12}, [%[dst]]!            \n\t"
219                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
222                      "subs       %[count], %[count], #8      \n\t"
223                      "9:                                         \n\t"
224                      "pld        [%[dst],#32]                \n\t"
225                      // expand 0565 q12 to 8888 {d4-d7}
226                      "vmovn.u16  d4, q12                     \n\t"
227                      "vshr.u16   q11, q12, #5                \n\t"
228                      "vshr.u16   q10, q12, #6+5              \n\t"
229                      "vmovn.u16  d5, q11                     \n\t"
230                      "vmovn.u16  d6, q10                     \n\t"
231                      "vshl.u8    d4, d4, #3                  \n\t"
232                      "vshl.u8    d5, d5, #2                  \n\t"
233                      "vshl.u8    d6, d6, #3                  \n\t"
234
235                      "vmovl.u8   q14, d31                    \n\t"
236                      "vmovl.u8   q13, d31                    \n\t"
237                      "vmovl.u8   q12, d31                    \n\t"
238
239                      // duplicate in 4/2/1 & 8pix vsns
240                      "vmvn.8     d30, d3                     \n\t"
241                      "vmlal.u8   q14, d30, d6                \n\t"
242                      "vmlal.u8   q13, d30, d5                \n\t"
243                      "vmlal.u8   q12, d30, d4                \n\t"
244                      "vshr.u16   q8, q14, #5                 \n\t"
245                      "vshr.u16   q9, q13, #6                 \n\t"
246                      "vaddhn.u16 d6, q14, q8                 \n\t"
247                      "vshr.u16   q8, q12, #5                 \n\t"
248                      "vaddhn.u16 d5, q13, q9                 \n\t"
249                      "vaddhn.u16 d4, q12, q8                 \n\t"
250                      // intentionally don't calculate alpha
251                      // result in d4-d6
252
253            #ifdef SK_PMCOLOR_IS_RGBA
254                      "vqadd.u8   d6, d6, d0                  \n\t"
255                      "vqadd.u8   d5, d5, d1                  \n\t"
256                      "vqadd.u8   d4, d4, d2                  \n\t"
257            #else
258                      "vqadd.u8   d6, d6, d2                  \n\t"
259                      "vqadd.u8   d5, d5, d1                  \n\t"
260                      "vqadd.u8   d4, d4, d0                  \n\t"
261            #endif
262
263                      // pack 8888 {d4-d6} to 0565 q10
264                      "vshll.u8   q10, d6, #8                 \n\t"
265                      "vshll.u8   q3, d5, #8                  \n\t"
266                      "vshll.u8   q2, d4, #8                  \n\t"
267                      "vsri.u16   q10, q3, #5                 \n\t"
268                      "vsri.u16   q10, q2, #11                \n\t"
269
270                      "bne        2b                          \n\t"
271
272                      "1:                                         \n\t"
273                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
274                      : [count] "+r" (count)
275                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
276                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
277                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
278                      "d30","d31"
279                      );
280    }
281    else
282    {   // handle count < 8
283        uint16_t* SK_RESTRICT keep_dst = 0;
284
285        asm volatile (
286                      "vmov.u8    d31, #1<<7                  \n\t"
287                      "mov        %[keep_dst], %[dst]         \n\t"
288
289                      "tst        %[count], #4                \n\t"
290                      "beq        14f                         \n\t"
291                      "vld1.16    {d25}, [%[dst]]!            \n\t"
292                      "vld1.32    {q1}, [%[src]]!             \n\t"
293
294                      "14:                                        \n\t"
295                      "tst        %[count], #2                \n\t"
296                      "beq        12f                         \n\t"
297                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
298                      "vld1.32    {d1}, [%[src]]!             \n\t"
299
300                      "12:                                        \n\t"
301                      "tst        %[count], #1                \n\t"
302                      "beq        11f                         \n\t"
303                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
304                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
305
306                      "11:                                        \n\t"
307                      // unzips achieve the same as a vld4 operation
308                      "vuzp.u16   q0, q1                      \n\t"
309                      "vuzp.u8    d0, d1                      \n\t"
310                      "vuzp.u8    d2, d3                      \n\t"
311                      // expand 0565 q12 to 8888 {d4-d7}
312                      "vmovn.u16  d4, q12                     \n\t"
313                      "vshr.u16   q11, q12, #5                \n\t"
314                      "vshr.u16   q10, q12, #6+5              \n\t"
315                      "vmovn.u16  d5, q11                     \n\t"
316                      "vmovn.u16  d6, q10                     \n\t"
317                      "vshl.u8    d4, d4, #3                  \n\t"
318                      "vshl.u8    d5, d5, #2                  \n\t"
319                      "vshl.u8    d6, d6, #3                  \n\t"
320
321                      "vmovl.u8   q14, d31                    \n\t"
322                      "vmovl.u8   q13, d31                    \n\t"
323                      "vmovl.u8   q12, d31                    \n\t"
324
325                      // duplicate in 4/2/1 & 8pix vsns
326                      "vmvn.8     d30, d3                     \n\t"
327                      "vmlal.u8   q14, d30, d6                \n\t"
328                      "vmlal.u8   q13, d30, d5                \n\t"
329                      "vmlal.u8   q12, d30, d4                \n\t"
330                      "vshr.u16   q8, q14, #5                 \n\t"
331                      "vshr.u16   q9, q13, #6                 \n\t"
332                      "vaddhn.u16 d6, q14, q8                 \n\t"
333                      "vshr.u16   q8, q12, #5                 \n\t"
334                      "vaddhn.u16 d5, q13, q9                 \n\t"
335                      "vaddhn.u16 d4, q12, q8                 \n\t"
336                      // intentionally don't calculate alpha
337                      // result in d4-d6
338
339            #ifdef SK_PMCOLOR_IS_RGBA
340                      "vqadd.u8   d6, d6, d0                  \n\t"
341                      "vqadd.u8   d5, d5, d1                  \n\t"
342                      "vqadd.u8   d4, d4, d2                  \n\t"
343            #else
344                      "vqadd.u8   d6, d6, d2                  \n\t"
345                      "vqadd.u8   d5, d5, d1                  \n\t"
346                      "vqadd.u8   d4, d4, d0                  \n\t"
347            #endif
348
349                      // pack 8888 {d4-d6} to 0565 q10
350                      "vshll.u8   q10, d6, #8                 \n\t"
351                      "vshll.u8   q3, d5, #8                  \n\t"
352                      "vshll.u8   q2, d4, #8                  \n\t"
353                      "vsri.u16   q10, q3, #5                 \n\t"
354                      "vsri.u16   q10, q2, #11                \n\t"
355
356                      // store
357                      "tst        %[count], #4                \n\t"
358                      "beq        24f                         \n\t"
359                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
360
361                      "24:                                        \n\t"
362                      "tst        %[count], #2                \n\t"
363                      "beq        22f                         \n\t"
364                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
365
366                      "22:                                        \n\t"
367                      "tst        %[count], #1                \n\t"
368                      "beq        21f                         \n\t"
369                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
370
371                      "21:                                        \n\t"
372                      : [count] "+r" (count)
373                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
374                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
375                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
376                      "d30","d31"
377                      );
378    }
379}
380
381#else // #ifdef SK_CPU_ARM32
382
383void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
384                           const SkPMColor* SK_RESTRICT src, int count,
385                           U8CPU alpha, int /*x*/, int /*y*/) {
386    SkASSERT(255 == alpha);
387
388    if (count >= 16) {
389        asm (
390            "movi    v4.8h, #0x80                   \t\n"
391
392            "1:                                     \t\n"
393            "sub     %w[count], %w[count], #16      \t\n"
394            "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
395            "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
396            "prfm    pldl1keep, [%[src],#512]       \t\n"
397            "prfm    pldl1keep, [%[dst],#256]       \t\n"
398            "ushr    v20.8h, v17.8h, #5             \t\n"
399            "ushr    v31.8h, v16.8h, #5             \t\n"
400            "xtn     v6.8b, v31.8h                  \t\n"
401            "xtn2    v6.16b, v20.8h                 \t\n"
402            "ushr    v20.8h, v17.8h, #11            \t\n"
403            "shl     v19.16b, v6.16b, #2            \t\n"
404            "ushr    v31.8h, v16.8h, #11            \t\n"
405            "xtn     v22.8b, v31.8h                 \t\n"
406            "xtn2    v22.16b, v20.8h                \t\n"
407            "shl     v18.16b, v22.16b, #3           \t\n"
408            "mvn     v3.16b, v3.16b                 \t\n"
409            "xtn     v16.8b, v16.8h                 \t\n"
410            "mov     v7.16b, v4.16b                 \t\n"
411            "xtn2    v16.16b, v17.8h                \t\n"
412            "umlal   v7.8h, v3.8b, v19.8b           \t\n"
413            "shl     v16.16b, v16.16b, #3           \t\n"
414            "mov     v22.16b, v4.16b                \t\n"
415            "ushr    v24.8h, v7.8h, #6              \t\n"
416            "umlal   v22.8h, v3.8b, v18.8b          \t\n"
417            "ushr    v20.8h, v22.8h, #5             \t\n"
418            "addhn   v20.8b, v22.8h, v20.8h         \t\n"
419            "cmp     %w[count], #16                 \t\n"
420            "mov     v6.16b, v4.16b                 \t\n"
421            "mov     v5.16b, v4.16b                 \t\n"
422            "umlal   v6.8h, v3.8b, v16.8b           \t\n"
423            "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
424            "mov     v17.16b, v4.16b                \t\n"
425            "ushr    v19.8h, v6.8h, #5              \t\n"
426            "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
427            "addhn   v7.8b, v7.8h, v24.8h           \t\n"
428            "ushr    v18.8h, v5.8h, #6              \t\n"
429            "ushr    v21.8h, v17.8h, #5             \t\n"
430            "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
431            "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
432            "mov     v22.16b, v4.16b                \t\n"
433            "addhn   v6.8b, v6.8h, v19.8h           \t\n"
434            "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
435            "ushr    v5.8h, v22.8h, #5              \t\n"
436            "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
437            "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
438#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
439            "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
440            "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
441#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
442            "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
443            "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
444#else
445#error "This function only supports BGRA and RGBA."
446#endif
447            "shll    v22.8h, v20.8b, #8             \t\n"
448            "shll    v5.8h, v7.8b, #8               \t\n"
449            "sri     v22.8h, v5.8h, #5              \t\n"
450            "shll    v17.8h, v6.8b, #8              \t\n"
451            "shll2   v23.8h, v20.16b, #8            \t\n"
452            "shll2   v7.8h, v7.16b, #8              \t\n"
453            "sri     v22.8h, v17.8h, #11            \t\n"
454            "sri     v23.8h, v7.8h, #5              \t\n"
455            "shll2   v6.8h, v6.16b, #8              \t\n"
456            "st1     {v22.8h}, [%[dst]], #16        \t\n"
457            "sri     v23.8h, v6.8h, #11             \t\n"
458            "st1     {v23.8h}, [%[dst]], #16        \t\n"
459            "b.ge    1b                             \t\n"
460            : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
461            :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
462               "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
463               "v31"
464        );
465    }
466        // Leftovers
467    if (count > 0) {
468        do {
469            SkPMColor c = *src++;
470            SkPMColorAssert(c);
471            if (c) {
472                *dst = SkSrcOver32To16(c, *dst);
473            }
474            dst += 1;
475        } while (--count != 0);
476    }
477}
478#endif // #ifdef SK_CPU_ARM32
479
480static uint32_t pmcolor_to_expand16(SkPMColor c) {
481    unsigned r = SkGetPackedR32(c);
482    unsigned g = SkGetPackedG32(c);
483    unsigned b = SkGetPackedB32(c);
484    return (g << 24) | (r << 13) | (b << 2);
485}
486
487void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
488    uint32_t src_expand;
489    unsigned scale;
490    uint16x8_t vmask_blue;
491
492    if (count <= 0) return;
493    SkASSERT(((size_t)dst & 0x01) == 0);
494
495    /*
496     * This preamble code is in order to make dst aligned to 8 bytes
497     * in the next mutiple bytes read & write access.
498     */
499    src_expand = pmcolor_to_expand16(src);
500    scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
501
502#define DST_ALIGN 8
503
504    /*
505     * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
506     */
507    int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
508
509    for (int i = 0; i < preamble_size; i+=2, dst++) {
510        uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
511        *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
512        if (--count == 0)
513            break;
514    }
515
516    int count16 = 0;
517    count16 = count >> 4;
518    vmask_blue = vmovq_n_u16(SK_B16_MASK);
519
520    if (count16) {
521        uint16x8_t wide_sr;
522        uint16x8_t wide_sg;
523        uint16x8_t wide_sb;
524        uint16x8_t wide_256_sa;
525
526        unsigned sr = SkGetPackedR32(src);
527        unsigned sg = SkGetPackedG32(src);
528        unsigned sb = SkGetPackedB32(src);
529        unsigned sa = SkGetPackedA32(src);
530
531        // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
532        // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
533        //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
534        wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
535
536        // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
537        //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
538        wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
539
540        // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
541        //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
542        wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
543
544        wide_256_sa =
545            vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
546
547        while (count16-- > 0) {
548            uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
549            uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
550            vdst1 = vld1q_u16(dst);
551            dst += 8;
552            vdst2 = vld1q_u16(dst);
553            dst -= 8;    //to store dst again.
554
555            vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
556            vdst1_b = vdst1 & vmask_blue;                              // extract blue
557            vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
558            vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
559
560            vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
561            vdst2_b = vdst2 & vmask_blue;                              // extract blue
562            vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
563            vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
564
565            vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
566            vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
567            vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
568
569            vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
570            vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
571            vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
572
573            vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
574            vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
575            vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
576
577            vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
578            vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
579
580            vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
581            vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
582            vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
583
584            vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
585            vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
586
587            vst1q_u16(dst, vdst1);
588            dst += 8;
589            vst1q_u16(dst, vdst2);
590            dst += 8;
591        }
592    }
593
594    count &= 0xF;
595    if (count > 0) {
596        do {
597            uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
598            *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
599            dst += 1;
600        } while (--count != 0);
601    }
602}
603
604static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
605    prod += vdupq_n_u16(128);
606    prod += vshrq_n_u16(prod, 8);
607    return vshrq_n_u16(prod, 8);
608}
609
610void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
611                          const SkPMColor* SK_RESTRICT src, int count,
612                          U8CPU alpha, int /*x*/, int /*y*/) {
613   SkASSERT(255 > alpha);
614
615    /* This code implements a Neon version of S32A_D565_Blend. The results have
616     * a few mismatches compared to the original code. These mismatches never
617     * exceed 1.
618     */
619
620    if (count >= 8) {
621        uint16x8_t valpha_max, vmask_blue;
622        uint8x8_t valpha;
623
624        // prepare constants
625        valpha_max = vmovq_n_u16(255);
626        valpha = vdup_n_u8(alpha);
627        vmask_blue = vmovq_n_u16(SK_B16_MASK);
628
629        do {
630            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
631            uint16x8_t vres_a, vres_r, vres_g, vres_b;
632            uint8x8x4_t vsrc;
633
634            // load pixels
635            vdst = vld1q_u16(dst);
636#ifdef SK_CPU_ARM64
637            vsrc = sk_vld4_u8_arm64_4(src);
638#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
639            asm (
640                "vld4.u8 %h[vsrc], [%[src]]!"
641                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
642                : :
643            );
644#else
645            register uint8x8_t d0 asm("d0");
646            register uint8x8_t d1 asm("d1");
647            register uint8x8_t d2 asm("d2");
648            register uint8x8_t d3 asm("d3");
649
650            asm volatile (
651                "vld4.u8    {d0-d3},[%[src]]!;"
652                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
653                  [src] "+&r" (src)
654                : :
655            );
656            vsrc.val[0] = d0;
657            vsrc.val[1] = d1;
658            vsrc.val[2] = d2;
659            vsrc.val[3] = d3;
660#endif
661
662
663            // deinterleave dst
664            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
665            vdst_b = vdst & vmask_blue;                     // extract blue
666            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
667            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
668
669            // shift src to 565
670            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
671            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
672            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
673
674            // calc src * src_scale
675            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
676            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
677            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
678            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
679
680            // prepare dst_scale
681            vres_a = SkDiv255Round_neon8(vres_a);
682            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
683
684            // add dst * dst_scale to previous result
685            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
686            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
687            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
688
689#ifdef S32A_D565_BLEND_EXACT
690            // It is possible to get exact results with this but it is slow,
691            // even slower than C code in some cases
692            vres_r = SkDiv255Round_neon8(vres_r);
693            vres_g = SkDiv255Round_neon8(vres_g);
694            vres_b = SkDiv255Round_neon8(vres_b);
695#else
696            vres_r = vrshrq_n_u16(vres_r, 8);
697            vres_g = vrshrq_n_u16(vres_g, 8);
698            vres_b = vrshrq_n_u16(vres_b, 8);
699#endif
700            // pack result
701            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
702            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
703
704            // store
705            vst1q_u16(dst, vres_b);
706            dst += 8;
707            count -= 8;
708        } while (count >= 8);
709    }
710
711    // leftovers
712    while (count-- > 0) {
713        SkPMColor sc = *src++;
714        if (sc) {
715            uint16_t dc = *dst;
716            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
717            unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
718            unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
719            unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
720            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
721        }
722        dst += 1;
723    }
724}
725
726/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
727 * each dither value is spaced out into byte lanes, and repeated
728 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
729 * start of each row.
730 */
731static const uint8_t gDitherMatrix_Neon[48] = {
732    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
733    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
734    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
735    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
736
737};
738
739void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
740                                int count, U8CPU alpha, int x, int y)
741{
742
743    SkASSERT(255 > alpha);
744
745    // rescale alpha to range 1 - 256
746    int scale = SkAlpha255To256(alpha);
747
748    if (count >= 8) {
749        /* select row and offset for dither array */
750        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
751
752        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
753        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
754
755        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
756        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
757
758        do {
759
760            uint8x8x4_t vsrc;
761            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
762            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
763            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
764            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
765            uint16x8_t vdst;
766            uint16x8_t vdst_r, vdst_g, vdst_b;
767            int16x8_t vres_r, vres_g, vres_b;
768            int8x8_t vres8_r, vres8_g, vres8_b;
769
770            // Load source and add dither
771#ifdef SK_CPU_ARM64
772            vsrc = sk_vld4_u8_arm64_3(src);
773#else
774            {
775            register uint8x8_t d0 asm("d0");
776            register uint8x8_t d1 asm("d1");
777            register uint8x8_t d2 asm("d2");
778            register uint8x8_t d3 asm("d3");
779
780            asm (
781                "vld4.8    {d0-d3},[%[src]]! "
782                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
783                :
784            );
785            vsrc.val[0] = d0;
786            vsrc.val[1] = d1;
787            vsrc.val[2] = d2;
788            }
789#endif
790            vsrc_r = vsrc.val[NEON_R];
791            vsrc_g = vsrc.val[NEON_G];
792            vsrc_b = vsrc.val[NEON_B];
793
794            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
795            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
796            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
797
798            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
799            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
800            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
801
802            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
803            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
804            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
805
806            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
807            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
808            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
809
810            // Load dst and unpack
811            vdst = vld1q_u16(dst);
812            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
813            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
814            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
815
816            // subtract dst from src and widen
817            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
818            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
819            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
820
821            // multiply diffs by scale and shift
822            vres_r = vmulq_s16(vres_r, vscale);
823            vres_g = vmulq_s16(vres_g, vscale);
824            vres_b = vmulq_s16(vres_b, vscale);
825
826            vres8_r = vshrn_n_s16(vres_r, 8);
827            vres8_g = vshrn_n_s16(vres_g, 8);
828            vres8_b = vshrn_n_s16(vres_b, 8);
829
830            // add dst to result
831            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
832            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
833            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
834
835            // put result into 565 format
836            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
837            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
838
839            // Store result
840            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
841
842            // Next iteration
843            dst += 8;
844            count -= 8;
845
846        } while (count >= 8);
847    }
848
849    // Leftovers
850    if (count > 0) {
851        int scale = SkAlpha255To256(alpha);
852        DITHER_565_SCAN(y);
853        do {
854            SkPMColor c = *src++;
855            SkPMColorAssert(c);
856
857            int dither = DITHER_VALUE(x);
858            int sr = SkGetPackedR32(c);
859            int sg = SkGetPackedG32(c);
860            int sb = SkGetPackedB32(c);
861            sr = SkDITHER_R32To565(sr, dither);
862            sg = SkDITHER_G32To565(sg, dither);
863            sb = SkDITHER_B32To565(sb, dither);
864
865            uint16_t d = *dst;
866            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
867                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
868                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
869            DITHER_INC_X(x);
870        } while (--count != 0);
871    }
872}
873
874void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
875                                const SkPMColor* SK_RESTRICT src,
876                                int count, U8CPU alpha) {
877
878    SkASSERT(255 == alpha);
879    if (count > 0) {
880
881
882    uint8x8_t alpha_mask;
883
884    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
885    alpha_mask = vld1_u8(alpha_mask_setup);
886
887    /* do the NEON unrolled code */
888#define    UNROLL    4
889    while (count >= UNROLL) {
890        uint8x8_t src_raw, dst_raw, dst_final;
891        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
892
893        /* The two prefetches below may make the code slighlty
894         * slower for small values of count but are worth having
895         * in the general case.
896         */
897        __builtin_prefetch(src+32);
898        __builtin_prefetch(dst+32);
899
900        /* get the source */
901        src_raw = vreinterpret_u8_u32(vld1_u32(src));
902#if    UNROLL > 2
903        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
904#endif
905
906        /* get and hold the dst too */
907        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
908#if    UNROLL > 2
909        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
910#endif
911
912    /* 1st and 2nd bits of the unrolling */
913    {
914        uint8x8_t dst_cooked;
915        uint16x8_t dst_wide;
916        uint8x8_t alpha_narrow;
917        uint16x8_t alpha_wide;
918
919        /* get the alphas spread out properly */
920        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
921        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
922
923        /* spread the dest */
924        dst_wide = vmovl_u8(dst_raw);
925
926        /* alpha mul the dest */
927        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
928        dst_cooked = vshrn_n_u16(dst_wide, 8);
929
930        /* sum -- ignoring any byte lane overflows */
931        dst_final = vadd_u8(src_raw, dst_cooked);
932    }
933
934#if    UNROLL > 2
935    /* the 3rd and 4th bits of our unrolling */
936    {
937        uint8x8_t dst_cooked;
938        uint16x8_t dst_wide;
939        uint8x8_t alpha_narrow;
940        uint16x8_t alpha_wide;
941
942        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
943        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
944
945        /* spread the dest */
946        dst_wide = vmovl_u8(dst_raw_2);
947
948        /* alpha mul the dest */
949        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
950        dst_cooked = vshrn_n_u16(dst_wide, 8);
951
952        /* sum -- ignoring any byte lane overflows */
953        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
954    }
955#endif
956
957        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
958#if    UNROLL > 2
959        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
960#endif
961
962        src += UNROLL;
963        dst += UNROLL;
964        count -= UNROLL;
965    }
966#undef    UNROLL
967
968    /* do any residual iterations */
969        while (--count >= 0) {
970            *dst = SkPMSrcOver(*src, *dst);
971            src += 1;
972            dst += 1;
973        }
974    }
975}
976
977void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
978                                const SkPMColor* SK_RESTRICT src,
979                                int count, U8CPU alpha) {
980    SkASSERT(255 == alpha);
981
982    if (count <= 0)
983    return;
984
985    /* Use these to check if src is transparent or opaque */
986    const unsigned int ALPHA_OPAQ  = 0xFF000000;
987    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
988
989#define UNROLL  4
990    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
991    const SkPMColor* SK_RESTRICT src_temp = src;
992
993    /* set up the NEON variables */
994    uint8x8_t alpha_mask;
995    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
996    alpha_mask = vld1_u8(alpha_mask_setup);
997
998    uint8x8_t src_raw, dst_raw, dst_final;
999    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
1000    uint8x8_t dst_cooked;
1001    uint16x8_t dst_wide;
1002    uint8x8_t alpha_narrow;
1003    uint16x8_t alpha_wide;
1004
1005    /* choose the first processing type */
1006    if( src >= src_end)
1007        goto TAIL;
1008    if(*src <= ALPHA_TRANS)
1009        goto ALPHA_0;
1010    if(*src >= ALPHA_OPAQ)
1011        goto ALPHA_255;
1012    /* fall-thru */
1013
1014ALPHA_1_TO_254:
1015    do {
1016
1017        /* get the source */
1018        src_raw = vreinterpret_u8_u32(vld1_u32(src));
1019        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
1020
1021        /* get and hold the dst too */
1022        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
1023        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
1024
1025
1026        /* get the alphas spread out properly */
1027        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
1028        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1029        /* we collapsed (255-a)+1 ... */
1030        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1031
1032        /* spread the dest */
1033        dst_wide = vmovl_u8(dst_raw);
1034
1035        /* alpha mul the dest */
1036        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1037        dst_cooked = vshrn_n_u16(dst_wide, 8);
1038
1039        /* sum -- ignoring any byte lane overflows */
1040        dst_final = vadd_u8(src_raw, dst_cooked);
1041
1042        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
1043        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1044        /* we collapsed (255-a)+1 ... */
1045        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1046
1047        /* spread the dest */
1048        dst_wide = vmovl_u8(dst_raw_2);
1049
1050        /* alpha mul the dest */
1051        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1052        dst_cooked = vshrn_n_u16(dst_wide, 8);
1053
1054        /* sum -- ignoring any byte lane overflows */
1055        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
1056
1057        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
1058        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
1059
1060        src += UNROLL;
1061        dst += UNROLL;
1062
1063        /* if 2 of the next pixels aren't between 1 and 254
1064        it might make sense to go to the optimized loops */
1065        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
1066            break;
1067
1068    } while(src < src_end);
1069
1070    if (src >= src_end)
1071        goto TAIL;
1072
1073    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
1074        goto ALPHA_255;
1075
1076    /*fall-thru*/
1077
1078ALPHA_0:
1079
1080    /*In this state, we know the current alpha is 0 and
1081     we optimize for the next alpha also being zero. */
1082    src_temp = src;  //so we don't have to increment dst every time
1083    do {
1084        if(*(++src) > ALPHA_TRANS)
1085            break;
1086        if(*(++src) > ALPHA_TRANS)
1087            break;
1088        if(*(++src) > ALPHA_TRANS)
1089            break;
1090        if(*(++src) > ALPHA_TRANS)
1091            break;
1092    } while(src < src_end);
1093
1094    dst += (src - src_temp);
1095
1096    /* no longer alpha 0, so determine where to go next. */
1097    if( src >= src_end)
1098        goto TAIL;
1099    if(*src >= ALPHA_OPAQ)
1100        goto ALPHA_255;
1101    else
1102        goto ALPHA_1_TO_254;
1103
1104ALPHA_255:
1105    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
1106        dst[0]=src[0];
1107        dst[1]=src[1];
1108        dst[2]=src[2];
1109        dst[3]=src[3];
1110        src+=UNROLL;
1111        dst+=UNROLL;
1112        if(src >= src_end)
1113            goto TAIL;
1114    }
1115
1116    //Handle remainder.
1117    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1118        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1119            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
1120        }
1121    }
1122
1123    if( src >= src_end)
1124        goto TAIL;
1125    if(*src <= ALPHA_TRANS)
1126        goto ALPHA_0;
1127    else
1128        goto ALPHA_1_TO_254;
1129
1130TAIL:
1131    /* do any residual iterations */
1132    src_end += UNROLL + 1;  //goto the real end
1133    while(src != src_end) {
1134        if( *src != 0 ) {
1135            if( *src >= ALPHA_OPAQ ) {
1136                *dst = *src;
1137            }
1138            else {
1139                *dst = SkPMSrcOver(*src, *dst);
1140            }
1141        }
1142        src++;
1143        dst++;
1144    }
1145
1146#undef    UNROLL
1147    return;
1148}
1149
1150/* Neon version of S32_Blend_BlitRow32()
1151 * portable version is in src/core/SkBlitRow_D32.cpp
1152 */
1153void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1154                              const SkPMColor* SK_RESTRICT src,
1155                              int count, U8CPU alpha) {
1156    SkASSERT(alpha <= 255);
1157
1158    if (count <= 0) {
1159        return;
1160    }
1161
1162    uint16_t src_scale = SkAlpha255To256(alpha);
1163    uint16_t dst_scale = 256 - src_scale;
1164
1165    while (count >= 2) {
1166        uint8x8_t vsrc, vdst, vres;
1167        uint16x8_t vsrc_wide, vdst_wide;
1168
1169        /* These commented prefetches are a big win for count
1170         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
1171         * They also hurt a little (<5%) on an A15
1172         */
1173        //__builtin_prefetch(src+32);
1174        //__builtin_prefetch(dst+32);
1175
1176        // Load
1177        vsrc = vreinterpret_u8_u32(vld1_u32(src));
1178        vdst = vreinterpret_u8_u32(vld1_u32(dst));
1179
1180        // Process src
1181        vsrc_wide = vmovl_u8(vsrc);
1182        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1183
1184        // Process dst
1185        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1186
1187        // Combine
1188        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1189
1190        // Store
1191        vst1_u32(dst, vreinterpret_u32_u8(vres));
1192
1193        src += 2;
1194        dst += 2;
1195        count -= 2;
1196    }
1197
1198    if (count == 1) {
1199        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1200        uint16x8_t vsrc_wide, vdst_wide;
1201
1202        // Load
1203        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1204        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1205
1206        // Process
1207        vsrc_wide = vmovl_u8(vsrc);
1208        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1209        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1210        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1211
1212        // Store
1213        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1214    }
1215}
1216
1217#ifdef SK_CPU_ARM32
1218void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1219                         const SkPMColor* SK_RESTRICT src,
1220                         int count, U8CPU alpha) {
1221
1222    SkASSERT(255 >= alpha);
1223
1224    if (count <= 0) {
1225        return;
1226    }
1227
1228    unsigned alpha256 = SkAlpha255To256(alpha);
1229
1230    // First deal with odd counts
1231    if (count & 1) {
1232        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1233        uint16x8_t vdst_wide, vsrc_wide;
1234        unsigned dst_scale;
1235
1236        // Load
1237        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1238        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1239
1240        // Calc dst_scale
1241        dst_scale = vget_lane_u8(vsrc, 3);
1242        dst_scale *= alpha256;
1243        dst_scale >>= 8;
1244        dst_scale = 256 - dst_scale;
1245
1246        // Process src
1247        vsrc_wide = vmovl_u8(vsrc);
1248        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1249
1250        // Process dst
1251        vdst_wide = vmovl_u8(vdst);
1252        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1253
1254        // Combine
1255        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1256
1257        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1258        dst++;
1259        src++;
1260        count--;
1261    }
1262
1263    if (count) {
1264        uint8x8_t alpha_mask;
1265        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1266        alpha_mask = vld1_u8(alpha_mask_setup);
1267
1268        do {
1269
1270            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1271            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1272
1273            __builtin_prefetch(src+32);
1274            __builtin_prefetch(dst+32);
1275
1276            // Load
1277            vsrc = vreinterpret_u8_u32(vld1_u32(src));
1278            vdst = vreinterpret_u8_u32(vld1_u32(dst));
1279
1280            // Prepare src_scale
1281            vsrc_scale = vdupq_n_u16(alpha256);
1282
1283            // Calc dst_scale
1284            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1285            vdst_scale = vmovl_u8(vsrc_alphas);
1286            vdst_scale *= vsrc_scale;
1287            vdst_scale = vshrq_n_u16(vdst_scale, 8);
1288            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1289
1290            // Process src
1291            vsrc_wide = vmovl_u8(vsrc);
1292            vsrc_wide *= vsrc_scale;
1293
1294            // Process dst
1295            vdst_wide = vmovl_u8(vdst);
1296            vdst_wide *= vdst_scale;
1297
1298            // Combine
1299            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1300
1301            vst1_u32(dst, vreinterpret_u32_u8(vres));
1302
1303            src += 2;
1304            dst += 2;
1305            count -= 2;
1306        } while(count);
1307    }
1308}
1309
1310///////////////////////////////////////////////////////////////////////////////
1311
1312#endif // #ifdef SK_CPU_ARM32
1313
1314void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1315                                   const SkPMColor* SK_RESTRICT src,
1316                                   int count, U8CPU alpha, int x, int y) {
1317    SkASSERT(255 == alpha);
1318
1319#define    UNROLL    8
1320
1321    if (count >= UNROLL) {
1322
1323    uint8x8_t dbase;
1324    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1325    dbase = vld1_u8(dstart);
1326
1327        do {
1328        uint8x8x4_t vsrc;
1329        uint8x8_t sr, sg, sb, sa, d;
1330        uint16x8_t dst8, scale8, alpha8;
1331        uint16x8_t dst_r, dst_g, dst_b;
1332
1333#ifdef SK_CPU_ARM64
1334        vsrc = sk_vld4_u8_arm64_4(src);
1335#else
1336        {
1337        register uint8x8_t d0 asm("d0");
1338        register uint8x8_t d1 asm("d1");
1339        register uint8x8_t d2 asm("d2");
1340        register uint8x8_t d3 asm("d3");
1341
1342        asm ("vld4.8    {d0-d3},[%[src]]! "
1343            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1344            :
1345        );
1346        vsrc.val[0] = d0;
1347        vsrc.val[1] = d1;
1348        vsrc.val[2] = d2;
1349        vsrc.val[3] = d3;
1350        }
1351#endif
1352        sa = vsrc.val[NEON_A];
1353        sr = vsrc.val[NEON_R];
1354        sg = vsrc.val[NEON_G];
1355        sb = vsrc.val[NEON_B];
1356
1357        /* calculate 'd', which will be 0..7
1358         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1359         */
1360        alpha8 = vmovl_u8(dbase);
1361        alpha8 = vmlal_u8(alpha8, sa, dbase);
1362        d = vshrn_n_u16(alpha8, 8);    // narrowing too
1363
1364        // sr = sr - (sr>>5) + d
1365        /* watching for 8-bit overflow.  d is 0..7; risky range of
1366         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1367         * safe  as long as we do ((sr-sr>>5) + d)
1368         */
1369        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1370        sr = vadd_u8(sr, d);
1371
1372        // sb = sb - (sb>>5) + d
1373        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1374        sb = vadd_u8(sb, d);
1375
1376        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1377        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1378        sg = vadd_u8(sg, vshr_n_u8(d,1));
1379
1380        // need to pick up 8 dst's -- at 16 bits each, 128 bits
1381        dst8 = vld1q_u16(dst);
1382        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1383        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1384        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1385
1386        // blend
1387        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1388
1389        // combine the addq and mul, save 3 insns
1390        scale8 = vshrq_n_u16(scale8, 3);
1391        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1392        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1393        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1394
1395        // repack to store
1396        dst8 = vshrq_n_u16(dst_b, 5);
1397        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1398        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1399
1400        vst1q_u16(dst, dst8);
1401
1402        dst += UNROLL;
1403        count -= UNROLL;
1404        // skip x += UNROLL, since it's unchanged mod-4
1405        } while (count >= UNROLL);
1406    }
1407#undef    UNROLL
1408
1409    // residuals
1410    if (count > 0) {
1411        DITHER_565_SCAN(y);
1412        do {
1413            SkPMColor c = *src++;
1414            SkPMColorAssert(c);
1415            if (c) {
1416                unsigned a = SkGetPackedA32(c);
1417
1418                // dither and alpha are just temporary variables to work-around
1419                // an ICE in debug.
1420                unsigned dither = DITHER_VALUE(x);
1421                unsigned alpha = SkAlpha255To256(a);
1422                int d = SkAlphaMul(dither, alpha);
1423
1424                unsigned sr = SkGetPackedR32(c);
1425                unsigned sg = SkGetPackedG32(c);
1426                unsigned sb = SkGetPackedB32(c);
1427                sr = SkDITHER_R32_FOR_565(sr, d);
1428                sg = SkDITHER_G32_FOR_565(sg, d);
1429                sb = SkDITHER_B32_FOR_565(sb, d);
1430
1431                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1432                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1433                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1434                // now src and dst expanded are in g:11 r:10 x:1 b:10
1435                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1436            }
1437            dst += 1;
1438            DITHER_INC_X(x);
1439        } while (--count != 0);
1440    }
1441}
1442
1443///////////////////////////////////////////////////////////////////////////////
1444
1445void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1446                                 const SkPMColor* SK_RESTRICT src,
1447                                 int count, U8CPU alpha, int x, int y) {
1448    SkASSERT(255 == alpha);
1449
1450#define    UNROLL    8
1451    if (count >= UNROLL) {
1452    uint8x8_t d;
1453    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1454    d = vld1_u8(dstart);
1455
1456    while (count >= UNROLL) {
1457        uint8x8_t sr, sg, sb;
1458        uint16x8_t dr, dg, db;
1459        uint16x8_t dst8;
1460        uint8x8x4_t vsrc;
1461
1462#ifdef SK_CPU_ARM64
1463        vsrc = sk_vld4_u8_arm64_3(src);
1464#else
1465        {
1466        register uint8x8_t d0 asm("d0");
1467        register uint8x8_t d1 asm("d1");
1468        register uint8x8_t d2 asm("d2");
1469        register uint8x8_t d3 asm("d3");
1470
1471        asm (
1472            "vld4.8    {d0-d3},[%[src]]! "
1473            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1474            :
1475        );
1476        vsrc.val[0] = d0;
1477        vsrc.val[1] = d1;
1478        vsrc.val[2] = d2;
1479        }
1480#endif
1481        sr = vsrc.val[NEON_R];
1482        sg = vsrc.val[NEON_G];
1483        sb = vsrc.val[NEON_B];
1484
1485        /* XXX: if we want to prefetch, hide it in the above asm()
1486         * using the gcc __builtin_prefetch(), the prefetch will
1487         * fall to the bottom of the loop -- it won't stick up
1488         * at the top of the loop, just after the vld4.
1489         */
1490
1491        // sr = sr - (sr>>5) + d
1492        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1493        dr = vaddl_u8(sr, d);
1494
1495        // sb = sb - (sb>>5) + d
1496        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1497        db = vaddl_u8(sb, d);
1498
1499        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1500        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1501        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1502
1503        // pack high bits of each into 565 format  (rgb, b is lsb)
1504        dst8 = vshrq_n_u16(db, 3);
1505        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1506        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1507
1508        // store it
1509        vst1q_u16(dst, dst8);
1510
1511        dst += UNROLL;
1512        // we don't need to increment src as the asm above has already done it
1513        count -= UNROLL;
1514        x += UNROLL;        // probably superfluous
1515    }
1516    }
1517#undef    UNROLL
1518
1519    // residuals
1520    if (count > 0) {
1521        DITHER_565_SCAN(y);
1522        do {
1523            SkPMColor c = *src++;
1524            SkPMColorAssert(c);
1525            SkASSERT(SkGetPackedA32(c) == 255);
1526
1527            unsigned dither = DITHER_VALUE(x);
1528            *dst++ = SkDitherRGB32To565(c, dither);
1529            DITHER_INC_X(x);
1530        } while (--count != 0);
1531    }
1532}
1533
1534///////////////////////////////////////////////////////////////////////////////
1535
1536const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
1537    // no dither
1538    S32_D565_Opaque_neon,
1539    S32_D565_Blend_neon,
1540    S32A_D565_Opaque_neon,
1541#if 0
1542    S32A_D565_Blend_neon,
1543#else
1544    nullptr,   // https://code.google.com/p/skia/issues/detail?id=2797
1545#endif
1546
1547    // dither
1548    S32_D565_Opaque_Dither_neon,
1549    S32_D565_Blend_Dither_neon,
1550    S32A_D565_Opaque_Dither_neon,
1551    nullptr,   // S32A_D565_Blend_Dither
1552};
1553
1554const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1555    Color32A_D565_neon,    // Color32_D565,
1556    Color32A_D565_neon,    // Color32A_D565,
1557    Color32A_D565_neon,    // Color32_D565_Dither,
1558    Color32A_D565_neon,    // Color32A_D565_Dither
1559};
1560
1561const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1562    nullptr,   // S32_Opaque,
1563    S32_Blend_BlitRow32_neon,        // S32_Blend,
1564    /*
1565     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1566     * value and attempts to optimize accordingly.  The optimization is
1567     * sensitive to the source content and is not a win in all cases. For
1568     * example, if there are a lot of transitions between the alpha states,
1569     * the performance will almost certainly be worse.  However, for many
1570     * common cases the performance is equivalent or better than the standard
1571     * case where we do not inspect the src alpha.
1572     */
1573#if SK_A32_SHIFT == 24
1574    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1575    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1576#else
1577    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1578#endif
1579#ifdef SK_CPU_ARM32
1580    S32A_Blend_BlitRow32_neon        // S32A_Blend
1581#else
1582    nullptr
1583#endif
1584};
1585