SkBlitRow_opts_arm_neon.cpp revision 90165c2269bc33ca3d6aaa73d528194daf48da4e
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkColor_opts_neon.h"
18#include <arm_neon.h>
19
20#ifdef SK_CPU_ARM64
21static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22    uint8x8x4_t vsrc;
23    uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24
25    asm (
26        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27        "mov    %[vsrc0].8b, v0.8b             \t\n"
28        "mov    %[vsrc1].8b, v1.8b             \t\n"
29        "mov    %[vsrc2].8b, v2.8b             \t\n"
30        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31          [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32        : : "v0", "v1", "v2", "v3"
33    );
34
35    vsrc.val[0] = vsrc_0;
36    vsrc.val[1] = vsrc_1;
37    vsrc.val[2] = vsrc_2;
38
39    return vsrc;
40}
41
42static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43    uint8x8x4_t vsrc;
44    uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45
46    asm (
47        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48        "mov    %[vsrc0].8b, v0.8b             \t\n"
49        "mov    %[vsrc1].8b, v1.8b             \t\n"
50        "mov    %[vsrc2].8b, v2.8b             \t\n"
51        "mov    %[vsrc3].8b, v3.8b             \t\n"
52        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53          [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54          [src] "+&r" (src)
55        : : "v0", "v1", "v2", "v3"
56    );
57
58    vsrc.val[0] = vsrc_0;
59    vsrc.val[1] = vsrc_1;
60    vsrc.val[2] = vsrc_2;
61    vsrc.val[3] = vsrc_3;
62
63    return vsrc;
64}
65#endif
66
67void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                           const SkPMColor* SK_RESTRICT src, int count,
69                           U8CPU alpha, int /*x*/, int /*y*/) {
70    SkASSERT(255 == alpha);
71
72    while (count >= 8) {
73        uint8x8x4_t vsrc;
74        uint16x8_t vdst;
75
76        // Load
77#ifdef SK_CPU_ARM64
78        vsrc = sk_vld4_u8_arm64_3(src);
79#else
80        vsrc = vld4_u8((uint8_t*)src);
81        src += 8;
82#endif
83
84        // Convert src to 565
85        vdst = SkPixel32ToPixel16_neon8(vsrc);
86
87        // Store
88        vst1q_u16(dst, vdst);
89
90        // Prepare next iteration
91        dst += 8;
92        count -= 8;
93    };
94
95    // Leftovers
96    while (count > 0) {
97        SkPMColor c = *src++;
98        SkPMColorAssert(c);
99        *dst = SkPixel32ToPixel16_ToU16(c);
100        dst++;
101        count--;
102    };
103}
104
105void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                          const SkPMColor* SK_RESTRICT src, int count,
107                          U8CPU alpha, int /*x*/, int /*y*/) {
108    SkASSERT(255 > alpha);
109
110    uint16x8_t vmask_blue, vscale;
111
112    // prepare constants
113    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114    vmask_blue = vmovq_n_u16(0x1F);
115
116    while (count >= 8) {
117        uint8x8x4_t vsrc;
118        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119        uint16x8_t vres_r, vres_g, vres_b;
120
121        // Load src
122#ifdef SK_CPU_ARM64
123        vsrc = sk_vld4_u8_arm64_3(src);
124#else
125        {
126        register uint8x8_t d0 asm("d0");
127        register uint8x8_t d1 asm("d1");
128        register uint8x8_t d2 asm("d2");
129        register uint8x8_t d3 asm("d3");
130
131        asm (
132            "vld4.8    {d0-d3},[%[src]]!"
133            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134            :
135        );
136        vsrc.val[0] = d0;
137        vsrc.val[1] = d1;
138        vsrc.val[2] = d2;
139        }
140#endif
141
142        // Load and unpack dst
143        vdst = vld1q_u16(dst);
144        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148
149        // Shift src to 565 range
150        vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151        vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152        vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153
154        // Scale src - dst
155        vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156        vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157        vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158
159        vres_r = vshrq_n_u16(vres_r * vscale, 8);
160        vres_g = vshrq_n_u16(vres_g * vscale, 8);
161        vres_b = vshrq_n_u16(vres_b * vscale, 8);
162
163        vres_r += vdst_r;
164        vres_g += vdst_g;
165        vres_b += vdst_b;
166
167        // Combine
168        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170
171        // Store
172        vst1q_u16(dst, vres_b);
173        dst += 8;
174        count -= 8;
175    }
176    if (count > 0) {
177        int scale = SkAlpha255To256(alpha);
178        do {
179            SkPMColor c = *src++;
180            SkPMColorAssert(c);
181            uint16_t d = *dst;
182            *dst++ = SkPackRGB16(
183                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186        } while (--count != 0);
187    }
188}
189
190#ifdef SK_CPU_ARM32
191void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                           const SkPMColor* SK_RESTRICT src, int count,
193                           U8CPU alpha, int /*x*/, int /*y*/) {
194    SkASSERT(255 == alpha);
195
196    if (count >= 8) {
197        int32_t tmp = 0;
198
199        asm volatile (
200                      "ands       %[tmp], %[count], #7            \n\t"
201                      "vmov.u8    d31, #1<<7                      \n\t"
202                      "vld1.16    {q12}, [%[dst]]                 \n\t"
203                      "vld4.8     {d0-d3}, [%[src]]               \n\t"
204                      // Thumb does not support the standard ARM conditional
205                      // instructions but instead requires the 'it' instruction
206                      // to signal conditional execution
207                      "it eq                                      \n\t"
208                      "moveq      %[tmp], #8                      \n\t"
209                      "mov        ip, %[dst]             \n\t"
210
211                      "add        %[src], %[src], %[tmp], LSL#2   \n\t"
212                      "add        %[dst], %[dst], %[tmp], LSL#1   \n\t"
213                      "subs       %[count], %[count], %[tmp]      \n\t"
214                      "b          9f                              \n\t"
215                      // LOOP
216                      "2:                                         \n\t"
217
218                      "vld1.16    {q12}, [%[dst]]!                \n\t"
219                      "vld4.8     {d0-d3}, [%[src]]!              \n\t"
220                      "vst1.16    {q10}, [ip]            \n\t"
221                      "sub        ip, %[dst], #8*2       \n\t"
222                      "subs       %[count], %[count], #8          \n\t"
223                      "9:                                         \n\t"
224                      "pld        [%[dst],#32]                    \n\t"
225                      // expand 0565 q12 to 8888 {d4-d7}
226                      "vmovn.u16  d4, q12                         \n\t"
227                      "vshr.u16   q11, q12, #5                    \n\t"
228                      "vshr.u16   q10, q12, #6+5                  \n\t"
229                      "vmovn.u16  d5, q11                         \n\t"
230                      "vmovn.u16  d6, q10                         \n\t"
231                      "vshl.u8    d4, d4, #3                      \n\t"
232                      "vshl.u8    d5, d5, #2                      \n\t"
233                      "vshl.u8    d6, d6, #3                      \n\t"
234
235                      "vmovl.u8   q14, d31                        \n\t"
236                      "vmovl.u8   q13, d31                        \n\t"
237                      "vmovl.u8   q12, d31                        \n\t"
238
239                      // duplicate in 4/2/1 & 8pix vsns
240                      "vmvn.8     d30, d3                         \n\t"
241                      "vmlal.u8   q14, d30, d6                    \n\t"
242                      "vmlal.u8   q13, d30, d5                    \n\t"
243                      "vmlal.u8   q12, d30, d4                    \n\t"
244                      "vshr.u16   q8, q14, #5                     \n\t"
245                      "vshr.u16   q9, q13, #6                     \n\t"
246                      "vaddhn.u16 d6, q14, q8                     \n\t"
247                      "vshr.u16   q8, q12, #5                     \n\t"
248                      "vaddhn.u16 d5, q13, q9                     \n\t"
249                      "vaddhn.u16 d4, q12, q8                     \n\t"
250                      // intentionally don't calculate alpha
251                      // result in d4-d6
252
253            #ifdef SK_PMCOLOR_IS_RGBA
254                      "vqadd.u8   d6, d6, d0                      \n\t"
255                      "vqadd.u8   d5, d5, d1                      \n\t"
256                      "vqadd.u8   d4, d4, d2                      \n\t"
257            #else
258                      "vqadd.u8   d6, d6, d2                      \n\t"
259                      "vqadd.u8   d5, d5, d1                      \n\t"
260                      "vqadd.u8   d4, d4, d0                      \n\t"
261            #endif
262
263                      // pack 8888 {d4-d6} to 0565 q10
264                      "vshll.u8   q10, d6, #8                     \n\t"
265                      "vshll.u8   q3, d5, #8                      \n\t"
266                      "vshll.u8   q2, d4, #8                      \n\t"
267                      "vsri.u16   q10, q3, #5                     \n\t"
268                      "vsri.u16   q10, q2, #11                    \n\t"
269
270                      "bne        2b                              \n\t"
271
272                      "1:                                         \n\t"
273                      "vst1.16      {q10}, [ip]          \n\t"
274                      : [count] "+r" (count)
275                      : [dst] "r" (dst), [src] "r" (src), [tmp] "r"(tmp)
276                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
277                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
278                      "d30","d31"
279                      );
280    }
281    else
282    {   // handle count < 8
283        uint16_t* SK_RESTRICT keep_dst = 0;
284
285        asm volatile (
286                      "vmov.u8    d31, #1<<7                  \n\t"
287                      "mov        %[keep_dst], %[dst]         \n\t"
288
289                      "tst        %[count], #4                \n\t"
290                      "beq        14f                         \n\t"
291                      "vld1.16    {d25}, [%[dst]]!            \n\t"
292                      "vld1.32    {q1}, [%[src]]!             \n\t"
293
294                      "14:                                        \n\t"
295                      "tst        %[count], #2                \n\t"
296                      "beq        12f                         \n\t"
297                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
298                      "vld1.32    {d1}, [%[src]]!             \n\t"
299
300                      "12:                                        \n\t"
301                      "tst        %[count], #1                \n\t"
302                      "beq        11f                         \n\t"
303                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
304                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
305
306                      "11:                                        \n\t"
307                      // unzips achieve the same as a vld4 operation
308                      "vuzp.u16   q0, q1                      \n\t"
309                      "vuzp.u8    d0, d1                      \n\t"
310                      "vuzp.u8    d2, d3                      \n\t"
311                      // expand 0565 q12 to 8888 {d4-d7}
312                      "vmovn.u16  d4, q12                     \n\t"
313                      "vshr.u16   q11, q12, #5                \n\t"
314                      "vshr.u16   q10, q12, #6+5              \n\t"
315                      "vmovn.u16  d5, q11                     \n\t"
316                      "vmovn.u16  d6, q10                     \n\t"
317                      "vshl.u8    d4, d4, #3                  \n\t"
318                      "vshl.u8    d5, d5, #2                  \n\t"
319                      "vshl.u8    d6, d6, #3                  \n\t"
320
321                      "vmovl.u8   q14, d31                    \n\t"
322                      "vmovl.u8   q13, d31                    \n\t"
323                      "vmovl.u8   q12, d31                    \n\t"
324
325                      // duplicate in 4/2/1 & 8pix vsns
326                      "vmvn.8     d30, d3                     \n\t"
327                      "vmlal.u8   q14, d30, d6                \n\t"
328                      "vmlal.u8   q13, d30, d5                \n\t"
329                      "vmlal.u8   q12, d30, d4                \n\t"
330                      "vshr.u16   q8, q14, #5                 \n\t"
331                      "vshr.u16   q9, q13, #6                 \n\t"
332                      "vaddhn.u16 d6, q14, q8                 \n\t"
333                      "vshr.u16   q8, q12, #5                 \n\t"
334                      "vaddhn.u16 d5, q13, q9                 \n\t"
335                      "vaddhn.u16 d4, q12, q8                 \n\t"
336                      // intentionally don't calculate alpha
337                      // result in d4-d6
338
339            #ifdef SK_PMCOLOR_IS_RGBA
340                      "vqadd.u8   d6, d6, d0                  \n\t"
341                      "vqadd.u8   d5, d5, d1                  \n\t"
342                      "vqadd.u8   d4, d4, d2                  \n\t"
343            #else
344                      "vqadd.u8   d6, d6, d2                  \n\t"
345                      "vqadd.u8   d5, d5, d1                  \n\t"
346                      "vqadd.u8   d4, d4, d0                  \n\t"
347            #endif
348
349                      // pack 8888 {d4-d6} to 0565 q10
350                      "vshll.u8   q10, d6, #8                 \n\t"
351                      "vshll.u8   q3, d5, #8                  \n\t"
352                      "vshll.u8   q2, d4, #8                  \n\t"
353                      "vsri.u16   q10, q3, #5                 \n\t"
354                      "vsri.u16   q10, q2, #11                \n\t"
355
356                      // store
357                      "tst        %[count], #4                \n\t"
358                      "beq        24f                         \n\t"
359                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
360
361                      "24:                                        \n\t"
362                      "tst        %[count], #2                \n\t"
363                      "beq        22f                         \n\t"
364                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
365
366                      "22:                                        \n\t"
367                      "tst        %[count], #1                \n\t"
368                      "beq        21f                         \n\t"
369                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
370
371                      "21:                                        \n\t"
372                      : [count] "+r" (count)
373                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
374                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
375                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
376                      "d30","d31"
377                      );
378    }
379}
380
381#else // #ifdef SK_CPU_ARM32
382
383void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
384                           const SkPMColor* SK_RESTRICT src, int count,
385                           U8CPU alpha, int /*x*/, int /*y*/) {
386    SkASSERT(255 == alpha);
387
388    if (count >= 16) {
389        asm (
390            "movi    v4.8h, #0x80                   \t\n"
391
392            "1:                                     \t\n"
393            "sub     %w[count], %w[count], #16      \t\n"
394            "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
395            "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
396            "prfm    pldl1keep, [%[src],#512]       \t\n"
397            "prfm    pldl1keep, [%[dst],#256]       \t\n"
398            "ushr    v20.8h, v17.8h, #5             \t\n"
399            "ushr    v31.8h, v16.8h, #5             \t\n"
400            "xtn     v6.8b, v31.8h                  \t\n"
401            "xtn2    v6.16b, v20.8h                 \t\n"
402            "ushr    v20.8h, v17.8h, #11            \t\n"
403            "shl     v19.16b, v6.16b, #2            \t\n"
404            "ushr    v31.8h, v16.8h, #11            \t\n"
405            "xtn     v22.8b, v31.8h                 \t\n"
406            "xtn2    v22.16b, v20.8h                \t\n"
407            "shl     v18.16b, v22.16b, #3           \t\n"
408            "mvn     v3.16b, v3.16b                 \t\n"
409            "xtn     v16.8b, v16.8h                 \t\n"
410            "mov     v7.16b, v4.16b                 \t\n"
411            "xtn2    v16.16b, v17.8h                \t\n"
412            "umlal   v7.8h, v3.8b, v19.8b           \t\n"
413            "shl     v16.16b, v16.16b, #3           \t\n"
414            "mov     v22.16b, v4.16b                \t\n"
415            "ushr    v24.8h, v7.8h, #6              \t\n"
416            "umlal   v22.8h, v3.8b, v18.8b          \t\n"
417            "ushr    v20.8h, v22.8h, #5             \t\n"
418            "addhn   v20.8b, v22.8h, v20.8h         \t\n"
419            "cmp     %w[count], #16                 \t\n"
420            "mov     v6.16b, v4.16b                 \t\n"
421            "mov     v5.16b, v4.16b                 \t\n"
422            "umlal   v6.8h, v3.8b, v16.8b           \t\n"
423            "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
424            "mov     v17.16b, v4.16b                \t\n"
425            "ushr    v19.8h, v6.8h, #5              \t\n"
426            "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
427            "addhn   v7.8b, v7.8h, v24.8h           \t\n"
428            "ushr    v18.8h, v5.8h, #6              \t\n"
429            "ushr    v21.8h, v17.8h, #5             \t\n"
430            "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
431            "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
432            "mov     v22.16b, v4.16b                \t\n"
433            "addhn   v6.8b, v6.8h, v19.8h           \t\n"
434            "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
435            "ushr    v5.8h, v22.8h, #5              \t\n"
436            "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
437            "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
438#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
439            "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
440            "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
441#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
442            "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
443            "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
444#else
445#error "This function only supports BGRA and RGBA."
446#endif
447            "shll    v22.8h, v20.8b, #8             \t\n"
448            "shll    v5.8h, v7.8b, #8               \t\n"
449            "sri     v22.8h, v5.8h, #5              \t\n"
450            "shll    v17.8h, v6.8b, #8              \t\n"
451            "shll2   v23.8h, v20.16b, #8            \t\n"
452            "shll2   v7.8h, v7.16b, #8              \t\n"
453            "sri     v22.8h, v17.8h, #11            \t\n"
454            "sri     v23.8h, v7.8h, #5              \t\n"
455            "shll2   v6.8h, v6.16b, #8              \t\n"
456            "st1     {v22.8h}, [%[dst]], #16        \t\n"
457            "sri     v23.8h, v6.8h, #11             \t\n"
458            "st1     {v23.8h}, [%[dst]], #16        \t\n"
459            "b.ge    1b                             \t\n"
460            : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
461            :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
462               "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
463               "v31"
464        );
465    }
466        // Leftovers
467    if (count > 0) {
468        do {
469            SkPMColor c = *src++;
470            SkPMColorAssert(c);
471            if (c) {
472                *dst = SkSrcOver32To16(c, *dst);
473            }
474            dst += 1;
475        } while (--count != 0);
476    }
477}
478#endif // #ifdef SK_CPU_ARM32
479
480static uint32_t pmcolor_to_expand16(SkPMColor c) {
481    unsigned r = SkGetPackedR32(c);
482    unsigned g = SkGetPackedG32(c);
483    unsigned b = SkGetPackedB32(c);
484    return (g << 24) | (r << 13) | (b << 2);
485}
486
487void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
488    uint32_t src_expand;
489    unsigned scale;
490    uint16x8_t vmask_blue;
491
492    if (count <= 0) return;
493    SkASSERT(((size_t)dst & 0x01) == 0);
494
495    /*
496     * This preamble code is in order to make dst aligned to 8 bytes
497     * in the next mutiple bytes read & write access.
498     */
499    src_expand = pmcolor_to_expand16(src);
500    scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
501
502#define DST_ALIGN 8
503
504    /*
505     * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
506     */
507    int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
508
509    for (int i = 0; i < preamble_size; i+=2, dst++) {
510        uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
511        *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
512        if (--count == 0)
513            break;
514    }
515
516    int count16 = 0;
517    count16 = count >> 4;
518    vmask_blue = vmovq_n_u16(SK_B16_MASK);
519
520    if (count16) {
521        uint16x8_t wide_sr;
522        uint16x8_t wide_sg;
523        uint16x8_t wide_sb;
524        uint16x8_t wide_256_sa;
525
526        unsigned sr = SkGetPackedR32(src);
527        unsigned sg = SkGetPackedG32(src);
528        unsigned sb = SkGetPackedB32(src);
529        unsigned sa = SkGetPackedA32(src);
530
531        // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
532        // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
533        //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
534        wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
535
536        // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
537        //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
538        wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
539
540        // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
541        //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
542        wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
543
544        wide_256_sa =
545            vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
546
547        while (count16-- > 0) {
548            uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
549            uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
550            vdst1 = vld1q_u16(dst);
551            dst += 8;
552            vdst2 = vld1q_u16(dst);
553            dst -= 8;    //to store dst again.
554
555            vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
556            vdst1_b = vdst1 & vmask_blue;                              // extract blue
557            vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
558            vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
559
560            vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
561            vdst2_b = vdst2 & vmask_blue;                              // extract blue
562            vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
563            vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
564
565            vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
566            vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
567            vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
568
569            vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
570            vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
571            vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
572
573            vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
574            vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
575            vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
576
577            vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
578            vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
579
580            vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
581            vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
582            vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
583
584            vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
585            vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
586
587            vst1q_u16(dst, vdst1);
588            dst += 8;
589            vst1q_u16(dst, vdst2);
590            dst += 8;
591        }
592    }
593
594    count &= 0xF;
595    if (count > 0) {
596        do {
597            uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
598            *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
599            dst += 1;
600        } while (--count != 0);
601    }
602}
603
604static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
605    prod += vdupq_n_u16(128);
606    prod += vshrq_n_u16(prod, 8);
607    return vshrq_n_u16(prod, 8);
608}
609
610void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
611                          const SkPMColor* SK_RESTRICT src, int count,
612                          U8CPU alpha, int /*x*/, int /*y*/) {
613   SkASSERT(255 > alpha);
614
615    /* This code implements a Neon version of S32A_D565_Blend. The results have
616     * a few mismatches compared to the original code. These mismatches never
617     * exceed 1.
618     */
619
620    if (count >= 8) {
621        uint16x8_t valpha_max, vmask_blue;
622        uint8x8_t valpha;
623
624        // prepare constants
625        valpha_max = vmovq_n_u16(255);
626        valpha = vdup_n_u8(alpha);
627        vmask_blue = vmovq_n_u16(SK_B16_MASK);
628
629        do {
630            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
631            uint16x8_t vres_a, vres_r, vres_g, vres_b;
632            uint8x8x4_t vsrc;
633
634            // load pixels
635            vdst = vld1q_u16(dst);
636#ifdef SK_CPU_ARM64
637            vsrc = sk_vld4_u8_arm64_4(src);
638#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
639            asm (
640                "vld4.u8 %h[vsrc], [%[src]]!"
641                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
642                : :
643            );
644#else
645            register uint8x8_t d0 asm("d0");
646            register uint8x8_t d1 asm("d1");
647            register uint8x8_t d2 asm("d2");
648            register uint8x8_t d3 asm("d3");
649
650            asm volatile (
651                "vld4.u8    {d0-d3},[%[src]]!;"
652                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
653                  [src] "+&r" (src)
654                : :
655            );
656            vsrc.val[0] = d0;
657            vsrc.val[1] = d1;
658            vsrc.val[2] = d2;
659            vsrc.val[3] = d3;
660#endif
661
662
663            // deinterleave dst
664            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
665            vdst_b = vdst & vmask_blue;                     // extract blue
666            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
667            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
668
669            // shift src to 565
670            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
671            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
672            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
673
674            // calc src * src_scale
675            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
676            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
677            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
678            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
679
680            // prepare dst_scale
681            vres_a = SkDiv255Round_neon8(vres_a);
682            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
683
684            // add dst * dst_scale to previous result
685            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
686            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
687            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
688
689#ifdef S32A_D565_BLEND_EXACT
690            // It is possible to get exact results with this but it is slow,
691            // even slower than C code in some cases
692            vres_r = SkDiv255Round_neon8(vres_r);
693            vres_g = SkDiv255Round_neon8(vres_g);
694            vres_b = SkDiv255Round_neon8(vres_b);
695#else
696            vres_r = vrshrq_n_u16(vres_r, 8);
697            vres_g = vrshrq_n_u16(vres_g, 8);
698            vres_b = vrshrq_n_u16(vres_b, 8);
699#endif
700            // pack result
701            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
702            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
703
704            // store
705            vst1q_u16(dst, vres_b);
706            dst += 8;
707            count -= 8;
708        } while (count >= 8);
709    }
710
711    // leftovers
712    while (count-- > 0) {
713        SkPMColor sc = *src++;
714        if (sc) {
715            uint16_t dc = *dst;
716            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
717            unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
718            unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
719            unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
720            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
721        }
722        dst += 1;
723    }
724}
725
726/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
727 * each dither value is spaced out into byte lanes, and repeated
728 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
729 * start of each row.
730 */
731static const uint8_t gDitherMatrix_Neon[48] = {
732    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
733    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
734    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
735    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
736
737};
738
739void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
740                                int count, U8CPU alpha, int x, int y)
741{
742
743    SkASSERT(255 > alpha);
744
745    // rescale alpha to range 1 - 256
746    int scale = SkAlpha255To256(alpha);
747
748    if (count >= 8) {
749        /* select row and offset for dither array */
750        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
751
752        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
753        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
754
755        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
756        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
757
758        do {
759
760            uint8x8x4_t vsrc;
761            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
762            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
763            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
764            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
765            uint16x8_t vdst;
766            uint16x8_t vdst_r, vdst_g, vdst_b;
767            int16x8_t vres_r, vres_g, vres_b;
768            int8x8_t vres8_r, vres8_g, vres8_b;
769
770            // Load source and add dither
771#ifdef SK_CPU_ARM64
772            vsrc = sk_vld4_u8_arm64_3(src);
773#else
774            {
775            register uint8x8_t d0 asm("d0");
776            register uint8x8_t d1 asm("d1");
777            register uint8x8_t d2 asm("d2");
778            register uint8x8_t d3 asm("d3");
779
780            asm (
781                "vld4.8    {d0-d3},[%[src]]! "
782                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
783                :
784            );
785            vsrc.val[0] = d0;
786            vsrc.val[1] = d1;
787            vsrc.val[2] = d2;
788            }
789#endif
790            vsrc_r = vsrc.val[NEON_R];
791            vsrc_g = vsrc.val[NEON_G];
792            vsrc_b = vsrc.val[NEON_B];
793
794            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
795            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
796            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
797
798            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
799            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
800            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
801
802            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
803            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
804            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
805
806            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
807            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
808            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
809
810            // Load dst and unpack
811            vdst = vld1q_u16(dst);
812            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
813            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
814            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
815
816            // subtract dst from src and widen
817            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
818            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
819            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
820
821            // multiply diffs by scale and shift
822            vres_r = vmulq_s16(vres_r, vscale);
823            vres_g = vmulq_s16(vres_g, vscale);
824            vres_b = vmulq_s16(vres_b, vscale);
825
826            vres8_r = vshrn_n_s16(vres_r, 8);
827            vres8_g = vshrn_n_s16(vres_g, 8);
828            vres8_b = vshrn_n_s16(vres_b, 8);
829
830            // add dst to result
831            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
832            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
833            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
834
835            // put result into 565 format
836            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
837            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
838
839            // Store result
840            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
841
842            // Next iteration
843            dst += 8;
844            count -= 8;
845
846        } while (count >= 8);
847    }
848
849    // Leftovers
850    if (count > 0) {
851        int scale = SkAlpha255To256(alpha);
852        DITHER_565_SCAN(y);
853        do {
854            SkPMColor c = *src++;
855            SkPMColorAssert(c);
856
857            int dither = DITHER_VALUE(x);
858            int sr = SkGetPackedR32(c);
859            int sg = SkGetPackedG32(c);
860            int sb = SkGetPackedB32(c);
861            sr = SkDITHER_R32To565(sr, dither);
862            sg = SkDITHER_G32To565(sg, dither);
863            sb = SkDITHER_B32To565(sb, dither);
864
865            uint16_t d = *dst;
866            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
867                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
868                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
869            DITHER_INC_X(x);
870        } while (--count != 0);
871    }
872}
873
874/* Neon version of S32_Blend_BlitRow32()
875 * portable version is in src/core/SkBlitRow_D32.cpp
876 */
877void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
878                              const SkPMColor* SK_RESTRICT src,
879                              int count, U8CPU alpha) {
880    SkASSERT(alpha <= 255);
881
882    if (count <= 0) {
883        return;
884    }
885
886    uint16_t src_scale = SkAlpha255To256(alpha);
887    uint16_t dst_scale = 256 - src_scale;
888
889    while (count >= 2) {
890        uint8x8_t vsrc, vdst, vres;
891        uint16x8_t vsrc_wide, vdst_wide;
892
893        /* These commented prefetches are a big win for count
894         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
895         * They also hurt a little (<5%) on an A15
896         */
897        //__builtin_prefetch(src+32);
898        //__builtin_prefetch(dst+32);
899
900        // Load
901        vsrc = vreinterpret_u8_u32(vld1_u32(src));
902        vdst = vreinterpret_u8_u32(vld1_u32(dst));
903
904        // Process src
905        vsrc_wide = vmovl_u8(vsrc);
906        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
907
908        // Process dst
909        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
910
911        // Combine
912#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
913        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
914#else
915        vdst_wide += vsrc_wide;
916        vres = vshrn_n_u16(vdst_wide, 8);
917#endif
918
919        // Store
920        vst1_u32(dst, vreinterpret_u32_u8(vres));
921
922        src += 2;
923        dst += 2;
924        count -= 2;
925    }
926
927    if (count == 1) {
928        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
929        uint16x8_t vsrc_wide, vdst_wide;
930
931        // Load
932        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
933        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
934
935        // Process
936        vsrc_wide = vmovl_u8(vsrc);
937        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
938        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
939#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
940        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
941#else
942        vdst_wide += vsrc_wide;
943        vres = vshrn_n_u16(vdst_wide, 8);
944#endif
945
946        // Store
947        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
948    }
949}
950
951#ifdef SK_CPU_ARM32
952void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
953                         const SkPMColor* SK_RESTRICT src,
954                         int count, U8CPU alpha) {
955
956    SkASSERT(255 > alpha);
957
958    if (count <= 0) {
959        return;
960    }
961
962    unsigned alpha256 = SkAlpha255To256(alpha);
963
964    // First deal with odd counts
965    if (count & 1) {
966        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
967        uint16x8_t vdst_wide, vsrc_wide;
968        unsigned dst_scale;
969
970        // Load
971        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
972        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
973
974        // Calc dst_scale
975        dst_scale = vget_lane_u8(vsrc, 3);
976        dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
977
978        // Process src
979        vsrc_wide = vmovl_u8(vsrc);
980        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
981
982        // Process dst
983        vdst_wide = vmovl_u8(vdst);
984        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
985
986        // Combine
987#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
988        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
989#else
990        vdst_wide += vsrc_wide;
991        vres = vshrn_n_u16(vdst_wide, 8);
992#endif
993
994        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
995        dst++;
996        src++;
997        count--;
998    }
999
1000    if (count) {
1001        uint8x8_t alpha_mask;
1002        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1003        alpha_mask = vld1_u8(alpha_mask_setup);
1004
1005        do {
1006
1007            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1008            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1009
1010            __builtin_prefetch(src+32);
1011            __builtin_prefetch(dst+32);
1012
1013            // Load
1014            vsrc = vreinterpret_u8_u32(vld1_u32(src));
1015            vdst = vreinterpret_u8_u32(vld1_u32(dst));
1016
1017            // Prepare src_scale
1018            vsrc_scale = vdupq_n_u16(alpha256);
1019
1020            // Calc dst_scale
1021            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1022            vdst_scale = vmovl_u8(vsrc_alphas);
1023#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
1024            vdst_scale *= vsrc_scale;
1025            vdst_scale = vshrq_n_u16(vdst_scale, 8);
1026            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1027#else
1028            // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
1029            // A 16-bit lane would overflow if we used 0xFFFF here,
1030            // so use an approximation with 0xFF00 that is off by 1,
1031            // and add back 1 after to get the correct value.
1032            // This is valid if alpha256 <= 255.
1033            vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
1034            vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
1035            vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
1036#endif
1037
1038            // Process src
1039            vsrc_wide = vmovl_u8(vsrc);
1040            vsrc_wide *= vsrc_scale;
1041
1042            // Process dst
1043            vdst_wide = vmovl_u8(vdst);
1044            vdst_wide *= vdst_scale;
1045
1046            // Combine
1047#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
1048            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1049#else
1050            vdst_wide += vsrc_wide;
1051            vres = vshrn_n_u16(vdst_wide, 8);
1052#endif
1053
1054            vst1_u32(dst, vreinterpret_u32_u8(vres));
1055
1056            src += 2;
1057            dst += 2;
1058            count -= 2;
1059        } while(count);
1060    }
1061}
1062
1063///////////////////////////////////////////////////////////////////////////////
1064
1065#endif // #ifdef SK_CPU_ARM32
1066
1067void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1068                                   const SkPMColor* SK_RESTRICT src,
1069                                   int count, U8CPU alpha, int x, int y) {
1070    SkASSERT(255 == alpha);
1071
1072#define    UNROLL    8
1073
1074    if (count >= UNROLL) {
1075
1076    uint8x8_t dbase;
1077    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1078    dbase = vld1_u8(dstart);
1079
1080        do {
1081        uint8x8x4_t vsrc;
1082        uint8x8_t sr, sg, sb, sa, d;
1083        uint16x8_t dst8, scale8, alpha8;
1084        uint16x8_t dst_r, dst_g, dst_b;
1085
1086#ifdef SK_CPU_ARM64
1087        vsrc = sk_vld4_u8_arm64_4(src);
1088#else
1089        {
1090        register uint8x8_t d0 asm("d0");
1091        register uint8x8_t d1 asm("d1");
1092        register uint8x8_t d2 asm("d2");
1093        register uint8x8_t d3 asm("d3");
1094
1095        asm ("vld4.8    {d0-d3},[%[src]]! "
1096            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1097            :
1098        );
1099        vsrc.val[0] = d0;
1100        vsrc.val[1] = d1;
1101        vsrc.val[2] = d2;
1102        vsrc.val[3] = d3;
1103        }
1104#endif
1105        sa = vsrc.val[NEON_A];
1106        sr = vsrc.val[NEON_R];
1107        sg = vsrc.val[NEON_G];
1108        sb = vsrc.val[NEON_B];
1109
1110        /* calculate 'd', which will be 0..7
1111         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1112         */
1113        alpha8 = vmovl_u8(dbase);
1114        alpha8 = vmlal_u8(alpha8, sa, dbase);
1115        d = vshrn_n_u16(alpha8, 8);    // narrowing too
1116
1117        // sr = sr - (sr>>5) + d
1118        /* watching for 8-bit overflow.  d is 0..7; risky range of
1119         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1120         * safe  as long as we do ((sr-sr>>5) + d)
1121         */
1122        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1123        sr = vadd_u8(sr, d);
1124
1125        // sb = sb - (sb>>5) + d
1126        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1127        sb = vadd_u8(sb, d);
1128
1129        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1130        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1131        sg = vadd_u8(sg, vshr_n_u8(d,1));
1132
1133        // need to pick up 8 dst's -- at 16 bits each, 128 bits
1134        dst8 = vld1q_u16(dst);
1135        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1136        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1137        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1138
1139        // blend
1140        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1141
1142        // combine the addq and mul, save 3 insns
1143        scale8 = vshrq_n_u16(scale8, 3);
1144        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1145        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1146        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1147
1148        // repack to store
1149        dst8 = vshrq_n_u16(dst_b, 5);
1150        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1151        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1152
1153        vst1q_u16(dst, dst8);
1154
1155        dst += UNROLL;
1156        count -= UNROLL;
1157        // skip x += UNROLL, since it's unchanged mod-4
1158        } while (count >= UNROLL);
1159    }
1160#undef    UNROLL
1161
1162    // residuals
1163    if (count > 0) {
1164        DITHER_565_SCAN(y);
1165        do {
1166            SkPMColor c = *src++;
1167            SkPMColorAssert(c);
1168            if (c) {
1169                unsigned a = SkGetPackedA32(c);
1170
1171                // dither and alpha are just temporary variables to work-around
1172                // an ICE in debug.
1173                unsigned dither = DITHER_VALUE(x);
1174                unsigned alpha = SkAlpha255To256(a);
1175                int d = SkAlphaMul(dither, alpha);
1176
1177                unsigned sr = SkGetPackedR32(c);
1178                unsigned sg = SkGetPackedG32(c);
1179                unsigned sb = SkGetPackedB32(c);
1180                sr = SkDITHER_R32_FOR_565(sr, d);
1181                sg = SkDITHER_G32_FOR_565(sg, d);
1182                sb = SkDITHER_B32_FOR_565(sb, d);
1183
1184                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1185                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1186                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1187                // now src and dst expanded are in g:11 r:10 x:1 b:10
1188                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1189            }
1190            dst += 1;
1191            DITHER_INC_X(x);
1192        } while (--count != 0);
1193    }
1194}
1195
1196///////////////////////////////////////////////////////////////////////////////
1197
1198void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1199                                 const SkPMColor* SK_RESTRICT src,
1200                                 int count, U8CPU alpha, int x, int y) {
1201    SkASSERT(255 == alpha);
1202
1203#define    UNROLL    8
1204    if (count >= UNROLL) {
1205    uint8x8_t d;
1206    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1207    d = vld1_u8(dstart);
1208
1209    while (count >= UNROLL) {
1210        uint8x8_t sr, sg, sb;
1211        uint16x8_t dr, dg, db;
1212        uint16x8_t dst8;
1213        uint8x8x4_t vsrc;
1214
1215#ifdef SK_CPU_ARM64
1216        vsrc = sk_vld4_u8_arm64_3(src);
1217#else
1218        {
1219        register uint8x8_t d0 asm("d0");
1220        register uint8x8_t d1 asm("d1");
1221        register uint8x8_t d2 asm("d2");
1222        register uint8x8_t d3 asm("d3");
1223
1224        asm (
1225            "vld4.8    {d0-d3},[%[src]]! "
1226            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1227            :
1228        );
1229        vsrc.val[0] = d0;
1230        vsrc.val[1] = d1;
1231        vsrc.val[2] = d2;
1232        }
1233#endif
1234        sr = vsrc.val[NEON_R];
1235        sg = vsrc.val[NEON_G];
1236        sb = vsrc.val[NEON_B];
1237
1238        /* XXX: if we want to prefetch, hide it in the above asm()
1239         * using the gcc __builtin_prefetch(), the prefetch will
1240         * fall to the bottom of the loop -- it won't stick up
1241         * at the top of the loop, just after the vld4.
1242         */
1243
1244        // sr = sr - (sr>>5) + d
1245        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1246        dr = vaddl_u8(sr, d);
1247
1248        // sb = sb - (sb>>5) + d
1249        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1250        db = vaddl_u8(sb, d);
1251
1252        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1253        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1254        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1255
1256        // pack high bits of each into 565 format  (rgb, b is lsb)
1257        dst8 = vshrq_n_u16(db, 3);
1258        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1259        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1260
1261        // store it
1262        vst1q_u16(dst, dst8);
1263
1264        dst += UNROLL;
1265        // we don't need to increment src as the asm above has already done it
1266        count -= UNROLL;
1267        x += UNROLL;        // probably superfluous
1268    }
1269    }
1270#undef    UNROLL
1271
1272    // residuals
1273    if (count > 0) {
1274        DITHER_565_SCAN(y);
1275        do {
1276            SkPMColor c = *src++;
1277            SkPMColorAssert(c);
1278            SkASSERT(SkGetPackedA32(c) == 255);
1279
1280            unsigned dither = DITHER_VALUE(x);
1281            *dst++ = SkDitherRGB32To565(c, dither);
1282            DITHER_INC_X(x);
1283        } while (--count != 0);
1284    }
1285}
1286
1287///////////////////////////////////////////////////////////////////////////////
1288
1289const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
1290    // no dither
1291    S32_D565_Opaque_neon,
1292    S32_D565_Blend_neon,
1293    S32A_D565_Opaque_neon,
1294#if 0
1295    S32A_D565_Blend_neon,
1296#else
1297    nullptr,   // https://code.google.com/p/skia/issues/detail?id=2797
1298#endif
1299
1300    // dither
1301    S32_D565_Opaque_Dither_neon,
1302    S32_D565_Blend_Dither_neon,
1303    S32A_D565_Opaque_Dither_neon,
1304    nullptr,   // S32A_D565_Blend_Dither
1305};
1306
1307const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1308    Color32A_D565_neon,    // Color32_D565,
1309    Color32A_D565_neon,    // Color32A_D565,
1310    Color32A_D565_neon,    // Color32_D565_Dither,
1311    Color32A_D565_neon,    // Color32A_D565_Dither
1312};
1313
1314const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1315    nullptr,   // S32_Opaque,
1316    S32_Blend_BlitRow32_neon,        // S32_Blend,
1317    nullptr,  // Ported to SkOpts
1318#ifdef SK_CPU_ARM32
1319    S32A_Blend_BlitRow32_neon        // S32A_Blend
1320#else
1321    nullptr
1322#endif
1323};
1324