1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkColor_opts_neon.h"
18#include <arm_neon.h>
19
20#ifdef SK_CPU_ARM64
21static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22    uint8x8x4_t vsrc;
23    uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24
25    asm (
26        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27        "mov    %[vsrc0].8b, v0.8b             \t\n"
28        "mov    %[vsrc1].8b, v1.8b             \t\n"
29        "mov    %[vsrc2].8b, v2.8b             \t\n"
30        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31          [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32        : : "v0", "v1", "v2", "v3"
33    );
34
35    vsrc.val[0] = vsrc_0;
36    vsrc.val[1] = vsrc_1;
37    vsrc.val[2] = vsrc_2;
38
39    return vsrc;
40}
41
42static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43    uint8x8x4_t vsrc;
44    uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45
46    asm (
47        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48        "mov    %[vsrc0].8b, v0.8b             \t\n"
49        "mov    %[vsrc1].8b, v1.8b             \t\n"
50        "mov    %[vsrc2].8b, v2.8b             \t\n"
51        "mov    %[vsrc3].8b, v3.8b             \t\n"
52        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53          [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54          [src] "+&r" (src)
55        : : "v0", "v1", "v2", "v3"
56    );
57
58    vsrc.val[0] = vsrc_0;
59    vsrc.val[1] = vsrc_1;
60    vsrc.val[2] = vsrc_2;
61    vsrc.val[3] = vsrc_3;
62
63    return vsrc;
64}
65#endif
66
67void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                           const SkPMColor* SK_RESTRICT src, int count,
69                           U8CPU alpha, int /*x*/, int /*y*/) {
70    SkASSERT(255 == alpha);
71
72    while (count >= 8) {
73        uint8x8x4_t vsrc;
74        uint16x8_t vdst;
75
76        // Load
77#ifdef SK_CPU_ARM64
78        vsrc = sk_vld4_u8_arm64_3(src);
79#else
80        vsrc = vld4_u8((uint8_t*)src);
81        src += 8;
82#endif
83
84        // Convert src to 565
85        vdst = SkPixel32ToPixel16_neon8(vsrc);
86
87        // Store
88        vst1q_u16(dst, vdst);
89
90        // Prepare next iteration
91        dst += 8;
92        count -= 8;
93    };
94
95    // Leftovers
96    while (count > 0) {
97        SkPMColor c = *src++;
98        SkPMColorAssert(c);
99        *dst = SkPixel32ToPixel16_ToU16(c);
100        dst++;
101        count--;
102    };
103}
104
105void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                          const SkPMColor* SK_RESTRICT src, int count,
107                          U8CPU alpha, int /*x*/, int /*y*/) {
108    SkASSERT(255 > alpha);
109
110    uint16x8_t vmask_blue, vscale;
111
112    // prepare constants
113    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114    vmask_blue = vmovq_n_u16(0x1F);
115
116    while (count >= 8) {
117        uint8x8x4_t vsrc;
118        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119        uint16x8_t vres_r, vres_g, vres_b;
120
121        // Load src
122#ifdef SK_CPU_ARM64
123        vsrc = sk_vld4_u8_arm64_3(src);
124#else
125        {
126        register uint8x8_t d0 asm("d0");
127        register uint8x8_t d1 asm("d1");
128        register uint8x8_t d2 asm("d2");
129        register uint8x8_t d3 asm("d3");
130
131        asm (
132            "vld4.8    {d0-d3},[%[src]]!"
133            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134            :
135        );
136        vsrc.val[0] = d0;
137        vsrc.val[1] = d1;
138        vsrc.val[2] = d2;
139        }
140#endif
141
142        // Load and unpack dst
143        vdst = vld1q_u16(dst);
144        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148
149        // Shift src to 565 range
150        vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151        vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152        vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153
154        // Scale src - dst
155        vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156        vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157        vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158
159        vres_r = vshrq_n_u16(vres_r * vscale, 8);
160        vres_g = vshrq_n_u16(vres_g * vscale, 8);
161        vres_b = vshrq_n_u16(vres_b * vscale, 8);
162
163        vres_r += vdst_r;
164        vres_g += vdst_g;
165        vres_b += vdst_b;
166
167        // Combine
168        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170
171        // Store
172        vst1q_u16(dst, vres_b);
173        dst += 8;
174        count -= 8;
175    }
176    if (count > 0) {
177        int scale = SkAlpha255To256(alpha);
178        do {
179            SkPMColor c = *src++;
180            SkPMColorAssert(c);
181            uint16_t d = *dst;
182            *dst++ = SkPackRGB16(
183                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186        } while (--count != 0);
187    }
188}
189
190#ifdef SK_CPU_ARM32
191void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                           const SkPMColor* SK_RESTRICT src, int count,
193                           U8CPU alpha, int /*x*/, int /*y*/) {
194    SkASSERT(255 == alpha);
195
196    if (count >= 8) {
197        uint16_t* SK_RESTRICT keep_dst = 0;
198
199        asm volatile (
200                      "ands       ip, %[count], #7            \n\t"
201                      "vmov.u8    d31, #1<<7                  \n\t"
202                      "vld1.16    {q12}, [%[dst]]             \n\t"
203                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
204                      // Thumb does not support the standard ARM conditional
205                      // instructions but instead requires the 'it' instruction
206                      // to signal conditional execution
207                      "it eq                                  \n\t"
208                      "moveq      ip, #8                      \n\t"
209                      "mov        %[keep_dst], %[dst]         \n\t"
210
211                      "add        %[src], %[src], ip, LSL#2   \n\t"
212                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
213                      "subs       %[count], %[count], ip      \n\t"
214                      "b          9f                          \n\t"
215                      // LOOP
216                      "2:                                         \n\t"
217
218                      "vld1.16    {q12}, [%[dst]]!            \n\t"
219                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
222                      "subs       %[count], %[count], #8      \n\t"
223                      "9:                                         \n\t"
224                      "pld        [%[dst],#32]                \n\t"
225                      // expand 0565 q12 to 8888 {d4-d7}
226                      "vmovn.u16  d4, q12                     \n\t"
227                      "vshr.u16   q11, q12, #5                \n\t"
228                      "vshr.u16   q10, q12, #6+5              \n\t"
229                      "vmovn.u16  d5, q11                     \n\t"
230                      "vmovn.u16  d6, q10                     \n\t"
231                      "vshl.u8    d4, d4, #3                  \n\t"
232                      "vshl.u8    d5, d5, #2                  \n\t"
233                      "vshl.u8    d6, d6, #3                  \n\t"
234
235                      "vmovl.u8   q14, d31                    \n\t"
236                      "vmovl.u8   q13, d31                    \n\t"
237                      "vmovl.u8   q12, d31                    \n\t"
238
239                      // duplicate in 4/2/1 & 8pix vsns
240                      "vmvn.8     d30, d3                     \n\t"
241                      "vmlal.u8   q14, d30, d6                \n\t"
242                      "vmlal.u8   q13, d30, d5                \n\t"
243                      "vmlal.u8   q12, d30, d4                \n\t"
244                      "vshr.u16   q8, q14, #5                 \n\t"
245                      "vshr.u16   q9, q13, #6                 \n\t"
246                      "vaddhn.u16 d6, q14, q8                 \n\t"
247                      "vshr.u16   q8, q12, #5                 \n\t"
248                      "vaddhn.u16 d5, q13, q9                 \n\t"
249                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
250                      "vaddhn.u16 d4, q12, q8                 \n\t"
251                      // intentionally don't calculate alpha
252                      // result in d4-d6
253
254                      "vqadd.u8   d5, d5, d1                  \n\t"
255                      "vqadd.u8   d4, d4, d2                  \n\t"
256
257                      // pack 8888 {d4-d6} to 0565 q10
258                      "vshll.u8   q10, d6, #8                 \n\t"
259                      "vshll.u8   q3, d5, #8                  \n\t"
260                      "vshll.u8   q2, d4, #8                  \n\t"
261                      "vsri.u16   q10, q3, #5                 \n\t"
262                      "vsri.u16   q10, q2, #11                \n\t"
263
264                      "bne        2b                          \n\t"
265
266                      "1:                                         \n\t"
267                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
268                      : [count] "+r" (count)
269                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
270                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
271                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
272                      "d30","d31"
273                      );
274    }
275    else
276    {   // handle count < 8
277        uint16_t* SK_RESTRICT keep_dst = 0;
278
279        asm volatile (
280                      "vmov.u8    d31, #1<<7                  \n\t"
281                      "mov        %[keep_dst], %[dst]         \n\t"
282
283                      "tst        %[count], #4                \n\t"
284                      "beq        14f                         \n\t"
285                      "vld1.16    {d25}, [%[dst]]!            \n\t"
286                      "vld1.32    {q1}, [%[src]]!             \n\t"
287
288                      "14:                                        \n\t"
289                      "tst        %[count], #2                \n\t"
290                      "beq        12f                         \n\t"
291                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
292                      "vld1.32    {d1}, [%[src]]!             \n\t"
293
294                      "12:                                        \n\t"
295                      "tst        %[count], #1                \n\t"
296                      "beq        11f                         \n\t"
297                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
298                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
299
300                      "11:                                        \n\t"
301                      // unzips achieve the same as a vld4 operation
302                      "vuzpq.u16  q0, q1                      \n\t"
303                      "vuzp.u8    d0, d1                      \n\t"
304                      "vuzp.u8    d2, d3                      \n\t"
305                      // expand 0565 q12 to 8888 {d4-d7}
306                      "vmovn.u16  d4, q12                     \n\t"
307                      "vshr.u16   q11, q12, #5                \n\t"
308                      "vshr.u16   q10, q12, #6+5              \n\t"
309                      "vmovn.u16  d5, q11                     \n\t"
310                      "vmovn.u16  d6, q10                     \n\t"
311                      "vshl.u8    d4, d4, #3                  \n\t"
312                      "vshl.u8    d5, d5, #2                  \n\t"
313                      "vshl.u8    d6, d6, #3                  \n\t"
314
315                      "vmovl.u8   q14, d31                    \n\t"
316                      "vmovl.u8   q13, d31                    \n\t"
317                      "vmovl.u8   q12, d31                    \n\t"
318
319                      // duplicate in 4/2/1 & 8pix vsns
320                      "vmvn.8     d30, d3                     \n\t"
321                      "vmlal.u8   q14, d30, d6                \n\t"
322                      "vmlal.u8   q13, d30, d5                \n\t"
323                      "vmlal.u8   q12, d30, d4                \n\t"
324                      "vshr.u16   q8, q14, #5                 \n\t"
325                      "vshr.u16   q9, q13, #6                 \n\t"
326                      "vaddhn.u16 d6, q14, q8                 \n\t"
327                      "vshr.u16   q8, q12, #5                 \n\t"
328                      "vaddhn.u16 d5, q13, q9                 \n\t"
329                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
330                      "vaddhn.u16 d4, q12, q8                 \n\t"
331                      // intentionally don't calculate alpha
332                      // result in d4-d6
333
334                      "vqadd.u8   d5, d5, d1                  \n\t"
335                      "vqadd.u8   d4, d4, d2                  \n\t"
336
337                      // pack 8888 {d4-d6} to 0565 q10
338                      "vshll.u8   q10, d6, #8                 \n\t"
339                      "vshll.u8   q3, d5, #8                  \n\t"
340                      "vshll.u8   q2, d4, #8                  \n\t"
341                      "vsri.u16   q10, q3, #5                 \n\t"
342                      "vsri.u16   q10, q2, #11                \n\t"
343
344                      // store
345                      "tst        %[count], #4                \n\t"
346                      "beq        24f                         \n\t"
347                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
348
349                      "24:                                        \n\t"
350                      "tst        %[count], #2                \n\t"
351                      "beq        22f                         \n\t"
352                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
353
354                      "22:                                        \n\t"
355                      "tst        %[count], #1                \n\t"
356                      "beq        21f                         \n\t"
357                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
358
359                      "21:                                        \n\t"
360                      : [count] "+r" (count)
361                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
362                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
363                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
364                      "d30","d31"
365                      );
366    }
367}
368#endif
369
370static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
371    prod += vdupq_n_u16(128);
372    prod += vshrq_n_u16(prod, 8);
373    return vshrq_n_u16(prod, 8);
374}
375
376void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
377                          const SkPMColor* SK_RESTRICT src, int count,
378                          U8CPU alpha, int /*x*/, int /*y*/) {
379   SkASSERT(255 > alpha);
380
381    /* This code implements a Neon version of S32A_D565_Blend. The results have
382     * a few mismatches compared to the original code. These mismatches never
383     * exceed 1.
384     */
385
386    if (count >= 8) {
387        uint16x8_t valpha_max, vmask_blue;
388        uint8x8_t valpha;
389
390        // prepare constants
391        valpha_max = vmovq_n_u16(255);
392        valpha = vdup_n_u8(alpha);
393        vmask_blue = vmovq_n_u16(SK_B16_MASK);
394
395        do {
396            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
397            uint16x8_t vres_a, vres_r, vres_g, vres_b;
398            uint8x8x4_t vsrc;
399
400            // load pixels
401            vdst = vld1q_u16(dst);
402#ifdef SK_CPU_ARM64
403            vsrc = sk_vld4_u8_arm64_4(src);
404#else
405#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
406            asm (
407                "vld4.u8 %h[vsrc], [%[src]]!"
408                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
409                : :
410            );
411#else
412            register uint8x8_t d0 asm("d0");
413            register uint8x8_t d1 asm("d1");
414            register uint8x8_t d2 asm("d2");
415            register uint8x8_t d3 asm("d3");
416
417            asm volatile (
418                "vld4.u8    {d0-d3},[%[src]]!;"
419                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
420                  [src] "+&r" (src)
421                : :
422            );
423            vsrc.val[0] = d0;
424            vsrc.val[1] = d1;
425            vsrc.val[2] = d2;
426            vsrc.val[3] = d3;
427#endif
428#endif // #ifdef SK_CPU_ARM64
429
430
431            // deinterleave dst
432            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
433            vdst_b = vdst & vmask_blue;                     // extract blue
434            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
435            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
436
437            // shift src to 565
438            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
439            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
440            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
441
442            // calc src * src_scale
443            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
444            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
445            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
446            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
447
448            // prepare dst_scale
449            vres_a = SkDiv255Round_neon8(vres_a);
450            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
451
452            // add dst * dst_scale to previous result
453            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
454            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
455            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
456
457#ifdef S32A_D565_BLEND_EXACT
458            // It is possible to get exact results with this but it is slow,
459            // even slower than C code in some cases
460            vres_r = SkDiv255Round_neon8(vres_r);
461            vres_g = SkDiv255Round_neon8(vres_g);
462            vres_b = SkDiv255Round_neon8(vres_b);
463#else
464            vres_r = vrshrq_n_u16(vres_r, 8);
465            vres_g = vrshrq_n_u16(vres_g, 8);
466            vres_b = vrshrq_n_u16(vres_b, 8);
467#endif
468            // pack result
469            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
470            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
471
472            // store
473            vst1q_u16(dst, vres_b);
474            dst += 8;
475            count -= 8;
476        } while (count >= 8);
477    }
478
479    // leftovers
480    while (count-- > 0) {
481        SkPMColor sc = *src++;
482        if (sc) {
483            uint16_t dc = *dst;
484            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
485            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
486            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
487            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
488            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
489        }
490        dst += 1;
491    }
492}
493
494/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
495 * each dither value is spaced out into byte lanes, and repeated
496 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
497 * start of each row.
498 */
499static const uint8_t gDitherMatrix_Neon[48] = {
500    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
501    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
502    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
503    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
504
505};
506
507void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
508                                int count, U8CPU alpha, int x, int y)
509{
510
511    SkASSERT(255 > alpha);
512
513    // rescale alpha to range 1 - 256
514    int scale = SkAlpha255To256(alpha);
515
516    if (count >= 8) {
517        /* select row and offset for dither array */
518        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
519
520        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
521        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
522
523        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
524        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
525
526        do {
527
528            uint8x8x4_t vsrc;
529            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
530            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
531            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
532            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
533            uint16x8_t vdst;
534            uint16x8_t vdst_r, vdst_g, vdst_b;
535            int16x8_t vres_r, vres_g, vres_b;
536            int8x8_t vres8_r, vres8_g, vres8_b;
537
538            // Load source and add dither
539#ifdef SK_CPU_ARM64
540            vsrc = sk_vld4_u8_arm64_3(src);
541#else
542            {
543            register uint8x8_t d0 asm("d0");
544            register uint8x8_t d1 asm("d1");
545            register uint8x8_t d2 asm("d2");
546            register uint8x8_t d3 asm("d3");
547
548            asm (
549                "vld4.8    {d0-d3},[%[src]]! "
550                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
551                :
552            );
553            vsrc.val[0] = d0;
554            vsrc.val[1] = d1;
555            vsrc.val[2] = d2;
556            }
557#endif
558            vsrc_r = vsrc.val[NEON_R];
559            vsrc_g = vsrc.val[NEON_G];
560            vsrc_b = vsrc.val[NEON_B];
561
562            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
563            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
564            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
565
566            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
567            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
568            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
569
570            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
571            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
572            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
573
574            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
575            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
576            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
577
578            // Load dst and unpack
579            vdst = vld1q_u16(dst);
580            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
581            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
582            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
583
584            // subtract dst from src and widen
585            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
586            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
587            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
588
589            // multiply diffs by scale and shift
590            vres_r = vmulq_s16(vres_r, vscale);
591            vres_g = vmulq_s16(vres_g, vscale);
592            vres_b = vmulq_s16(vres_b, vscale);
593
594            vres8_r = vshrn_n_s16(vres_r, 8);
595            vres8_g = vshrn_n_s16(vres_g, 8);
596            vres8_b = vshrn_n_s16(vres_b, 8);
597
598            // add dst to result
599            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
600            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
601            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
602
603            // put result into 565 format
604            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
605            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
606
607            // Store result
608            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
609
610            // Next iteration
611            dst += 8;
612            count -= 8;
613
614        } while (count >= 8);
615    }
616
617    // Leftovers
618    if (count > 0) {
619        int scale = SkAlpha255To256(alpha);
620        DITHER_565_SCAN(y);
621        do {
622            SkPMColor c = *src++;
623            SkPMColorAssert(c);
624
625            int dither = DITHER_VALUE(x);
626            int sr = SkGetPackedR32(c);
627            int sg = SkGetPackedG32(c);
628            int sb = SkGetPackedB32(c);
629            sr = SkDITHER_R32To565(sr, dither);
630            sg = SkDITHER_G32To565(sg, dither);
631            sb = SkDITHER_B32To565(sb, dither);
632
633            uint16_t d = *dst;
634            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
635                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
636                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
637            DITHER_INC_X(x);
638        } while (--count != 0);
639    }
640}
641
642void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
643                                const SkPMColor* SK_RESTRICT src,
644                                int count, U8CPU alpha) {
645
646    SkASSERT(255 == alpha);
647    if (count > 0) {
648
649
650    uint8x8_t alpha_mask;
651
652    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
653    alpha_mask = vld1_u8(alpha_mask_setup);
654
655    /* do the NEON unrolled code */
656#define    UNROLL    4
657    while (count >= UNROLL) {
658        uint8x8_t src_raw, dst_raw, dst_final;
659        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
660
661        /* The two prefetches below may make the code slighlty
662         * slower for small values of count but are worth having
663         * in the general case.
664         */
665        __builtin_prefetch(src+32);
666        __builtin_prefetch(dst+32);
667
668        /* get the source */
669        src_raw = vreinterpret_u8_u32(vld1_u32(src));
670#if    UNROLL > 2
671        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
672#endif
673
674        /* get and hold the dst too */
675        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
676#if    UNROLL > 2
677        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
678#endif
679
680    /* 1st and 2nd bits of the unrolling */
681    {
682        uint8x8_t dst_cooked;
683        uint16x8_t dst_wide;
684        uint8x8_t alpha_narrow;
685        uint16x8_t alpha_wide;
686
687        /* get the alphas spread out properly */
688        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
689        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
690
691        /* spread the dest */
692        dst_wide = vmovl_u8(dst_raw);
693
694        /* alpha mul the dest */
695        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
696        dst_cooked = vshrn_n_u16(dst_wide, 8);
697
698        /* sum -- ignoring any byte lane overflows */
699        dst_final = vadd_u8(src_raw, dst_cooked);
700    }
701
702#if    UNROLL > 2
703    /* the 3rd and 4th bits of our unrolling */
704    {
705        uint8x8_t dst_cooked;
706        uint16x8_t dst_wide;
707        uint8x8_t alpha_narrow;
708        uint16x8_t alpha_wide;
709
710        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
711        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
712
713        /* spread the dest */
714        dst_wide = vmovl_u8(dst_raw_2);
715
716        /* alpha mul the dest */
717        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
718        dst_cooked = vshrn_n_u16(dst_wide, 8);
719
720        /* sum -- ignoring any byte lane overflows */
721        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
722    }
723#endif
724
725        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
726#if    UNROLL > 2
727        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
728#endif
729
730        src += UNROLL;
731        dst += UNROLL;
732        count -= UNROLL;
733    }
734#undef    UNROLL
735
736    /* do any residual iterations */
737        while (--count >= 0) {
738            *dst = SkPMSrcOver(*src, *dst);
739            src += 1;
740            dst += 1;
741        }
742    }
743}
744
745void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
746                                const SkPMColor* SK_RESTRICT src,
747                                int count, U8CPU alpha) {
748    SkASSERT(255 == alpha);
749
750    if (count <= 0)
751    return;
752
753    /* Use these to check if src is transparent or opaque */
754    const unsigned int ALPHA_OPAQ  = 0xFF000000;
755    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
756
757#define UNROLL  4
758    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
759    const SkPMColor* SK_RESTRICT src_temp = src;
760
761    /* set up the NEON variables */
762    uint8x8_t alpha_mask;
763    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
764    alpha_mask = vld1_u8(alpha_mask_setup);
765
766    uint8x8_t src_raw, dst_raw, dst_final;
767    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
768    uint8x8_t dst_cooked;
769    uint16x8_t dst_wide;
770    uint8x8_t alpha_narrow;
771    uint16x8_t alpha_wide;
772
773    /* choose the first processing type */
774    if( src >= src_end)
775        goto TAIL;
776    if(*src <= ALPHA_TRANS)
777        goto ALPHA_0;
778    if(*src >= ALPHA_OPAQ)
779        goto ALPHA_255;
780    /* fall-thru */
781
782ALPHA_1_TO_254:
783    do {
784
785        /* get the source */
786        src_raw = vreinterpret_u8_u32(vld1_u32(src));
787        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
788
789        /* get and hold the dst too */
790        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
791        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
792
793
794        /* get the alphas spread out properly */
795        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
796        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
797        /* we collapsed (255-a)+1 ... */
798        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
799
800        /* spread the dest */
801        dst_wide = vmovl_u8(dst_raw);
802
803        /* alpha mul the dest */
804        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
805        dst_cooked = vshrn_n_u16(dst_wide, 8);
806
807        /* sum -- ignoring any byte lane overflows */
808        dst_final = vadd_u8(src_raw, dst_cooked);
809
810        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
811        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
812        /* we collapsed (255-a)+1 ... */
813        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
814
815        /* spread the dest */
816        dst_wide = vmovl_u8(dst_raw_2);
817
818        /* alpha mul the dest */
819        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
820        dst_cooked = vshrn_n_u16(dst_wide, 8);
821
822        /* sum -- ignoring any byte lane overflows */
823        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
824
825        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
826        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
827
828        src += UNROLL;
829        dst += UNROLL;
830
831        /* if 2 of the next pixels aren't between 1 and 254
832        it might make sense to go to the optimized loops */
833        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
834            break;
835
836    } while(src < src_end);
837
838    if (src >= src_end)
839        goto TAIL;
840
841    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
842        goto ALPHA_255;
843
844    /*fall-thru*/
845
846ALPHA_0:
847
848    /*In this state, we know the current alpha is 0 and
849     we optimize for the next alpha also being zero. */
850    src_temp = src;  //so we don't have to increment dst every time
851    do {
852        if(*(++src) > ALPHA_TRANS)
853            break;
854        if(*(++src) > ALPHA_TRANS)
855            break;
856        if(*(++src) > ALPHA_TRANS)
857            break;
858        if(*(++src) > ALPHA_TRANS)
859            break;
860    } while(src < src_end);
861
862    dst += (src - src_temp);
863
864    /* no longer alpha 0, so determine where to go next. */
865    if( src >= src_end)
866        goto TAIL;
867    if(*src >= ALPHA_OPAQ)
868        goto ALPHA_255;
869    else
870        goto ALPHA_1_TO_254;
871
872ALPHA_255:
873    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
874        dst[0]=src[0];
875        dst[1]=src[1];
876        dst[2]=src[2];
877        dst[3]=src[3];
878        src+=UNROLL;
879        dst+=UNROLL;
880        if(src >= src_end)
881            goto TAIL;
882    }
883
884    //Handle remainder.
885    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
886        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
887            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
888        }
889    }
890
891    if( src >= src_end)
892        goto TAIL;
893    if(*src <= ALPHA_TRANS)
894        goto ALPHA_0;
895    else
896        goto ALPHA_1_TO_254;
897
898TAIL:
899    /* do any residual iterations */
900    src_end += UNROLL + 1;  //goto the real end
901    while(src != src_end) {
902        if( *src != 0 ) {
903            if( *src >= ALPHA_OPAQ ) {
904                *dst = *src;
905            }
906            else {
907                *dst = SkPMSrcOver(*src, *dst);
908            }
909        }
910        src++;
911        dst++;
912    }
913
914#undef    UNROLL
915    return;
916}
917
918/* Neon version of S32_Blend_BlitRow32()
919 * portable version is in src/core/SkBlitRow_D32.cpp
920 */
921void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
922                              const SkPMColor* SK_RESTRICT src,
923                              int count, U8CPU alpha) {
924    SkASSERT(alpha <= 255);
925
926    if (count <= 0) {
927        return;
928    }
929
930    uint16_t src_scale = SkAlpha255To256(alpha);
931    uint16_t dst_scale = 256 - src_scale;
932
933    while (count >= 2) {
934        uint8x8_t vsrc, vdst, vres;
935        uint16x8_t vsrc_wide, vdst_wide;
936
937        /* These commented prefetches are a big win for count
938         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
939         * They also hurt a little (<5%) on an A15
940         */
941        //__builtin_prefetch(src+32);
942        //__builtin_prefetch(dst+32);
943
944        // Load
945        vsrc = vreinterpret_u8_u32(vld1_u32(src));
946        vdst = vreinterpret_u8_u32(vld1_u32(dst));
947
948        // Process src
949        vsrc_wide = vmovl_u8(vsrc);
950        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
951
952        // Process dst
953        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
954
955        // Combine
956        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
957
958        // Store
959        vst1_u32(dst, vreinterpret_u32_u8(vres));
960
961        src += 2;
962        dst += 2;
963        count -= 2;
964    }
965
966    if (count == 1) {
967        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
968        uint16x8_t vsrc_wide, vdst_wide;
969
970        // Load
971        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
972        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
973
974        // Process
975        vsrc_wide = vmovl_u8(vsrc);
976        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
977        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
978        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
979
980        // Store
981        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
982    }
983}
984
985#ifdef SK_CPU_ARM32
986void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
987                         const SkPMColor* SK_RESTRICT src,
988                         int count, U8CPU alpha) {
989
990    SkASSERT(255 >= alpha);
991
992    if (count <= 0) {
993        return;
994    }
995
996    unsigned alpha256 = SkAlpha255To256(alpha);
997
998    // First deal with odd counts
999    if (count & 1) {
1000        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1001        uint16x8_t vdst_wide, vsrc_wide;
1002        unsigned dst_scale;
1003
1004        // Load
1005        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1006        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1007
1008        // Calc dst_scale
1009        dst_scale = vget_lane_u8(vsrc, 3);
1010        dst_scale *= alpha256;
1011        dst_scale >>= 8;
1012        dst_scale = 256 - dst_scale;
1013
1014        // Process src
1015        vsrc_wide = vmovl_u8(vsrc);
1016        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1017
1018        // Process dst
1019        vdst_wide = vmovl_u8(vdst);
1020        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1021
1022        // Combine
1023        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1024
1025        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1026        dst++;
1027        src++;
1028        count--;
1029    }
1030
1031    if (count) {
1032        uint8x8_t alpha_mask;
1033        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1034        alpha_mask = vld1_u8(alpha_mask_setup);
1035
1036        do {
1037
1038            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1039            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1040
1041            __builtin_prefetch(src+32);
1042            __builtin_prefetch(dst+32);
1043
1044            // Load
1045            vsrc = vreinterpret_u8_u32(vld1_u32(src));
1046            vdst = vreinterpret_u8_u32(vld1_u32(dst));
1047
1048            // Prepare src_scale
1049            vsrc_scale = vdupq_n_u16(alpha256);
1050
1051            // Calc dst_scale
1052            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1053            vdst_scale = vmovl_u8(vsrc_alphas);
1054            vdst_scale *= vsrc_scale;
1055            vdst_scale = vshrq_n_u16(vdst_scale, 8);
1056            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1057
1058            // Process src
1059            vsrc_wide = vmovl_u8(vsrc);
1060            vsrc_wide *= vsrc_scale;
1061
1062            // Process dst
1063            vdst_wide = vmovl_u8(vdst);
1064            vdst_wide *= vdst_scale;
1065
1066            // Combine
1067            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1068
1069            vst1_u32(dst, vreinterpret_u32_u8(vres));
1070
1071            src += 2;
1072            dst += 2;
1073            count -= 2;
1074        } while(count);
1075    }
1076}
1077
1078///////////////////////////////////////////////////////////////////////////////
1079
1080#undef    DEBUG_OPAQUE_DITHER
1081
1082#if    defined(DEBUG_OPAQUE_DITHER)
1083static void showme8(char *str, void *p, int len)
1084{
1085    static char buf[256];
1086    char tbuf[32];
1087    int i;
1088    char *pc = (char*) p;
1089    sprintf(buf,"%8s:", str);
1090    for(i=0;i<len;i++) {
1091        sprintf(tbuf, "   %02x", pc[i]);
1092        strcat(buf, tbuf);
1093    }
1094    SkDebugf("%s\n", buf);
1095}
1096static void showme16(char *str, void *p, int len)
1097{
1098    static char buf[256];
1099    char tbuf[32];
1100    int i;
1101    uint16_t *pc = (uint16_t*) p;
1102    sprintf(buf,"%8s:", str);
1103    len = (len / sizeof(uint16_t));    /* passed as bytes */
1104    for(i=0;i<len;i++) {
1105        sprintf(tbuf, " %04x", pc[i]);
1106        strcat(buf, tbuf);
1107    }
1108    SkDebugf("%s\n", buf);
1109}
1110#endif
1111#endif // #ifdef SK_CPU_ARM32
1112
1113void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1114                                   const SkPMColor* SK_RESTRICT src,
1115                                   int count, U8CPU alpha, int x, int y) {
1116    SkASSERT(255 == alpha);
1117
1118#define    UNROLL    8
1119
1120    if (count >= UNROLL) {
1121
1122#if defined(DEBUG_OPAQUE_DITHER)
1123    uint16_t tmpbuf[UNROLL];
1124    int td[UNROLL];
1125    int tdv[UNROLL];
1126    int ta[UNROLL];
1127    int tap[UNROLL];
1128    uint16_t in_dst[UNROLL];
1129    int offset = 0;
1130    int noisy = 0;
1131#endif
1132
1133    uint8x8_t dbase;
1134    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1135    dbase = vld1_u8(dstart);
1136
1137        do {
1138        uint8x8x4_t vsrc;
1139        uint8x8_t sr, sg, sb, sa, d;
1140        uint16x8_t dst8, scale8, alpha8;
1141        uint16x8_t dst_r, dst_g, dst_b;
1142
1143#if defined(DEBUG_OPAQUE_DITHER)
1144        // calculate 8 elements worth into a temp buffer
1145        {
1146        int my_y = y;
1147        int my_x = x;
1148        SkPMColor* my_src = (SkPMColor*)src;
1149        uint16_t* my_dst = dst;
1150        int i;
1151
1152        DITHER_565_SCAN(my_y);
1153        for(i = 0; i < UNROLL; i++) {
1154            SkPMColor c = *my_src++;
1155            SkPMColorAssert(c);
1156            if (c) {
1157                unsigned a = SkGetPackedA32(c);
1158
1159                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1160                tdv[i] = DITHER_VALUE(my_x);
1161                ta[i] = a;
1162                tap[i] = SkAlpha255To256(a);
1163                td[i] = d;
1164
1165                unsigned sr = SkGetPackedR32(c);
1166                unsigned sg = SkGetPackedG32(c);
1167                unsigned sb = SkGetPackedB32(c);
1168                sr = SkDITHER_R32_FOR_565(sr, d);
1169                sg = SkDITHER_G32_FOR_565(sg, d);
1170                sb = SkDITHER_B32_FOR_565(sb, d);
1171
1172                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1173                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1174                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1175                // now src and dst expanded are in g:11 r:10 x:1 b:10
1176                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1177                td[i] = d;
1178            } else {
1179                tmpbuf[i] = *my_dst;
1180                ta[i] = tdv[i] = td[i] = 0xbeef;
1181            }
1182            in_dst[i] = *my_dst;
1183            my_dst += 1;
1184            DITHER_INC_X(my_x);
1185        }
1186        }
1187#endif
1188
1189#ifdef SK_CPU_ARM64
1190        vsrc = sk_vld4_u8_arm64_4(src);
1191#else
1192        {
1193        register uint8x8_t d0 asm("d0");
1194        register uint8x8_t d1 asm("d1");
1195        register uint8x8_t d2 asm("d2");
1196        register uint8x8_t d3 asm("d3");
1197
1198        asm ("vld4.8    {d0-d3},[%[src]]! "
1199            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1200            :
1201        );
1202        vsrc.val[0] = d0;
1203        vsrc.val[1] = d1;
1204        vsrc.val[2] = d2;
1205        vsrc.val[3] = d3;
1206        }
1207#endif
1208        sa = vsrc.val[NEON_A];
1209        sr = vsrc.val[NEON_R];
1210        sg = vsrc.val[NEON_G];
1211        sb = vsrc.val[NEON_B];
1212
1213        /* calculate 'd', which will be 0..7
1214         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1215         */
1216        alpha8 = vmovl_u8(dbase);
1217        alpha8 = vmlal_u8(alpha8, sa, dbase);
1218        d = vshrn_n_u16(alpha8, 8);    // narrowing too
1219
1220        // sr = sr - (sr>>5) + d
1221        /* watching for 8-bit overflow.  d is 0..7; risky range of
1222         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1223         * safe  as long as we do ((sr-sr>>5) + d)
1224         */
1225        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1226        sr = vadd_u8(sr, d);
1227
1228        // sb = sb - (sb>>5) + d
1229        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1230        sb = vadd_u8(sb, d);
1231
1232        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1233        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1234        sg = vadd_u8(sg, vshr_n_u8(d,1));
1235
1236        // need to pick up 8 dst's -- at 16 bits each, 128 bits
1237        dst8 = vld1q_u16(dst);
1238        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1239        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1240        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1241
1242        // blend
1243        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1244
1245        // combine the addq and mul, save 3 insns
1246        scale8 = vshrq_n_u16(scale8, 3);
1247        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1248        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1249        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1250
1251        // repack to store
1252        dst8 = vshrq_n_u16(dst_b, 5);
1253        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1254        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1255
1256        vst1q_u16(dst, dst8);
1257
1258#if defined(DEBUG_OPAQUE_DITHER)
1259        // verify my 8 elements match the temp buffer
1260        {
1261        int i, bad=0;
1262        static int invocation;
1263
1264        for (i = 0; i < UNROLL; i++) {
1265            if (tmpbuf[i] != dst[i]) {
1266                bad=1;
1267            }
1268        }
1269        if (bad) {
1270            SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1271                     invocation, offset);
1272            SkDebugf("  alpha 0x%x\n", alpha);
1273            for (i = 0; i < UNROLL; i++)
1274                SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1275                         i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1276                         in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1277
1278            showme16("alpha8", &alpha8, sizeof(alpha8));
1279            showme16("scale8", &scale8, sizeof(scale8));
1280            showme8("d", &d, sizeof(d));
1281            showme16("dst8", &dst8, sizeof(dst8));
1282            showme16("dst_b", &dst_b, sizeof(dst_b));
1283            showme16("dst_g", &dst_g, sizeof(dst_g));
1284            showme16("dst_r", &dst_r, sizeof(dst_r));
1285            showme8("sb", &sb, sizeof(sb));
1286            showme8("sg", &sg, sizeof(sg));
1287            showme8("sr", &sr, sizeof(sr));
1288
1289            return;
1290        }
1291        offset += UNROLL;
1292        invocation++;
1293        }
1294#endif
1295        dst += UNROLL;
1296        count -= UNROLL;
1297        // skip x += UNROLL, since it's unchanged mod-4
1298        } while (count >= UNROLL);
1299    }
1300#undef    UNROLL
1301
1302    // residuals
1303    if (count > 0) {
1304        DITHER_565_SCAN(y);
1305        do {
1306            SkPMColor c = *src++;
1307            SkPMColorAssert(c);
1308            if (c) {
1309                unsigned a = SkGetPackedA32(c);
1310
1311                // dither and alpha are just temporary variables to work-around
1312                // an ICE in debug.
1313                unsigned dither = DITHER_VALUE(x);
1314                unsigned alpha = SkAlpha255To256(a);
1315                int d = SkAlphaMul(dither, alpha);
1316
1317                unsigned sr = SkGetPackedR32(c);
1318                unsigned sg = SkGetPackedG32(c);
1319                unsigned sb = SkGetPackedB32(c);
1320                sr = SkDITHER_R32_FOR_565(sr, d);
1321                sg = SkDITHER_G32_FOR_565(sg, d);
1322                sb = SkDITHER_B32_FOR_565(sb, d);
1323
1324                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1325                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1326                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1327                // now src and dst expanded are in g:11 r:10 x:1 b:10
1328                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1329            }
1330            dst += 1;
1331            DITHER_INC_X(x);
1332        } while (--count != 0);
1333    }
1334}
1335
1336///////////////////////////////////////////////////////////////////////////////
1337
1338#undef    DEBUG_S32_OPAQUE_DITHER
1339
1340void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1341                                 const SkPMColor* SK_RESTRICT src,
1342                                 int count, U8CPU alpha, int x, int y) {
1343    SkASSERT(255 == alpha);
1344
1345#define    UNROLL    8
1346    if (count >= UNROLL) {
1347    uint8x8_t d;
1348    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1349    d = vld1_u8(dstart);
1350
1351    while (count >= UNROLL) {
1352        uint8x8_t sr, sg, sb;
1353        uint16x8_t dr, dg, db;
1354        uint16x8_t dst8;
1355        uint8x8x4_t vsrc;
1356
1357#ifdef SK_CPU_ARM64
1358        vsrc = sk_vld4_u8_arm64_3(src);
1359#else
1360        {
1361        register uint8x8_t d0 asm("d0");
1362        register uint8x8_t d1 asm("d1");
1363        register uint8x8_t d2 asm("d2");
1364        register uint8x8_t d3 asm("d3");
1365
1366        asm (
1367            "vld4.8    {d0-d3},[%[src]]! "
1368            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1369            :
1370        );
1371        vsrc.val[0] = d0;
1372        vsrc.val[1] = d1;
1373        vsrc.val[2] = d2;
1374        }
1375#endif
1376        sr = vsrc.val[NEON_R];
1377        sg = vsrc.val[NEON_G];
1378        sb = vsrc.val[NEON_B];
1379
1380        /* XXX: if we want to prefetch, hide it in the above asm()
1381         * using the gcc __builtin_prefetch(), the prefetch will
1382         * fall to the bottom of the loop -- it won't stick up
1383         * at the top of the loop, just after the vld4.
1384         */
1385
1386        // sr = sr - (sr>>5) + d
1387        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1388        dr = vaddl_u8(sr, d);
1389
1390        // sb = sb - (sb>>5) + d
1391        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1392        db = vaddl_u8(sb, d);
1393
1394        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1395        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1396        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1397
1398        // pack high bits of each into 565 format  (rgb, b is lsb)
1399        dst8 = vshrq_n_u16(db, 3);
1400        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1401        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1402
1403        // store it
1404        vst1q_u16(dst, dst8);
1405
1406#if    defined(DEBUG_S32_OPAQUE_DITHER)
1407        // always good to know if we generated good results
1408        {
1409        int i, myx = x, myy = y;
1410        DITHER_565_SCAN(myy);
1411        for (i=0;i<UNROLL;i++) {
1412            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1413            SkPMColor c = src[i-8];
1414            unsigned dither = DITHER_VALUE(myx);
1415            uint16_t val = SkDitherRGB32To565(c, dither);
1416            if (val != dst[i]) {
1417            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1418                c, dither, val, dst[i], dstart[i]);
1419            }
1420            DITHER_INC_X(myx);
1421        }
1422        }
1423#endif
1424
1425        dst += UNROLL;
1426        // we don't need to increment src as the asm above has already done it
1427        count -= UNROLL;
1428        x += UNROLL;        // probably superfluous
1429    }
1430    }
1431#undef    UNROLL
1432
1433    // residuals
1434    if (count > 0) {
1435        DITHER_565_SCAN(y);
1436        do {
1437            SkPMColor c = *src++;
1438            SkPMColorAssert(c);
1439            SkASSERT(SkGetPackedA32(c) == 255);
1440
1441            unsigned dither = DITHER_VALUE(x);
1442            *dst++ = SkDitherRGB32To565(c, dither);
1443            DITHER_INC_X(x);
1444        } while (--count != 0);
1445    }
1446}
1447
1448void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1449                      SkPMColor color) {
1450    if (count <= 0) {
1451        return;
1452    }
1453
1454    if (0 == color) {
1455        if (src != dst) {
1456            memcpy(dst, src, count * sizeof(SkPMColor));
1457        }
1458        return;
1459    }
1460
1461    unsigned colorA = SkGetPackedA32(color);
1462    if (255 == colorA) {
1463        sk_memset32(dst, color, count);
1464        return;
1465    }
1466
1467    unsigned scale = 256 - SkAlpha255To256(colorA);
1468
1469    if (count >= 8) {
1470        uint32x4_t vcolor;
1471        uint8x8_t vscale;
1472
1473        vcolor = vdupq_n_u32(color);
1474
1475        // scale numerical interval [0-255], so load as 8 bits
1476        vscale = vdup_n_u8(scale);
1477
1478        do {
1479            // load src color, 8 pixels, 4 64 bit registers
1480            // (and increment src).
1481            uint32x2x4_t vsrc;
1482#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1483            asm (
1484                "vld1.32    %h[vsrc], [%[src]]!"
1485                : [vsrc] "=w" (vsrc), [src] "+r" (src)
1486                : :
1487            );
1488#else // 64bit targets and Clang
1489            vsrc.val[0] = vld1_u32(src);
1490            vsrc.val[1] = vld1_u32(src+2);
1491            vsrc.val[2] = vld1_u32(src+4);
1492            vsrc.val[3] = vld1_u32(src+6);
1493            src += 8;
1494#endif
1495
1496            // multiply long by scale, 64 bits at a time,
1497            // destination into a 128 bit register.
1498            uint16x8x4_t vtmp;
1499            vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1500            vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1501            vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1502            vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1503
1504            // shift the 128 bit registers, containing the 16
1505            // bit scaled values back to 8 bits, narrowing the
1506            // results to 64 bit registers.
1507            uint8x16x2_t vres;
1508            vres.val[0] = vcombine_u8(
1509                            vshrn_n_u16(vtmp.val[0], 8),
1510                            vshrn_n_u16(vtmp.val[1], 8));
1511            vres.val[1] = vcombine_u8(
1512                            vshrn_n_u16(vtmp.val[2], 8),
1513                            vshrn_n_u16(vtmp.val[3], 8));
1514
1515            // adding back the color, using 128 bit registers.
1516            uint32x4x2_t vdst;
1517            vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1518                                               vreinterpretq_u8_u32(vcolor));
1519            vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1520                                               vreinterpretq_u8_u32(vcolor));
1521
1522            // store back the 8 calculated pixels (2 128 bit
1523            // registers), and increment dst.
1524#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1525            asm (
1526                "vst1.32    %h[vdst], [%[dst]]!"
1527                : [dst] "+r" (dst)
1528                : [vdst] "w" (vdst)
1529                : "memory"
1530            );
1531#else // 64bit targets and Clang
1532            vst1q_u32(dst, vdst.val[0]);
1533            vst1q_u32(dst+4, vdst.val[1]);
1534            dst += 8;
1535#endif
1536            count -= 8;
1537
1538        } while (count >= 8);
1539    }
1540
1541    while (count > 0) {
1542        *dst = color + SkAlphaMulQ(*src, scale);
1543        src += 1;
1544        dst += 1;
1545        count--;
1546    }
1547}
1548
1549///////////////////////////////////////////////////////////////////////////////
1550
1551const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1552    // no dither
1553    S32_D565_Opaque_neon,
1554    S32_D565_Blend_neon,
1555#ifdef SK_CPU_ARM32
1556    S32A_D565_Opaque_neon,
1557#else
1558    NULL,
1559#endif
1560    S32A_D565_Blend_neon,
1561
1562    // dither
1563    S32_D565_Opaque_Dither_neon,
1564    S32_D565_Blend_Dither_neon,
1565    S32A_D565_Opaque_Dither_neon,
1566    NULL,   // S32A_D565_Blend_Dither
1567};
1568
1569const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1570    NULL,   // S32_Opaque,
1571    S32_Blend_BlitRow32_neon,        // S32_Blend,
1572    /*
1573     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1574     * value and attempts to optimize accordingly.  The optimization is
1575     * sensitive to the source content and is not a win in all cases. For
1576     * example, if there are a lot of transitions between the alpha states,
1577     * the performance will almost certainly be worse.  However, for many
1578     * common cases the performance is equivalent or better than the standard
1579     * case where we do not inspect the src alpha.
1580     */
1581#if SK_A32_SHIFT == 24
1582    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1583    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1584#else
1585    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1586#endif
1587#ifdef SK_CPU_ARM32
1588    S32A_Blend_BlitRow32_neon        // S32A_Blend
1589#else
1590    NULL
1591#endif
1592};
1593