SkBlitRow_opts_arm_neon.cpp revision 95c2e5532b094add82b007bdfcd4c64050b6b366
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkCachePreload_arm.h"
18#include "SkColor_opts_neon.h"
19#include <arm_neon.h>
20
21void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22                           const SkPMColor* SK_RESTRICT src, int count,
23                           U8CPU alpha, int /*x*/, int /*y*/) {
24    SkASSERT(255 == alpha);
25
26    while (count >= 8) {
27        uint8x8x4_t vsrc;
28        uint16x8_t vdst;
29
30        // Load
31        vsrc = vld4_u8((uint8_t*)src);
32
33        // Convert src to 565
34        vdst = SkPixel32ToPixel16_neon8(vsrc);
35
36        // Store
37        vst1q_u16(dst, vdst);
38
39        // Prepare next iteration
40        dst += 8;
41        src += 8;
42        count -= 8;
43    };
44
45    // Leftovers
46    while (count > 0) {
47        SkPMColor c = *src++;
48        SkPMColorAssert(c);
49        *dst = SkPixel32ToPixel16_ToU16(c);
50        dst++;
51        count--;
52    };
53}
54
55void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
56                          const SkPMColor* SK_RESTRICT src, int count,
57                          U8CPU alpha, int /*x*/, int /*y*/) {
58    SkASSERT(255 > alpha);
59
60    uint16x8_t vmask_blue, vscale;
61
62    // prepare constants
63    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
64    vmask_blue = vmovq_n_u16(0x1F);
65
66    while (count >= 8) {
67        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
68        uint16x8_t vres_r, vres_g, vres_b;
69        uint8x8_t vsrc_r, vsrc_g, vsrc_b;
70
71        // Load src
72        {
73        register uint8x8_t d0 asm("d0");
74        register uint8x8_t d1 asm("d1");
75        register uint8x8_t d2 asm("d2");
76        register uint8x8_t d3 asm("d3");
77
78        asm (
79            "vld4.8    {d0-d3},[%[src]]!"
80            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
81            :
82        );
83        vsrc_g = d1;
84#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
85        vsrc_r = d2; vsrc_b = d0;
86#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
87        vsrc_r = d0; vsrc_b = d2;
88#endif
89        }
90
91        // Load and unpack dst
92        vdst = vld1q_u16(dst);
93        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
94        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
95        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
96        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
97
98        // Shift src to 565
99        vsrc_r = vshr_n_u8(vsrc_r, 3);    // shift red to 565 range
100        vsrc_g = vshr_n_u8(vsrc_g, 2);    // shift green to 565 range
101        vsrc_b = vshr_n_u8(vsrc_b, 3);    // shift blue to 565 range
102
103        // Scale src - dst
104        vres_r = vmovl_u8(vsrc_r) - vdst_r;
105        vres_g = vmovl_u8(vsrc_g) - vdst_g;
106        vres_b = vmovl_u8(vsrc_b) - vdst_b;
107
108        vres_r = vshrq_n_u16(vres_r * vscale, 8);
109        vres_g = vshrq_n_u16(vres_g * vscale, 8);
110        vres_b = vshrq_n_u16(vres_b * vscale, 8);
111
112        vres_r += vdst_r;
113        vres_g += vdst_g;
114        vres_b += vdst_b;
115
116        // Combine
117        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
118        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
119
120        // Store
121        vst1q_u16(dst, vres_b);
122        dst += 8;
123        count -= 8;
124    }
125    if (count > 0) {
126        int scale = SkAlpha255To256(alpha);
127        do {
128            SkPMColor c = *src++;
129            SkPMColorAssert(c);
130            uint16_t d = *dst;
131            *dst++ = SkPackRGB16(
132                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
133                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
134                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
135        } while (--count != 0);
136    }
137}
138
139void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
140                           const SkPMColor* SK_RESTRICT src, int count,
141                           U8CPU alpha, int /*x*/, int /*y*/) {
142    SkASSERT(255 == alpha);
143
144    if (count >= 8) {
145        uint16_t* SK_RESTRICT keep_dst = 0;
146
147        asm volatile (
148                      "ands       ip, %[count], #7            \n\t"
149                      "vmov.u8    d31, #1<<7                  \n\t"
150                      "vld1.16    {q12}, [%[dst]]             \n\t"
151                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
152                      // Thumb does not support the standard ARM conditional
153                      // instructions but instead requires the 'it' instruction
154                      // to signal conditional execution
155                      "it eq                                  \n\t"
156                      "moveq      ip, #8                      \n\t"
157                      "mov        %[keep_dst], %[dst]         \n\t"
158
159                      "add        %[src], %[src], ip, LSL#2   \n\t"
160                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
161                      "subs       %[count], %[count], ip      \n\t"
162                      "b          9f                          \n\t"
163                      // LOOP
164                      "2:                                         \n\t"
165
166                      "vld1.16    {q12}, [%[dst]]!            \n\t"
167                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
168                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
169                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
170                      "subs       %[count], %[count], #8      \n\t"
171                      "9:                                         \n\t"
172                      "pld        [%[dst],#32]                \n\t"
173                      // expand 0565 q12 to 8888 {d4-d7}
174                      "vmovn.u16  d4, q12                     \n\t"
175                      "vshr.u16   q11, q12, #5                \n\t"
176                      "vshr.u16   q10, q12, #6+5              \n\t"
177                      "vmovn.u16  d5, q11                     \n\t"
178                      "vmovn.u16  d6, q10                     \n\t"
179                      "vshl.u8    d4, d4, #3                  \n\t"
180                      "vshl.u8    d5, d5, #2                  \n\t"
181                      "vshl.u8    d6, d6, #3                  \n\t"
182
183                      "vmovl.u8   q14, d31                    \n\t"
184                      "vmovl.u8   q13, d31                    \n\t"
185                      "vmovl.u8   q12, d31                    \n\t"
186
187                      // duplicate in 4/2/1 & 8pix vsns
188                      "vmvn.8     d30, d3                     \n\t"
189                      "vmlal.u8   q14, d30, d6                \n\t"
190                      "vmlal.u8   q13, d30, d5                \n\t"
191                      "vmlal.u8   q12, d30, d4                \n\t"
192                      "vshr.u16   q8, q14, #5                 \n\t"
193                      "vshr.u16   q9, q13, #6                 \n\t"
194                      "vaddhn.u16 d6, q14, q8                 \n\t"
195                      "vshr.u16   q8, q12, #5                 \n\t"
196                      "vaddhn.u16 d5, q13, q9                 \n\t"
197                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
198                      "vaddhn.u16 d4, q12, q8                 \n\t"
199                      // intentionally don't calculate alpha
200                      // result in d4-d6
201
202                      "vqadd.u8   d5, d5, d1                  \n\t"
203                      "vqadd.u8   d4, d4, d2                  \n\t"
204
205                      // pack 8888 {d4-d6} to 0565 q10
206                      "vshll.u8   q10, d6, #8                 \n\t"
207                      "vshll.u8   q3, d5, #8                  \n\t"
208                      "vshll.u8   q2, d4, #8                  \n\t"
209                      "vsri.u16   q10, q3, #5                 \n\t"
210                      "vsri.u16   q10, q2, #11                \n\t"
211
212                      "bne        2b                          \n\t"
213
214                      "1:                                         \n\t"
215                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
216                      : [count] "+r" (count)
217                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
218                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
219                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
220                      "d30","d31"
221                      );
222    }
223    else
224    {   // handle count < 8
225        uint16_t* SK_RESTRICT keep_dst = 0;
226
227        asm volatile (
228                      "vmov.u8    d31, #1<<7                  \n\t"
229                      "mov        %[keep_dst], %[dst]         \n\t"
230
231                      "tst        %[count], #4                \n\t"
232                      "beq        14f                         \n\t"
233                      "vld1.16    {d25}, [%[dst]]!            \n\t"
234                      "vld1.32    {q1}, [%[src]]!             \n\t"
235
236                      "14:                                        \n\t"
237                      "tst        %[count], #2                \n\t"
238                      "beq        12f                         \n\t"
239                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
240                      "vld1.32    {d1}, [%[src]]!             \n\t"
241
242                      "12:                                        \n\t"
243                      "tst        %[count], #1                \n\t"
244                      "beq        11f                         \n\t"
245                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
246                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
247
248                      "11:                                        \n\t"
249                      // unzips achieve the same as a vld4 operation
250                      "vuzpq.u16  q0, q1                      \n\t"
251                      "vuzp.u8    d0, d1                      \n\t"
252                      "vuzp.u8    d2, d3                      \n\t"
253                      // expand 0565 q12 to 8888 {d4-d7}
254                      "vmovn.u16  d4, q12                     \n\t"
255                      "vshr.u16   q11, q12, #5                \n\t"
256                      "vshr.u16   q10, q12, #6+5              \n\t"
257                      "vmovn.u16  d5, q11                     \n\t"
258                      "vmovn.u16  d6, q10                     \n\t"
259                      "vshl.u8    d4, d4, #3                  \n\t"
260                      "vshl.u8    d5, d5, #2                  \n\t"
261                      "vshl.u8    d6, d6, #3                  \n\t"
262
263                      "vmovl.u8   q14, d31                    \n\t"
264                      "vmovl.u8   q13, d31                    \n\t"
265                      "vmovl.u8   q12, d31                    \n\t"
266
267                      // duplicate in 4/2/1 & 8pix vsns
268                      "vmvn.8     d30, d3                     \n\t"
269                      "vmlal.u8   q14, d30, d6                \n\t"
270                      "vmlal.u8   q13, d30, d5                \n\t"
271                      "vmlal.u8   q12, d30, d4                \n\t"
272                      "vshr.u16   q8, q14, #5                 \n\t"
273                      "vshr.u16   q9, q13, #6                 \n\t"
274                      "vaddhn.u16 d6, q14, q8                 \n\t"
275                      "vshr.u16   q8, q12, #5                 \n\t"
276                      "vaddhn.u16 d5, q13, q9                 \n\t"
277                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
278                      "vaddhn.u16 d4, q12, q8                 \n\t"
279                      // intentionally don't calculate alpha
280                      // result in d4-d6
281
282                      "vqadd.u8   d5, d5, d1                  \n\t"
283                      "vqadd.u8   d4, d4, d2                  \n\t"
284
285                      // pack 8888 {d4-d6} to 0565 q10
286                      "vshll.u8   q10, d6, #8                 \n\t"
287                      "vshll.u8   q3, d5, #8                  \n\t"
288                      "vshll.u8   q2, d4, #8                  \n\t"
289                      "vsri.u16   q10, q3, #5                 \n\t"
290                      "vsri.u16   q10, q2, #11                \n\t"
291
292                      // store
293                      "tst        %[count], #4                \n\t"
294                      "beq        24f                         \n\t"
295                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
296
297                      "24:                                        \n\t"
298                      "tst        %[count], #2                \n\t"
299                      "beq        22f                         \n\t"
300                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
301
302                      "22:                                        \n\t"
303                      "tst        %[count], #1                \n\t"
304                      "beq        21f                         \n\t"
305                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
306
307                      "21:                                        \n\t"
308                      : [count] "+r" (count)
309                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
310                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
311                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
312                      "d30","d31"
313                      );
314    }
315}
316
317static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
318    prod += vdupq_n_u16(128);
319    prod += vshrq_n_u16(prod, 8);
320    return vshrq_n_u16(prod, 8);
321}
322
323void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
324                          const SkPMColor* SK_RESTRICT src, int count,
325                          U8CPU alpha, int /*x*/, int /*y*/) {
326   SkASSERT(255 > alpha);
327
328    /* This code implements a Neon version of S32A_D565_Blend. The results have
329     * a few mismatches compared to the original code. These mismatches never
330     * exceed 1.
331     */
332
333    if (count >= 8) {
334        uint16x8_t valpha_max, vmask_blue;
335        uint8x8_t valpha;
336
337        // prepare constants
338        valpha_max = vmovq_n_u16(255);
339        valpha = vdup_n_u8(alpha);
340        vmask_blue = vmovq_n_u16(SK_B16_MASK);
341
342        do {
343            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
344            uint16x8_t vres_a, vres_r, vres_g, vres_b;
345            uint8x8x4_t vsrc;
346
347            // load pixels
348            vdst = vld1q_u16(dst);
349#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
350            asm (
351                "vld4.u8 %h[vsrc], [%[src]]!"
352                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
353                : :
354            );
355#else
356            register uint8x8_t d0 asm("d0");
357            register uint8x8_t d1 asm("d1");
358            register uint8x8_t d2 asm("d2");
359            register uint8x8_t d3 asm("d3");
360
361            asm volatile (
362                "vld4.u8    {d0-d3},[%[src]]!;"
363                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
364                  [src] "+&r" (src)
365                : :
366            );
367            vsrc.val[0] = d0;
368            vsrc.val[1] = d1;
369            vsrc.val[2] = d2;
370            vsrc.val[3] = d3;
371#endif
372
373
374            // deinterleave dst
375            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
376            vdst_b = vdst & vmask_blue;                     // extract blue
377            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
378            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
379
380            // shift src to 565
381            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
382            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
383            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
384
385            // calc src * src_scale
386            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
387            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
388            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
389            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
390
391            // prepare dst_scale
392            vres_a = SkDiv255Round_neon8(vres_a);
393            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
394
395            // add dst * dst_scale to previous result
396            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
397            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
398            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
399
400#ifdef S32A_D565_BLEND_EXACT
401            // It is possible to get exact results with this but it is slow,
402            // even slower than C code in some cases
403            vres_r = SkDiv255Round_neon8(vres_r);
404            vres_g = SkDiv255Round_neon8(vres_g);
405            vres_b = SkDiv255Round_neon8(vres_b);
406#else
407            vres_r = vrshrq_n_u16(vres_r, 8);
408            vres_g = vrshrq_n_u16(vres_g, 8);
409            vres_b = vrshrq_n_u16(vres_b, 8);
410#endif
411            // pack result
412            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
413            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
414
415            // store
416            vst1q_u16(dst, vres_b);
417            dst += 8;
418            count -= 8;
419        } while (count >= 8);
420    }
421
422    // leftovers
423    while (count-- > 0) {
424        SkPMColor sc = *src++;
425        if (sc) {
426            uint16_t dc = *dst;
427            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
428            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
429            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
430            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
431            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
432        }
433        dst += 1;
434    }
435}
436
437/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
438 * each dither value is spaced out into byte lanes, and repeated
439 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
440 * start of each row.
441 */
442static const uint8_t gDitherMatrix_Neon[48] = {
443    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
444    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
445    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
446    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
447
448};
449
450void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
451                                int count, U8CPU alpha, int x, int y)
452{
453
454    SkASSERT(255 > alpha);
455
456    // rescale alpha to range 1 - 256
457    int scale = SkAlpha255To256(alpha);
458
459    if (count >= 8) {
460        /* select row and offset for dither array */
461        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
462
463        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
464        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
465
466        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
467        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
468
469        do {
470
471            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
472            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
473            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
474            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
475            uint16x8_t vdst;
476            uint16x8_t vdst_r, vdst_g, vdst_b;
477            int16x8_t vres_r, vres_g, vres_b;
478            int8x8_t vres8_r, vres8_g, vres8_b;
479
480            // Load source and add dither
481            {
482            register uint8x8_t d0 asm("d0");
483            register uint8x8_t d1 asm("d1");
484            register uint8x8_t d2 asm("d2");
485            register uint8x8_t d3 asm("d3");
486
487            asm (
488                "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
489                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
490                :
491            );
492            vsrc_g = d1;
493#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
494            vsrc_r = d2; vsrc_b = d0;
495#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
496            vsrc_r = d0; vsrc_b = d2;
497#endif
498            }
499
500            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
501            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
502            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
503
504            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
505            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
506            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
507
508            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
509            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
510            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
511
512            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
513            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
514            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
515
516            // Load dst and unpack
517            vdst = vld1q_u16(dst);
518            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
519            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
520            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
521
522            // subtract dst from src and widen
523            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
524            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
525            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
526
527            // multiply diffs by scale and shift
528            vres_r = vmulq_s16(vres_r, vscale);
529            vres_g = vmulq_s16(vres_g, vscale);
530            vres_b = vmulq_s16(vres_b, vscale);
531
532            vres8_r = vshrn_n_s16(vres_r, 8);
533            vres8_g = vshrn_n_s16(vres_g, 8);
534            vres8_b = vshrn_n_s16(vres_b, 8);
535
536            // add dst to result
537            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
538            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
539            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
540
541            // put result into 565 format
542            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
543            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
544
545            // Store result
546            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
547
548            // Next iteration
549            dst += 8;
550            count -= 8;
551
552        } while (count >= 8);
553    }
554
555    // Leftovers
556    if (count > 0) {
557        int scale = SkAlpha255To256(alpha);
558        DITHER_565_SCAN(y);
559        do {
560            SkPMColor c = *src++;
561            SkPMColorAssert(c);
562
563            int dither = DITHER_VALUE(x);
564            int sr = SkGetPackedR32(c);
565            int sg = SkGetPackedG32(c);
566            int sb = SkGetPackedB32(c);
567            sr = SkDITHER_R32To565(sr, dither);
568            sg = SkDITHER_G32To565(sg, dither);
569            sb = SkDITHER_B32To565(sb, dither);
570
571            uint16_t d = *dst;
572            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
573                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
574                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
575            DITHER_INC_X(x);
576        } while (--count != 0);
577    }
578}
579
580void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
581                                const SkPMColor* SK_RESTRICT src,
582                                int count, U8CPU alpha) {
583
584    SkASSERT(255 == alpha);
585    if (count > 0) {
586
587
588    uint8x8_t alpha_mask;
589
590    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
591    alpha_mask = vld1_u8(alpha_mask_setup);
592
593    /* do the NEON unrolled code */
594#define    UNROLL    4
595    while (count >= UNROLL) {
596        uint8x8_t src_raw, dst_raw, dst_final;
597        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
598
599        /* The two prefetches below may make the code slighlty
600         * slower for small values of count but are worth having
601         * in the general case.
602         */
603        __builtin_prefetch(src+32);
604        __builtin_prefetch(dst+32);
605
606        /* get the source */
607        src_raw = vreinterpret_u8_u32(vld1_u32(src));
608#if    UNROLL > 2
609        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
610#endif
611
612        /* get and hold the dst too */
613        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
614#if    UNROLL > 2
615        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
616#endif
617
618    /* 1st and 2nd bits of the unrolling */
619    {
620        uint8x8_t dst_cooked;
621        uint16x8_t dst_wide;
622        uint8x8_t alpha_narrow;
623        uint16x8_t alpha_wide;
624
625        /* get the alphas spread out properly */
626        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
627        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
628
629        /* spread the dest */
630        dst_wide = vmovl_u8(dst_raw);
631
632        /* alpha mul the dest */
633        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
634        dst_cooked = vshrn_n_u16(dst_wide, 8);
635
636        /* sum -- ignoring any byte lane overflows */
637        dst_final = vadd_u8(src_raw, dst_cooked);
638    }
639
640#if    UNROLL > 2
641    /* the 3rd and 4th bits of our unrolling */
642    {
643        uint8x8_t dst_cooked;
644        uint16x8_t dst_wide;
645        uint8x8_t alpha_narrow;
646        uint16x8_t alpha_wide;
647
648        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
649        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
650
651        /* spread the dest */
652        dst_wide = vmovl_u8(dst_raw_2);
653
654        /* alpha mul the dest */
655        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
656        dst_cooked = vshrn_n_u16(dst_wide, 8);
657
658        /* sum -- ignoring any byte lane overflows */
659        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
660    }
661#endif
662
663        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
664#if    UNROLL > 2
665        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
666#endif
667
668        src += UNROLL;
669        dst += UNROLL;
670        count -= UNROLL;
671    }
672#undef    UNROLL
673
674    /* do any residual iterations */
675        while (--count >= 0) {
676            *dst = SkPMSrcOver(*src, *dst);
677            src += 1;
678            dst += 1;
679        }
680    }
681}
682
683void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
684                                const SkPMColor* SK_RESTRICT src,
685                                int count, U8CPU alpha) {
686    SkASSERT(255 == alpha);
687
688    if (count <= 0)
689    return;
690
691    /* Use these to check if src is transparent or opaque */
692    const unsigned int ALPHA_OPAQ  = 0xFF000000;
693    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
694
695#define UNROLL  4
696    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
697    const SkPMColor* SK_RESTRICT src_temp = src;
698
699    /* set up the NEON variables */
700    uint8x8_t alpha_mask;
701    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
702    alpha_mask = vld1_u8(alpha_mask_setup);
703
704    uint8x8_t src_raw, dst_raw, dst_final;
705    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
706    uint8x8_t dst_cooked;
707    uint16x8_t dst_wide;
708    uint8x8_t alpha_narrow;
709    uint16x8_t alpha_wide;
710
711    /* choose the first processing type */
712    if( src >= src_end)
713        goto TAIL;
714    if(*src <= ALPHA_TRANS)
715        goto ALPHA_0;
716    if(*src >= ALPHA_OPAQ)
717        goto ALPHA_255;
718    /* fall-thru */
719
720ALPHA_1_TO_254:
721    do {
722
723        /* get the source */
724        src_raw = vreinterpret_u8_u32(vld1_u32(src));
725        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
726
727        /* get and hold the dst too */
728        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
729        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
730
731
732        /* get the alphas spread out properly */
733        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
734        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
735        /* we collapsed (255-a)+1 ... */
736        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
737
738        /* spread the dest */
739        dst_wide = vmovl_u8(dst_raw);
740
741        /* alpha mul the dest */
742        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
743        dst_cooked = vshrn_n_u16(dst_wide, 8);
744
745        /* sum -- ignoring any byte lane overflows */
746        dst_final = vadd_u8(src_raw, dst_cooked);
747
748        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
749        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
750        /* we collapsed (255-a)+1 ... */
751        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
752
753        /* spread the dest */
754        dst_wide = vmovl_u8(dst_raw_2);
755
756        /* alpha mul the dest */
757        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
758        dst_cooked = vshrn_n_u16(dst_wide, 8);
759
760        /* sum -- ignoring any byte lane overflows */
761        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
762
763        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
764        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
765
766        src += UNROLL;
767        dst += UNROLL;
768
769        /* if 2 of the next pixels aren't between 1 and 254
770        it might make sense to go to the optimized loops */
771        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
772            break;
773
774    } while(src < src_end);
775
776    if (src >= src_end)
777        goto TAIL;
778
779    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
780        goto ALPHA_255;
781
782    /*fall-thru*/
783
784ALPHA_0:
785
786    /*In this state, we know the current alpha is 0 and
787     we optimize for the next alpha also being zero. */
788    src_temp = src;  //so we don't have to increment dst every time
789    do {
790        if(*(++src) > ALPHA_TRANS)
791            break;
792        if(*(++src) > ALPHA_TRANS)
793            break;
794        if(*(++src) > ALPHA_TRANS)
795            break;
796        if(*(++src) > ALPHA_TRANS)
797            break;
798    } while(src < src_end);
799
800    dst += (src - src_temp);
801
802    /* no longer alpha 0, so determine where to go next. */
803    if( src >= src_end)
804        goto TAIL;
805    if(*src >= ALPHA_OPAQ)
806        goto ALPHA_255;
807    else
808        goto ALPHA_1_TO_254;
809
810ALPHA_255:
811    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
812        dst[0]=src[0];
813        dst[1]=src[1];
814        dst[2]=src[2];
815        dst[3]=src[3];
816        src+=UNROLL;
817        dst+=UNROLL;
818        if(src >= src_end)
819            goto TAIL;
820    }
821
822    //Handle remainder.
823    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
824        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
825            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
826        }
827    }
828
829    if( src >= src_end)
830        goto TAIL;
831    if(*src <= ALPHA_TRANS)
832        goto ALPHA_0;
833    else
834        goto ALPHA_1_TO_254;
835
836TAIL:
837    /* do any residual iterations */
838    src_end += UNROLL + 1;  //goto the real end
839    while(src != src_end) {
840        if( *src != 0 ) {
841            if( *src >= ALPHA_OPAQ ) {
842                *dst = *src;
843            }
844            else {
845                *dst = SkPMSrcOver(*src, *dst);
846            }
847        }
848        src++;
849        dst++;
850    }
851
852#undef    UNROLL
853    return;
854}
855
856/* Neon version of S32_Blend_BlitRow32()
857 * portable version is in src/core/SkBlitRow_D32.cpp
858 */
859void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
860                              const SkPMColor* SK_RESTRICT src,
861                              int count, U8CPU alpha) {
862    SkASSERT(alpha <= 255);
863
864    if (count <= 0) {
865        return;
866    }
867
868    uint16_t src_scale = SkAlpha255To256(alpha);
869    uint16_t dst_scale = 256 - src_scale;
870
871    while (count >= 2) {
872        uint8x8_t vsrc, vdst, vres;
873        uint16x8_t vsrc_wide, vdst_wide;
874
875        /* These commented prefetches are a big win for count
876         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
877         * They also hurt a little (<5%) on an A15
878         */
879        //__builtin_prefetch(src+32);
880        //__builtin_prefetch(dst+32);
881
882        // Load
883        vsrc = vreinterpret_u8_u32(vld1_u32(src));
884        vdst = vreinterpret_u8_u32(vld1_u32(dst));
885
886        // Process src
887        vsrc_wide = vmovl_u8(vsrc);
888        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
889
890        // Process dst
891        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
892
893        // Combine
894        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
895
896        // Store
897        vst1_u32(dst, vreinterpret_u32_u8(vres));
898
899        src += 2;
900        dst += 2;
901        count -= 2;
902    }
903
904    if (count == 1) {
905        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
906        uint16x8_t vsrc_wide, vdst_wide;
907
908        // Load
909        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
910        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
911
912        // Process
913        vsrc_wide = vmovl_u8(vsrc);
914        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
915        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
916        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
917
918        // Store
919        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
920    }
921}
922
923void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
924                         const SkPMColor* SK_RESTRICT src,
925                         int count, U8CPU alpha) {
926
927    SkASSERT(255 >= alpha);
928
929    if (count <= 0) {
930        return;
931    }
932
933    unsigned alpha256 = SkAlpha255To256(alpha);
934
935    // First deal with odd counts
936    if (count & 1) {
937        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
938        uint16x8_t vdst_wide, vsrc_wide;
939        unsigned dst_scale;
940
941        // Load
942        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
943        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
944
945        // Calc dst_scale
946        dst_scale = vget_lane_u8(vsrc, 3);
947        dst_scale *= alpha256;
948        dst_scale >>= 8;
949        dst_scale = 256 - dst_scale;
950
951        // Process src
952        vsrc_wide = vmovl_u8(vsrc);
953        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
954
955        // Process dst
956        vdst_wide = vmovl_u8(vdst);
957        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
958
959        // Combine
960        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
961
962        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
963        dst++;
964        src++;
965        count--;
966    }
967
968    if (count) {
969        uint8x8_t alpha_mask;
970        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
971        alpha_mask = vld1_u8(alpha_mask_setup);
972
973        do {
974
975            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
976            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
977
978            __builtin_prefetch(src+32);
979            __builtin_prefetch(dst+32);
980
981            // Load
982            vsrc = vreinterpret_u8_u32(vld1_u32(src));
983            vdst = vreinterpret_u8_u32(vld1_u32(dst));
984
985            // Prepare src_scale
986            vsrc_scale = vdupq_n_u16(alpha256);
987
988            // Calc dst_scale
989            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
990            vdst_scale = vmovl_u8(vsrc_alphas);
991            vdst_scale *= vsrc_scale;
992            vdst_scale = vshrq_n_u16(vdst_scale, 8);
993            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
994
995            // Process src
996            vsrc_wide = vmovl_u8(vsrc);
997            vsrc_wide *= vsrc_scale;
998
999            // Process dst
1000            vdst_wide = vmovl_u8(vdst);
1001            vdst_wide *= vdst_scale;
1002
1003            // Combine
1004            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1005
1006            vst1_u32(dst, vreinterpret_u32_u8(vres));
1007
1008            src += 2;
1009            dst += 2;
1010            count -= 2;
1011        } while(count);
1012    }
1013}
1014
1015///////////////////////////////////////////////////////////////////////////////
1016
1017#undef    DEBUG_OPAQUE_DITHER
1018
1019#if    defined(DEBUG_OPAQUE_DITHER)
1020static void showme8(char *str, void *p, int len)
1021{
1022    static char buf[256];
1023    char tbuf[32];
1024    int i;
1025    char *pc = (char*) p;
1026    sprintf(buf,"%8s:", str);
1027    for(i=0;i<len;i++) {
1028        sprintf(tbuf, "   %02x", pc[i]);
1029        strcat(buf, tbuf);
1030    }
1031    SkDebugf("%s\n", buf);
1032}
1033static void showme16(char *str, void *p, int len)
1034{
1035    static char buf[256];
1036    char tbuf[32];
1037    int i;
1038    uint16_t *pc = (uint16_t*) p;
1039    sprintf(buf,"%8s:", str);
1040    len = (len / sizeof(uint16_t));    /* passed as bytes */
1041    for(i=0;i<len;i++) {
1042        sprintf(tbuf, " %04x", pc[i]);
1043        strcat(buf, tbuf);
1044    }
1045    SkDebugf("%s\n", buf);
1046}
1047#endif
1048
1049void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1050                                   const SkPMColor* SK_RESTRICT src,
1051                                   int count, U8CPU alpha, int x, int y) {
1052    SkASSERT(255 == alpha);
1053
1054#define    UNROLL    8
1055
1056    if (count >= UNROLL) {
1057
1058#if defined(DEBUG_OPAQUE_DITHER)
1059    uint16_t tmpbuf[UNROLL];
1060    int td[UNROLL];
1061    int tdv[UNROLL];
1062    int ta[UNROLL];
1063    int tap[UNROLL];
1064    uint16_t in_dst[UNROLL];
1065    int offset = 0;
1066    int noisy = 0;
1067#endif
1068
1069    uint8x8_t dbase;
1070    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1071    dbase = vld1_u8(dstart);
1072
1073        do {
1074        uint8x8_t sr, sg, sb, sa, d;
1075        uint16x8_t dst8, scale8, alpha8;
1076        uint16x8_t dst_r, dst_g, dst_b;
1077
1078#if defined(DEBUG_OPAQUE_DITHER)
1079        // calculate 8 elements worth into a temp buffer
1080        {
1081        int my_y = y;
1082        int my_x = x;
1083        SkPMColor* my_src = (SkPMColor*)src;
1084        uint16_t* my_dst = dst;
1085        int i;
1086
1087        DITHER_565_SCAN(my_y);
1088        for(i = 0; i < UNROLL; i++) {
1089            SkPMColor c = *my_src++;
1090            SkPMColorAssert(c);
1091            if (c) {
1092                unsigned a = SkGetPackedA32(c);
1093
1094                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1095                tdv[i] = DITHER_VALUE(my_x);
1096                ta[i] = a;
1097                tap[i] = SkAlpha255To256(a);
1098                td[i] = d;
1099
1100                unsigned sr = SkGetPackedR32(c);
1101                unsigned sg = SkGetPackedG32(c);
1102                unsigned sb = SkGetPackedB32(c);
1103                sr = SkDITHER_R32_FOR_565(sr, d);
1104                sg = SkDITHER_G32_FOR_565(sg, d);
1105                sb = SkDITHER_B32_FOR_565(sb, d);
1106
1107                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1108                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1109                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1110                // now src and dst expanded are in g:11 r:10 x:1 b:10
1111                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1112                td[i] = d;
1113            } else {
1114                tmpbuf[i] = *my_dst;
1115                ta[i] = tdv[i] = td[i] = 0xbeef;
1116            }
1117            in_dst[i] = *my_dst;
1118            my_dst += 1;
1119            DITHER_INC_X(my_x);
1120        }
1121        }
1122#endif
1123
1124
1125        {
1126        register uint8x8_t d0 asm("d0");
1127        register uint8x8_t d1 asm("d1");
1128        register uint8x8_t d2 asm("d2");
1129        register uint8x8_t d3 asm("d3");
1130
1131        asm ("vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1132            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1133            :
1134        );
1135#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1136            sr = d2; sg = d1; sb = d0; sa = d3;
1137#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1138            sr = d0; sg = d1; sb = d2; sa = d3;
1139#endif
1140        }
1141
1142        /* calculate 'd', which will be 0..7
1143         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1144         */
1145        alpha8 = vmovl_u8(dbase);
1146        alpha8 = vmlal_u8(alpha8, sa, dbase);
1147        d = vshrn_n_u16(alpha8, 8);    // narrowing too
1148
1149        // sr = sr - (sr>>5) + d
1150        /* watching for 8-bit overflow.  d is 0..7; risky range of
1151         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1152         * safe  as long as we do ((sr-sr>>5) + d)
1153         */
1154        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1155        sr = vadd_u8(sr, d);
1156
1157        // sb = sb - (sb>>5) + d
1158        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1159        sb = vadd_u8(sb, d);
1160
1161        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1162        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1163        sg = vadd_u8(sg, vshr_n_u8(d,1));
1164
1165        // need to pick up 8 dst's -- at 16 bits each, 128 bits
1166        dst8 = vld1q_u16(dst);
1167        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1168        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1169        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1170
1171        // blend
1172        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1173
1174        // combine the addq and mul, save 3 insns
1175        scale8 = vshrq_n_u16(scale8, 3);
1176        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1177        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1178        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1179
1180        // repack to store
1181        dst8 = vshrq_n_u16(dst_b, 5);
1182        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1183        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1184
1185        vst1q_u16(dst, dst8);
1186
1187#if defined(DEBUG_OPAQUE_DITHER)
1188        // verify my 8 elements match the temp buffer
1189        {
1190        int i, bad=0;
1191        static int invocation;
1192
1193        for (i = 0; i < UNROLL; i++) {
1194            if (tmpbuf[i] != dst[i]) {
1195                bad=1;
1196            }
1197        }
1198        if (bad) {
1199            SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1200                     invocation, offset);
1201            SkDebugf("  alpha 0x%x\n", alpha);
1202            for (i = 0; i < UNROLL; i++)
1203                SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1204                         i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1205                         in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1206
1207            showme16("alpha8", &alpha8, sizeof(alpha8));
1208            showme16("scale8", &scale8, sizeof(scale8));
1209            showme8("d", &d, sizeof(d));
1210            showme16("dst8", &dst8, sizeof(dst8));
1211            showme16("dst_b", &dst_b, sizeof(dst_b));
1212            showme16("dst_g", &dst_g, sizeof(dst_g));
1213            showme16("dst_r", &dst_r, sizeof(dst_r));
1214            showme8("sb", &sb, sizeof(sb));
1215            showme8("sg", &sg, sizeof(sg));
1216            showme8("sr", &sr, sizeof(sr));
1217
1218            return;
1219        }
1220        offset += UNROLL;
1221        invocation++;
1222        }
1223#endif
1224        dst += UNROLL;
1225        count -= UNROLL;
1226        // skip x += UNROLL, since it's unchanged mod-4
1227        } while (count >= UNROLL);
1228    }
1229#undef    UNROLL
1230
1231    // residuals
1232    if (count > 0) {
1233        DITHER_565_SCAN(y);
1234        do {
1235            SkPMColor c = *src++;
1236            SkPMColorAssert(c);
1237            if (c) {
1238                unsigned a = SkGetPackedA32(c);
1239
1240                // dither and alpha are just temporary variables to work-around
1241                // an ICE in debug.
1242                unsigned dither = DITHER_VALUE(x);
1243                unsigned alpha = SkAlpha255To256(a);
1244                int d = SkAlphaMul(dither, alpha);
1245
1246                unsigned sr = SkGetPackedR32(c);
1247                unsigned sg = SkGetPackedG32(c);
1248                unsigned sb = SkGetPackedB32(c);
1249                sr = SkDITHER_R32_FOR_565(sr, d);
1250                sg = SkDITHER_G32_FOR_565(sg, d);
1251                sb = SkDITHER_B32_FOR_565(sb, d);
1252
1253                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1254                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1255                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1256                // now src and dst expanded are in g:11 r:10 x:1 b:10
1257                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1258            }
1259            dst += 1;
1260            DITHER_INC_X(x);
1261        } while (--count != 0);
1262    }
1263}
1264
1265///////////////////////////////////////////////////////////////////////////////
1266
1267#undef    DEBUG_S32_OPAQUE_DITHER
1268
1269void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1270                                 const SkPMColor* SK_RESTRICT src,
1271                                 int count, U8CPU alpha, int x, int y) {
1272    SkASSERT(255 == alpha);
1273
1274#define    UNROLL    8
1275    if (count >= UNROLL) {
1276    uint8x8_t d;
1277    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1278    d = vld1_u8(dstart);
1279
1280    while (count >= UNROLL) {
1281        uint8x8_t sr, sg, sb;
1282        uint16x8_t dr, dg, db;
1283        uint16x8_t dst8;
1284
1285        {
1286        register uint8x8_t d0 asm("d0");
1287        register uint8x8_t d1 asm("d1");
1288        register uint8x8_t d2 asm("d2");
1289        register uint8x8_t d3 asm("d3");
1290
1291        asm (
1292            "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1293            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1294            :
1295        );
1296        sg = d1;
1297#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1298        sr = d2; sb = d0;
1299#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1300        sr = d0; sb = d2;
1301#endif
1302        }
1303        /* XXX: if we want to prefetch, hide it in the above asm()
1304         * using the gcc __builtin_prefetch(), the prefetch will
1305         * fall to the bottom of the loop -- it won't stick up
1306         * at the top of the loop, just after the vld4.
1307         */
1308
1309        // sr = sr - (sr>>5) + d
1310        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1311        dr = vaddl_u8(sr, d);
1312
1313        // sb = sb - (sb>>5) + d
1314        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1315        db = vaddl_u8(sb, d);
1316
1317        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1318        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1319        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1320
1321        // pack high bits of each into 565 format  (rgb, b is lsb)
1322        dst8 = vshrq_n_u16(db, 3);
1323        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1324        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1325
1326        // store it
1327        vst1q_u16(dst, dst8);
1328
1329#if    defined(DEBUG_S32_OPAQUE_DITHER)
1330        // always good to know if we generated good results
1331        {
1332        int i, myx = x, myy = y;
1333        DITHER_565_SCAN(myy);
1334        for (i=0;i<UNROLL;i++) {
1335            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1336            SkPMColor c = src[i-8];
1337            unsigned dither = DITHER_VALUE(myx);
1338            uint16_t val = SkDitherRGB32To565(c, dither);
1339            if (val != dst[i]) {
1340            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1341                c, dither, val, dst[i], dstart[i]);
1342            }
1343            DITHER_INC_X(myx);
1344        }
1345        }
1346#endif
1347
1348        dst += UNROLL;
1349        // we don't need to increment src as the asm above has already done it
1350        count -= UNROLL;
1351        x += UNROLL;        // probably superfluous
1352    }
1353    }
1354#undef    UNROLL
1355
1356    // residuals
1357    if (count > 0) {
1358        DITHER_565_SCAN(y);
1359        do {
1360            SkPMColor c = *src++;
1361            SkPMColorAssert(c);
1362            SkASSERT(SkGetPackedA32(c) == 255);
1363
1364            unsigned dither = DITHER_VALUE(x);
1365            *dst++ = SkDitherRGB32To565(c, dither);
1366            DITHER_INC_X(x);
1367        } while (--count != 0);
1368    }
1369}
1370
1371void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1372                      SkPMColor color) {
1373    if (count <= 0) {
1374        return;
1375    }
1376
1377    if (0 == color) {
1378        if (src != dst) {
1379            memcpy(dst, src, count * sizeof(SkPMColor));
1380        }
1381        return;
1382    }
1383
1384    unsigned colorA = SkGetPackedA32(color);
1385    if (255 == colorA) {
1386        sk_memset32(dst, color, count);
1387    } else {
1388        unsigned scale = 256 - SkAlpha255To256(colorA);
1389
1390        if (count >= 8) {
1391            // at the end of this assembly, count will have been decremented
1392            // to a negative value. That is, if count mod 8 = x, it will be
1393            // -8 +x coming out.
1394            asm volatile (
1395                PLD128(src, 0)
1396
1397                "vdup.32    q0, %[color]                \n\t"
1398
1399                PLD128(src, 128)
1400
1401                // scale numerical interval [0-255], so load as 8 bits
1402                "vdup.8     d2, %[scale]                \n\t"
1403
1404                PLD128(src, 256)
1405
1406                "subs       %[count], %[count], #8      \n\t"
1407
1408                PLD128(src, 384)
1409
1410                "Loop_Color32:                          \n\t"
1411
1412                // load src color, 8 pixels, 4 64 bit registers
1413                // (and increment src).
1414                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1415
1416                PLD128(src, 384)
1417
1418                // multiply long by scale, 64 bits at a time,
1419                // destination into a 128 bit register.
1420                "vmull.u8   q4, d4, d2                  \n\t"
1421                "vmull.u8   q5, d5, d2                  \n\t"
1422                "vmull.u8   q6, d6, d2                  \n\t"
1423                "vmull.u8   q7, d7, d2                  \n\t"
1424
1425                // shift the 128 bit registers, containing the 16
1426                // bit scaled values back to 8 bits, narrowing the
1427                // results to 64 bit registers.
1428                "vshrn.i16  d8, q4, #8                  \n\t"
1429                "vshrn.i16  d9, q5, #8                  \n\t"
1430                "vshrn.i16  d10, q6, #8                 \n\t"
1431                "vshrn.i16  d11, q7, #8                 \n\t"
1432
1433                // adding back the color, using 128 bit registers.
1434                "vadd.i8    q6, q4, q0                  \n\t"
1435                "vadd.i8    q7, q5, q0                  \n\t"
1436
1437                // store back the 8 calculated pixels (2 128 bit
1438                // registers), and increment dst.
1439                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1440
1441                "subs       %[count], %[count], #8      \n\t"
1442                "bge        Loop_Color32                \n\t"
1443                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1444                : [color] "r" (color), [scale] "r" (scale)
1445                : "cc", "memory",
1446                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1447                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1448                          );
1449            // At this point, if we went through the inline assembly, count is
1450            // a negative value:
1451            // if the value is -8, there is no pixel left to process.
1452            // if the value is -7, there is one pixel left to process
1453            // ...
1454            // And'ing it with 7 will give us the number of pixels
1455            // left to process.
1456            count = count & 0x7;
1457        }
1458
1459        while (count > 0) {
1460            *dst = color + SkAlphaMulQ(*src, scale);
1461            src += 1;
1462            dst += 1;
1463            count--;
1464        }
1465    }
1466}
1467
1468///////////////////////////////////////////////////////////////////////////////
1469
1470const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1471    // no dither
1472    S32_D565_Opaque_neon,
1473    S32_D565_Blend_neon,
1474    S32A_D565_Opaque_neon,
1475    S32A_D565_Blend_neon,
1476
1477    // dither
1478    S32_D565_Opaque_Dither_neon,
1479    S32_D565_Blend_Dither_neon,
1480    S32A_D565_Opaque_Dither_neon,
1481    NULL,   // S32A_D565_Blend_Dither
1482};
1483
1484const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1485    NULL,   // S32_Opaque,
1486    S32_Blend_BlitRow32_neon,        // S32_Blend,
1487    /*
1488     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1489     * value and attempts to optimize accordingly.  The optimization is
1490     * sensitive to the source content and is not a win in all cases. For
1491     * example, if there are a lot of transitions between the alpha states,
1492     * the performance will almost certainly be worse.  However, for many
1493     * common cases the performance is equivalent or better than the standard
1494     * case where we do not inspect the src alpha.
1495     */
1496#if SK_A32_SHIFT == 24
1497    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1498    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1499#else
1500    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1501#endif
1502    S32A_Blend_BlitRow32_neon        // S32A_Blend
1503};
1504