SkBlitRow_opts_arm_neon.cpp revision 4cc26324e3be5258fae9dc102aa6a3af7d1c96ea
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkCachePreload_arm.h"
18#include "SkColor_opts_neon.h"
19#include <arm_neon.h>
20
21void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22                           const SkPMColor* SK_RESTRICT src, int count,
23                           U8CPU alpha, int /*x*/, int /*y*/) {
24    SkASSERT(255 == alpha);
25
26    while (count >= 8) {
27        uint8x8x4_t vsrc;
28        uint16x8_t vdst;
29
30        // Load
31        vsrc = vld4_u8((uint8_t*)src);
32
33        // Convert src to 565
34        vdst = vshll_n_u8(vsrc.val[NEON_R], 8);
35        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5);
36        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6);
37
38        // Store
39        vst1q_u16(dst, vdst);
40
41        // Prepare next iteration
42        dst += 8;
43        src += 8;
44        count -= 8;
45    };
46
47    // Leftovers
48    while (count > 0) {
49        SkPMColor c = *src++;
50        SkPMColorAssert(c);
51        *dst = SkPixel32ToPixel16_ToU16(c);
52        dst++;
53        count--;
54    };
55}
56
57void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
58                           const SkPMColor* SK_RESTRICT src, int count,
59                           U8CPU alpha, int /*x*/, int /*y*/) {
60    SkASSERT(255 == alpha);
61
62    if (count >= 8) {
63        uint16_t* SK_RESTRICT keep_dst = 0;
64
65        asm volatile (
66                      "ands       ip, %[count], #7            \n\t"
67                      "vmov.u8    d31, #1<<7                  \n\t"
68                      "vld1.16    {q12}, [%[dst]]             \n\t"
69                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
70                      // Thumb does not support the standard ARM conditional
71                      // instructions but instead requires the 'it' instruction
72                      // to signal conditional execution
73                      "it eq                                  \n\t"
74                      "moveq      ip, #8                      \n\t"
75                      "mov        %[keep_dst], %[dst]         \n\t"
76
77                      "add        %[src], %[src], ip, LSL#2   \n\t"
78                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
79                      "subs       %[count], %[count], ip      \n\t"
80                      "b          9f                          \n\t"
81                      // LOOP
82                      "2:                                         \n\t"
83
84                      "vld1.16    {q12}, [%[dst]]!            \n\t"
85                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
86                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
87                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
88                      "subs       %[count], %[count], #8      \n\t"
89                      "9:                                         \n\t"
90                      "pld        [%[dst],#32]                \n\t"
91                      // expand 0565 q12 to 8888 {d4-d7}
92                      "vmovn.u16  d4, q12                     \n\t"
93                      "vshr.u16   q11, q12, #5                \n\t"
94                      "vshr.u16   q10, q12, #6+5              \n\t"
95                      "vmovn.u16  d5, q11                     \n\t"
96                      "vmovn.u16  d6, q10                     \n\t"
97                      "vshl.u8    d4, d4, #3                  \n\t"
98                      "vshl.u8    d5, d5, #2                  \n\t"
99                      "vshl.u8    d6, d6, #3                  \n\t"
100
101                      "vmovl.u8   q14, d31                    \n\t"
102                      "vmovl.u8   q13, d31                    \n\t"
103                      "vmovl.u8   q12, d31                    \n\t"
104
105                      // duplicate in 4/2/1 & 8pix vsns
106                      "vmvn.8     d30, d3                     \n\t"
107                      "vmlal.u8   q14, d30, d6                \n\t"
108                      "vmlal.u8   q13, d30, d5                \n\t"
109                      "vmlal.u8   q12, d30, d4                \n\t"
110                      "vshr.u16   q8, q14, #5                 \n\t"
111                      "vshr.u16   q9, q13, #6                 \n\t"
112                      "vaddhn.u16 d6, q14, q8                 \n\t"
113                      "vshr.u16   q8, q12, #5                 \n\t"
114                      "vaddhn.u16 d5, q13, q9                 \n\t"
115                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
116                      "vaddhn.u16 d4, q12, q8                 \n\t"
117                      // intentionally don't calculate alpha
118                      // result in d4-d6
119
120                      "vqadd.u8   d5, d5, d1                  \n\t"
121                      "vqadd.u8   d4, d4, d2                  \n\t"
122
123                      // pack 8888 {d4-d6} to 0565 q10
124                      "vshll.u8   q10, d6, #8                 \n\t"
125                      "vshll.u8   q3, d5, #8                  \n\t"
126                      "vshll.u8   q2, d4, #8                  \n\t"
127                      "vsri.u16   q10, q3, #5                 \n\t"
128                      "vsri.u16   q10, q2, #11                \n\t"
129
130                      "bne        2b                          \n\t"
131
132                      "1:                                         \n\t"
133                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
134                      : [count] "+r" (count)
135                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
136                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
137                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
138                      "d30","d31"
139                      );
140    }
141    else
142    {   // handle count < 8
143        uint16_t* SK_RESTRICT keep_dst = 0;
144
145        asm volatile (
146                      "vmov.u8    d31, #1<<7                  \n\t"
147                      "mov        %[keep_dst], %[dst]         \n\t"
148
149                      "tst        %[count], #4                \n\t"
150                      "beq        14f                         \n\t"
151                      "vld1.16    {d25}, [%[dst]]!            \n\t"
152                      "vld1.32    {q1}, [%[src]]!             \n\t"
153
154                      "14:                                        \n\t"
155                      "tst        %[count], #2                \n\t"
156                      "beq        12f                         \n\t"
157                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
158                      "vld1.32    {d1}, [%[src]]!             \n\t"
159
160                      "12:                                        \n\t"
161                      "tst        %[count], #1                \n\t"
162                      "beq        11f                         \n\t"
163                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
164                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
165
166                      "11:                                        \n\t"
167                      // unzips achieve the same as a vld4 operation
168                      "vuzpq.u16  q0, q1                      \n\t"
169                      "vuzp.u8    d0, d1                      \n\t"
170                      "vuzp.u8    d2, d3                      \n\t"
171                      // expand 0565 q12 to 8888 {d4-d7}
172                      "vmovn.u16  d4, q12                     \n\t"
173                      "vshr.u16   q11, q12, #5                \n\t"
174                      "vshr.u16   q10, q12, #6+5              \n\t"
175                      "vmovn.u16  d5, q11                     \n\t"
176                      "vmovn.u16  d6, q10                     \n\t"
177                      "vshl.u8    d4, d4, #3                  \n\t"
178                      "vshl.u8    d5, d5, #2                  \n\t"
179                      "vshl.u8    d6, d6, #3                  \n\t"
180
181                      "vmovl.u8   q14, d31                    \n\t"
182                      "vmovl.u8   q13, d31                    \n\t"
183                      "vmovl.u8   q12, d31                    \n\t"
184
185                      // duplicate in 4/2/1 & 8pix vsns
186                      "vmvn.8     d30, d3                     \n\t"
187                      "vmlal.u8   q14, d30, d6                \n\t"
188                      "vmlal.u8   q13, d30, d5                \n\t"
189                      "vmlal.u8   q12, d30, d4                \n\t"
190                      "vshr.u16   q8, q14, #5                 \n\t"
191                      "vshr.u16   q9, q13, #6                 \n\t"
192                      "vaddhn.u16 d6, q14, q8                 \n\t"
193                      "vshr.u16   q8, q12, #5                 \n\t"
194                      "vaddhn.u16 d5, q13, q9                 \n\t"
195                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
196                      "vaddhn.u16 d4, q12, q8                 \n\t"
197                      // intentionally don't calculate alpha
198                      // result in d4-d6
199
200                      "vqadd.u8   d5, d5, d1                  \n\t"
201                      "vqadd.u8   d4, d4, d2                  \n\t"
202
203                      // pack 8888 {d4-d6} to 0565 q10
204                      "vshll.u8   q10, d6, #8                 \n\t"
205                      "vshll.u8   q3, d5, #8                  \n\t"
206                      "vshll.u8   q2, d4, #8                  \n\t"
207                      "vsri.u16   q10, q3, #5                 \n\t"
208                      "vsri.u16   q10, q2, #11                \n\t"
209
210                      // store
211                      "tst        %[count], #4                \n\t"
212                      "beq        24f                         \n\t"
213                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
214
215                      "24:                                        \n\t"
216                      "tst        %[count], #2                \n\t"
217                      "beq        22f                         \n\t"
218                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
219
220                      "22:                                        \n\t"
221                      "tst        %[count], #1                \n\t"
222                      "beq        21f                         \n\t"
223                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
224
225                      "21:                                        \n\t"
226                      : [count] "+r" (count)
227                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
228                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
229                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
230                      "d30","d31"
231                      );
232    }
233}
234
235void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
236                          const SkPMColor* SK_RESTRICT src, int count,
237                          U8CPU alpha, int /*x*/, int /*y*/) {
238
239    U8CPU alpha_for_asm = alpha;
240
241    asm volatile (
242    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
243     * the original in two respects:
244     *  1. The results have a few mismatches compared to the original code. These mismatches
245     *     never exceed 1. It's possible to improve accuracy vs. a floating point
246     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
247     *     Rounding is not present in the code below, because although results would be closer
248     *     to a floating point implementation, the number of mismatches compared to the
249     *     original code would be far greater.
250     *  2. On certain inputs, the original code can overflow, causing colour channels to
251     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
252     *     to affect another.
253     */
254
255#if 1
256        /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
257                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
258#else
259                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
260#endif
261                  "vmov.u16   q3, #255                        \n\t"   // set up constant
262                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
263                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
264                  "beq        2f                              \n\t"   // if count8 == 0, exit
265                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
266
267                  "1:                                             \n\t"
268                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
269                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
270                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
271                  //  and deinterleave
272
273                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
274                  "vand       q10, q0, q15                    \n\t"   // extract blue
275                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
276                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
277                  // dstrgb = {q8, q9, q10}
278
279                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
280                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
281                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
282
283                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
284                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
285                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
286                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
287                  // srcrgba = {q11, q12, q13, q14}
288
289                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
290                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
291                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
292                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
293
294                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
295                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
296                  // dst_scale = q2
297
298                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
299                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
300                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
301
302#if 1
303    // trying for a better match with SkDiv255Round(a)
304    // C alg is:  a+=128; (a+a>>8)>>8
305    // we'll use just a rounding shift [q2 is available for scratch]
306                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
307                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
308                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
309#else
310    // arm's original "truncating divide by 256"
311                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
312                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
313                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
314#endif
315
316                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
317                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
318                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
319
320                  "bne        1b                              \n\t"   // if counter != 0, loop
321                  "2:                                             \n\t"   // exit
322
323                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
324                  :
325                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
326                  );
327
328    count &= 7;
329    if (count > 0) {
330        do {
331            SkPMColor sc = *src++;
332            if (sc) {
333                uint16_t dc = *dst;
334                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
335                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
336                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
337                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
338                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
339            }
340            dst += 1;
341        } while (--count != 0);
342    }
343}
344
345/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
346 * each dither value is spaced out into byte lanes, and repeated
347 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
348 * start of each row.
349 */
350static const uint8_t gDitherMatrix_Neon[48] = {
351    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
352    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
353    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
354    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
355
356};
357
358void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
359                                int count, U8CPU alpha, int x, int y)
360{
361
362    SkASSERT(255 > alpha);
363
364    // rescale alpha to range 1 - 256
365    int scale = SkAlpha255To256(alpha);
366
367    if (count >= 8) {
368        /* select row and offset for dither array */
369        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
370
371        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
372        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
373
374        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
375        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
376
377        do {
378
379            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
380            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
381            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
382            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
383            uint16x8_t vdst;
384            uint16x8_t vdst_r, vdst_g, vdst_b;
385            int16x8_t vres_r, vres_g, vres_b;
386            int8x8_t vres8_r, vres8_g, vres8_b;
387
388            // Load source and add dither
389            {
390            register uint8x8_t d0 asm("d0");
391            register uint8x8_t d1 asm("d1");
392            register uint8x8_t d2 asm("d2");
393            register uint8x8_t d3 asm("d3");
394
395            asm (
396                "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
397                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
398                :
399            );
400            vsrc_g = d1;
401#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
402            vsrc_r = d2; vsrc_b = d0;
403#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
404            vsrc_r = d0; vsrc_b = d2;
405#endif
406            }
407
408            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
409            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
410            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
411
412            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
413            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
414            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
415
416            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
417            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
418            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
419
420            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
421            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
422            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
423
424            // Load dst and unpack
425            vdst = vld1q_u16(dst);
426            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
427            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
428            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
429
430            // subtract dst from src and widen
431            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
432            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
433            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
434
435            // multiply diffs by scale and shift
436            vres_r = vmulq_s16(vres_r, vscale);
437            vres_g = vmulq_s16(vres_g, vscale);
438            vres_b = vmulq_s16(vres_b, vscale);
439
440            vres8_r = vshrn_n_s16(vres_r, 8);
441            vres8_g = vshrn_n_s16(vres_g, 8);
442            vres8_b = vshrn_n_s16(vres_b, 8);
443
444            // add dst to result
445            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
446            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
447            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
448
449            // put result into 565 format
450            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
451            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
452
453            // Store result
454            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
455
456            // Next iteration
457            dst += 8;
458            count -= 8;
459
460        } while (count >= 8);
461    }
462
463    // Leftovers
464    if (count > 0) {
465        int scale = SkAlpha255To256(alpha);
466        DITHER_565_SCAN(y);
467        do {
468            SkPMColor c = *src++;
469            SkPMColorAssert(c);
470
471            int dither = DITHER_VALUE(x);
472            int sr = SkGetPackedR32(c);
473            int sg = SkGetPackedG32(c);
474            int sb = SkGetPackedB32(c);
475            sr = SkDITHER_R32To565(sr, dither);
476            sg = SkDITHER_G32To565(sg, dither);
477            sb = SkDITHER_B32To565(sb, dither);
478
479            uint16_t d = *dst;
480            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
481                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
482                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
483            DITHER_INC_X(x);
484        } while (--count != 0);
485    }
486}
487
488void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
489                                const SkPMColor* SK_RESTRICT src,
490                                int count, U8CPU alpha) {
491
492    SkASSERT(255 == alpha);
493    if (count > 0) {
494
495
496    uint8x8_t alpha_mask;
497
498    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
499    alpha_mask = vld1_u8(alpha_mask_setup);
500
501    /* do the NEON unrolled code */
502#define    UNROLL    4
503    while (count >= UNROLL) {
504        uint8x8_t src_raw, dst_raw, dst_final;
505        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
506
507        /* The two prefetches below may make the code slighlty
508         * slower for small values of count but are worth having
509         * in the general case.
510         */
511        __builtin_prefetch(src+32);
512        __builtin_prefetch(dst+32);
513
514        /* get the source */
515        src_raw = vreinterpret_u8_u32(vld1_u32(src));
516#if    UNROLL > 2
517        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
518#endif
519
520        /* get and hold the dst too */
521        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
522#if    UNROLL > 2
523        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
524#endif
525
526    /* 1st and 2nd bits of the unrolling */
527    {
528        uint8x8_t dst_cooked;
529        uint16x8_t dst_wide;
530        uint8x8_t alpha_narrow;
531        uint16x8_t alpha_wide;
532
533        /* get the alphas spread out properly */
534        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
535        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
536
537        /* spread the dest */
538        dst_wide = vmovl_u8(dst_raw);
539
540        /* alpha mul the dest */
541        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
542        dst_cooked = vshrn_n_u16(dst_wide, 8);
543
544        /* sum -- ignoring any byte lane overflows */
545        dst_final = vadd_u8(src_raw, dst_cooked);
546    }
547
548#if    UNROLL > 2
549    /* the 3rd and 4th bits of our unrolling */
550    {
551        uint8x8_t dst_cooked;
552        uint16x8_t dst_wide;
553        uint8x8_t alpha_narrow;
554        uint16x8_t alpha_wide;
555
556        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
557        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
558
559        /* spread the dest */
560        dst_wide = vmovl_u8(dst_raw_2);
561
562        /* alpha mul the dest */
563        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
564        dst_cooked = vshrn_n_u16(dst_wide, 8);
565
566        /* sum -- ignoring any byte lane overflows */
567        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
568    }
569#endif
570
571        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
572#if    UNROLL > 2
573        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
574#endif
575
576        src += UNROLL;
577        dst += UNROLL;
578        count -= UNROLL;
579    }
580#undef    UNROLL
581
582    /* do any residual iterations */
583        while (--count >= 0) {
584            *dst = SkPMSrcOver(*src, *dst);
585            src += 1;
586            dst += 1;
587        }
588    }
589}
590
591void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
592                                const SkPMColor* SK_RESTRICT src,
593                                int count, U8CPU alpha) {
594    SkASSERT(255 == alpha);
595
596    if (count <= 0)
597    return;
598
599    /* Use these to check if src is transparent or opaque */
600    const unsigned int ALPHA_OPAQ  = 0xFF000000;
601    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
602
603#define UNROLL  4
604    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
605    const SkPMColor* SK_RESTRICT src_temp = src;
606
607    /* set up the NEON variables */
608    uint8x8_t alpha_mask;
609    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
610    alpha_mask = vld1_u8(alpha_mask_setup);
611
612    uint8x8_t src_raw, dst_raw, dst_final;
613    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
614    uint8x8_t dst_cooked;
615    uint16x8_t dst_wide;
616    uint8x8_t alpha_narrow;
617    uint16x8_t alpha_wide;
618
619    /* choose the first processing type */
620    if( src >= src_end)
621        goto TAIL;
622    if(*src <= ALPHA_TRANS)
623        goto ALPHA_0;
624    if(*src >= ALPHA_OPAQ)
625        goto ALPHA_255;
626    /* fall-thru */
627
628ALPHA_1_TO_254:
629    do {
630
631        /* get the source */
632        src_raw = vreinterpret_u8_u32(vld1_u32(src));
633        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
634
635        /* get and hold the dst too */
636        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
637        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
638
639
640        /* get the alphas spread out properly */
641        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
642        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
643        /* we collapsed (255-a)+1 ... */
644        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
645
646        /* spread the dest */
647        dst_wide = vmovl_u8(dst_raw);
648
649        /* alpha mul the dest */
650        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
651        dst_cooked = vshrn_n_u16(dst_wide, 8);
652
653        /* sum -- ignoring any byte lane overflows */
654        dst_final = vadd_u8(src_raw, dst_cooked);
655
656        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
657        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
658        /* we collapsed (255-a)+1 ... */
659        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
660
661        /* spread the dest */
662        dst_wide = vmovl_u8(dst_raw_2);
663
664        /* alpha mul the dest */
665        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
666        dst_cooked = vshrn_n_u16(dst_wide, 8);
667
668        /* sum -- ignoring any byte lane overflows */
669        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
670
671        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
672        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
673
674        src += UNROLL;
675        dst += UNROLL;
676
677        /* if 2 of the next pixels aren't between 1 and 254
678        it might make sense to go to the optimized loops */
679        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
680            break;
681
682    } while(src < src_end);
683
684    if (src >= src_end)
685        goto TAIL;
686
687    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
688        goto ALPHA_255;
689
690    /*fall-thru*/
691
692ALPHA_0:
693
694    /*In this state, we know the current alpha is 0 and
695     we optimize for the next alpha also being zero. */
696    src_temp = src;  //so we don't have to increment dst every time
697    do {
698        if(*(++src) > ALPHA_TRANS)
699            break;
700        if(*(++src) > ALPHA_TRANS)
701            break;
702        if(*(++src) > ALPHA_TRANS)
703            break;
704        if(*(++src) > ALPHA_TRANS)
705            break;
706    } while(src < src_end);
707
708    dst += (src - src_temp);
709
710    /* no longer alpha 0, so determine where to go next. */
711    if( src >= src_end)
712        goto TAIL;
713    if(*src >= ALPHA_OPAQ)
714        goto ALPHA_255;
715    else
716        goto ALPHA_1_TO_254;
717
718ALPHA_255:
719    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
720        dst[0]=src[0];
721        dst[1]=src[1];
722        dst[2]=src[2];
723        dst[3]=src[3];
724        src+=UNROLL;
725        dst+=UNROLL;
726        if(src >= src_end)
727            goto TAIL;
728    }
729
730    //Handle remainder.
731    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
732        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
733            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
734        }
735    }
736
737    if( src >= src_end)
738        goto TAIL;
739    if(*src <= ALPHA_TRANS)
740        goto ALPHA_0;
741    else
742        goto ALPHA_1_TO_254;
743
744TAIL:
745    /* do any residual iterations */
746    src_end += UNROLL + 1;  //goto the real end
747    while(src != src_end) {
748        if( *src != 0 ) {
749            if( *src >= ALPHA_OPAQ ) {
750                *dst = *src;
751            }
752            else {
753                *dst = SkPMSrcOver(*src, *dst);
754            }
755        }
756        src++;
757        dst++;
758    }
759
760#undef    UNROLL
761    return;
762}
763
764/* Neon version of S32_Blend_BlitRow32()
765 * portable version is in src/core/SkBlitRow_D32.cpp
766 */
767void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
768                              const SkPMColor* SK_RESTRICT src,
769                              int count, U8CPU alpha) {
770    SkASSERT(alpha <= 255);
771    if (count > 0) {
772        uint16_t src_scale = SkAlpha255To256(alpha);
773        uint16_t dst_scale = 256 - src_scale;
774
775    /* run them N at a time through the NEON unit */
776    /* note that each 1 is 4 bytes, each treated exactly the same,
777     * so we can work under that guise. We *do* know that the src&dst
778     * will be 32-bit aligned quantities, so we can specify that on
779     * the load/store ops and do a neon 'reinterpret' to get us to
780     * byte-sized (pun intended) pieces that we widen/multiply/shift
781     * we're limited at 128 bits in the wide ops, which is 8x16bits
782     * or a pair of 32 bit src/dsts.
783     */
784    /* we *could* manually unroll this loop so that we load 128 bits
785     * (as a pair of 64s) from each of src and dst, processing them
786     * in pieces. This might give us a little better management of
787     * the memory latency, but my initial attempts here did not
788     * produce an instruction stream that looked all that nice.
789     */
790#define    UNROLL    2
791    while (count >= UNROLL) {
792        uint8x8_t  src_raw, dst_raw, dst_final;
793        uint16x8_t  src_wide, dst_wide;
794
795        /* get 64 bits of src, widen it, multiply by src_scale */
796        src_raw = vreinterpret_u8_u32(vld1_u32(src));
797        src_wide = vmovl_u8(src_raw);
798        /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
799        src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
800
801        /* ditto with dst */
802        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
803        dst_wide = vmovl_u8(dst_raw);
804
805        /* combine add with dst multiply into mul-accumulate */
806        dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
807
808        dst_final = vshrn_n_u16(dst_wide, 8);
809        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
810
811        src += UNROLL;
812        dst += UNROLL;
813        count -= UNROLL;
814    }
815    /* RBE: well, i don't like how gcc manages src/dst across the above
816     * loop it's constantly calculating src+bias, dst+bias and it only
817     * adjusts the real ones when we leave the loop. Not sure why
818     * it's "hoisting down" (hoisting implies above in my lexicon ;))
819     * the adjustments to src/dst/count, but it does...
820     * (might be SSA-style internal logic...
821     */
822
823#if    UNROLL == 2
824    if (count == 1) {
825            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
826    }
827#else
828    if (count > 0) {
829            do {
830                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
831                src += 1;
832                dst += 1;
833            } while (--count > 0);
834    }
835#endif
836
837#undef    UNROLL
838    }
839}
840
841void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
842                         const SkPMColor* SK_RESTRICT src,
843                         int count, U8CPU alpha) {
844
845    SkASSERT(255 >= alpha);
846
847    if (count <= 0) {
848        return;
849    }
850
851    unsigned alpha256 = SkAlpha255To256(alpha);
852
853    // First deal with odd counts
854    if (count & 1) {
855        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
856        uint16x8_t vdst_wide, vsrc_wide;
857        unsigned dst_scale;
858
859        // Load
860        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
861        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
862
863        // Calc dst_scale
864        dst_scale = vget_lane_u8(vsrc, 3);
865        dst_scale *= alpha256;
866        dst_scale >>= 8;
867        dst_scale = 256 - dst_scale;
868
869        // Process src
870        vsrc_wide = vmovl_u8(vsrc);
871        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
872
873        // Process dst
874        vdst_wide = vmovl_u8(vdst);
875        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
876
877        // Combine
878        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
879
880        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
881        dst++;
882        src++;
883        count--;
884    }
885
886    if (count) {
887        uint8x8_t alpha_mask;
888        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
889        alpha_mask = vld1_u8(alpha_mask_setup);
890
891        do {
892
893            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
894            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
895
896            __builtin_prefetch(src+32);
897            __builtin_prefetch(dst+32);
898
899            // Load
900            vsrc = vreinterpret_u8_u32(vld1_u32(src));
901            vdst = vreinterpret_u8_u32(vld1_u32(dst));
902
903            // Prepare src_scale
904            vsrc_scale = vdupq_n_u16(alpha256);
905
906            // Calc dst_scale
907            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
908            vdst_scale = vmovl_u8(vsrc_alphas);
909            vdst_scale *= vsrc_scale;
910            vdst_scale = vshrq_n_u16(vdst_scale, 8);
911            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
912
913            // Process src
914            vsrc_wide = vmovl_u8(vsrc);
915            vsrc_wide *= vsrc_scale;
916
917            // Process dst
918            vdst_wide = vmovl_u8(vdst);
919            vdst_wide *= vdst_scale;
920
921            // Combine
922            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
923
924            vst1_u32(dst, vreinterpret_u32_u8(vres));
925
926            src += 2;
927            dst += 2;
928            count -= 2;
929        } while(count);
930    }
931}
932
933///////////////////////////////////////////////////////////////////////////////
934
935#undef    DEBUG_OPAQUE_DITHER
936
937#if    defined(DEBUG_OPAQUE_DITHER)
938static void showme8(char *str, void *p, int len)
939{
940    static char buf[256];
941    char tbuf[32];
942    int i;
943    char *pc = (char*) p;
944    sprintf(buf,"%8s:", str);
945    for(i=0;i<len;i++) {
946        sprintf(tbuf, "   %02x", pc[i]);
947        strcat(buf, tbuf);
948    }
949    SkDebugf("%s\n", buf);
950}
951static void showme16(char *str, void *p, int len)
952{
953    static char buf[256];
954    char tbuf[32];
955    int i;
956    uint16_t *pc = (uint16_t*) p;
957    sprintf(buf,"%8s:", str);
958    len = (len / sizeof(uint16_t));    /* passed as bytes */
959    for(i=0;i<len;i++) {
960        sprintf(tbuf, " %04x", pc[i]);
961        strcat(buf, tbuf);
962    }
963    SkDebugf("%s\n", buf);
964}
965#endif
966
967void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
968                                   const SkPMColor* SK_RESTRICT src,
969                                   int count, U8CPU alpha, int x, int y) {
970    SkASSERT(255 == alpha);
971
972#define    UNROLL    8
973
974    if (count >= UNROLL) {
975    uint8x8_t dbase;
976
977#if    defined(DEBUG_OPAQUE_DITHER)
978    uint16_t tmpbuf[UNROLL];
979    int td[UNROLL];
980    int tdv[UNROLL];
981    int ta[UNROLL];
982    int tap[UNROLL];
983    uint16_t in_dst[UNROLL];
984    int offset = 0;
985    int noisy = 0;
986#endif
987
988    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
989    dbase = vld1_u8(dstart);
990
991        do {
992        uint8x8_t sr, sg, sb, sa, d;
993        uint16x8_t dst8, scale8, alpha8;
994        uint16x8_t dst_r, dst_g, dst_b;
995
996#if    defined(DEBUG_OPAQUE_DITHER)
997    /* calculate 8 elements worth into a temp buffer */
998    {
999      int my_y = y;
1000      int my_x = x;
1001      SkPMColor* my_src = (SkPMColor*)src;
1002      uint16_t* my_dst = dst;
1003      int i;
1004
1005          DITHER_565_SCAN(my_y);
1006          for(i=0;i<UNROLL;i++) {
1007            SkPMColor c = *my_src++;
1008            SkPMColorAssert(c);
1009            if (c) {
1010                unsigned a = SkGetPackedA32(c);
1011
1012                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1013        tdv[i] = DITHER_VALUE(my_x);
1014        ta[i] = a;
1015        tap[i] = SkAlpha255To256(a);
1016        td[i] = d;
1017
1018                unsigned sr = SkGetPackedR32(c);
1019                unsigned sg = SkGetPackedG32(c);
1020                unsigned sb = SkGetPackedB32(c);
1021                sr = SkDITHER_R32_FOR_565(sr, d);
1022                sg = SkDITHER_G32_FOR_565(sg, d);
1023                sb = SkDITHER_B32_FOR_565(sb, d);
1024
1025                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1026                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1027                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1028                // now src and dst expanded are in g:11 r:10 x:1 b:10
1029                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1030        td[i] = d;
1031
1032            } else {
1033        tmpbuf[i] = *my_dst;
1034        ta[i] = tdv[i] = td[i] = 0xbeef;
1035        }
1036        in_dst[i] = *my_dst;
1037            my_dst += 1;
1038            DITHER_INC_X(my_x);
1039          }
1040    }
1041#endif
1042
1043        /* source is in ABGR */
1044        {
1045        register uint8x8_t d0 asm("d0");
1046        register uint8x8_t d1 asm("d1");
1047        register uint8x8_t d2 asm("d2");
1048        register uint8x8_t d3 asm("d3");
1049
1050        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1051            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1052            : "r" (src)
1053                    );
1054            sr = d0; sg = d1; sb = d2; sa = d3;
1055        }
1056
1057        /* calculate 'd', which will be 0..7 */
1058        /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1059#if defined(SK_BUILD_FOR_ANDROID)
1060        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1061        alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1062#else
1063        alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1064#endif
1065        alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1066        d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
1067
1068        /* sr = sr - (sr>>5) + d */
1069        /* watching for 8-bit overflow.  d is 0..7; risky range of
1070         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1071         * safe  as long as we do ((sr-sr>>5) + d) */
1072        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1073        sr = vadd_u8(sr, d);
1074
1075        /* sb = sb - (sb>>5) + d */
1076        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1077        sb = vadd_u8(sb, d);
1078
1079        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1080        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1081        sg = vadd_u8(sg, vshr_n_u8(d,1));
1082
1083        /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1084        dst8 = vld1q_u16(dst);
1085        dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1086        dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1087        dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
1088
1089        /* blend */
1090#if 1
1091        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1092        /* originally 255-sa + 1 */
1093        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1094#else
1095        scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1096        scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1097#endif
1098
1099#if 1
1100        /* combine the addq and mul, save 3 insns */
1101        scale8 = vshrq_n_u16(scale8, 3);
1102        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1103        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1104        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1105#else
1106        /* known correct, but +3 insns over above */
1107        scale8 = vshrq_n_u16(scale8, 3);
1108        dst_b = vmulq_u16(dst_b, scale8);
1109        dst_g = vmulq_u16(dst_g, scale8);
1110        dst_r = vmulq_u16(dst_r, scale8);
1111
1112        /* combine */
1113        /* NB: vshll widens, need to preserve those bits */
1114        dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1115        dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1116        dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1117#endif
1118
1119        /* repack to store */
1120        dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1121        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1122        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1123
1124        vst1q_u16(dst, dst8);
1125
1126#if    defined(DEBUG_OPAQUE_DITHER)
1127        /* verify my 8 elements match the temp buffer */
1128    {
1129       int i, bad=0;
1130       static int invocation;
1131
1132       for (i=0;i<UNROLL;i++)
1133        if (tmpbuf[i] != dst[i]) bad=1;
1134       if (bad) {
1135        SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1136            invocation, offset);
1137        SkDebugf("  alpha 0x%x\n", alpha);
1138        for (i=0;i<UNROLL;i++)
1139            SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1140            i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1141            dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1142
1143        showme16("alpha8", &alpha8, sizeof(alpha8));
1144        showme16("scale8", &scale8, sizeof(scale8));
1145        showme8("d", &d, sizeof(d));
1146        showme16("dst8", &dst8, sizeof(dst8));
1147        showme16("dst_b", &dst_b, sizeof(dst_b));
1148        showme16("dst_g", &dst_g, sizeof(dst_g));
1149        showme16("dst_r", &dst_r, sizeof(dst_r));
1150        showme8("sb", &sb, sizeof(sb));
1151        showme8("sg", &sg, sizeof(sg));
1152        showme8("sr", &sr, sizeof(sr));
1153
1154        /* cop out */
1155        return;
1156       }
1157       offset += UNROLL;
1158       invocation++;
1159    }
1160#endif
1161
1162            dst += UNROLL;
1163        src += UNROLL;
1164        count -= UNROLL;
1165        /* skip x += UNROLL, since it's unchanged mod-4 */
1166        } while (count >= UNROLL);
1167    }
1168#undef    UNROLL
1169
1170    /* residuals */
1171    if (count > 0) {
1172        DITHER_565_SCAN(y);
1173        do {
1174            SkPMColor c = *src++;
1175            SkPMColorAssert(c);
1176            if (c) {
1177                unsigned a = SkGetPackedA32(c);
1178
1179                // dither and alpha are just temporary variables to work-around
1180                // an ICE in debug.
1181                unsigned dither = DITHER_VALUE(x);
1182                unsigned alpha = SkAlpha255To256(a);
1183                int d = SkAlphaMul(dither, alpha);
1184
1185                unsigned sr = SkGetPackedR32(c);
1186                unsigned sg = SkGetPackedG32(c);
1187                unsigned sb = SkGetPackedB32(c);
1188                sr = SkDITHER_R32_FOR_565(sr, d);
1189                sg = SkDITHER_G32_FOR_565(sg, d);
1190                sb = SkDITHER_B32_FOR_565(sb, d);
1191
1192                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1193                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1194                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1195                // now src and dst expanded are in g:11 r:10 x:1 b:10
1196                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1197            }
1198            dst += 1;
1199            DITHER_INC_X(x);
1200        } while (--count != 0);
1201    }
1202}
1203
1204///////////////////////////////////////////////////////////////////////////////
1205
1206#undef    DEBUG_S32_OPAQUE_DITHER
1207
1208void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1209                                 const SkPMColor* SK_RESTRICT src,
1210                                 int count, U8CPU alpha, int x, int y) {
1211    SkASSERT(255 == alpha);
1212
1213#define    UNROLL    8
1214    if (count >= UNROLL) {
1215    uint8x8_t d;
1216    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1217    d = vld1_u8(dstart);
1218
1219    while (count >= UNROLL) {
1220        uint8x8_t sr, sg, sb;
1221        uint16x8_t dr, dg, db;
1222        uint16x8_t dst8;
1223
1224        {
1225        register uint8x8_t d0 asm("d0");
1226        register uint8x8_t d1 asm("d1");
1227        register uint8x8_t d2 asm("d2");
1228        register uint8x8_t d3 asm("d3");
1229
1230        asm (
1231            "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1232            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1233            :
1234        );
1235        sg = d1;
1236#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1237        sr = d2; sb = d0;
1238#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1239        sr = d0; sb = d2;
1240#endif
1241        }
1242        /* XXX: if we want to prefetch, hide it in the above asm()
1243         * using the gcc __builtin_prefetch(), the prefetch will
1244         * fall to the bottom of the loop -- it won't stick up
1245         * at the top of the loop, just after the vld4.
1246         */
1247
1248        // sr = sr - (sr>>5) + d
1249        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1250        dr = vaddl_u8(sr, d);
1251
1252        // sb = sb - (sb>>5) + d
1253        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1254        db = vaddl_u8(sb, d);
1255
1256        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1257        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1258        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1259
1260        // pack high bits of each into 565 format  (rgb, b is lsb)
1261        dst8 = vshrq_n_u16(db, 3);
1262        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1263        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1264
1265        // store it
1266        vst1q_u16(dst, dst8);
1267
1268#if    defined(DEBUG_S32_OPAQUE_DITHER)
1269        // always good to know if we generated good results
1270        {
1271        int i, myx = x, myy = y;
1272        DITHER_565_SCAN(myy);
1273        for (i=0;i<UNROLL;i++) {
1274            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1275            SkPMColor c = src[i-8];
1276            unsigned dither = DITHER_VALUE(myx);
1277            uint16_t val = SkDitherRGB32To565(c, dither);
1278            if (val != dst[i]) {
1279            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1280                c, dither, val, dst[i], dstart[i]);
1281            }
1282            DITHER_INC_X(myx);
1283        }
1284        }
1285#endif
1286
1287        dst += UNROLL;
1288        // we don't need to increment src as the asm above has already done it
1289        count -= UNROLL;
1290        x += UNROLL;        // probably superfluous
1291    }
1292    }
1293#undef    UNROLL
1294
1295    // residuals
1296    if (count > 0) {
1297        DITHER_565_SCAN(y);
1298        do {
1299            SkPMColor c = *src++;
1300            SkPMColorAssert(c);
1301            SkASSERT(SkGetPackedA32(c) == 255);
1302
1303            unsigned dither = DITHER_VALUE(x);
1304            *dst++ = SkDitherRGB32To565(c, dither);
1305            DITHER_INC_X(x);
1306        } while (--count != 0);
1307    }
1308}
1309
1310void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1311                      SkPMColor color) {
1312    if (count <= 0) {
1313        return;
1314    }
1315
1316    if (0 == color) {
1317        if (src != dst) {
1318            memcpy(dst, src, count * sizeof(SkPMColor));
1319        }
1320        return;
1321    }
1322
1323    unsigned colorA = SkGetPackedA32(color);
1324    if (255 == colorA) {
1325        sk_memset32(dst, color, count);
1326    } else {
1327        unsigned scale = 256 - SkAlpha255To256(colorA);
1328
1329        if (count >= 8) {
1330            // at the end of this assembly, count will have been decremented
1331            // to a negative value. That is, if count mod 8 = x, it will be
1332            // -8 +x coming out.
1333            asm volatile (
1334                PLD128(src, 0)
1335
1336                "vdup.32    q0, %[color]                \n\t"
1337
1338                PLD128(src, 128)
1339
1340                // scale numerical interval [0-255], so load as 8 bits
1341                "vdup.8     d2, %[scale]                \n\t"
1342
1343                PLD128(src, 256)
1344
1345                "subs       %[count], %[count], #8      \n\t"
1346
1347                PLD128(src, 384)
1348
1349                "Loop_Color32:                          \n\t"
1350
1351                // load src color, 8 pixels, 4 64 bit registers
1352                // (and increment src).
1353                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1354
1355                PLD128(src, 384)
1356
1357                // multiply long by scale, 64 bits at a time,
1358                // destination into a 128 bit register.
1359                "vmull.u8   q4, d4, d2                  \n\t"
1360                "vmull.u8   q5, d5, d2                  \n\t"
1361                "vmull.u8   q6, d6, d2                  \n\t"
1362                "vmull.u8   q7, d7, d2                  \n\t"
1363
1364                // shift the 128 bit registers, containing the 16
1365                // bit scaled values back to 8 bits, narrowing the
1366                // results to 64 bit registers.
1367                "vshrn.i16  d8, q4, #8                  \n\t"
1368                "vshrn.i16  d9, q5, #8                  \n\t"
1369                "vshrn.i16  d10, q6, #8                 \n\t"
1370                "vshrn.i16  d11, q7, #8                 \n\t"
1371
1372                // adding back the color, using 128 bit registers.
1373                "vadd.i8    q6, q4, q0                  \n\t"
1374                "vadd.i8    q7, q5, q0                  \n\t"
1375
1376                // store back the 8 calculated pixels (2 128 bit
1377                // registers), and increment dst.
1378                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1379
1380                "subs       %[count], %[count], #8      \n\t"
1381                "bge        Loop_Color32                \n\t"
1382                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1383                : [color] "r" (color), [scale] "r" (scale)
1384                : "cc", "memory",
1385                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1386                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1387                          );
1388            // At this point, if we went through the inline assembly, count is
1389            // a negative value:
1390            // if the value is -8, there is no pixel left to process.
1391            // if the value is -7, there is one pixel left to process
1392            // ...
1393            // And'ing it with 7 will give us the number of pixels
1394            // left to process.
1395            count = count & 0x7;
1396        }
1397
1398        while (count > 0) {
1399            *dst = color + SkAlphaMulQ(*src, scale);
1400            src += 1;
1401            dst += 1;
1402            count--;
1403        }
1404    }
1405}
1406
1407///////////////////////////////////////////////////////////////////////////////
1408
1409const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1410    // no dither
1411    // NOTE: For the S32_D565_Blend function below, we don't have a special
1412    //       version that assumes that each source pixel is opaque. But our
1413    //       S32A is still faster than the default, so use it.
1414    S32_D565_Opaque_neon,
1415    S32A_D565_Blend_neon,   // really S32_D565_Blend
1416    S32A_D565_Opaque_neon,
1417    S32A_D565_Blend_neon,
1418
1419    // dither
1420    S32_D565_Opaque_Dither_neon,
1421    S32_D565_Blend_Dither_neon,
1422    S32A_D565_Opaque_Dither_neon,
1423    NULL,   // S32A_D565_Blend_Dither
1424};
1425
1426const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1427    NULL,   // S32_Opaque,
1428    S32_Blend_BlitRow32_neon,        // S32_Blend,
1429    /*
1430     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1431     * value and attempts to optimize accordingly.  The optimization is
1432     * sensitive to the source content and is not a win in all cases. For
1433     * example, if there are a lot of transitions between the alpha states,
1434     * the performance will almost certainly be worse.  However, for many
1435     * common cases the performance is equivalent or better than the standard
1436     * case where we do not inspect the src alpha.
1437     */
1438#if SK_A32_SHIFT == 24
1439    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1440    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1441#else
1442    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1443#endif
1444    S32A_Blend_BlitRow32_neon        // S32A_Blend
1445};
1446