SkBlitRow_opts_arm_neon.cpp revision c2050e3a3ecfb8738b36e2add15c526e8e0f21fe
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkCachePreload_arm.h"
18
19#include <arm_neon.h>
20
21void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22                           const SkPMColor* SK_RESTRICT src, int count,
23                           U8CPU alpha, int /*x*/, int /*y*/) {
24    SkASSERT(255 == alpha);
25
26    if (count >= 8) {
27        uint16_t* SK_RESTRICT keep_dst = 0;
28
29        asm volatile (
30                      "ands       ip, %[count], #7            \n\t"
31                      "vmov.u8    d31, #1<<7                  \n\t"
32                      "vld1.16    {q12}, [%[dst]]             \n\t"
33                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
34                      // Thumb does not support the standard ARM conditional
35                      // instructions but instead requires the 'it' instruction
36                      // to signal conditional execution
37                      "it eq                                  \n\t"
38                      "moveq      ip, #8                      \n\t"
39                      "mov        %[keep_dst], %[dst]         \n\t"
40
41                      "add        %[src], %[src], ip, LSL#2   \n\t"
42                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
43                      "subs       %[count], %[count], ip      \n\t"
44                      "b          9f                          \n\t"
45                      // LOOP
46                      "2:                                         \n\t"
47
48                      "vld1.16    {q12}, [%[dst]]!            \n\t"
49                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
50                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
51                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
52                      "subs       %[count], %[count], #8      \n\t"
53                      "9:                                         \n\t"
54                      "pld        [%[dst],#32]                \n\t"
55                      // expand 0565 q12 to 8888 {d4-d7}
56                      "vmovn.u16  d4, q12                     \n\t"
57                      "vshr.u16   q11, q12, #5                \n\t"
58                      "vshr.u16   q10, q12, #6+5              \n\t"
59                      "vmovn.u16  d5, q11                     \n\t"
60                      "vmovn.u16  d6, q10                     \n\t"
61                      "vshl.u8    d4, d4, #3                  \n\t"
62                      "vshl.u8    d5, d5, #2                  \n\t"
63                      "vshl.u8    d6, d6, #3                  \n\t"
64
65                      "vmovl.u8   q14, d31                    \n\t"
66                      "vmovl.u8   q13, d31                    \n\t"
67                      "vmovl.u8   q12, d31                    \n\t"
68
69                      // duplicate in 4/2/1 & 8pix vsns
70                      "vmvn.8     d30, d3                     \n\t"
71                      "vmlal.u8   q14, d30, d6                \n\t"
72                      "vmlal.u8   q13, d30, d5                \n\t"
73                      "vmlal.u8   q12, d30, d4                \n\t"
74                      "vshr.u16   q8, q14, #5                 \n\t"
75                      "vshr.u16   q9, q13, #6                 \n\t"
76                      "vaddhn.u16 d6, q14, q8                 \n\t"
77                      "vshr.u16   q8, q12, #5                 \n\t"
78                      "vaddhn.u16 d5, q13, q9                 \n\t"
79                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
80                      "vaddhn.u16 d4, q12, q8                 \n\t"
81                      // intentionally don't calculate alpha
82                      // result in d4-d6
83
84                      "vqadd.u8   d5, d5, d1                  \n\t"
85                      "vqadd.u8   d4, d4, d2                  \n\t"
86
87                      // pack 8888 {d4-d6} to 0565 q10
88                      "vshll.u8   q10, d6, #8                 \n\t"
89                      "vshll.u8   q3, d5, #8                  \n\t"
90                      "vshll.u8   q2, d4, #8                  \n\t"
91                      "vsri.u16   q10, q3, #5                 \n\t"
92                      "vsri.u16   q10, q2, #11                \n\t"
93
94                      "bne        2b                          \n\t"
95
96                      "1:                                         \n\t"
97                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
98                      : [count] "+r" (count)
99                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
100                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
101                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
102                      "d30","d31"
103                      );
104    }
105    else
106    {   // handle count < 8
107        uint16_t* SK_RESTRICT keep_dst = 0;
108
109        asm volatile (
110                      "vmov.u8    d31, #1<<7                  \n\t"
111                      "mov        %[keep_dst], %[dst]         \n\t"
112
113                      "tst        %[count], #4                \n\t"
114                      "beq        14f                         \n\t"
115                      "vld1.16    {d25}, [%[dst]]!            \n\t"
116                      "vld1.32    {q1}, [%[src]]!             \n\t"
117
118                      "14:                                        \n\t"
119                      "tst        %[count], #2                \n\t"
120                      "beq        12f                         \n\t"
121                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
122                      "vld1.32    {d1}, [%[src]]!             \n\t"
123
124                      "12:                                        \n\t"
125                      "tst        %[count], #1                \n\t"
126                      "beq        11f                         \n\t"
127                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
128                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
129
130                      "11:                                        \n\t"
131                      // unzips achieve the same as a vld4 operation
132                      "vuzpq.u16  q0, q1                      \n\t"
133                      "vuzp.u8    d0, d1                      \n\t"
134                      "vuzp.u8    d2, d3                      \n\t"
135                      // expand 0565 q12 to 8888 {d4-d7}
136                      "vmovn.u16  d4, q12                     \n\t"
137                      "vshr.u16   q11, q12, #5                \n\t"
138                      "vshr.u16   q10, q12, #6+5              \n\t"
139                      "vmovn.u16  d5, q11                     \n\t"
140                      "vmovn.u16  d6, q10                     \n\t"
141                      "vshl.u8    d4, d4, #3                  \n\t"
142                      "vshl.u8    d5, d5, #2                  \n\t"
143                      "vshl.u8    d6, d6, #3                  \n\t"
144
145                      "vmovl.u8   q14, d31                    \n\t"
146                      "vmovl.u8   q13, d31                    \n\t"
147                      "vmovl.u8   q12, d31                    \n\t"
148
149                      // duplicate in 4/2/1 & 8pix vsns
150                      "vmvn.8     d30, d3                     \n\t"
151                      "vmlal.u8   q14, d30, d6                \n\t"
152                      "vmlal.u8   q13, d30, d5                \n\t"
153                      "vmlal.u8   q12, d30, d4                \n\t"
154                      "vshr.u16   q8, q14, #5                 \n\t"
155                      "vshr.u16   q9, q13, #6                 \n\t"
156                      "vaddhn.u16 d6, q14, q8                 \n\t"
157                      "vshr.u16   q8, q12, #5                 \n\t"
158                      "vaddhn.u16 d5, q13, q9                 \n\t"
159                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
160                      "vaddhn.u16 d4, q12, q8                 \n\t"
161                      // intentionally don't calculate alpha
162                      // result in d4-d6
163
164                      "vqadd.u8   d5, d5, d1                  \n\t"
165                      "vqadd.u8   d4, d4, d2                  \n\t"
166
167                      // pack 8888 {d4-d6} to 0565 q10
168                      "vshll.u8   q10, d6, #8                 \n\t"
169                      "vshll.u8   q3, d5, #8                  \n\t"
170                      "vshll.u8   q2, d4, #8                  \n\t"
171                      "vsri.u16   q10, q3, #5                 \n\t"
172                      "vsri.u16   q10, q2, #11                \n\t"
173
174                      // store
175                      "tst        %[count], #4                \n\t"
176                      "beq        24f                         \n\t"
177                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
178
179                      "24:                                        \n\t"
180                      "tst        %[count], #2                \n\t"
181                      "beq        22f                         \n\t"
182                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
183
184                      "22:                                        \n\t"
185                      "tst        %[count], #1                \n\t"
186                      "beq        21f                         \n\t"
187                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
188
189                      "21:                                        \n\t"
190                      : [count] "+r" (count)
191                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
192                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
193                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
194                      "d30","d31"
195                      );
196    }
197}
198
199void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
200                          const SkPMColor* SK_RESTRICT src, int count,
201                          U8CPU alpha, int /*x*/, int /*y*/) {
202
203    U8CPU alpha_for_asm = alpha;
204
205    asm volatile (
206    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
207     * the original in two respects:
208     *  1. The results have a few mismatches compared to the original code. These mismatches
209     *     never exceed 1. It's possible to improve accuracy vs. a floating point
210     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
211     *     Rounding is not present in the code below, because although results would be closer
212     *     to a floating point implementation, the number of mismatches compared to the
213     *     original code would be far greater.
214     *  2. On certain inputs, the original code can overflow, causing colour channels to
215     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
216     *     to affect another.
217     */
218
219#if 1
220        /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
221                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
222#else
223                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
224#endif
225                  "vmov.u16   q3, #255                        \n\t"   // set up constant
226                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
227                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
228                  "beq        2f                              \n\t"   // if count8 == 0, exit
229                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
230
231                  "1:                                             \n\t"
232                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
233                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
234                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
235                  //  and deinterleave
236
237                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
238                  "vand       q10, q0, q15                    \n\t"   // extract blue
239                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
240                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
241                  // dstrgb = {q8, q9, q10}
242
243                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
244                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
245                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
246
247                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
248                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
249                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
250                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
251                  // srcrgba = {q11, q12, q13, q14}
252
253                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
254                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
255                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
256                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
257
258                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
259                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
260                  // dst_scale = q2
261
262                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
263                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
264                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
265
266#if 1
267    // trying for a better match with SkDiv255Round(a)
268    // C alg is:  a+=128; (a+a>>8)>>8
269    // we'll use just a rounding shift [q2 is available for scratch]
270                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
271                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
272                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
273#else
274    // arm's original "truncating divide by 256"
275                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
276                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
277                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
278#endif
279
280                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
281                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
282                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
283
284                  "bne        1b                              \n\t"   // if counter != 0, loop
285                  "2:                                             \n\t"   // exit
286
287                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
288                  :
289                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
290                  );
291
292    count &= 7;
293    if (count > 0) {
294        do {
295            SkPMColor sc = *src++;
296            if (sc) {
297                uint16_t dc = *dst;
298                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
299                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
300                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
301                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
302                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
303            }
304            dst += 1;
305        } while (--count != 0);
306    }
307}
308
309/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
310 * each dither value is spaced out into byte lanes, and repeated
311 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
312 * start of each row.
313 */
314static const uint8_t gDitherMatrix_Neon[48] = {
315    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
316    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
317    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
318    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
319
320};
321
322void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
323                                int count, U8CPU alpha, int x, int y)
324{
325    /* select row and offset for dither array */
326    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
327
328    /* rescale alpha to range 0 - 256 */
329    int scale = SkAlpha255To256(alpha);
330
331    asm volatile (
332                  "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
333                  "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
334                  "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
335                  "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
336                  "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
337                  "1:                                                 \n\t"
338                  "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
339                  "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
340                  "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
341                  "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
342                  "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
343                  "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
344                  "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
345                  "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
346                  "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
347                  "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
348                  "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
349                  "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
350                  "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
351                  // load 8 pixels from dst, extract rgb
352                  "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
353                  "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
354                  "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
355                  "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
356                  "vand           d17, d17, d29                   \n\t"   // and green with green mask
357                  "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
358                  "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
359                  // src = {d22 (r), d23 (g), d24 (b)}
360                  // dst = {d16 (r), d17 (g), d18 (b)}
361                  // subtract dst from src and widen
362                  "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
363                  "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
364                  "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
365                  // multiply diffs by scale and shift
366                  "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
367                  "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
368                  "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
369                  "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
370                  "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
371                  "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
372                  "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
373                  // add dst to result
374                  "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
375                  "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
376                  "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
377                  // put result into 565 format
378                  "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
379                  "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
380                  "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
381                  "bgt            1b                              \n\t"   // loop if count > 0
382                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
383                  : [dstart] "r" (dstart), [scale] "r" (scale)
384                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
385                  );
386
387    DITHER_565_SCAN(y);
388
389    while((count & 7) > 0)
390    {
391        SkPMColor c = *src++;
392
393        int dither = DITHER_VALUE(x);
394        int sr = SkGetPackedR32(c);
395        int sg = SkGetPackedG32(c);
396        int sb = SkGetPackedB32(c);
397        sr = SkDITHER_R32To565(sr, dither);
398        sg = SkDITHER_G32To565(sg, dither);
399        sb = SkDITHER_B32To565(sb, dither);
400
401        uint16_t d = *dst;
402        *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
403                             SkAlphaBlend(sg, SkGetPackedG16(d), scale),
404                             SkAlphaBlend(sb, SkGetPackedB16(d), scale));
405        DITHER_INC_X(x);
406        count--;
407    }
408}
409
410void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
411                                const SkPMColor* SK_RESTRICT src,
412                                int count, U8CPU alpha) {
413
414    SkASSERT(255 == alpha);
415    if (count > 0) {
416
417
418    uint8x8_t alpha_mask;
419
420    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
421    alpha_mask = vld1_u8(alpha_mask_setup);
422
423    /* do the NEON unrolled code */
424#define    UNROLL    4
425    while (count >= UNROLL) {
426        uint8x8_t src_raw, dst_raw, dst_final;
427        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
428
429        /* The two prefetches below may make the code slighlty
430         * slower for small values of count but are worth having
431         * in the general case.
432         */
433        __builtin_prefetch(src+32);
434        __builtin_prefetch(dst+32);
435
436        /* get the source */
437        src_raw = vreinterpret_u8_u32(vld1_u32(src));
438#if    UNROLL > 2
439        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
440#endif
441
442        /* get and hold the dst too */
443        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
444#if    UNROLL > 2
445        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
446#endif
447
448    /* 1st and 2nd bits of the unrolling */
449    {
450        uint8x8_t dst_cooked;
451        uint16x8_t dst_wide;
452        uint8x8_t alpha_narrow;
453        uint16x8_t alpha_wide;
454
455        /* get the alphas spread out properly */
456        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
457        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
458
459        /* spread the dest */
460        dst_wide = vmovl_u8(dst_raw);
461
462        /* alpha mul the dest */
463        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
464        dst_cooked = vshrn_n_u16(dst_wide, 8);
465
466        /* sum -- ignoring any byte lane overflows */
467        dst_final = vadd_u8(src_raw, dst_cooked);
468    }
469
470#if    UNROLL > 2
471    /* the 3rd and 4th bits of our unrolling */
472    {
473        uint8x8_t dst_cooked;
474        uint16x8_t dst_wide;
475        uint8x8_t alpha_narrow;
476        uint16x8_t alpha_wide;
477
478        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
479        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
480
481        /* spread the dest */
482        dst_wide = vmovl_u8(dst_raw_2);
483
484        /* alpha mul the dest */
485        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
486        dst_cooked = vshrn_n_u16(dst_wide, 8);
487
488        /* sum -- ignoring any byte lane overflows */
489        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
490    }
491#endif
492
493        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
494#if    UNROLL > 2
495        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
496#endif
497
498        src += UNROLL;
499        dst += UNROLL;
500        count -= UNROLL;
501    }
502#undef    UNROLL
503
504    /* do any residual iterations */
505        while (--count >= 0) {
506            *dst = SkPMSrcOver(*src, *dst);
507            src += 1;
508            dst += 1;
509        }
510    }
511}
512
513void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
514                                const SkPMColor* SK_RESTRICT src,
515                                int count, U8CPU alpha) {
516    SkASSERT(255 == alpha);
517
518    if (count <= 0)
519    return;
520
521    /* Use these to check if src is transparent or opaque */
522    const unsigned int ALPHA_OPAQ  = 0xFF000000;
523    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
524
525#define UNROLL  4
526    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
527    const SkPMColor* SK_RESTRICT src_temp = src;
528
529    /* set up the NEON variables */
530    uint8x8_t alpha_mask;
531    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
532    alpha_mask = vld1_u8(alpha_mask_setup);
533
534    uint8x8_t src_raw, dst_raw, dst_final;
535    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
536    uint8x8_t dst_cooked;
537    uint16x8_t dst_wide;
538    uint8x8_t alpha_narrow;
539    uint16x8_t alpha_wide;
540
541    /* choose the first processing type */
542    if( src >= src_end)
543        goto TAIL;
544    if(*src <= ALPHA_TRANS)
545        goto ALPHA_0;
546    if(*src >= ALPHA_OPAQ)
547        goto ALPHA_255;
548    /* fall-thru */
549
550ALPHA_1_TO_254:
551    do {
552
553        /* get the source */
554        src_raw = vreinterpret_u8_u32(vld1_u32(src));
555        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
556
557        /* get and hold the dst too */
558        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
559        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
560
561
562        /* get the alphas spread out properly */
563        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
564        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
565        /* we collapsed (255-a)+1 ... */
566        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
567
568        /* spread the dest */
569        dst_wide = vmovl_u8(dst_raw);
570
571        /* alpha mul the dest */
572        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
573        dst_cooked = vshrn_n_u16(dst_wide, 8);
574
575        /* sum -- ignoring any byte lane overflows */
576        dst_final = vadd_u8(src_raw, dst_cooked);
577
578        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
579        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
580        /* we collapsed (255-a)+1 ... */
581        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
582
583        /* spread the dest */
584        dst_wide = vmovl_u8(dst_raw_2);
585
586        /* alpha mul the dest */
587        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
588        dst_cooked = vshrn_n_u16(dst_wide, 8);
589
590        /* sum -- ignoring any byte lane overflows */
591        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
592
593        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
594        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
595
596        src += UNROLL;
597        dst += UNROLL;
598
599        /* if 2 of the next pixels aren't between 1 and 254
600        it might make sense to go to the optimized loops */
601        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
602            break;
603
604    } while(src < src_end);
605
606    if (src >= src_end)
607        goto TAIL;
608
609    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
610        goto ALPHA_255;
611
612    /*fall-thru*/
613
614ALPHA_0:
615
616    /*In this state, we know the current alpha is 0 and
617     we optimize for the next alpha also being zero. */
618    src_temp = src;  //so we don't have to increment dst every time
619    do {
620        if(*(++src) > ALPHA_TRANS)
621            break;
622        if(*(++src) > ALPHA_TRANS)
623            break;
624        if(*(++src) > ALPHA_TRANS)
625            break;
626        if(*(++src) > ALPHA_TRANS)
627            break;
628    } while(src < src_end);
629
630    dst += (src - src_temp);
631
632    /* no longer alpha 0, so determine where to go next. */
633    if( src >= src_end)
634        goto TAIL;
635    if(*src >= ALPHA_OPAQ)
636        goto ALPHA_255;
637    else
638        goto ALPHA_1_TO_254;
639
640ALPHA_255:
641    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
642        dst[0]=src[0];
643        dst[1]=src[1];
644        dst[2]=src[2];
645        dst[3]=src[3];
646        src+=UNROLL;
647        dst+=UNROLL;
648        if(src >= src_end)
649            goto TAIL;
650    }
651
652    //Handle remainder.
653    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
654        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
655            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
656        }
657    }
658
659    if( src >= src_end)
660        goto TAIL;
661    if(*src <= ALPHA_TRANS)
662        goto ALPHA_0;
663    else
664        goto ALPHA_1_TO_254;
665
666TAIL:
667    /* do any residual iterations */
668    src_end += UNROLL + 1;  //goto the real end
669    while(src != src_end) {
670        if( *src != 0 ) {
671            if( *src >= ALPHA_OPAQ ) {
672                *dst = *src;
673            }
674            else {
675                *dst = SkPMSrcOver(*src, *dst);
676            }
677        }
678        src++;
679        dst++;
680    }
681
682#undef    UNROLL
683    return;
684}
685
686/* Neon version of S32_Blend_BlitRow32()
687 * portable version is in src/core/SkBlitRow_D32.cpp
688 */
689void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
690                              const SkPMColor* SK_RESTRICT src,
691                              int count, U8CPU alpha) {
692    SkASSERT(alpha <= 255);
693    if (count > 0) {
694        uint16_t src_scale = SkAlpha255To256(alpha);
695        uint16_t dst_scale = 256 - src_scale;
696
697    /* run them N at a time through the NEON unit */
698    /* note that each 1 is 4 bytes, each treated exactly the same,
699     * so we can work under that guise. We *do* know that the src&dst
700     * will be 32-bit aligned quantities, so we can specify that on
701     * the load/store ops and do a neon 'reinterpret' to get us to
702     * byte-sized (pun intended) pieces that we widen/multiply/shift
703     * we're limited at 128 bits in the wide ops, which is 8x16bits
704     * or a pair of 32 bit src/dsts.
705     */
706    /* we *could* manually unroll this loop so that we load 128 bits
707     * (as a pair of 64s) from each of src and dst, processing them
708     * in pieces. This might give us a little better management of
709     * the memory latency, but my initial attempts here did not
710     * produce an instruction stream that looked all that nice.
711     */
712#define    UNROLL    2
713    while (count >= UNROLL) {
714        uint8x8_t  src_raw, dst_raw, dst_final;
715        uint16x8_t  src_wide, dst_wide;
716
717        /* get 64 bits of src, widen it, multiply by src_scale */
718        src_raw = vreinterpret_u8_u32(vld1_u32(src));
719        src_wide = vmovl_u8(src_raw);
720        /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
721        src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
722
723        /* ditto with dst */
724        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
725        dst_wide = vmovl_u8(dst_raw);
726
727        /* combine add with dst multiply into mul-accumulate */
728        dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
729
730        dst_final = vshrn_n_u16(dst_wide, 8);
731        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
732
733        src += UNROLL;
734        dst += UNROLL;
735        count -= UNROLL;
736    }
737    /* RBE: well, i don't like how gcc manages src/dst across the above
738     * loop it's constantly calculating src+bias, dst+bias and it only
739     * adjusts the real ones when we leave the loop. Not sure why
740     * it's "hoisting down" (hoisting implies above in my lexicon ;))
741     * the adjustments to src/dst/count, but it does...
742     * (might be SSA-style internal logic...
743     */
744
745#if    UNROLL == 2
746    if (count == 1) {
747            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
748    }
749#else
750    if (count > 0) {
751            do {
752                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
753                src += 1;
754                dst += 1;
755            } while (--count > 0);
756    }
757#endif
758
759#undef    UNROLL
760    }
761}
762
763///////////////////////////////////////////////////////////////////////////////
764
765#undef    DEBUG_OPAQUE_DITHER
766
767#if    defined(DEBUG_OPAQUE_DITHER)
768static void showme8(char *str, void *p, int len)
769{
770    static char buf[256];
771    char tbuf[32];
772    int i;
773    char *pc = (char*) p;
774    sprintf(buf,"%8s:", str);
775    for(i=0;i<len;i++) {
776        sprintf(tbuf, "   %02x", pc[i]);
777        strcat(buf, tbuf);
778    }
779    SkDebugf("%s\n", buf);
780}
781static void showme16(char *str, void *p, int len)
782{
783    static char buf[256];
784    char tbuf[32];
785    int i;
786    uint16_t *pc = (uint16_t*) p;
787    sprintf(buf,"%8s:", str);
788    len = (len / sizeof(uint16_t));    /* passed as bytes */
789    for(i=0;i<len;i++) {
790        sprintf(tbuf, " %04x", pc[i]);
791        strcat(buf, tbuf);
792    }
793    SkDebugf("%s\n", buf);
794}
795#endif
796
797void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
798                                   const SkPMColor* SK_RESTRICT src,
799                                   int count, U8CPU alpha, int x, int y) {
800    SkASSERT(255 == alpha);
801
802#define    UNROLL    8
803
804    if (count >= UNROLL) {
805    uint8x8_t dbase;
806
807#if    defined(DEBUG_OPAQUE_DITHER)
808    uint16_t tmpbuf[UNROLL];
809    int td[UNROLL];
810    int tdv[UNROLL];
811    int ta[UNROLL];
812    int tap[UNROLL];
813    uint16_t in_dst[UNROLL];
814    int offset = 0;
815    int noisy = 0;
816#endif
817
818    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
819    dbase = vld1_u8(dstart);
820
821        do {
822        uint8x8_t sr, sg, sb, sa, d;
823        uint16x8_t dst8, scale8, alpha8;
824        uint16x8_t dst_r, dst_g, dst_b;
825
826#if    defined(DEBUG_OPAQUE_DITHER)
827    /* calculate 8 elements worth into a temp buffer */
828    {
829      int my_y = y;
830      int my_x = x;
831      SkPMColor* my_src = (SkPMColor*)src;
832      uint16_t* my_dst = dst;
833      int i;
834
835          DITHER_565_SCAN(my_y);
836          for(i=0;i<UNROLL;i++) {
837            SkPMColor c = *my_src++;
838            SkPMColorAssert(c);
839            if (c) {
840                unsigned a = SkGetPackedA32(c);
841
842                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
843        tdv[i] = DITHER_VALUE(my_x);
844        ta[i] = a;
845        tap[i] = SkAlpha255To256(a);
846        td[i] = d;
847
848                unsigned sr = SkGetPackedR32(c);
849                unsigned sg = SkGetPackedG32(c);
850                unsigned sb = SkGetPackedB32(c);
851                sr = SkDITHER_R32_FOR_565(sr, d);
852                sg = SkDITHER_G32_FOR_565(sg, d);
853                sb = SkDITHER_B32_FOR_565(sb, d);
854
855                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
856                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
857                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
858                // now src and dst expanded are in g:11 r:10 x:1 b:10
859                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
860        td[i] = d;
861
862            } else {
863        tmpbuf[i] = *my_dst;
864        ta[i] = tdv[i] = td[i] = 0xbeef;
865        }
866        in_dst[i] = *my_dst;
867            my_dst += 1;
868            DITHER_INC_X(my_x);
869          }
870    }
871#endif
872
873        /* source is in ABGR */
874        {
875        register uint8x8_t d0 asm("d0");
876        register uint8x8_t d1 asm("d1");
877        register uint8x8_t d2 asm("d2");
878        register uint8x8_t d3 asm("d3");
879
880        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
881            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
882            : "r" (src)
883                    );
884            sr = d0; sg = d1; sb = d2; sa = d3;
885        }
886
887        /* calculate 'd', which will be 0..7 */
888        /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
889#if defined(SK_BUILD_FOR_ANDROID)
890        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
891        alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
892#else
893        alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
894#endif
895        alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
896        d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
897
898        /* sr = sr - (sr>>5) + d */
899        /* watching for 8-bit overflow.  d is 0..7; risky range of
900         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
901         * safe  as long as we do ((sr-sr>>5) + d) */
902        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
903        sr = vadd_u8(sr, d);
904
905        /* sb = sb - (sb>>5) + d */
906        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
907        sb = vadd_u8(sb, d);
908
909        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
910        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
911        sg = vadd_u8(sg, vshr_n_u8(d,1));
912
913        /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
914        dst8 = vld1q_u16(dst);
915        dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
916        dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
917        dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
918
919        /* blend */
920#if 1
921        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
922        /* originally 255-sa + 1 */
923        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
924#else
925        scale8 = vsubw_u8(vdupq_n_u16(255), sa);
926        scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
927#endif
928
929#if 1
930        /* combine the addq and mul, save 3 insns */
931        scale8 = vshrq_n_u16(scale8, 3);
932        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
933        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
934        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
935#else
936        /* known correct, but +3 insns over above */
937        scale8 = vshrq_n_u16(scale8, 3);
938        dst_b = vmulq_u16(dst_b, scale8);
939        dst_g = vmulq_u16(dst_g, scale8);
940        dst_r = vmulq_u16(dst_r, scale8);
941
942        /* combine */
943        /* NB: vshll widens, need to preserve those bits */
944        dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
945        dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
946        dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
947#endif
948
949        /* repack to store */
950        dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
951        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
952        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
953
954        vst1q_u16(dst, dst8);
955
956#if    defined(DEBUG_OPAQUE_DITHER)
957        /* verify my 8 elements match the temp buffer */
958    {
959       int i, bad=0;
960       static int invocation;
961
962       for (i=0;i<UNROLL;i++)
963        if (tmpbuf[i] != dst[i]) bad=1;
964       if (bad) {
965        SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
966            invocation, offset);
967        SkDebugf("  alpha 0x%x\n", alpha);
968        for (i=0;i<UNROLL;i++)
969            SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
970            i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
971            dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
972
973        showme16("alpha8", &alpha8, sizeof(alpha8));
974        showme16("scale8", &scale8, sizeof(scale8));
975        showme8("d", &d, sizeof(d));
976        showme16("dst8", &dst8, sizeof(dst8));
977        showme16("dst_b", &dst_b, sizeof(dst_b));
978        showme16("dst_g", &dst_g, sizeof(dst_g));
979        showme16("dst_r", &dst_r, sizeof(dst_r));
980        showme8("sb", &sb, sizeof(sb));
981        showme8("sg", &sg, sizeof(sg));
982        showme8("sr", &sr, sizeof(sr));
983
984        /* cop out */
985        return;
986       }
987       offset += UNROLL;
988       invocation++;
989    }
990#endif
991
992            dst += UNROLL;
993        src += UNROLL;
994        count -= UNROLL;
995        /* skip x += UNROLL, since it's unchanged mod-4 */
996        } while (count >= UNROLL);
997    }
998#undef    UNROLL
999
1000    /* residuals */
1001    if (count > 0) {
1002        DITHER_565_SCAN(y);
1003        do {
1004            SkPMColor c = *src++;
1005            SkPMColorAssert(c);
1006            if (c) {
1007                unsigned a = SkGetPackedA32(c);
1008
1009                // dither and alpha are just temporary variables to work-around
1010                // an ICE in debug.
1011                unsigned dither = DITHER_VALUE(x);
1012                unsigned alpha = SkAlpha255To256(a);
1013                int d = SkAlphaMul(dither, alpha);
1014
1015                unsigned sr = SkGetPackedR32(c);
1016                unsigned sg = SkGetPackedG32(c);
1017                unsigned sb = SkGetPackedB32(c);
1018                sr = SkDITHER_R32_FOR_565(sr, d);
1019                sg = SkDITHER_G32_FOR_565(sg, d);
1020                sb = SkDITHER_B32_FOR_565(sb, d);
1021
1022                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1023                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1024                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1025                // now src and dst expanded are in g:11 r:10 x:1 b:10
1026                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1027            }
1028            dst += 1;
1029            DITHER_INC_X(x);
1030        } while (--count != 0);
1031    }
1032}
1033
1034///////////////////////////////////////////////////////////////////////////////
1035
1036/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1037 * speedup untested, but ARM version is 26 insns/iteration and
1038 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1039 * which is 10x the native version; that's pure instruction counts,
1040 * not accounting for any instruction or memory latencies.
1041 */
1042
1043#undef    DEBUG_S32_OPAQUE_DITHER
1044
1045void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1046                                 const SkPMColor* SK_RESTRICT src,
1047                                 int count, U8CPU alpha, int x, int y) {
1048    SkASSERT(255 == alpha);
1049
1050#define    UNROLL    8
1051    if (count >= UNROLL) {
1052    uint8x8_t d;
1053    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1054    d = vld1_u8(dstart);
1055
1056    while (count >= UNROLL) {
1057        uint8x8_t sr, sg, sb;
1058        uint16x8_t dr, dg, db;
1059        uint16x8_t dst8;
1060
1061        /* source is in ABGR ordering (R == lsb) */
1062        {
1063        register uint8x8_t d0 asm("d0");
1064        register uint8x8_t d1 asm("d1");
1065        register uint8x8_t d2 asm("d2");
1066        register uint8x8_t d3 asm("d3");
1067
1068        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1069            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1070            : "r" (src)
1071                    );
1072            sr = d0; sg = d1; sb = d2;
1073        }
1074        /* XXX: if we want to prefetch, hide it in the above asm()
1075         * using the gcc __builtin_prefetch(), the prefetch will
1076         * fall to the bottom of the loop -- it won't stick up
1077         * at the top of the loop, just after the vld4.
1078         */
1079
1080        /* sr = sr - (sr>>5) + d */
1081        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1082        dr = vaddl_u8(sr, d);
1083
1084        /* sb = sb - (sb>>5) + d */
1085        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1086        db = vaddl_u8(sb, d);
1087
1088        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1089        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1090        dg = vaddl_u8(sg, vshr_n_u8(d,1));
1091        /* XXX: check that the "d>>1" here is hoisted */
1092
1093        /* pack high bits of each into 565 format  (rgb, b is lsb) */
1094        dst8 = vshrq_n_u16(db, 3);
1095        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1096        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1097
1098        /* store it */
1099        vst1q_u16(dst, dst8);
1100
1101#if    defined(DEBUG_S32_OPAQUE_DITHER)
1102        /* always good to know if we generated good results */
1103        {
1104        int i, myx = x, myy = y;
1105        DITHER_565_SCAN(myy);
1106        for (i=0;i<UNROLL;i++) {
1107            SkPMColor c = src[i];
1108            unsigned dither = DITHER_VALUE(myx);
1109            uint16_t val = SkDitherRGB32To565(c, dither);
1110            if (val != dst[i]) {
1111            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1112                c, dither, val, dst[i], dstart[i]);
1113            }
1114            DITHER_INC_X(myx);
1115        }
1116        }
1117#endif
1118
1119        dst += UNROLL;
1120        src += UNROLL;
1121        count -= UNROLL;
1122        x += UNROLL;        /* probably superfluous */
1123    }
1124    }
1125#undef    UNROLL
1126
1127    /* residuals */
1128    if (count > 0) {
1129        DITHER_565_SCAN(y);
1130        do {
1131            SkPMColor c = *src++;
1132            SkPMColorAssert(c);
1133            SkASSERT(SkGetPackedA32(c) == 255);
1134
1135            unsigned dither = DITHER_VALUE(x);
1136            *dst++ = SkDitherRGB32To565(c, dither);
1137            DITHER_INC_X(x);
1138        } while (--count != 0);
1139    }
1140}
1141
1142void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1143                      SkPMColor color) {
1144    if (count <= 0) {
1145        return;
1146    }
1147
1148    if (0 == color) {
1149        if (src != dst) {
1150            memcpy(dst, src, count * sizeof(SkPMColor));
1151        }
1152        return;
1153    }
1154
1155    unsigned colorA = SkGetPackedA32(color);
1156    if (255 == colorA) {
1157        sk_memset32(dst, color, count);
1158    } else {
1159        unsigned scale = 256 - SkAlpha255To256(colorA);
1160
1161        if (count >= 8) {
1162            // at the end of this assembly, count will have been decremented
1163            // to a negative value. That is, if count mod 8 = x, it will be
1164            // -8 +x coming out.
1165            asm volatile (
1166                PLD128(src, 0)
1167
1168                "vdup.32    q0, %[color]                \n\t"
1169
1170                PLD128(src, 128)
1171
1172                // scale numerical interval [0-255], so load as 8 bits
1173                "vdup.8     d2, %[scale]                \n\t"
1174
1175                PLD128(src, 256)
1176
1177                "subs       %[count], %[count], #8      \n\t"
1178
1179                PLD128(src, 384)
1180
1181                "Loop_Color32:                          \n\t"
1182
1183                // load src color, 8 pixels, 4 64 bit registers
1184                // (and increment src).
1185                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1186
1187                PLD128(src, 384)
1188
1189                // multiply long by scale, 64 bits at a time,
1190                // destination into a 128 bit register.
1191                "vmull.u8   q4, d4, d2                  \n\t"
1192                "vmull.u8   q5, d5, d2                  \n\t"
1193                "vmull.u8   q6, d6, d2                  \n\t"
1194                "vmull.u8   q7, d7, d2                  \n\t"
1195
1196                // shift the 128 bit registers, containing the 16
1197                // bit scaled values back to 8 bits, narrowing the
1198                // results to 64 bit registers.
1199                "vshrn.i16  d8, q4, #8                  \n\t"
1200                "vshrn.i16  d9, q5, #8                  \n\t"
1201                "vshrn.i16  d10, q6, #8                 \n\t"
1202                "vshrn.i16  d11, q7, #8                 \n\t"
1203
1204                // adding back the color, using 128 bit registers.
1205                "vadd.i8    q6, q4, q0                  \n\t"
1206                "vadd.i8    q7, q5, q0                  \n\t"
1207
1208                // store back the 8 calculated pixels (2 128 bit
1209                // registers), and increment dst.
1210                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1211
1212                "subs       %[count], %[count], #8      \n\t"
1213                "bge        Loop_Color32                \n\t"
1214                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1215                : [color] "r" (color), [scale] "r" (scale)
1216                : "cc", "memory",
1217                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1218                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1219                          );
1220            // At this point, if we went through the inline assembly, count is
1221            // a negative value:
1222            // if the value is -8, there is no pixel left to process.
1223            // if the value is -7, there is one pixel left to process
1224            // ...
1225            // And'ing it with 7 will give us the number of pixels
1226            // left to process.
1227            count = count & 0x7;
1228        }
1229
1230        while (count > 0) {
1231            *dst = color + SkAlphaMulQ(*src, scale);
1232            src += 1;
1233            dst += 1;
1234            count--;
1235        }
1236    }
1237}
1238
1239///////////////////////////////////////////////////////////////////////////////
1240
1241const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1242    // no dither
1243    // NOTE: For the two functions below, we don't have a special version
1244    //       that assumes that each source pixel is opaque. But our S32A is
1245    //       still faster than the default, so use it.
1246    S32A_D565_Opaque_neon,  // really S32_D565_Opaque
1247    S32A_D565_Blend_neon,   // really S32_D565_Blend
1248    S32A_D565_Opaque_neon,
1249    S32A_D565_Blend_neon,
1250
1251    // dither
1252    S32_D565_Opaque_Dither_neon,
1253    S32_D565_Blend_Dither_neon,
1254    S32A_D565_Opaque_Dither_neon,
1255    NULL,   // S32A_D565_Blend_Dither
1256};
1257
1258const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1259    NULL,   // S32_Opaque,
1260    S32_Blend_BlitRow32_neon,        // S32_Blend,
1261    /*
1262     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1263     * value and attempts to optimize accordingly.  The optimization is
1264     * sensitive to the source content and is not a win in all cases. For
1265     * example, if there are a lot of transitions between the alpha states,
1266     * the performance will almost certainly be worse.  However, for many
1267     * common cases the performance is equivalent or better than the standard
1268     * case where we do not inspect the src alpha.
1269     */
1270#if SK_A32_SHIFT == 24
1271    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1272    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1273#else
1274    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1275#endif
1276    S32A_Blend_BlitRow32_arm        // S32A_Blend
1277};
1278