SkBlitRow_opts_arm_neon.cpp revision a111e492b84c312a6bd5d5d9ef100dca48f4941d
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkCachePreload_arm.h"
18
19#include <arm_neon.h>
20
21void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22                           const SkPMColor* SK_RESTRICT src, int count,
23                           U8CPU alpha, int /*x*/, int /*y*/) {
24    SkASSERT(255 == alpha);
25
26    if (count >= 8) {
27        uint16_t* SK_RESTRICT keep_dst = 0;
28
29        asm volatile (
30                      "ands       ip, %[count], #7            \n\t"
31                      "vmov.u8    d31, #1<<7                  \n\t"
32                      "vld1.16    {q12}, [%[dst]]             \n\t"
33                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
34                      // Thumb does not support the standard ARM conditional
35                      // instructions but instead requires the 'it' instruction
36                      // to signal conditional execution
37                      "it eq                                  \n\t"
38                      "moveq      ip, #8                      \n\t"
39                      "mov        %[keep_dst], %[dst]         \n\t"
40
41                      "add        %[src], %[src], ip, LSL#2   \n\t"
42                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
43                      "subs       %[count], %[count], ip      \n\t"
44                      "b          9f                          \n\t"
45                      // LOOP
46                      "2:                                         \n\t"
47
48                      "vld1.16    {q12}, [%[dst]]!            \n\t"
49                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
50                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
51                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
52                      "subs       %[count], %[count], #8      \n\t"
53                      "9:                                         \n\t"
54                      "pld        [%[dst],#32]                \n\t"
55                      // expand 0565 q12 to 8888 {d4-d7}
56                      "vmovn.u16  d4, q12                     \n\t"
57                      "vshr.u16   q11, q12, #5                \n\t"
58                      "vshr.u16   q10, q12, #6+5              \n\t"
59                      "vmovn.u16  d5, q11                     \n\t"
60                      "vmovn.u16  d6, q10                     \n\t"
61                      "vshl.u8    d4, d4, #3                  \n\t"
62                      "vshl.u8    d5, d5, #2                  \n\t"
63                      "vshl.u8    d6, d6, #3                  \n\t"
64
65                      "vmovl.u8   q14, d31                    \n\t"
66                      "vmovl.u8   q13, d31                    \n\t"
67                      "vmovl.u8   q12, d31                    \n\t"
68
69                      // duplicate in 4/2/1 & 8pix vsns
70                      "vmvn.8     d30, d3                     \n\t"
71                      "vmlal.u8   q14, d30, d6                \n\t"
72                      "vmlal.u8   q13, d30, d5                \n\t"
73                      "vmlal.u8   q12, d30, d4                \n\t"
74                      "vshr.u16   q8, q14, #5                 \n\t"
75                      "vshr.u16   q9, q13, #6                 \n\t"
76                      "vaddhn.u16 d6, q14, q8                 \n\t"
77                      "vshr.u16   q8, q12, #5                 \n\t"
78                      "vaddhn.u16 d5, q13, q9                 \n\t"
79                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
80                      "vaddhn.u16 d4, q12, q8                 \n\t"
81                      // intentionally don't calculate alpha
82                      // result in d4-d6
83
84                      "vqadd.u8   d5, d5, d1                  \n\t"
85                      "vqadd.u8   d4, d4, d2                  \n\t"
86
87                      // pack 8888 {d4-d6} to 0565 q10
88                      "vshll.u8   q10, d6, #8                 \n\t"
89                      "vshll.u8   q3, d5, #8                  \n\t"
90                      "vshll.u8   q2, d4, #8                  \n\t"
91                      "vsri.u16   q10, q3, #5                 \n\t"
92                      "vsri.u16   q10, q2, #11                \n\t"
93
94                      "bne        2b                          \n\t"
95
96                      "1:                                         \n\t"
97                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
98                      : [count] "+r" (count)
99                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
100                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
101                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
102                      "d30","d31"
103                      );
104    }
105    else
106    {   // handle count < 8
107        uint16_t* SK_RESTRICT keep_dst = 0;
108
109        asm volatile (
110                      "vmov.u8    d31, #1<<7                  \n\t"
111                      "mov        %[keep_dst], %[dst]         \n\t"
112
113                      "tst        %[count], #4                \n\t"
114                      "beq        14f                         \n\t"
115                      "vld1.16    {d25}, [%[dst]]!            \n\t"
116                      "vld1.32    {q1}, [%[src]]!             \n\t"
117
118                      "14:                                        \n\t"
119                      "tst        %[count], #2                \n\t"
120                      "beq        12f                         \n\t"
121                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
122                      "vld1.32    {d1}, [%[src]]!             \n\t"
123
124                      "12:                                        \n\t"
125                      "tst        %[count], #1                \n\t"
126                      "beq        11f                         \n\t"
127                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
128                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
129
130                      "11:                                        \n\t"
131                      // unzips achieve the same as a vld4 operation
132                      "vuzpq.u16  q0, q1                      \n\t"
133                      "vuzp.u8    d0, d1                      \n\t"
134                      "vuzp.u8    d2, d3                      \n\t"
135                      // expand 0565 q12 to 8888 {d4-d7}
136                      "vmovn.u16  d4, q12                     \n\t"
137                      "vshr.u16   q11, q12, #5                \n\t"
138                      "vshr.u16   q10, q12, #6+5              \n\t"
139                      "vmovn.u16  d5, q11                     \n\t"
140                      "vmovn.u16  d6, q10                     \n\t"
141                      "vshl.u8    d4, d4, #3                  \n\t"
142                      "vshl.u8    d5, d5, #2                  \n\t"
143                      "vshl.u8    d6, d6, #3                  \n\t"
144
145                      "vmovl.u8   q14, d31                    \n\t"
146                      "vmovl.u8   q13, d31                    \n\t"
147                      "vmovl.u8   q12, d31                    \n\t"
148
149                      // duplicate in 4/2/1 & 8pix vsns
150                      "vmvn.8     d30, d3                     \n\t"
151                      "vmlal.u8   q14, d30, d6                \n\t"
152                      "vmlal.u8   q13, d30, d5                \n\t"
153                      "vmlal.u8   q12, d30, d4                \n\t"
154                      "vshr.u16   q8, q14, #5                 \n\t"
155                      "vshr.u16   q9, q13, #6                 \n\t"
156                      "vaddhn.u16 d6, q14, q8                 \n\t"
157                      "vshr.u16   q8, q12, #5                 \n\t"
158                      "vaddhn.u16 d5, q13, q9                 \n\t"
159                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
160                      "vaddhn.u16 d4, q12, q8                 \n\t"
161                      // intentionally don't calculate alpha
162                      // result in d4-d6
163
164                      "vqadd.u8   d5, d5, d1                  \n\t"
165                      "vqadd.u8   d4, d4, d2                  \n\t"
166
167                      // pack 8888 {d4-d6} to 0565 q10
168                      "vshll.u8   q10, d6, #8                 \n\t"
169                      "vshll.u8   q3, d5, #8                  \n\t"
170                      "vshll.u8   q2, d4, #8                  \n\t"
171                      "vsri.u16   q10, q3, #5                 \n\t"
172                      "vsri.u16   q10, q2, #11                \n\t"
173
174                      // store
175                      "tst        %[count], #4                \n\t"
176                      "beq        24f                         \n\t"
177                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
178
179                      "24:                                        \n\t"
180                      "tst        %[count], #2                \n\t"
181                      "beq        22f                         \n\t"
182                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
183
184                      "22:                                        \n\t"
185                      "tst        %[count], #1                \n\t"
186                      "beq        21f                         \n\t"
187                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
188
189                      "21:                                        \n\t"
190                      : [count] "+r" (count)
191                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
192                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
193                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
194                      "d30","d31"
195                      );
196    }
197}
198
199void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
200                          const SkPMColor* SK_RESTRICT src, int count,
201                          U8CPU alpha, int /*x*/, int /*y*/) {
202
203    U8CPU alpha_for_asm = alpha;
204
205    asm volatile (
206    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
207     * the original in two respects:
208     *  1. The results have a few mismatches compared to the original code. These mismatches
209     *     never exceed 1. It's possible to improve accuracy vs. a floating point
210     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
211     *     Rounding is not present in the code below, because although results would be closer
212     *     to a floating point implementation, the number of mismatches compared to the
213     *     original code would be far greater.
214     *  2. On certain inputs, the original code can overflow, causing colour channels to
215     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
216     *     to affect another.
217     */
218
219#if 1
220        /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
221                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
222#else
223                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
224#endif
225                  "vmov.u16   q3, #255                        \n\t"   // set up constant
226                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
227                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
228                  "beq        2f                              \n\t"   // if count8 == 0, exit
229                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
230
231                  "1:                                             \n\t"
232                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
233                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
234                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
235                  //  and deinterleave
236
237                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
238                  "vand       q10, q0, q15                    \n\t"   // extract blue
239                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
240                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
241                  // dstrgb = {q8, q9, q10}
242
243                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
244                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
245                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
246
247                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
248                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
249                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
250                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
251                  // srcrgba = {q11, q12, q13, q14}
252
253                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
254                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
255                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
256                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
257
258                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
259                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
260                  // dst_scale = q2
261
262                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
263                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
264                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
265
266#if 1
267    // trying for a better match with SkDiv255Round(a)
268    // C alg is:  a+=128; (a+a>>8)>>8
269    // we'll use just a rounding shift [q2 is available for scratch]
270                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
271                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
272                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
273#else
274    // arm's original "truncating divide by 256"
275                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
276                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
277                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
278#endif
279
280                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
281                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
282                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
283
284                  "bne        1b                              \n\t"   // if counter != 0, loop
285                  "2:                                             \n\t"   // exit
286
287                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
288                  :
289                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
290                  );
291
292    count &= 7;
293    if (count > 0) {
294        do {
295            SkPMColor sc = *src++;
296            if (sc) {
297                uint16_t dc = *dst;
298                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
299                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
300                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
301                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
302                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
303            }
304            dst += 1;
305        } while (--count != 0);
306    }
307}
308
309/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
310 * each dither value is spaced out into byte lanes, and repeated
311 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
312 * start of each row.
313 */
314static const uint8_t gDitherMatrix_Neon[48] = {
315    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
316    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
317    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
318    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
319
320};
321
322void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
323                                int count, U8CPU alpha, int x, int y)
324{
325    /* select row and offset for dither array */
326    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
327
328    /* rescale alpha to range 0 - 256 */
329    int scale = SkAlpha255To256(alpha);
330
331    asm volatile (
332                  "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
333                  "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
334                  "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
335                  "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
336                  "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
337                  "1:                                                 \n\t"
338                  "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
339                  "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
340                  "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
341                  "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
342                  "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
343                  "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
344                  "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
345                  "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
346                  "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
347                  "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
348                  "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
349                  "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
350                  "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
351                  // load 8 pixels from dst, extract rgb
352                  "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
353                  "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
354                  "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
355                  "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
356                  "vand           d17, d17, d29                   \n\t"   // and green with green mask
357                  "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
358                  "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
359                  // src = {d22 (r), d23 (g), d24 (b)}
360                  // dst = {d16 (r), d17 (g), d18 (b)}
361                  // subtract dst from src and widen
362                  "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
363                  "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
364                  "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
365                  // multiply diffs by scale and shift
366                  "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
367                  "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
368                  "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
369                  "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
370                  "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
371                  "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
372                  "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
373                  // add dst to result
374                  "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
375                  "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
376                  "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
377                  // put result into 565 format
378                  "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
379                  "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
380                  "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
381                  "bgt            1b                              \n\t"   // loop if count > 0
382                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
383                  : [dstart] "r" (dstart), [scale] "r" (scale)
384                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
385                  );
386
387    DITHER_565_SCAN(y);
388
389    while((count & 7) > 0)
390    {
391        SkPMColor c = *src++;
392
393        int dither = DITHER_VALUE(x);
394        int sr = SkGetPackedR32(c);
395        int sg = SkGetPackedG32(c);
396        int sb = SkGetPackedB32(c);
397        sr = SkDITHER_R32To565(sr, dither);
398        sg = SkDITHER_G32To565(sg, dither);
399        sb = SkDITHER_B32To565(sb, dither);
400
401        uint16_t d = *dst;
402        *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
403                             SkAlphaBlend(sg, SkGetPackedG16(d), scale),
404                             SkAlphaBlend(sb, SkGetPackedB16(d), scale));
405        DITHER_INC_X(x);
406        count--;
407    }
408}
409
410void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
411                                const SkPMColor* SK_RESTRICT src,
412                                int count, U8CPU alpha) {
413
414    SkASSERT(255 == alpha);
415    if (count > 0) {
416
417
418    uint8x8_t alpha_mask;
419
420    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
421    alpha_mask = vld1_u8(alpha_mask_setup);
422
423    /* do the NEON unrolled code */
424#define    UNROLL    4
425    while (count >= UNROLL) {
426        uint8x8_t src_raw, dst_raw, dst_final;
427        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
428
429        /* The two prefetches below may make the code slighlty
430         * slower for small values of count but are worth having
431         * in the general case.
432         */
433        __builtin_prefetch(src+32);
434        __builtin_prefetch(dst+32);
435
436        /* get the source */
437        src_raw = vreinterpret_u8_u32(vld1_u32(src));
438#if    UNROLL > 2
439        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
440#endif
441
442        /* get and hold the dst too */
443        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
444#if    UNROLL > 2
445        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
446#endif
447
448    /* 1st and 2nd bits of the unrolling */
449    {
450        uint8x8_t dst_cooked;
451        uint16x8_t dst_wide;
452        uint8x8_t alpha_narrow;
453        uint16x8_t alpha_wide;
454
455        /* get the alphas spread out properly */
456        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
457        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
458
459        /* spread the dest */
460        dst_wide = vmovl_u8(dst_raw);
461
462        /* alpha mul the dest */
463        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
464        dst_cooked = vshrn_n_u16(dst_wide, 8);
465
466        /* sum -- ignoring any byte lane overflows */
467        dst_final = vadd_u8(src_raw, dst_cooked);
468    }
469
470#if    UNROLL > 2
471    /* the 3rd and 4th bits of our unrolling */
472    {
473        uint8x8_t dst_cooked;
474        uint16x8_t dst_wide;
475        uint8x8_t alpha_narrow;
476        uint16x8_t alpha_wide;
477
478        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
479        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
480
481        /* spread the dest */
482        dst_wide = vmovl_u8(dst_raw_2);
483
484        /* alpha mul the dest */
485        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
486        dst_cooked = vshrn_n_u16(dst_wide, 8);
487
488        /* sum -- ignoring any byte lane overflows */
489        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
490    }
491#endif
492
493        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
494#if    UNROLL > 2
495        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
496#endif
497
498        src += UNROLL;
499        dst += UNROLL;
500        count -= UNROLL;
501    }
502#undef    UNROLL
503
504    /* do any residual iterations */
505        while (--count >= 0) {
506            *dst = SkPMSrcOver(*src, *dst);
507            src += 1;
508            dst += 1;
509        }
510    }
511}
512
513void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
514                                const SkPMColor* SK_RESTRICT src,
515                                int count, U8CPU alpha) {
516    SkASSERT(255 == alpha);
517
518    if (count <= 0)
519    return;
520
521    /* Use these to check if src is transparent or opaque */
522    const unsigned int ALPHA_OPAQ  = 0xFF000000;
523    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
524
525#define UNROLL  4
526    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
527    const SkPMColor* SK_RESTRICT src_temp = src;
528
529    /* set up the NEON variables */
530    uint8x8_t alpha_mask;
531    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
532    alpha_mask = vld1_u8(alpha_mask_setup);
533
534    uint8x8_t src_raw, dst_raw, dst_final;
535    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
536    uint8x8_t dst_cooked;
537    uint16x8_t dst_wide;
538    uint8x8_t alpha_narrow;
539    uint16x8_t alpha_wide;
540
541    /* choose the first processing type */
542    if( src >= src_end)
543        goto TAIL;
544    if(*src <= ALPHA_TRANS)
545        goto ALPHA_0;
546    if(*src >= ALPHA_OPAQ)
547        goto ALPHA_255;
548    /* fall-thru */
549
550ALPHA_1_TO_254:
551    do {
552
553        /* get the source */
554        src_raw = vreinterpret_u8_u32(vld1_u32(src));
555        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
556
557        /* get and hold the dst too */
558        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
559        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
560
561
562        /* get the alphas spread out properly */
563        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
564        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
565        /* we collapsed (255-a)+1 ... */
566        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
567
568        /* spread the dest */
569        dst_wide = vmovl_u8(dst_raw);
570
571        /* alpha mul the dest */
572        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
573        dst_cooked = vshrn_n_u16(dst_wide, 8);
574
575        /* sum -- ignoring any byte lane overflows */
576        dst_final = vadd_u8(src_raw, dst_cooked);
577
578        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
579        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
580        /* we collapsed (255-a)+1 ... */
581        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
582
583        /* spread the dest */
584        dst_wide = vmovl_u8(dst_raw_2);
585
586        /* alpha mul the dest */
587        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
588        dst_cooked = vshrn_n_u16(dst_wide, 8);
589
590        /* sum -- ignoring any byte lane overflows */
591        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
592
593        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
594        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
595
596        src += UNROLL;
597        dst += UNROLL;
598
599        /* if 2 of the next pixels aren't between 1 and 254
600        it might make sense to go to the optimized loops */
601        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
602            break;
603
604    } while(src < src_end);
605
606    if (src >= src_end)
607        goto TAIL;
608
609    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
610        goto ALPHA_255;
611
612    /*fall-thru*/
613
614ALPHA_0:
615
616    /*In this state, we know the current alpha is 0 and
617     we optimize for the next alpha also being zero. */
618    src_temp = src;  //so we don't have to increment dst every time
619    do {
620        if(*(++src) > ALPHA_TRANS)
621            break;
622        if(*(++src) > ALPHA_TRANS)
623            break;
624        if(*(++src) > ALPHA_TRANS)
625            break;
626        if(*(++src) > ALPHA_TRANS)
627            break;
628    } while(src < src_end);
629
630    dst += (src - src_temp);
631
632    /* no longer alpha 0, so determine where to go next. */
633    if( src >= src_end)
634        goto TAIL;
635    if(*src >= ALPHA_OPAQ)
636        goto ALPHA_255;
637    else
638        goto ALPHA_1_TO_254;
639
640ALPHA_255:
641    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
642        dst[0]=src[0];
643        dst[1]=src[1];
644        dst[2]=src[2];
645        dst[3]=src[3];
646        src+=UNROLL;
647        dst+=UNROLL;
648        if(src >= src_end)
649            goto TAIL;
650    }
651
652    //Handle remainder.
653    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
654        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
655            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
656        }
657    }
658
659    if( src >= src_end)
660        goto TAIL;
661    if(*src <= ALPHA_TRANS)
662        goto ALPHA_0;
663    else
664        goto ALPHA_1_TO_254;
665
666TAIL:
667    /* do any residual iterations */
668    src_end += UNROLL + 1;  //goto the real end
669    while(src != src_end) {
670        if( *src != 0 ) {
671            if( *src >= ALPHA_OPAQ ) {
672                *dst = *src;
673            }
674            else {
675                *dst = SkPMSrcOver(*src, *dst);
676            }
677        }
678        src++;
679        dst++;
680    }
681
682#undef    UNROLL
683    return;
684}
685
686/* Neon version of S32_Blend_BlitRow32()
687 * portable version is in src/core/SkBlitRow_D32.cpp
688 */
689void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
690                              const SkPMColor* SK_RESTRICT src,
691                              int count, U8CPU alpha) {
692    SkASSERT(alpha <= 255);
693    if (count > 0) {
694        uint16_t src_scale = SkAlpha255To256(alpha);
695        uint16_t dst_scale = 256 - src_scale;
696
697    /* run them N at a time through the NEON unit */
698    /* note that each 1 is 4 bytes, each treated exactly the same,
699     * so we can work under that guise. We *do* know that the src&dst
700     * will be 32-bit aligned quantities, so we can specify that on
701     * the load/store ops and do a neon 'reinterpret' to get us to
702     * byte-sized (pun intended) pieces that we widen/multiply/shift
703     * we're limited at 128 bits in the wide ops, which is 8x16bits
704     * or a pair of 32 bit src/dsts.
705     */
706    /* we *could* manually unroll this loop so that we load 128 bits
707     * (as a pair of 64s) from each of src and dst, processing them
708     * in pieces. This might give us a little better management of
709     * the memory latency, but my initial attempts here did not
710     * produce an instruction stream that looked all that nice.
711     */
712#define    UNROLL    2
713    while (count >= UNROLL) {
714        uint8x8_t  src_raw, dst_raw, dst_final;
715        uint16x8_t  src_wide, dst_wide;
716
717        /* get 64 bits of src, widen it, multiply by src_scale */
718        src_raw = vreinterpret_u8_u32(vld1_u32(src));
719        src_wide = vmovl_u8(src_raw);
720        /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
721        src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
722
723        /* ditto with dst */
724        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
725        dst_wide = vmovl_u8(dst_raw);
726
727        /* combine add with dst multiply into mul-accumulate */
728        dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
729
730        dst_final = vshrn_n_u16(dst_wide, 8);
731        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
732
733        src += UNROLL;
734        dst += UNROLL;
735        count -= UNROLL;
736    }
737    /* RBE: well, i don't like how gcc manages src/dst across the above
738     * loop it's constantly calculating src+bias, dst+bias and it only
739     * adjusts the real ones when we leave the loop. Not sure why
740     * it's "hoisting down" (hoisting implies above in my lexicon ;))
741     * the adjustments to src/dst/count, but it does...
742     * (might be SSA-style internal logic...
743     */
744
745#if    UNROLL == 2
746    if (count == 1) {
747            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
748    }
749#else
750    if (count > 0) {
751            do {
752                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
753                src += 1;
754                dst += 1;
755            } while (--count > 0);
756    }
757#endif
758
759#undef    UNROLL
760    }
761}
762
763void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
764                         const SkPMColor* SK_RESTRICT src,
765                         int count, U8CPU alpha) {
766
767    SkASSERT(255 >= alpha);
768
769    if (count <= 0) {
770        return;
771    }
772
773    unsigned alpha256 = SkAlpha255To256(alpha);
774
775    // First deal with odd counts
776    if (count & 1) {
777        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
778        uint16x8_t vdst_wide, vsrc_wide;
779        unsigned dst_scale;
780
781        // Load
782        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
783        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
784
785        // Calc dst_scale
786        dst_scale = vget_lane_u8(vsrc, 3);
787        dst_scale *= alpha256;
788        dst_scale >>= 8;
789        dst_scale = 256 - dst_scale;
790
791        // Process src
792        vsrc_wide = vmovl_u8(vsrc);
793        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
794
795        // Process dst
796        vdst_wide = vmovl_u8(vdst);
797        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
798
799        // Combine
800        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
801
802        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
803        dst++;
804        src++;
805        count--;
806    }
807
808    if (count) {
809        uint8x8_t alpha_mask;
810        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
811        alpha_mask = vld1_u8(alpha_mask_setup);
812
813        do {
814
815            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
816            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
817
818            __builtin_prefetch(src+32);
819            __builtin_prefetch(dst+32);
820
821            // Load
822            vsrc = vreinterpret_u8_u32(vld1_u32(src));
823            vdst = vreinterpret_u8_u32(vld1_u32(dst));
824
825            // Prepare src_scale
826            vsrc_scale = vdupq_n_u16(alpha256);
827
828            // Calc dst_scale
829            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
830            vdst_scale = vmovl_u8(vsrc_alphas);
831            vdst_scale *= vsrc_scale;
832            vdst_scale = vshrq_n_u16(vdst_scale, 8);
833            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
834
835            // Process src
836            vsrc_wide = vmovl_u8(vsrc);
837            vsrc_wide *= vsrc_scale;
838
839            // Process dst
840            vdst_wide = vmovl_u8(vdst);
841            vdst_wide *= vdst_scale;
842
843            // Combine
844            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
845
846            vst1_u32(dst, vreinterpret_u32_u8(vres));
847
848            src += 2;
849            dst += 2;
850            count -= 2;
851        } while(count);
852    }
853}
854
855///////////////////////////////////////////////////////////////////////////////
856
857#undef    DEBUG_OPAQUE_DITHER
858
859#if    defined(DEBUG_OPAQUE_DITHER)
860static void showme8(char *str, void *p, int len)
861{
862    static char buf[256];
863    char tbuf[32];
864    int i;
865    char *pc = (char*) p;
866    sprintf(buf,"%8s:", str);
867    for(i=0;i<len;i++) {
868        sprintf(tbuf, "   %02x", pc[i]);
869        strcat(buf, tbuf);
870    }
871    SkDebugf("%s\n", buf);
872}
873static void showme16(char *str, void *p, int len)
874{
875    static char buf[256];
876    char tbuf[32];
877    int i;
878    uint16_t *pc = (uint16_t*) p;
879    sprintf(buf,"%8s:", str);
880    len = (len / sizeof(uint16_t));    /* passed as bytes */
881    for(i=0;i<len;i++) {
882        sprintf(tbuf, " %04x", pc[i]);
883        strcat(buf, tbuf);
884    }
885    SkDebugf("%s\n", buf);
886}
887#endif
888
889void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
890                                   const SkPMColor* SK_RESTRICT src,
891                                   int count, U8CPU alpha, int x, int y) {
892    SkASSERT(255 == alpha);
893
894#define    UNROLL    8
895
896    if (count >= UNROLL) {
897    uint8x8_t dbase;
898
899#if    defined(DEBUG_OPAQUE_DITHER)
900    uint16_t tmpbuf[UNROLL];
901    int td[UNROLL];
902    int tdv[UNROLL];
903    int ta[UNROLL];
904    int tap[UNROLL];
905    uint16_t in_dst[UNROLL];
906    int offset = 0;
907    int noisy = 0;
908#endif
909
910    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
911    dbase = vld1_u8(dstart);
912
913        do {
914        uint8x8_t sr, sg, sb, sa, d;
915        uint16x8_t dst8, scale8, alpha8;
916        uint16x8_t dst_r, dst_g, dst_b;
917
918#if    defined(DEBUG_OPAQUE_DITHER)
919    /* calculate 8 elements worth into a temp buffer */
920    {
921      int my_y = y;
922      int my_x = x;
923      SkPMColor* my_src = (SkPMColor*)src;
924      uint16_t* my_dst = dst;
925      int i;
926
927          DITHER_565_SCAN(my_y);
928          for(i=0;i<UNROLL;i++) {
929            SkPMColor c = *my_src++;
930            SkPMColorAssert(c);
931            if (c) {
932                unsigned a = SkGetPackedA32(c);
933
934                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
935        tdv[i] = DITHER_VALUE(my_x);
936        ta[i] = a;
937        tap[i] = SkAlpha255To256(a);
938        td[i] = d;
939
940                unsigned sr = SkGetPackedR32(c);
941                unsigned sg = SkGetPackedG32(c);
942                unsigned sb = SkGetPackedB32(c);
943                sr = SkDITHER_R32_FOR_565(sr, d);
944                sg = SkDITHER_G32_FOR_565(sg, d);
945                sb = SkDITHER_B32_FOR_565(sb, d);
946
947                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
948                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
949                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
950                // now src and dst expanded are in g:11 r:10 x:1 b:10
951                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
952        td[i] = d;
953
954            } else {
955        tmpbuf[i] = *my_dst;
956        ta[i] = tdv[i] = td[i] = 0xbeef;
957        }
958        in_dst[i] = *my_dst;
959            my_dst += 1;
960            DITHER_INC_X(my_x);
961          }
962    }
963#endif
964
965        /* source is in ABGR */
966        {
967        register uint8x8_t d0 asm("d0");
968        register uint8x8_t d1 asm("d1");
969        register uint8x8_t d2 asm("d2");
970        register uint8x8_t d3 asm("d3");
971
972        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
973            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
974            : "r" (src)
975                    );
976            sr = d0; sg = d1; sb = d2; sa = d3;
977        }
978
979        /* calculate 'd', which will be 0..7 */
980        /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
981#if defined(SK_BUILD_FOR_ANDROID)
982        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
983        alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
984#else
985        alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
986#endif
987        alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
988        d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
989
990        /* sr = sr - (sr>>5) + d */
991        /* watching for 8-bit overflow.  d is 0..7; risky range of
992         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
993         * safe  as long as we do ((sr-sr>>5) + d) */
994        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
995        sr = vadd_u8(sr, d);
996
997        /* sb = sb - (sb>>5) + d */
998        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
999        sb = vadd_u8(sb, d);
1000
1001        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1002        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1003        sg = vadd_u8(sg, vshr_n_u8(d,1));
1004
1005        /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1006        dst8 = vld1q_u16(dst);
1007        dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1008        dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1009        dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
1010
1011        /* blend */
1012#if 1
1013        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1014        /* originally 255-sa + 1 */
1015        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1016#else
1017        scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1018        scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1019#endif
1020
1021#if 1
1022        /* combine the addq and mul, save 3 insns */
1023        scale8 = vshrq_n_u16(scale8, 3);
1024        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1025        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1026        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1027#else
1028        /* known correct, but +3 insns over above */
1029        scale8 = vshrq_n_u16(scale8, 3);
1030        dst_b = vmulq_u16(dst_b, scale8);
1031        dst_g = vmulq_u16(dst_g, scale8);
1032        dst_r = vmulq_u16(dst_r, scale8);
1033
1034        /* combine */
1035        /* NB: vshll widens, need to preserve those bits */
1036        dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1037        dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1038        dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1039#endif
1040
1041        /* repack to store */
1042        dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1043        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1044        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1045
1046        vst1q_u16(dst, dst8);
1047
1048#if    defined(DEBUG_OPAQUE_DITHER)
1049        /* verify my 8 elements match the temp buffer */
1050    {
1051       int i, bad=0;
1052       static int invocation;
1053
1054       for (i=0;i<UNROLL;i++)
1055        if (tmpbuf[i] != dst[i]) bad=1;
1056       if (bad) {
1057        SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1058            invocation, offset);
1059        SkDebugf("  alpha 0x%x\n", alpha);
1060        for (i=0;i<UNROLL;i++)
1061            SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1062            i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1063            dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1064
1065        showme16("alpha8", &alpha8, sizeof(alpha8));
1066        showme16("scale8", &scale8, sizeof(scale8));
1067        showme8("d", &d, sizeof(d));
1068        showme16("dst8", &dst8, sizeof(dst8));
1069        showme16("dst_b", &dst_b, sizeof(dst_b));
1070        showme16("dst_g", &dst_g, sizeof(dst_g));
1071        showme16("dst_r", &dst_r, sizeof(dst_r));
1072        showme8("sb", &sb, sizeof(sb));
1073        showme8("sg", &sg, sizeof(sg));
1074        showme8("sr", &sr, sizeof(sr));
1075
1076        /* cop out */
1077        return;
1078       }
1079       offset += UNROLL;
1080       invocation++;
1081    }
1082#endif
1083
1084            dst += UNROLL;
1085        src += UNROLL;
1086        count -= UNROLL;
1087        /* skip x += UNROLL, since it's unchanged mod-4 */
1088        } while (count >= UNROLL);
1089    }
1090#undef    UNROLL
1091
1092    /* residuals */
1093    if (count > 0) {
1094        DITHER_565_SCAN(y);
1095        do {
1096            SkPMColor c = *src++;
1097            SkPMColorAssert(c);
1098            if (c) {
1099                unsigned a = SkGetPackedA32(c);
1100
1101                // dither and alpha are just temporary variables to work-around
1102                // an ICE in debug.
1103                unsigned dither = DITHER_VALUE(x);
1104                unsigned alpha = SkAlpha255To256(a);
1105                int d = SkAlphaMul(dither, alpha);
1106
1107                unsigned sr = SkGetPackedR32(c);
1108                unsigned sg = SkGetPackedG32(c);
1109                unsigned sb = SkGetPackedB32(c);
1110                sr = SkDITHER_R32_FOR_565(sr, d);
1111                sg = SkDITHER_G32_FOR_565(sg, d);
1112                sb = SkDITHER_B32_FOR_565(sb, d);
1113
1114                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1115                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1116                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1117                // now src and dst expanded are in g:11 r:10 x:1 b:10
1118                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1119            }
1120            dst += 1;
1121            DITHER_INC_X(x);
1122        } while (--count != 0);
1123    }
1124}
1125
1126///////////////////////////////////////////////////////////////////////////////
1127
1128/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1129 * speedup untested, but ARM version is 26 insns/iteration and
1130 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1131 * which is 10x the native version; that's pure instruction counts,
1132 * not accounting for any instruction or memory latencies.
1133 */
1134
1135#undef    DEBUG_S32_OPAQUE_DITHER
1136
1137void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1138                                 const SkPMColor* SK_RESTRICT src,
1139                                 int count, U8CPU alpha, int x, int y) {
1140    SkASSERT(255 == alpha);
1141
1142#define    UNROLL    8
1143    if (count >= UNROLL) {
1144    uint8x8_t d;
1145    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1146    d = vld1_u8(dstart);
1147
1148    while (count >= UNROLL) {
1149        uint8x8_t sr, sg, sb;
1150        uint16x8_t dr, dg, db;
1151        uint16x8_t dst8;
1152
1153        /* source is in ABGR ordering (R == lsb) */
1154        {
1155        register uint8x8_t d0 asm("d0");
1156        register uint8x8_t d1 asm("d1");
1157        register uint8x8_t d2 asm("d2");
1158        register uint8x8_t d3 asm("d3");
1159
1160        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1161            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1162            : "r" (src)
1163                    );
1164            sr = d0; sg = d1; sb = d2;
1165        }
1166        /* XXX: if we want to prefetch, hide it in the above asm()
1167         * using the gcc __builtin_prefetch(), the prefetch will
1168         * fall to the bottom of the loop -- it won't stick up
1169         * at the top of the loop, just after the vld4.
1170         */
1171
1172        /* sr = sr - (sr>>5) + d */
1173        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1174        dr = vaddl_u8(sr, d);
1175
1176        /* sb = sb - (sb>>5) + d */
1177        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1178        db = vaddl_u8(sb, d);
1179
1180        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1181        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1182        dg = vaddl_u8(sg, vshr_n_u8(d,1));
1183        /* XXX: check that the "d>>1" here is hoisted */
1184
1185        /* pack high bits of each into 565 format  (rgb, b is lsb) */
1186        dst8 = vshrq_n_u16(db, 3);
1187        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1188        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1189
1190        /* store it */
1191        vst1q_u16(dst, dst8);
1192
1193#if    defined(DEBUG_S32_OPAQUE_DITHER)
1194        /* always good to know if we generated good results */
1195        {
1196        int i, myx = x, myy = y;
1197        DITHER_565_SCAN(myy);
1198        for (i=0;i<UNROLL;i++) {
1199            SkPMColor c = src[i];
1200            unsigned dither = DITHER_VALUE(myx);
1201            uint16_t val = SkDitherRGB32To565(c, dither);
1202            if (val != dst[i]) {
1203            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1204                c, dither, val, dst[i], dstart[i]);
1205            }
1206            DITHER_INC_X(myx);
1207        }
1208        }
1209#endif
1210
1211        dst += UNROLL;
1212        src += UNROLL;
1213        count -= UNROLL;
1214        x += UNROLL;        /* probably superfluous */
1215    }
1216    }
1217#undef    UNROLL
1218
1219    /* residuals */
1220    if (count > 0) {
1221        DITHER_565_SCAN(y);
1222        do {
1223            SkPMColor c = *src++;
1224            SkPMColorAssert(c);
1225            SkASSERT(SkGetPackedA32(c) == 255);
1226
1227            unsigned dither = DITHER_VALUE(x);
1228            *dst++ = SkDitherRGB32To565(c, dither);
1229            DITHER_INC_X(x);
1230        } while (--count != 0);
1231    }
1232}
1233
1234void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1235                      SkPMColor color) {
1236    if (count <= 0) {
1237        return;
1238    }
1239
1240    if (0 == color) {
1241        if (src != dst) {
1242            memcpy(dst, src, count * sizeof(SkPMColor));
1243        }
1244        return;
1245    }
1246
1247    unsigned colorA = SkGetPackedA32(color);
1248    if (255 == colorA) {
1249        sk_memset32(dst, color, count);
1250    } else {
1251        unsigned scale = 256 - SkAlpha255To256(colorA);
1252
1253        if (count >= 8) {
1254            // at the end of this assembly, count will have been decremented
1255            // to a negative value. That is, if count mod 8 = x, it will be
1256            // -8 +x coming out.
1257            asm volatile (
1258                PLD128(src, 0)
1259
1260                "vdup.32    q0, %[color]                \n\t"
1261
1262                PLD128(src, 128)
1263
1264                // scale numerical interval [0-255], so load as 8 bits
1265                "vdup.8     d2, %[scale]                \n\t"
1266
1267                PLD128(src, 256)
1268
1269                "subs       %[count], %[count], #8      \n\t"
1270
1271                PLD128(src, 384)
1272
1273                "Loop_Color32:                          \n\t"
1274
1275                // load src color, 8 pixels, 4 64 bit registers
1276                // (and increment src).
1277                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1278
1279                PLD128(src, 384)
1280
1281                // multiply long by scale, 64 bits at a time,
1282                // destination into a 128 bit register.
1283                "vmull.u8   q4, d4, d2                  \n\t"
1284                "vmull.u8   q5, d5, d2                  \n\t"
1285                "vmull.u8   q6, d6, d2                  \n\t"
1286                "vmull.u8   q7, d7, d2                  \n\t"
1287
1288                // shift the 128 bit registers, containing the 16
1289                // bit scaled values back to 8 bits, narrowing the
1290                // results to 64 bit registers.
1291                "vshrn.i16  d8, q4, #8                  \n\t"
1292                "vshrn.i16  d9, q5, #8                  \n\t"
1293                "vshrn.i16  d10, q6, #8                 \n\t"
1294                "vshrn.i16  d11, q7, #8                 \n\t"
1295
1296                // adding back the color, using 128 bit registers.
1297                "vadd.i8    q6, q4, q0                  \n\t"
1298                "vadd.i8    q7, q5, q0                  \n\t"
1299
1300                // store back the 8 calculated pixels (2 128 bit
1301                // registers), and increment dst.
1302                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1303
1304                "subs       %[count], %[count], #8      \n\t"
1305                "bge        Loop_Color32                \n\t"
1306                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1307                : [color] "r" (color), [scale] "r" (scale)
1308                : "cc", "memory",
1309                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1310                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1311                          );
1312            // At this point, if we went through the inline assembly, count is
1313            // a negative value:
1314            // if the value is -8, there is no pixel left to process.
1315            // if the value is -7, there is one pixel left to process
1316            // ...
1317            // And'ing it with 7 will give us the number of pixels
1318            // left to process.
1319            count = count & 0x7;
1320        }
1321
1322        while (count > 0) {
1323            *dst = color + SkAlphaMulQ(*src, scale);
1324            src += 1;
1325            dst += 1;
1326            count--;
1327        }
1328    }
1329}
1330
1331///////////////////////////////////////////////////////////////////////////////
1332
1333const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1334    // no dither
1335    // NOTE: For the two functions below, we don't have a special version
1336    //       that assumes that each source pixel is opaque. But our S32A is
1337    //       still faster than the default, so use it.
1338    S32A_D565_Opaque_neon,  // really S32_D565_Opaque
1339    S32A_D565_Blend_neon,   // really S32_D565_Blend
1340    S32A_D565_Opaque_neon,
1341    S32A_D565_Blend_neon,
1342
1343    // dither
1344    S32_D565_Opaque_Dither_neon,
1345    S32_D565_Blend_Dither_neon,
1346    S32A_D565_Opaque_Dither_neon,
1347    NULL,   // S32A_D565_Blend_Dither
1348};
1349
1350const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1351    NULL,   // S32_Opaque,
1352    S32_Blend_BlitRow32_neon,        // S32_Blend,
1353    /*
1354     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1355     * value and attempts to optimize accordingly.  The optimization is
1356     * sensitive to the source content and is not a win in all cases. For
1357     * example, if there are a lot of transitions between the alpha states,
1358     * the performance will almost certainly be worse.  However, for many
1359     * common cases the performance is equivalent or better than the standard
1360     * case where we do not inspect the src alpha.
1361     */
1362#if SK_A32_SHIFT == 24
1363    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1364    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1365#else
1366    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1367#endif
1368    S32A_Blend_BlitRow32_neon        // S32A_Blend
1369};
1370