SkBlitRow_opts_arm_neon.cpp revision fbfcd5602128ec010c82cb733c9cdc0a3254f9f3
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkCachePreload_arm.h"
18
19#include <arm_neon.h>
20
21void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22                           const SkPMColor* SK_RESTRICT src, int count,
23                           U8CPU alpha, int /*x*/, int /*y*/) {
24    SkASSERT(255 == alpha);
25
26    if (count >= 8) {
27        uint16_t* SK_RESTRICT keep_dst;
28
29        asm volatile (
30                      "ands       ip, %[count], #7            \n\t"
31                      "vmov.u8    d31, #1<<7                  \n\t"
32                      "vld1.16    {q12}, [%[dst]]             \n\t"
33                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
34                      // Thumb does not support the standard ARM conditional
35                      // instructions but instead requires the 'it' instruction
36                      // to signal conditional execution
37                      "it eq                                  \n\t"
38                      "moveq      ip, #8                      \n\t"
39                      "mov        %[keep_dst], %[dst]         \n\t"
40
41                      "add        %[src], %[src], ip, LSL#2   \n\t"
42                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
43                      "subs       %[count], %[count], ip      \n\t"
44                      "b          9f                          \n\t"
45                      // LOOP
46                      "2:                                         \n\t"
47
48                      "vld1.16    {q12}, [%[dst]]!            \n\t"
49                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
50                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
51                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
52                      "subs       %[count], %[count], #8      \n\t"
53                      "9:                                         \n\t"
54                      "pld        [%[dst],#32]                \n\t"
55                      // expand 0565 q12 to 8888 {d4-d7}
56                      "vmovn.u16  d4, q12                     \n\t"
57                      "vshr.u16   q11, q12, #5                \n\t"
58                      "vshr.u16   q10, q12, #6+5              \n\t"
59                      "vmovn.u16  d5, q11                     \n\t"
60                      "vmovn.u16  d6, q10                     \n\t"
61                      "vshl.u8    d4, d4, #3                  \n\t"
62                      "vshl.u8    d5, d5, #2                  \n\t"
63                      "vshl.u8    d6, d6, #3                  \n\t"
64
65                      "vmovl.u8   q14, d31                    \n\t"
66                      "vmovl.u8   q13, d31                    \n\t"
67                      "vmovl.u8   q12, d31                    \n\t"
68
69                      // duplicate in 4/2/1 & 8pix vsns
70                      "vmvn.8     d30, d3                     \n\t"
71                      "vmlal.u8   q14, d30, d6                \n\t"
72                      "vmlal.u8   q13, d30, d5                \n\t"
73                      "vmlal.u8   q12, d30, d4                \n\t"
74                      "vshr.u16   q8, q14, #5                 \n\t"
75                      "vshr.u16   q9, q13, #6                 \n\t"
76                      "vaddhn.u16 d6, q14, q8                 \n\t"
77                      "vshr.u16   q8, q12, #5                 \n\t"
78                      "vaddhn.u16 d5, q13, q9                 \n\t"
79                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
80                      "vaddhn.u16 d4, q12, q8                 \n\t"
81                      // intentionally don't calculate alpha
82                      // result in d4-d6
83
84                      "vqadd.u8   d5, d5, d1                  \n\t"
85                      "vqadd.u8   d4, d4, d2                  \n\t"
86
87                      // pack 8888 {d4-d6} to 0565 q10
88                      "vshll.u8   q10, d6, #8                 \n\t"
89                      "vshll.u8   q3, d5, #8                  \n\t"
90                      "vshll.u8   q2, d4, #8                  \n\t"
91                      "vsri.u16   q10, q3, #5                 \n\t"
92                      "vsri.u16   q10, q2, #11                \n\t"
93
94                      "bne        2b                          \n\t"
95
96                      "1:                                         \n\t"
97                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
98                      : [count] "+r" (count)
99                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
100                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
101                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
102                      "d30","d31"
103                      );
104    }
105    else
106    {   // handle count < 8
107        uint16_t* SK_RESTRICT keep_dst;
108
109        asm volatile (
110                      "vmov.u8    d31, #1<<7                  \n\t"
111                      "mov        %[keep_dst], %[dst]         \n\t"
112
113                      "tst        %[count], #4                \n\t"
114                      "beq        14f                         \n\t"
115                      "vld1.16    {d25}, [%[dst]]!            \n\t"
116                      "vld1.32    {q1}, [%[src]]!             \n\t"
117
118                      "14:                                        \n\t"
119                      "tst        %[count], #2                \n\t"
120                      "beq        12f                         \n\t"
121                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
122                      "vld1.32    {d1}, [%[src]]!             \n\t"
123
124                      "12:                                        \n\t"
125                      "tst        %[count], #1                \n\t"
126                      "beq        11f                         \n\t"
127                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
128                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
129
130                      "11:                                        \n\t"
131                      // unzips achieve the same as a vld4 operation
132                      "vuzpq.u16  q0, q1                      \n\t"
133                      "vuzp.u8    d0, d1                      \n\t"
134                      "vuzp.u8    d2, d3                      \n\t"
135                      // expand 0565 q12 to 8888 {d4-d7}
136                      "vmovn.u16  d4, q12                     \n\t"
137                      "vshr.u16   q11, q12, #5                \n\t"
138                      "vshr.u16   q10, q12, #6+5              \n\t"
139                      "vmovn.u16  d5, q11                     \n\t"
140                      "vmovn.u16  d6, q10                     \n\t"
141                      "vshl.u8    d4, d4, #3                  \n\t"
142                      "vshl.u8    d5, d5, #2                  \n\t"
143                      "vshl.u8    d6, d6, #3                  \n\t"
144
145                      "vmovl.u8   q14, d31                    \n\t"
146                      "vmovl.u8   q13, d31                    \n\t"
147                      "vmovl.u8   q12, d31                    \n\t"
148
149                      // duplicate in 4/2/1 & 8pix vsns
150                      "vmvn.8     d30, d3                     \n\t"
151                      "vmlal.u8   q14, d30, d6                \n\t"
152                      "vmlal.u8   q13, d30, d5                \n\t"
153                      "vmlal.u8   q12, d30, d4                \n\t"
154                      "vshr.u16   q8, q14, #5                 \n\t"
155                      "vshr.u16   q9, q13, #6                 \n\t"
156                      "vaddhn.u16 d6, q14, q8                 \n\t"
157                      "vshr.u16   q8, q12, #5                 \n\t"
158                      "vaddhn.u16 d5, q13, q9                 \n\t"
159                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
160                      "vaddhn.u16 d4, q12, q8                 \n\t"
161                      // intentionally don't calculate alpha
162                      // result in d4-d6
163
164                      "vqadd.u8   d5, d5, d1                  \n\t"
165                      "vqadd.u8   d4, d4, d2                  \n\t"
166
167                      // pack 8888 {d4-d6} to 0565 q10
168                      "vshll.u8   q10, d6, #8                 \n\t"
169                      "vshll.u8   q3, d5, #8                  \n\t"
170                      "vshll.u8   q2, d4, #8                  \n\t"
171                      "vsri.u16   q10, q3, #5                 \n\t"
172                      "vsri.u16   q10, q2, #11                \n\t"
173
174                      // store
175                      "tst        %[count], #4                \n\t"
176                      "beq        24f                         \n\t"
177                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
178
179                      "24:                                        \n\t"
180                      "tst        %[count], #2                \n\t"
181                      "beq        22f                         \n\t"
182                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
183
184                      "22:                                        \n\t"
185                      "tst        %[count], #1                \n\t"
186                      "beq        21f                         \n\t"
187                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
188
189                      "21:                                        \n\t"
190                      : [count] "+r" (count)
191                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
192                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
193                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
194                      "d30","d31"
195                      );
196    }
197}
198
199void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
200                          const SkPMColor* SK_RESTRICT src, int count,
201                          U8CPU alpha, int /*x*/, int /*y*/) {
202
203    U8CPU alpha_for_asm = alpha;
204
205    asm volatile (
206    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
207     * the original in two respects:
208     *  1. The results have a few mismatches compared to the original code. These mismatches
209     *     never exceed 1. It's possible to improve accuracy vs. a floating point
210     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
211     *     Rounding is not present in the code below, because although results would be closer
212     *     to a floating point implementation, the number of mismatches compared to the
213     *     original code would be far greater.
214     *  2. On certain inputs, the original code can overflow, causing colour channels to
215     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
216     *     to affect another.
217     */
218
219#if 1
220        /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
221                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
222#else
223                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
224#endif
225                  "vmov.u16   q3, #255                        \n\t"   // set up constant
226                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
227                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
228                  "beq        2f                              \n\t"   // if count8 == 0, exit
229                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
230
231                  "1:                                             \n\t"
232                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
233                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
234                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
235                  //  and deinterleave
236
237                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
238                  "vand       q10, q0, q15                    \n\t"   // extract blue
239                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
240                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
241                  // dstrgb = {q8, q9, q10}
242
243                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
244                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
245                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
246
247                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
248                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
249                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
250                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
251                  // srcrgba = {q11, q12, q13, q14}
252
253                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
254                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
255                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
256                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
257
258                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
259                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
260                  // dst_scale = q2
261
262                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
263                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
264                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
265
266#if 1
267    // trying for a better match with SkDiv255Round(a)
268    // C alg is:  a+=128; (a+a>>8)>>8
269    // we'll use just a rounding shift [q2 is available for scratch]
270                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
271                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
272                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
273#else
274    // arm's original "truncating divide by 256"
275                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
276                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
277                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
278#endif
279
280                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
281                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
282                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
283
284                  "bne        1b                              \n\t"   // if counter != 0, loop
285                  "2:                                             \n\t"   // exit
286
287                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
288                  :
289                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
290                  );
291
292    count &= 7;
293    if (count > 0) {
294        do {
295            SkPMColor sc = *src++;
296            if (sc) {
297                uint16_t dc = *dst;
298                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
299                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
300                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
301                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
302                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
303            }
304            dst += 1;
305        } while (--count != 0);
306    }
307}
308
309/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
310 * each dither value is spaced out into byte lanes, and repeated
311 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
312 * start of each row.
313 */
314static const uint8_t gDitherMatrix_Neon[48] = {
315    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
316    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
317    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
318    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
319
320};
321
322void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
323                                int count, U8CPU alpha, int x, int y)
324{
325    /* select row and offset for dither array */
326    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
327
328    /* rescale alpha to range 0 - 256 */
329    int scale = SkAlpha255To256(alpha);
330
331    asm volatile (
332                  "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
333                  "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
334                  "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
335                  "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
336                  "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
337                  "1:                                                 \n\t"
338                  "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
339                  "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
340                  "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
341                  "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
342                  "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
343                  "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
344                  "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
345                  "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
346                  "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
347                  "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
348                  "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
349                  "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
350                  "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
351                  // load 8 pixels from dst, extract rgb
352                  "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
353                  "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
354                  "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
355                  "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
356                  "vand           d17, d17, d29                   \n\t"   // and green with green mask
357                  "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
358                  "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
359                  // src = {d22 (r), d23 (g), d24 (b)}
360                  // dst = {d16 (r), d17 (g), d18 (b)}
361                  // subtract dst from src and widen
362                  "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
363                  "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
364                  "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
365                  // multiply diffs by scale and shift
366                  "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
367                  "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
368                  "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
369                  "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
370                  "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
371                  "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
372                  "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
373                  // add dst to result
374                  "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
375                  "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
376                  "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
377                  // put result into 565 format
378                  "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
379                  "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
380                  "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
381                  "bgt            1b                              \n\t"   // loop if count > 0
382                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
383                  : [dstart] "r" (dstart), [scale] "r" (scale)
384                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
385                  );
386
387    DITHER_565_SCAN(y);
388
389    while((count & 7) > 0)
390    {
391        SkPMColor c = *src++;
392
393        int dither = DITHER_VALUE(x);
394        int sr = SkGetPackedR32(c);
395        int sg = SkGetPackedG32(c);
396        int sb = SkGetPackedB32(c);
397        sr = SkDITHER_R32To565(sr, dither);
398        sg = SkDITHER_G32To565(sg, dither);
399        sb = SkDITHER_B32To565(sb, dither);
400
401        uint16_t d = *dst;
402        *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
403                             SkAlphaBlend(sg, SkGetPackedG16(d), scale),
404                             SkAlphaBlend(sb, SkGetPackedB16(d), scale));
405        DITHER_INC_X(x);
406        count--;
407    }
408}
409
410void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
411                                const SkPMColor* SK_RESTRICT src,
412                                int count, U8CPU alpha) {
413
414    SkASSERT(255 == alpha);
415    if (count > 0) {
416
417
418    uint8x8_t alpha_mask;
419
420    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
421    alpha_mask = vld1_u8(alpha_mask_setup);
422
423    /* do the NEON unrolled code */
424#define    UNROLL    4
425    while (count >= UNROLL) {
426        uint8x8_t src_raw, dst_raw, dst_final;
427        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
428
429        /* get the source */
430        src_raw = vreinterpret_u8_u32(vld1_u32(src));
431#if    UNROLL > 2
432        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
433#endif
434
435        /* get and hold the dst too */
436        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
437#if    UNROLL > 2
438        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
439#endif
440
441    /* 1st and 2nd bits of the unrolling */
442    {
443        uint8x8_t dst_cooked;
444        uint16x8_t dst_wide;
445        uint8x8_t alpha_narrow;
446        uint16x8_t alpha_wide;
447
448        /* get the alphas spread out properly */
449        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
450#if 1
451        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
452        /* we collapsed (255-a)+1 ... */
453        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
454#else
455        alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
456        alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
457#endif
458
459        /* spread the dest */
460        dst_wide = vmovl_u8(dst_raw);
461
462        /* alpha mul the dest */
463        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
464        dst_cooked = vshrn_n_u16(dst_wide, 8);
465
466        /* sum -- ignoring any byte lane overflows */
467        dst_final = vadd_u8(src_raw, dst_cooked);
468    }
469
470#if    UNROLL > 2
471    /* the 3rd and 4th bits of our unrolling */
472    {
473        uint8x8_t dst_cooked;
474        uint16x8_t dst_wide;
475        uint8x8_t alpha_narrow;
476        uint16x8_t alpha_wide;
477
478        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
479#if 1
480        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
481        /* we collapsed (255-a)+1 ... */
482        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
483#else
484        alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
485        alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
486#endif
487
488        /* spread the dest */
489        dst_wide = vmovl_u8(dst_raw_2);
490
491        /* alpha mul the dest */
492        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
493        dst_cooked = vshrn_n_u16(dst_wide, 8);
494
495        /* sum -- ignoring any byte lane overflows */
496        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
497    }
498#endif
499
500        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
501#if    UNROLL > 2
502        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
503#endif
504
505        src += UNROLL;
506        dst += UNROLL;
507        count -= UNROLL;
508    }
509#undef    UNROLL
510
511    /* do any residual iterations */
512        while (--count >= 0) {
513            *dst = SkPMSrcOver(*src, *dst);
514            src += 1;
515            dst += 1;
516        }
517    }
518}
519
520
521/* Neon version of S32_Blend_BlitRow32()
522 * portable version is in src/core/SkBlitRow_D32.cpp
523 */
524void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
525                              const SkPMColor* SK_RESTRICT src,
526                              int count, U8CPU alpha) {
527    SkASSERT(alpha <= 255);
528    if (count > 0) {
529        uint16_t src_scale = SkAlpha255To256(alpha);
530        uint16_t dst_scale = 256 - src_scale;
531
532    /* run them N at a time through the NEON unit */
533    /* note that each 1 is 4 bytes, each treated exactly the same,
534     * so we can work under that guise. We *do* know that the src&dst
535     * will be 32-bit aligned quantities, so we can specify that on
536     * the load/store ops and do a neon 'reinterpret' to get us to
537     * byte-sized (pun intended) pieces that we widen/multiply/shift
538     * we're limited at 128 bits in the wide ops, which is 8x16bits
539     * or a pair of 32 bit src/dsts.
540     */
541    /* we *could* manually unroll this loop so that we load 128 bits
542     * (as a pair of 64s) from each of src and dst, processing them
543     * in pieces. This might give us a little better management of
544     * the memory latency, but my initial attempts here did not
545     * produce an instruction stream that looked all that nice.
546     */
547#define    UNROLL    2
548    while (count >= UNROLL) {
549        uint8x8_t  src_raw, dst_raw, dst_final;
550        uint16x8_t  src_wide, dst_wide;
551
552        /* get 64 bits of src, widen it, multiply by src_scale */
553        src_raw = vreinterpret_u8_u32(vld1_u32(src));
554        src_wide = vmovl_u8(src_raw);
555        /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
556        src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
557
558        /* ditto with dst */
559        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
560        dst_wide = vmovl_u8(dst_raw);
561
562        /* combine add with dst multiply into mul-accumulate */
563        dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
564
565        dst_final = vshrn_n_u16(dst_wide, 8);
566        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
567
568        src += UNROLL;
569        dst += UNROLL;
570        count -= UNROLL;
571    }
572    /* RBE: well, i don't like how gcc manages src/dst across the above
573     * loop it's constantly calculating src+bias, dst+bias and it only
574     * adjusts the real ones when we leave the loop. Not sure why
575     * it's "hoisting down" (hoisting implies above in my lexicon ;))
576     * the adjustments to src/dst/count, but it does...
577     * (might be SSA-style internal logic...
578     */
579
580#if    UNROLL == 2
581    if (count == 1) {
582            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
583    }
584#else
585    if (count > 0) {
586            do {
587                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
588                src += 1;
589                dst += 1;
590            } while (--count > 0);
591    }
592#endif
593
594#undef    UNROLL
595    }
596}
597
598///////////////////////////////////////////////////////////////////////////////
599
600#undef    DEBUG_OPAQUE_DITHER
601
602#if    defined(DEBUG_OPAQUE_DITHER)
603static void showme8(char *str, void *p, int len)
604{
605    static char buf[256];
606    char tbuf[32];
607    int i;
608    char *pc = (char*) p;
609    sprintf(buf,"%8s:", str);
610    for(i=0;i<len;i++) {
611        sprintf(tbuf, "   %02x", pc[i]);
612        strcat(buf, tbuf);
613    }
614    SkDebugf("%s\n", buf);
615}
616static void showme16(char *str, void *p, int len)
617{
618    static char buf[256];
619    char tbuf[32];
620    int i;
621    uint16_t *pc = (uint16_t*) p;
622    sprintf(buf,"%8s:", str);
623    len = (len / sizeof(uint16_t));    /* passed as bytes */
624    for(i=0;i<len;i++) {
625        sprintf(tbuf, " %04x", pc[i]);
626        strcat(buf, tbuf);
627    }
628    SkDebugf("%s\n", buf);
629}
630#endif
631
632void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
633                                   const SkPMColor* SK_RESTRICT src,
634                                   int count, U8CPU alpha, int x, int y) {
635    SkASSERT(255 == alpha);
636
637#define    UNROLL    8
638
639    if (count >= UNROLL) {
640    uint8x8_t dbase;
641
642#if    defined(DEBUG_OPAQUE_DITHER)
643    uint16_t tmpbuf[UNROLL];
644    int td[UNROLL];
645    int tdv[UNROLL];
646    int ta[UNROLL];
647    int tap[UNROLL];
648    uint16_t in_dst[UNROLL];
649    int offset = 0;
650    int noisy = 0;
651#endif
652
653    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
654    dbase = vld1_u8(dstart);
655
656        do {
657        uint8x8_t sr, sg, sb, sa, d;
658        uint16x8_t dst8, scale8, alpha8;
659        uint16x8_t dst_r, dst_g, dst_b;
660
661#if    defined(DEBUG_OPAQUE_DITHER)
662    /* calculate 8 elements worth into a temp buffer */
663    {
664      int my_y = y;
665      int my_x = x;
666      SkPMColor* my_src = (SkPMColor*)src;
667      uint16_t* my_dst = dst;
668      int i;
669
670          DITHER_565_SCAN(my_y);
671          for(i=0;i<UNROLL;i++) {
672            SkPMColor c = *my_src++;
673            SkPMColorAssert(c);
674            if (c) {
675                unsigned a = SkGetPackedA32(c);
676
677                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
678        tdv[i] = DITHER_VALUE(my_x);
679        ta[i] = a;
680        tap[i] = SkAlpha255To256(a);
681        td[i] = d;
682
683                unsigned sr = SkGetPackedR32(c);
684                unsigned sg = SkGetPackedG32(c);
685                unsigned sb = SkGetPackedB32(c);
686                sr = SkDITHER_R32_FOR_565(sr, d);
687                sg = SkDITHER_G32_FOR_565(sg, d);
688                sb = SkDITHER_B32_FOR_565(sb, d);
689
690                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
691                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
692                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
693                // now src and dst expanded are in g:11 r:10 x:1 b:10
694                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
695        td[i] = d;
696
697            } else {
698        tmpbuf[i] = *my_dst;
699        ta[i] = tdv[i] = td[i] = 0xbeef;
700        }
701        in_dst[i] = *my_dst;
702            my_dst += 1;
703            DITHER_INC_X(my_x);
704          }
705    }
706#endif
707
708        /* source is in ABGR */
709        {
710        register uint8x8_t d0 asm("d0");
711        register uint8x8_t d1 asm("d1");
712        register uint8x8_t d2 asm("d2");
713        register uint8x8_t d3 asm("d3");
714
715        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
716            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
717            : "r" (src)
718                    );
719            sr = d0; sg = d1; sb = d2; sa = d3;
720        }
721
722        /* calculate 'd', which will be 0..7 */
723        /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
724#if defined(SK_BUILD_FOR_ANDROID)
725        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
726        alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
727#else
728        alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
729#endif
730        alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
731        d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
732
733        /* sr = sr - (sr>>5) + d */
734        /* watching for 8-bit overflow.  d is 0..7; risky range of
735         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
736         * safe  as long as we do ((sr-sr>>5) + d) */
737        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
738        sr = vadd_u8(sr, d);
739
740        /* sb = sb - (sb>>5) + d */
741        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
742        sb = vadd_u8(sb, d);
743
744        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
745        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
746        sg = vadd_u8(sg, vshr_n_u8(d,1));
747
748        /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
749        dst8 = vld1q_u16(dst);
750        dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
751        dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
752        dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
753
754        /* blend */
755#if 1
756        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
757        /* originally 255-sa + 1 */
758        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
759#else
760        scale8 = vsubw_u8(vdupq_n_u16(255), sa);
761        scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
762#endif
763
764#if 1
765        /* combine the addq and mul, save 3 insns */
766        scale8 = vshrq_n_u16(scale8, 3);
767        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
768        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
769        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
770#else
771        /* known correct, but +3 insns over above */
772        scale8 = vshrq_n_u16(scale8, 3);
773        dst_b = vmulq_u16(dst_b, scale8);
774        dst_g = vmulq_u16(dst_g, scale8);
775        dst_r = vmulq_u16(dst_r, scale8);
776
777        /* combine */
778        /* NB: vshll widens, need to preserve those bits */
779        dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
780        dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
781        dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
782#endif
783
784        /* repack to store */
785        dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
786        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
787        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
788
789        vst1q_u16(dst, dst8);
790
791#if    defined(DEBUG_OPAQUE_DITHER)
792        /* verify my 8 elements match the temp buffer */
793    {
794       int i, bad=0;
795       static int invocation;
796
797       for (i=0;i<UNROLL;i++)
798        if (tmpbuf[i] != dst[i]) bad=1;
799       if (bad) {
800        SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
801            invocation, offset);
802        SkDebugf("  alpha 0x%x\n", alpha);
803        for (i=0;i<UNROLL;i++)
804            SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
805            i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
806            dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
807
808        showme16("alpha8", &alpha8, sizeof(alpha8));
809        showme16("scale8", &scale8, sizeof(scale8));
810        showme8("d", &d, sizeof(d));
811        showme16("dst8", &dst8, sizeof(dst8));
812        showme16("dst_b", &dst_b, sizeof(dst_b));
813        showme16("dst_g", &dst_g, sizeof(dst_g));
814        showme16("dst_r", &dst_r, sizeof(dst_r));
815        showme8("sb", &sb, sizeof(sb));
816        showme8("sg", &sg, sizeof(sg));
817        showme8("sr", &sr, sizeof(sr));
818
819        /* cop out */
820        return;
821       }
822       offset += UNROLL;
823       invocation++;
824    }
825#endif
826
827            dst += UNROLL;
828        src += UNROLL;
829        count -= UNROLL;
830        /* skip x += UNROLL, since it's unchanged mod-4 */
831        } while (count >= UNROLL);
832    }
833#undef    UNROLL
834
835    /* residuals */
836    if (count > 0) {
837        DITHER_565_SCAN(y);
838        do {
839            SkPMColor c = *src++;
840            SkPMColorAssert(c);
841            if (c) {
842                unsigned a = SkGetPackedA32(c);
843
844                // dither and alpha are just temporary variables to work-around
845                // an ICE in debug.
846                unsigned dither = DITHER_VALUE(x);
847                unsigned alpha = SkAlpha255To256(a);
848                int d = SkAlphaMul(dither, alpha);
849
850                unsigned sr = SkGetPackedR32(c);
851                unsigned sg = SkGetPackedG32(c);
852                unsigned sb = SkGetPackedB32(c);
853                sr = SkDITHER_R32_FOR_565(sr, d);
854                sg = SkDITHER_G32_FOR_565(sg, d);
855                sb = SkDITHER_B32_FOR_565(sb, d);
856
857                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
858                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
859                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
860                // now src and dst expanded are in g:11 r:10 x:1 b:10
861                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
862            }
863            dst += 1;
864            DITHER_INC_X(x);
865        } while (--count != 0);
866    }
867}
868
869///////////////////////////////////////////////////////////////////////////////
870
871/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
872 * speedup untested, but ARM version is 26 insns/iteration and
873 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
874 * which is 10x the native version; that's pure instruction counts,
875 * not accounting for any instruction or memory latencies.
876 */
877
878#undef    DEBUG_S32_OPAQUE_DITHER
879
880void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
881                                 const SkPMColor* SK_RESTRICT src,
882                                 int count, U8CPU alpha, int x, int y) {
883    SkASSERT(255 == alpha);
884
885#define    UNROLL    8
886    if (count >= UNROLL) {
887    uint8x8_t d;
888    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
889    d = vld1_u8(dstart);
890
891    while (count >= UNROLL) {
892        uint8x8_t sr, sg, sb, sa;
893        uint16x8_t dr, dg, db, da;
894        uint16x8_t dst8;
895
896        /* source is in ABGR ordering (R == lsb) */
897        {
898        register uint8x8_t d0 asm("d0");
899        register uint8x8_t d1 asm("d1");
900        register uint8x8_t d2 asm("d2");
901        register uint8x8_t d3 asm("d3");
902
903        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
904            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
905            : "r" (src)
906                    );
907            sr = d0; sg = d1; sb = d2; sa = d3;
908        }
909        /* XXX: if we want to prefetch, hide it in the above asm()
910         * using the gcc __builtin_prefetch(), the prefetch will
911         * fall to the bottom of the loop -- it won't stick up
912         * at the top of the loop, just after the vld4.
913         */
914
915        /* sr = sr - (sr>>5) + d */
916        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
917        dr = vaddl_u8(sr, d);
918
919        /* sb = sb - (sb>>5) + d */
920        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
921        db = vaddl_u8(sb, d);
922
923        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
924        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
925        dg = vaddl_u8(sg, vshr_n_u8(d,1));
926        /* XXX: check that the "d>>1" here is hoisted */
927
928        /* pack high bits of each into 565 format  (rgb, b is lsb) */
929        dst8 = vshrq_n_u16(db, 3);
930        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
931        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
932
933        /* store it */
934        vst1q_u16(dst, dst8);
935
936#if    defined(DEBUG_S32_OPAQUE_DITHER)
937        /* always good to know if we generated good results */
938        {
939        int i, myx = x, myy = y;
940        DITHER_565_SCAN(myy);
941        for (i=0;i<UNROLL;i++) {
942            SkPMColor c = src[i];
943            unsigned dither = DITHER_VALUE(myx);
944            uint16_t val = SkDitherRGB32To565(c, dither);
945            if (val != dst[i]) {
946            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
947                c, dither, val, dst[i], dstart[i]);
948            }
949            DITHER_INC_X(myx);
950        }
951        }
952#endif
953
954        dst += UNROLL;
955        src += UNROLL;
956        count -= UNROLL;
957        x += UNROLL;        /* probably superfluous */
958    }
959    }
960#undef    UNROLL
961
962    /* residuals */
963    if (count > 0) {
964        DITHER_565_SCAN(y);
965        do {
966            SkPMColor c = *src++;
967            SkPMColorAssert(c);
968            SkASSERT(SkGetPackedA32(c) == 255);
969
970            unsigned dither = DITHER_VALUE(x);
971            *dst++ = SkDitherRGB32To565(c, dither);
972            DITHER_INC_X(x);
973        } while (--count != 0);
974    }
975}
976
977void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
978                      SkPMColor color) {
979    if (count <= 0) {
980        return;
981    }
982
983    if (0 == color) {
984        if (src != dst) {
985            memcpy(dst, src, count * sizeof(SkPMColor));
986        }
987        return;
988    }
989
990    unsigned colorA = SkGetPackedA32(color);
991    if (255 == colorA) {
992        sk_memset32(dst, color, count);
993    } else {
994        unsigned scale = 256 - SkAlpha255To256(colorA);
995
996        if (count >= 8) {
997            // at the end of this assembly, count will have been decremented
998            // to a negative value. That is, if count mod 8 = x, it will be
999            // -8 +x coming out.
1000            asm volatile (
1001                PLD128(src, 0)
1002
1003                "vdup.32    q0, %[color]                \n\t"
1004
1005                PLD128(src, 128)
1006
1007                // scale numerical interval [0-255], so load as 8 bits
1008                "vdup.8     d2, %[scale]                \n\t"
1009
1010                PLD128(src, 256)
1011
1012                "subs       %[count], %[count], #8      \n\t"
1013
1014                PLD128(src, 384)
1015
1016                "Loop_Color32:                          \n\t"
1017
1018                // load src color, 8 pixels, 4 64 bit registers
1019                // (and increment src).
1020                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1021
1022                PLD128(src, 384)
1023
1024                // multiply long by scale, 64 bits at a time,
1025                // destination into a 128 bit register.
1026                "vmull.u8   q4, d4, d2                  \n\t"
1027                "vmull.u8   q5, d5, d2                  \n\t"
1028                "vmull.u8   q6, d6, d2                  \n\t"
1029                "vmull.u8   q7, d7, d2                  \n\t"
1030
1031                // shift the 128 bit registers, containing the 16
1032                // bit scaled values back to 8 bits, narrowing the
1033                // results to 64 bit registers.
1034                "vshrn.i16  d8, q4, #8                  \n\t"
1035                "vshrn.i16  d9, q5, #8                  \n\t"
1036                "vshrn.i16  d10, q6, #8                 \n\t"
1037                "vshrn.i16  d11, q7, #8                 \n\t"
1038
1039                // adding back the color, using 128 bit registers.
1040                "vadd.i8    q6, q4, q0                  \n\t"
1041                "vadd.i8    q7, q5, q0                  \n\t"
1042
1043                // store back the 8 calculated pixels (2 128 bit
1044                // registers), and increment dst.
1045                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1046
1047                "subs       %[count], %[count], #8      \n\t"
1048                "bge        Loop_Color32                \n\t"
1049                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1050                : [color] "r" (color), [scale] "r" (scale)
1051                : "cc", "memory",
1052                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1053                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1054                          );
1055            // At this point, if we went through the inline assembly, count is
1056            // a negative value:
1057            // if the value is -8, there is no pixel left to process.
1058            // if the value is -7, there is one pixel left to process
1059            // ...
1060            // And'ing it with 7 will give us the number of pixels
1061            // left to process.
1062            count = count & 0x7;
1063        }
1064
1065        while (count > 0) {
1066            *dst = color + SkAlphaMulQ(*src, scale);
1067            src += 1;
1068            dst += 1;
1069            count--;
1070        }
1071    }
1072}
1073
1074///////////////////////////////////////////////////////////////////////////////
1075
1076const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1077    // no dither
1078    // NOTE: For the two functions below, we don't have a special version
1079    //       that assumes that each source pixel is opaque. But our S32A is
1080    //       still faster than the default, so use it.
1081    S32A_D565_Opaque_neon,  // really S32_D565_Opaque
1082    S32A_D565_Blend_neon,   // really S32_D565_Blend
1083    S32A_D565_Opaque_neon,
1084    S32A_D565_Blend_neon,
1085
1086    // dither
1087    S32_D565_Opaque_Dither_neon,
1088    S32_D565_Blend_Dither_neon,
1089    S32A_D565_Opaque_Dither_neon,
1090    NULL,   // S32A_D565_Blend_Dither
1091};
1092
1093const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm_neon[] = {
1094    // no dither
1095    NULL,   // S32_D4444_Opaque,
1096    NULL,   // S32_D4444_Blend,
1097    NULL,   // S32A_D4444_Opaque,
1098    NULL,   // S32A_D4444_Blend,
1099
1100    // dither
1101    NULL,   // S32_D4444_Opaque_Dither,
1102    NULL,   // S32_D4444_Blend_Dither,
1103    NULL,   // S32A_D4444_Opaque_Dither,
1104    NULL,   // S32A_D4444_Blend_Dither
1105};
1106
1107const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1108    NULL,   // S32_Opaque,
1109    S32_Blend_BlitRow32_neon,        // S32_Blend,
1110    S32A_Opaque_BlitRow32_neon,        // S32A_Opaque,
1111    S32A_Blend_BlitRow32_arm        // S32A_Blend
1112};
1113