SkBlitRow_opts_arm.cpp revision 568468094af358ce981f7319b2bc2b4996ac4bce
1/*
2 * Copyright 2009 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9#include "SkBlitRow.h"
10#include "SkBlitMask.h"
11#include "SkColorPriv.h"
12#include "SkDither.h"
13
14#if defined(__ARM_HAVE_NEON)
15#include <arm_neon.h>
16#endif
17
18#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
19static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
20                                  const SkPMColor* SK_RESTRICT src, int count,
21                                  U8CPU alpha, int /*x*/, int /*y*/) {
22    SkASSERT(255 == alpha);
23
24    if (count >= 8) {
25        uint16_t* SK_RESTRICT keep_dst;
26
27        asm volatile (
28                      "ands       ip, %[count], #7            \n\t"
29                      "vmov.u8    d31, #1<<7                  \n\t"
30                      "vld1.16    {q12}, [%[dst]]             \n\t"
31                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
32                      "moveq      ip, #8                      \n\t"
33                      "mov        %[keep_dst], %[dst]         \n\t"
34
35                      "add        %[src], %[src], ip, LSL#2   \n\t"
36                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
37                      "subs       %[count], %[count], ip      \n\t"
38                      "b          9f                          \n\t"
39                      // LOOP
40                      "2:                                         \n\t"
41
42                      "vld1.16    {q12}, [%[dst]]!            \n\t"
43                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
44                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
45                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
46                      "subs       %[count], %[count], #8      \n\t"
47                      "9:                                         \n\t"
48                      "pld        [%[dst],#32]                \n\t"
49                      // expand 0565 q12 to 8888 {d4-d7}
50                      "vmovn.u16  d4, q12                     \n\t"
51                      "vshr.u16   q11, q12, #5                \n\t"
52                      "vshr.u16   q10, q12, #6+5              \n\t"
53                      "vmovn.u16  d5, q11                     \n\t"
54                      "vmovn.u16  d6, q10                     \n\t"
55                      "vshl.u8    d4, d4, #3                  \n\t"
56                      "vshl.u8    d5, d5, #2                  \n\t"
57                      "vshl.u8    d6, d6, #3                  \n\t"
58
59                      "vmovl.u8   q14, d31                    \n\t"
60                      "vmovl.u8   q13, d31                    \n\t"
61                      "vmovl.u8   q12, d31                    \n\t"
62
63                      // duplicate in 4/2/1 & 8pix vsns
64                      "vmvn.8     d30, d3                     \n\t"
65                      "vmlal.u8   q14, d30, d6                \n\t"
66                      "vmlal.u8   q13, d30, d5                \n\t"
67                      "vmlal.u8   q12, d30, d4                \n\t"
68                      "vshr.u16   q8, q14, #5                 \n\t"
69                      "vshr.u16   q9, q13, #6                 \n\t"
70                      "vaddhn.u16 d6, q14, q8                 \n\t"
71                      "vshr.u16   q8, q12, #5                 \n\t"
72                      "vaddhn.u16 d5, q13, q9                 \n\t"
73                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
74                      "vaddhn.u16 d4, q12, q8                 \n\t"
75                      // intentionally don't calculate alpha
76                      // result in d4-d6
77
78                      "vqadd.u8   d5, d5, d1                  \n\t"
79                      "vqadd.u8   d4, d4, d2                  \n\t"
80
81                      // pack 8888 {d4-d6} to 0565 q10
82                      "vshll.u8   q10, d6, #8                 \n\t"
83                      "vshll.u8   q3, d5, #8                  \n\t"
84                      "vshll.u8   q2, d4, #8                  \n\t"
85                      "vsri.u16   q10, q3, #5                 \n\t"
86                      "vsri.u16   q10, q2, #11                \n\t"
87
88                      "bne        2b                          \n\t"
89
90                      "1:                                         \n\t"
91                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
92                      : [count] "+r" (count)
93                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
94                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
95                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
96                      "d30","d31"
97                      );
98    }
99    else
100    {   // handle count < 8
101        uint16_t* SK_RESTRICT keep_dst;
102
103        asm volatile (
104                      "vmov.u8    d31, #1<<7                  \n\t"
105                      "mov        %[keep_dst], %[dst]         \n\t"
106
107                      "tst        %[count], #4                \n\t"
108                      "beq        14f                         \n\t"
109                      "vld1.16    {d25}, [%[dst]]!            \n\t"
110                      "vld1.32    {q1}, [%[src]]!             \n\t"
111
112                      "14:                                        \n\t"
113                      "tst        %[count], #2                \n\t"
114                      "beq        12f                         \n\t"
115                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
116                      "vld1.32    {d1}, [%[src]]!             \n\t"
117
118                      "12:                                        \n\t"
119                      "tst        %[count], #1                \n\t"
120                      "beq        11f                         \n\t"
121                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
122                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
123
124                      "11:                                        \n\t"
125                      // unzips achieve the same as a vld4 operation
126                      "vuzpq.u16  q0, q1                      \n\t"
127                      "vuzp.u8    d0, d1                      \n\t"
128                      "vuzp.u8    d2, d3                      \n\t"
129                      // expand 0565 q12 to 8888 {d4-d7}
130                      "vmovn.u16  d4, q12                     \n\t"
131                      "vshr.u16   q11, q12, #5                \n\t"
132                      "vshr.u16   q10, q12, #6+5              \n\t"
133                      "vmovn.u16  d5, q11                     \n\t"
134                      "vmovn.u16  d6, q10                     \n\t"
135                      "vshl.u8    d4, d4, #3                  \n\t"
136                      "vshl.u8    d5, d5, #2                  \n\t"
137                      "vshl.u8    d6, d6, #3                  \n\t"
138
139                      "vmovl.u8   q14, d31                    \n\t"
140                      "vmovl.u8   q13, d31                    \n\t"
141                      "vmovl.u8   q12, d31                    \n\t"
142
143                      // duplicate in 4/2/1 & 8pix vsns
144                      "vmvn.8     d30, d3                     \n\t"
145                      "vmlal.u8   q14, d30, d6                \n\t"
146                      "vmlal.u8   q13, d30, d5                \n\t"
147                      "vmlal.u8   q12, d30, d4                \n\t"
148                      "vshr.u16   q8, q14, #5                 \n\t"
149                      "vshr.u16   q9, q13, #6                 \n\t"
150                      "vaddhn.u16 d6, q14, q8                 \n\t"
151                      "vshr.u16   q8, q12, #5                 \n\t"
152                      "vaddhn.u16 d5, q13, q9                 \n\t"
153                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
154                      "vaddhn.u16 d4, q12, q8                 \n\t"
155                      // intentionally don't calculate alpha
156                      // result in d4-d6
157
158                      "vqadd.u8   d5, d5, d1                  \n\t"
159                      "vqadd.u8   d4, d4, d2                  \n\t"
160
161                      // pack 8888 {d4-d6} to 0565 q10
162                      "vshll.u8   q10, d6, #8                 \n\t"
163                      "vshll.u8   q3, d5, #8                  \n\t"
164                      "vshll.u8   q2, d4, #8                  \n\t"
165                      "vsri.u16   q10, q3, #5                 \n\t"
166                      "vsri.u16   q10, q2, #11                \n\t"
167
168                      // store
169                      "tst        %[count], #4                \n\t"
170                      "beq        24f                         \n\t"
171                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
172
173                      "24:                                        \n\t"
174                      "tst        %[count], #2                \n\t"
175                      "beq        22f                         \n\t"
176                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
177
178                      "22:                                        \n\t"
179                      "tst        %[count], #1                \n\t"
180                      "beq        21f                         \n\t"
181                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
182
183                      "21:                                        \n\t"
184                      : [count] "+r" (count)
185                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
186                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
187                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
188                      "d30","d31"
189                      );
190    }
191}
192
193static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
194                                 const SkPMColor* SK_RESTRICT src, int count,
195                                 U8CPU alpha, int /*x*/, int /*y*/) {
196
197    U8CPU alpha_for_asm = alpha;
198
199    asm volatile (
200    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
201     * the original in two respects:
202     *  1. The results have a few mismatches compared to the original code. These mismatches
203     *     never exceed 1. It's possible to improve accuracy vs. a floating point
204     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
205     *     Rounding is not present in the code below, because although results would be closer
206     *     to a floating point implementation, the number of mismatches compared to the
207     *     original code would be far greater.
208     *  2. On certain inputs, the original code can overflow, causing colour channels to
209     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
210     *     to affect another.
211     */
212
213#if 1
214		/* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
215                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
216#else
217                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
218#endif
219                  "vmov.u16   q3, #255                        \n\t"   // set up constant
220                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
221                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
222                  "beq        2f                              \n\t"   // if count8 == 0, exit
223                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
224
225                  "1:                                             \n\t"
226                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
227                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
228                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
229                  //  and deinterleave
230
231                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
232                  "vand       q10, q0, q15                    \n\t"   // extract blue
233                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
234                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
235                  // dstrgb = {q8, q9, q10}
236
237                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
238                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
239                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
240
241                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
242                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
243                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
244                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
245                  // srcrgba = {q11, q12, q13, q14}
246
247                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
248                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
249                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
250                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
251
252                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
253                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
254                  // dst_scale = q2
255
256                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
257                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
258                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
259
260#if 1
261	// trying for a better match with SkDiv255Round(a)
262	// C alg is:  a+=128; (a+a>>8)>>8
263	// we'll use just a rounding shift [q2 is available for scratch]
264                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
265                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
266                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
267#else
268	// arm's original "truncating divide by 256"
269                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
270                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
271                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
272#endif
273
274                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
275                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
276                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
277
278                  "bne        1b                              \n\t"   // if counter != 0, loop
279                  "2:                                             \n\t"   // exit
280
281                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
282                  :
283                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
284                  );
285
286    count &= 7;
287    if (count > 0) {
288        do {
289            SkPMColor sc = *src++;
290            if (sc) {
291                uint16_t dc = *dst;
292                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
293                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
294                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
295                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
296                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
297            }
298            dst += 1;
299        } while (--count != 0);
300    }
301}
302
303/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
304 * each dither value is spaced out into byte lanes, and repeated
305 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
306 * start of each row.
307 */
308static const uint8_t gDitherMatrix_Neon[48] = {
309    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
310    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
311    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
312    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
313
314};
315
316static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
317                                       int count, U8CPU alpha, int x, int y)
318{
319    /* select row and offset for dither array */
320    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
321
322    /* rescale alpha to range 0 - 256 */
323    int scale = SkAlpha255To256(alpha);
324
325    asm volatile (
326                  "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
327                  "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
328                  "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
329                  "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
330                  "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
331                  "1:                                                 \n\t"
332                  "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
333                  "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
334                  "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
335                  "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
336                  "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
337                  "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
338                  "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
339                  "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
340                  "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
341                  "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
342                  "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
343                  "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
344                  "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
345                  // load 8 pixels from dst, extract rgb
346                  "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
347                  "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
348                  "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
349                  "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
350                  "vand           d17, d17, d29                   \n\t"   // and green with green mask
351                  "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
352                  "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
353                  // src = {d22 (r), d23 (g), d24 (b)}
354                  // dst = {d16 (r), d17 (g), d18 (b)}
355                  // subtract dst from src and widen
356                  "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
357                  "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
358                  "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
359                  // multiply diffs by scale and shift
360                  "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
361                  "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
362                  "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
363                  "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
364                  "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
365                  "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
366                  "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
367                  // add dst to result
368                  "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
369                  "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
370                  "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
371                  // put result into 565 format
372                  "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
373                  "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
374                  "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
375                  "bgt            1b                              \n\t"   // loop if count > 0
376                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
377                  : [dstart] "r" (dstart), [scale] "r" (scale)
378                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
379                  );
380
381    DITHER_565_SCAN(y);
382
383    while((count & 7) > 0)
384    {
385        SkPMColor c = *src++;
386
387        int dither = DITHER_VALUE(x);
388        int sr = SkGetPackedR32(c);
389        int sg = SkGetPackedG32(c);
390        int sb = SkGetPackedB32(c);
391        sr = SkDITHER_R32To565(sr, dither);
392        sg = SkDITHER_G32To565(sg, dither);
393        sb = SkDITHER_B32To565(sb, dither);
394
395        uint16_t d = *dst;
396        *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
397                             SkAlphaBlend(sg, SkGetPackedG16(d), scale),
398                             SkAlphaBlend(sb, SkGetPackedB16(d), scale));
399        DITHER_INC_X(x);
400        count--;
401    }
402}
403
404#define S32A_D565_Opaque_PROC       S32A_D565_Opaque_neon
405#define S32A_D565_Blend_PROC        S32A_D565_Blend_neon
406#define S32_D565_Blend_Dither_PROC  S32_D565_Blend_Dither_neon
407#else
408#define S32A_D565_Opaque_PROC       NULL
409#define S32A_D565_Blend_PROC        NULL
410#define S32_D565_Blend_Dither_PROC  NULL
411#endif
412
413/* Don't have a special version that assumes each src is opaque, but our S32A
414    is still faster than the default, so use it here
415 */
416#define S32_D565_Opaque_PROC    S32A_D565_Opaque_PROC
417#define S32_D565_Blend_PROC     S32A_D565_Blend_PROC
418
419///////////////////////////////////////////////////////////////////////////////
420
421#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
422
423static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
424                                  const SkPMColor* SK_RESTRICT src,
425                                  int count, U8CPU alpha) {
426
427    SkASSERT(255 == alpha);
428    if (count > 0) {
429
430
431	uint8x8_t alpha_mask;
432
433	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
434	alpha_mask = vld1_u8(alpha_mask_setup);
435
436	/* do the NEON unrolled code */
437#define	UNROLL	4
438	while (count >= UNROLL) {
439	    uint8x8_t src_raw, dst_raw, dst_final;
440	    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
441
442	    /* get the source */
443	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
444#if	UNROLL > 2
445	    src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
446#endif
447
448	    /* get and hold the dst too */
449	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
450#if	UNROLL > 2
451	    dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
452#endif
453
454	/* 1st and 2nd bits of the unrolling */
455	{
456	    uint8x8_t dst_cooked;
457	    uint16x8_t dst_wide;
458	    uint8x8_t alpha_narrow;
459	    uint16x8_t alpha_wide;
460
461	    /* get the alphas spread out properly */
462	    alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
463#if 1
464	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
465	    /* we collapsed (255-a)+1 ... */
466	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
467#else
468	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
469	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
470#endif
471
472	    /* spread the dest */
473	    dst_wide = vmovl_u8(dst_raw);
474
475	    /* alpha mul the dest */
476	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
477	    dst_cooked = vshrn_n_u16(dst_wide, 8);
478
479	    /* sum -- ignoring any byte lane overflows */
480	    dst_final = vadd_u8(src_raw, dst_cooked);
481	}
482
483#if	UNROLL > 2
484	/* the 3rd and 4th bits of our unrolling */
485	{
486	    uint8x8_t dst_cooked;
487	    uint16x8_t dst_wide;
488	    uint8x8_t alpha_narrow;
489	    uint16x8_t alpha_wide;
490
491	    alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
492#if 1
493	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
494	    /* we collapsed (255-a)+1 ... */
495	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
496#else
497	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
498	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
499#endif
500
501	    /* spread the dest */
502	    dst_wide = vmovl_u8(dst_raw_2);
503
504	    /* alpha mul the dest */
505	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
506	    dst_cooked = vshrn_n_u16(dst_wide, 8);
507
508	    /* sum -- ignoring any byte lane overflows */
509	    dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
510	}
511#endif
512
513	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
514#if	UNROLL > 2
515	    vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
516#endif
517
518	    src += UNROLL;
519	    dst += UNROLL;
520	    count -= UNROLL;
521	}
522#undef	UNROLL
523
524	/* do any residual iterations */
525        while (--count >= 0) {
526#ifdef TEST_SRC_ALPHA
527            SkPMColor sc = *src;
528            if (sc) {
529                unsigned srcA = SkGetPackedA32(sc);
530                SkPMColor result = sc;
531                if (srcA != 255) {
532                    result = SkPMSrcOver(sc, *dst);
533                }
534                *dst = result;
535            }
536#else
537            *dst = SkPMSrcOver(*src, *dst);
538#endif
539            src += 1;
540            dst += 1;
541        }
542    }
543}
544
545#define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_neon
546
547#else
548
549#ifdef TEST_SRC_ALPHA
550#error The ARM asm version of S32A_Opaque_BlitRow32 does not support TEST_SRC_ALPHA
551#endif
552
553static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
554                                  const SkPMColor* SK_RESTRICT src,
555                                  int count, U8CPU alpha) {
556
557    SkASSERT(255 == alpha);
558
559    /* Does not support the TEST_SRC_ALPHA case */
560    asm volatile (
561                  "cmp    %[count], #0               \n\t" /* comparing count with 0 */
562                  "beq    3f                         \n\t" /* if zero exit */
563
564                  "mov    ip, #0xff                  \n\t" /* load the 0xff mask in ip */
565                  "orr    ip, ip, ip, lsl #16        \n\t" /* convert it to 0xff00ff in ip */
566
567                  "cmp    %[count], #2               \n\t" /* compare count with 2 */
568                  "blt    2f                         \n\t" /* if less than 2 -> single loop */
569
570                  /* Double Loop */
571                  "1:                                \n\t" /* <double loop> */
572                  "ldm    %[src]!, {r5,r6}           \n\t" /* load the src(s) at r5-r6 */
573                  "ldm    %[dst], {r7,r8}            \n\t" /* loading dst(s) into r7-r8 */
574                  "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
575
576                  /* ----------- */
577                  "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
578                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
579                  "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
580
581                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
582                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
583                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
584
585                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
586                  "lsr    r4, r6, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
587                  "orr    r7, r9, r10                \n\t" /* br | ag*/
588
589                  "add    r7, r5, r7                 \n\t" /* dst = src + calc dest(r7) */
590                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 255 -> r4=scale */
591
592                  /* ----------- */
593                  "and    r9, ip, r8                 \n\t" /* r9 = br masked by ip */
594
595                  "and    r10, ip, r8, lsr #8        \n\t" /* r10 = ag masked by ip */
596                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
597                  "sub    %[count], %[count], #2     \n\t"
598                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
599
600                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
601                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
602                  "cmp    %[count], #1               \n\t" /* comparing count with 1 */
603                  "orr    r8, r9, r10                \n\t" /* br | ag */
604
605                  "add    r8, r6, r8                 \n\t" /* dst = src + calc dest(r8) */
606
607                  /* ----------------- */
608                  "stm    %[dst]!, {r7,r8}           \n\t" /* *dst = r7, increment dst by two (each times 4) */
609                  /* ----------------- */
610
611                  "bgt    1b                         \n\t" /* if greater than 1 -> reloop */
612                  "blt    3f                         \n\t" /* if less than 1 -> exit */
613
614                  /* Single Loop */
615                  "2:                                \n\t" /* <single loop> */
616                  "ldr    r5, [%[src]], #4           \n\t" /* load the src pointer into r5 r5=src */
617                  "ldr    r7, [%[dst]]               \n\t" /* loading dst into r7 */
618                  "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
619
620                  /* ----------- */
621                  "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
622                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
623
624                  "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
625                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
626                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
627                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
628
629                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag */
630                  "orr    r7, r9, r10                \n\t" /* br | ag */
631
632                  "add    r7, r5, r7                 \n\t" /* *dst = src + calc dest(r7) */
633
634                  /* ----------------- */
635                  "str    r7, [%[dst]], #4           \n\t" /* *dst = r7, increment dst by one (times 4) */
636                  /* ----------------- */
637
638                  "3:                                \n\t" /* <exit> */
639                  : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
640                  :
641                  : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
642                  );
643}
644#define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_arm
645#endif
646
647/*
648 * ARM asm version of S32A_Blend_BlitRow32
649 */
650static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
651                                 const SkPMColor* SK_RESTRICT src,
652                                 int count, U8CPU alpha) {
653    asm volatile (
654                  "cmp    %[count], #0               \n\t" /* comparing count with 0 */
655                  "beq    3f                         \n\t" /* if zero exit */
656
657                  "mov    r12, #0xff                 \n\t" /* load the 0xff mask in r12 */
658                  "orr    r12, r12, r12, lsl #16     \n\t" /* convert it to 0xff00ff in r12 */
659
660                  /* src1,2_scale */
661                  "add    %[alpha], %[alpha], #1     \n\t" /* loading %[alpha]=src_scale=alpha+1 */
662
663                  "cmp    %[count], #2               \n\t" /* comparing count with 2 */
664                  "blt    2f                         \n\t" /* if less than 2 -> single loop */
665
666                  /* Double Loop */
667                  "1:                                \n\t" /* <double loop> */
668                  "ldm    %[src]!, {r5, r6}          \n\t" /* loading src pointers into r5 and r6 */
669                  "ldm    %[dst], {r7, r8}           \n\t" /* loading dst pointers into r7 and r8 */
670
671                  /* dst1_scale and dst2_scale*/
672                  "lsr    r9, r5, #24                \n\t" /* src >> 24 */
673                  "lsr    r10, r6, #24               \n\t" /* src >> 24 */
674                  "smulbb r9, r9, %[alpha]           \n\t" /* r9 = SkMulS16 r9 with src_scale */
675                  "smulbb r10, r10, %[alpha]         \n\t" /* r10 = SkMulS16 r10 with src_scale */
676                  "lsr    r9, r9, #8                 \n\t" /* r9 >> 8 */
677                  "lsr    r10, r10, #8               \n\t" /* r10 >> 8 */
678                  "rsb    r9, r9, #256               \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
679                  "rsb    r10, r10, #256             \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
680
681                  /* ---------------------- */
682
683                  /* src1, src1_scale */
684                  "and    r11, r12, r5, lsr #8       \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
685                  "and    r4, r12, r5                \n\t" /* rb = r4 = r5 masked by r12 */
686                  "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
687                  "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
688                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
689                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
690                  "orr    r5, r11, r4                \n\t" /* r5 = (src1, src_scale) */
691
692                  /* dst1, dst1_scale */
693                  "and    r11, r12, r7, lsr #8       \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
694                  "and    r4, r12, r7                \n\t" /* rb = r4 = r7 masked by r12 */
695                  "mul    r11, r11, r9               \n\t" /* ag = r11 times dst_scale (r9) */
696                  "mul    r4, r4, r9                 \n\t" /* rb = r4 times dst_scale (r9) */
697                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
698                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
699                  "orr    r9, r11, r4                \n\t" /* r9 = (dst1, dst_scale) */
700
701                  /* ---------------------- */
702                  "add    r9, r5, r9                 \n\t" /* *dst = src plus dst both scaled */
703                  /* ---------------------- */
704
705                  /* ====================== */
706
707                  /* src2, src2_scale */
708                  "and    r11, r12, r6, lsr #8       \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
709                  "and    r4, r12, r6                \n\t" /* rb = r4 = r6 masked by r12 */
710                  "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
711                  "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
712                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
713                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
714                  "orr    r6, r11, r4                \n\t" /* r6 = (src2, src_scale) */
715
716                  /* dst2, dst2_scale */
717                  "and    r11, r12, r8, lsr #8       \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
718                  "and    r4, r12, r8                \n\t" /* rb = r4 = r8 masked by r12 */
719                  "mul    r11, r11, r10              \n\t" /* ag = r11 times dst_scale (r10) */
720                  "mul    r4, r4, r10                \n\t" /* rb = r4 times dst_scale (r6) */
721                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
722                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
723                  "orr    r10, r11, r4               \n\t" /* r10 = (dst2, dst_scale) */
724
725                  "sub    %[count], %[count], #2     \n\t" /* decrease count by 2 */
726                  /* ---------------------- */
727                  "add    r10, r6, r10               \n\t" /* *dst = src plus dst both scaled */
728                  /* ---------------------- */
729                  "cmp    %[count], #1               \n\t" /* compare count with 1 */
730                  /* ----------------- */
731                  "stm    %[dst]!, {r9, r10}         \n\t" /* copy r9 and r10 to r7 and r8 respectively */
732                  /* ----------------- */
733
734                  "bgt    1b                         \n\t" /* if %[count] greater than 1 reloop */
735                  "blt    3f                         \n\t" /* if %[count] less than 1 exit */
736                                                           /* else get into the single loop */
737                  /* Single Loop */
738                  "2:                                \n\t" /* <single loop> */
739                  "ldr    r5, [%[src]], #4           \n\t" /* loading src pointer into r5: r5=src */
740                  "ldr    r7, [%[dst]]               \n\t" /* loading dst pointer into r7: r7=dst */
741
742                  "lsr    r6, r5, #24                \n\t" /* src >> 24 */
743                  "and    r8, r12, r5, lsr #8        \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
744                  "smulbb r6, r6, %[alpha]           \n\t" /* r6 = SkMulS16 with src_scale */
745                  "and    r9, r12, r5                \n\t" /* rb = r9 = r5 masked by r12 */
746                  "lsr    r6, r6, #8                 \n\t" /* r6 >> 8 */
747                  "mul    r8, r8, %[alpha]           \n\t" /* ag = r8 times scale */
748                  "rsb    r6, r6, #256               \n\t" /* r6 = 255 - r6 + 1 */
749
750                  /* src, src_scale */
751                  "mul    r9, r9, %[alpha]           \n\t" /* rb = r9 times scale */
752                  "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
753                  "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
754                  "orr    r10, r8, r9                \n\t" /* r10 = (scr, src_scale) */
755
756                  /* dst, dst_scale */
757                  "and    r8, r12, r7, lsr #8        \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
758                  "and    r9, r12, r7                \n\t" /* rb = r9 = r7 masked by r12 */
759                  "mul    r8, r8, r6                 \n\t" /* ag = r8 times scale (r6) */
760                  "mul    r9, r9, r6                 \n\t" /* rb = r9 times scale (r6) */
761                  "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
762                  "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
763                  "orr    r7, r8, r9                 \n\t" /* r7 = (dst, dst_scale) */
764
765                  "add    r10, r7, r10               \n\t" /* *dst = src plus dst both scaled */
766
767                  /* ----------------- */
768                  "str    r10, [%[dst]], #4          \n\t" /* *dst = r10, postincrement dst by one (times 4) */
769                  /* ----------------- */
770
771                  "3:                                \n\t" /* <exit> */
772                  : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
773                  :
774                  : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
775                  );
776
777}
778#define	S32A_Blend_BlitRow32_PROC	S32A_Blend_BlitRow32_arm
779
780/* Neon version of S32_Blend_BlitRow32()
781 * portable version is in src/core/SkBlitRow_D32.cpp
782 */
783#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
784static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
785                                const SkPMColor* SK_RESTRICT src,
786                                int count, U8CPU alpha) {
787    SkASSERT(alpha <= 255);
788    if (count > 0) {
789        uint16_t src_scale = SkAlpha255To256(alpha);
790        uint16_t dst_scale = 256 - src_scale;
791
792	/* run them N at a time through the NEON unit */
793	/* note that each 1 is 4 bytes, each treated exactly the same,
794	 * so we can work under that guise. We *do* know that the src&dst
795	 * will be 32-bit aligned quantities, so we can specify that on
796	 * the load/store ops and do a neon 'reinterpret' to get us to
797	 * byte-sized (pun intended) pieces that we widen/multiply/shift
798	 * we're limited at 128 bits in the wide ops, which is 8x16bits
799	 * or a pair of 32 bit src/dsts.
800	 */
801	/* we *could* manually unroll this loop so that we load 128 bits
802	 * (as a pair of 64s) from each of src and dst, processing them
803	 * in pieces. This might give us a little better management of
804	 * the memory latency, but my initial attempts here did not
805	 * produce an instruction stream that looked all that nice.
806	 */
807#define	UNROLL	2
808	while (count >= UNROLL) {
809	    uint8x8_t  src_raw, dst_raw, dst_final;
810	    uint16x8_t  src_wide, dst_wide;
811
812	    /* get 64 bits of src, widen it, multiply by src_scale */
813	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
814	    src_wide = vmovl_u8(src_raw);
815	    /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
816	    src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
817
818	    /* ditto with dst */
819	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
820	    dst_wide = vmovl_u8(dst_raw);
821
822	    /* combine add with dst multiply into mul-accumulate */
823	    dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
824
825	    dst_final = vshrn_n_u16(dst_wide, 8);
826	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
827
828	    src += UNROLL;
829	    dst += UNROLL;
830	    count -= UNROLL;
831	}
832	/* RBE: well, i don't like how gcc manages src/dst across the above
833	 * loop it's constantly calculating src+bias, dst+bias and it only
834	 * adjusts the real ones when we leave the loop. Not sure why
835	 * it's "hoisting down" (hoisting implies above in my lexicon ;))
836	 * the adjustments to src/dst/count, but it does...
837	 * (might be SSA-style internal logic...
838	 */
839
840#if	UNROLL == 2
841	if (count == 1) {
842            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
843	}
844#else
845	if (count > 0) {
846            do {
847                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
848                src += 1;
849                dst += 1;
850            } while (--count > 0);
851	}
852#endif
853
854#undef	UNROLL
855    }
856}
857
858#define	S32_Blend_BlitRow32_PROC	S32_Blend_BlitRow32_neon
859#else
860#define	S32_Blend_BlitRow32_PROC	NULL
861#endif
862
863///////////////////////////////////////////////////////////////////////////////
864
865#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
866
867#undef	DEBUG_OPAQUE_DITHER
868
869#if	defined(DEBUG_OPAQUE_DITHER)
870static void showme8(char *str, void *p, int len)
871{
872	static char buf[256];
873	char tbuf[32];
874	int i;
875	char *pc = (char*) p;
876	sprintf(buf,"%8s:", str);
877	for(i=0;i<len;i++) {
878	    sprintf(tbuf, "   %02x", pc[i]);
879	    strcat(buf, tbuf);
880	}
881	SkDebugf("%s\n", buf);
882}
883static void showme16(char *str, void *p, int len)
884{
885	static char buf[256];
886	char tbuf[32];
887	int i;
888	uint16_t *pc = (uint16_t*) p;
889	sprintf(buf,"%8s:", str);
890	len = (len / sizeof(uint16_t));	/* passed as bytes */
891	for(i=0;i<len;i++) {
892	    sprintf(tbuf, " %04x", pc[i]);
893	    strcat(buf, tbuf);
894	}
895	SkDebugf("%s\n", buf);
896}
897#endif
898
899static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
900                                      const SkPMColor* SK_RESTRICT src,
901                                      int count, U8CPU alpha, int x, int y) {
902    SkASSERT(255 == alpha);
903
904#define	UNROLL	8
905
906    if (count >= UNROLL) {
907	uint8x8_t dbase;
908
909#if	defined(DEBUG_OPAQUE_DITHER)
910	uint16_t tmpbuf[UNROLL];
911	int td[UNROLL];
912	int tdv[UNROLL];
913	int ta[UNROLL];
914	int tap[UNROLL];
915	uint16_t in_dst[UNROLL];
916	int offset = 0;
917	int noisy = 0;
918#endif
919
920	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
921	dbase = vld1_u8(dstart);
922
923        do {
924	    uint8x8_t sr, sg, sb, sa, d;
925	    uint16x8_t dst8, scale8, alpha8;
926	    uint16x8_t dst_r, dst_g, dst_b;
927
928#if	defined(DEBUG_OPAQUE_DITHER)
929	/* calculate 8 elements worth into a temp buffer */
930	{
931	  int my_y = y;
932	  int my_x = x;
933	  SkPMColor* my_src = (SkPMColor*)src;
934	  uint16_t* my_dst = dst;
935	  int i;
936
937          DITHER_565_SCAN(my_y);
938          for(i=0;i<UNROLL;i++) {
939            SkPMColor c = *my_src++;
940            SkPMColorAssert(c);
941            if (c) {
942                unsigned a = SkGetPackedA32(c);
943
944                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
945		tdv[i] = DITHER_VALUE(my_x);
946		ta[i] = a;
947		tap[i] = SkAlpha255To256(a);
948		td[i] = d;
949
950                unsigned sr = SkGetPackedR32(c);
951                unsigned sg = SkGetPackedG32(c);
952                unsigned sb = SkGetPackedB32(c);
953                sr = SkDITHER_R32_FOR_565(sr, d);
954                sg = SkDITHER_G32_FOR_565(sg, d);
955                sb = SkDITHER_B32_FOR_565(sb, d);
956
957                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
958                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
959                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
960                // now src and dst expanded are in g:11 r:10 x:1 b:10
961                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
962		td[i] = d;
963
964            } else {
965		tmpbuf[i] = *my_dst;
966		ta[i] = tdv[i] = td[i] = 0xbeef;
967	    }
968	    in_dst[i] = *my_dst;
969            my_dst += 1;
970            DITHER_INC_X(my_x);
971          }
972	}
973#endif
974
975	    /* source is in ABGR */
976	    {
977		register uint8x8_t d0 asm("d0");
978		register uint8x8_t d1 asm("d1");
979		register uint8x8_t d2 asm("d2");
980		register uint8x8_t d3 asm("d3");
981
982		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
983		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
984		    : "r" (src)
985                    );
986		    sr = d0; sg = d1; sb = d2; sa = d3;
987	    }
988
989	    /* calculate 'd', which will be 0..7 */
990	    /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
991#if SK_BUILD_FOR_ANDROID
992	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
993	    alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
994#else
995	    alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
996#endif
997	    alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
998	    d = vshrn_n_u16(alpha8, 8);	/* narrowing too */
999
1000	    /* sr = sr - (sr>>5) + d */
1001	    /* watching for 8-bit overflow.  d is 0..7; risky range of
1002	     * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1003	     * safe  as long as we do ((sr-sr>>5) + d) */
1004	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1005	    sr = vadd_u8(sr, d);
1006
1007	    /* sb = sb - (sb>>5) + d */
1008	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1009	    sb = vadd_u8(sb, d);
1010
1011	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1012	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1013	    sg = vadd_u8(sg, vshr_n_u8(d,1));
1014
1015	    /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1016	    dst8 = vld1q_u16(dst);
1017	    dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1018	    dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1019	    dst_r = vshrq_n_u16(dst8,11);	/* clearing hi bits */
1020
1021	    /* blend */
1022#if 1
1023	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1024	    /* originally 255-sa + 1 */
1025	    scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1026#else
1027	    scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1028	    scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1029#endif
1030
1031#if 1
1032	    /* combine the addq and mul, save 3 insns */
1033	    scale8 = vshrq_n_u16(scale8, 3);
1034	    dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1035	    dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1036	    dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1037#else
1038	    /* known correct, but +3 insns over above */
1039	    scale8 = vshrq_n_u16(scale8, 3);
1040	    dst_b = vmulq_u16(dst_b, scale8);
1041	    dst_g = vmulq_u16(dst_g, scale8);
1042	    dst_r = vmulq_u16(dst_r, scale8);
1043
1044	    /* combine */
1045	    /* NB: vshll widens, need to preserve those bits */
1046	    dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1047	    dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1048	    dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1049#endif
1050
1051	    /* repack to store */
1052	    dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1053	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1054	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1055
1056	    vst1q_u16(dst, dst8);
1057
1058#if	defined(DEBUG_OPAQUE_DITHER)
1059	    /* verify my 8 elements match the temp buffer */
1060	{
1061	   int i, bad=0;
1062	   static int invocation;
1063
1064	   for (i=0;i<UNROLL;i++)
1065		if (tmpbuf[i] != dst[i]) bad=1;
1066	   if (bad) {
1067		SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1068			invocation, offset);
1069		SkDebugf("  alpha 0x%x\n", alpha);
1070		for (i=0;i<UNROLL;i++)
1071		    SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1072			i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1073			dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1074
1075		showme16("alpha8", &alpha8, sizeof(alpha8));
1076		showme16("scale8", &scale8, sizeof(scale8));
1077		showme8("d", &d, sizeof(d));
1078		showme16("dst8", &dst8, sizeof(dst8));
1079		showme16("dst_b", &dst_b, sizeof(dst_b));
1080		showme16("dst_g", &dst_g, sizeof(dst_g));
1081		showme16("dst_r", &dst_r, sizeof(dst_r));
1082		showme8("sb", &sb, sizeof(sb));
1083		showme8("sg", &sg, sizeof(sg));
1084		showme8("sr", &sr, sizeof(sr));
1085
1086		/* cop out */
1087		return;
1088	   }
1089	   offset += UNROLL;
1090	   invocation++;
1091	}
1092#endif
1093
1094            dst += UNROLL;
1095	    src += UNROLL;
1096	    count -= UNROLL;
1097	    /* skip x += UNROLL, since it's unchanged mod-4 */
1098        } while (count >= UNROLL);
1099    }
1100#undef	UNROLL
1101
1102    /* residuals */
1103    if (count > 0) {
1104        DITHER_565_SCAN(y);
1105        do {
1106            SkPMColor c = *src++;
1107            SkPMColorAssert(c);
1108            if (c) {
1109                unsigned a = SkGetPackedA32(c);
1110
1111                // dither and alpha are just temporary variables to work-around
1112                // an ICE in debug.
1113                unsigned dither = DITHER_VALUE(x);
1114                unsigned alpha = SkAlpha255To256(a);
1115                int d = SkAlphaMul(dither, alpha);
1116
1117                unsigned sr = SkGetPackedR32(c);
1118                unsigned sg = SkGetPackedG32(c);
1119                unsigned sb = SkGetPackedB32(c);
1120                sr = SkDITHER_R32_FOR_565(sr, d);
1121                sg = SkDITHER_G32_FOR_565(sg, d);
1122                sb = SkDITHER_B32_FOR_565(sb, d);
1123
1124                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1125                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1126                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1127                // now src and dst expanded are in g:11 r:10 x:1 b:10
1128                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1129            }
1130            dst += 1;
1131            DITHER_INC_X(x);
1132        } while (--count != 0);
1133    }
1134}
1135
1136#define	S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
1137#else
1138#define	S32A_D565_Opaque_Dither_PROC NULL
1139#endif
1140
1141///////////////////////////////////////////////////////////////////////////////
1142
1143#if	defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1144/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1145 * speedup untested, but ARM version is 26 insns/iteration and
1146 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1147 * which is 10x the native version; that's pure instruction counts,
1148 * not accounting for any instruction or memory latencies.
1149 */
1150
1151#undef	DEBUG_S32_OPAQUE_DITHER
1152
1153static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1154                                     const SkPMColor* SK_RESTRICT src,
1155                                     int count, U8CPU alpha, int x, int y) {
1156    SkASSERT(255 == alpha);
1157
1158#define	UNROLL	8
1159    if (count >= UNROLL) {
1160	uint8x8_t d;
1161	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1162	d = vld1_u8(dstart);
1163
1164	while (count >= UNROLL) {
1165	    uint8x8_t sr, sg, sb, sa;
1166	    uint16x8_t dr, dg, db, da;
1167	    uint16x8_t dst8;
1168
1169	    /* source is in ABGR ordering (R == lsb) */
1170	    {
1171		register uint8x8_t d0 asm("d0");
1172		register uint8x8_t d1 asm("d1");
1173		register uint8x8_t d2 asm("d2");
1174		register uint8x8_t d3 asm("d3");
1175
1176		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1177		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1178		    : "r" (src)
1179                    );
1180		    sr = d0; sg = d1; sb = d2; sa = d3;
1181	    }
1182	    /* XXX: if we want to prefetch, hide it in the above asm()
1183	     * using the gcc __builtin_prefetch(), the prefetch will
1184	     * fall to the bottom of the loop -- it won't stick up
1185	     * at the top of the loop, just after the vld4.
1186	     */
1187
1188	    /* sr = sr - (sr>>5) + d */
1189	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1190	    dr = vaddl_u8(sr, d);
1191
1192	    /* sb = sb - (sb>>5) + d */
1193	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1194	    db = vaddl_u8(sb, d);
1195
1196	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1197	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1198	    dg = vaddl_u8(sg, vshr_n_u8(d,1));
1199	    /* XXX: check that the "d>>1" here is hoisted */
1200
1201	    /* pack high bits of each into 565 format  (rgb, b is lsb) */
1202	    dst8 = vshrq_n_u16(db, 3);
1203	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1204	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1205
1206	    /* store it */
1207	    vst1q_u16(dst, dst8);
1208
1209#if	defined(DEBUG_S32_OPAQUE_DITHER)
1210	    /* always good to know if we generated good results */
1211	    {
1212		int i, myx = x, myy = y;
1213		DITHER_565_SCAN(myy);
1214		for (i=0;i<UNROLL;i++) {
1215		    SkPMColor c = src[i];
1216		    unsigned dither = DITHER_VALUE(myx);
1217		    uint16_t val = SkDitherRGB32To565(c, dither);
1218		    if (val != dst[i]) {
1219			SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1220			    c, dither, val, dst[i], dstart[i]);
1221		    }
1222		    DITHER_INC_X(myx);
1223		}
1224	    }
1225#endif
1226
1227	    dst += UNROLL;
1228	    src += UNROLL;
1229	    count -= UNROLL;
1230	    x += UNROLL;		/* probably superfluous */
1231	}
1232    }
1233#undef	UNROLL
1234
1235    /* residuals */
1236    if (count > 0) {
1237        DITHER_565_SCAN(y);
1238        do {
1239            SkPMColor c = *src++;
1240            SkPMColorAssert(c);
1241            SkASSERT(SkGetPackedA32(c) == 255);
1242
1243            unsigned dither = DITHER_VALUE(x);
1244            *dst++ = SkDitherRGB32To565(c, dither);
1245            DITHER_INC_X(x);
1246        } while (--count != 0);
1247    }
1248}
1249
1250#define	S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1251#else
1252#define	S32_D565_Opaque_Dither_PROC NULL
1253#endif
1254
1255///////////////////////////////////////////////////////////////////////////////
1256
1257static const SkBlitRow::Proc platform_565_procs[] = {
1258    // no dither
1259    S32_D565_Opaque_PROC,
1260    S32_D565_Blend_PROC,
1261    S32A_D565_Opaque_PROC,
1262    S32A_D565_Blend_PROC,
1263
1264    // dither
1265    S32_D565_Opaque_Dither_PROC,
1266    S32_D565_Blend_Dither_PROC,
1267    S32A_D565_Opaque_Dither_PROC,
1268    NULL,   // S32A_D565_Blend_Dither
1269};
1270
1271static const SkBlitRow::Proc platform_4444_procs[] = {
1272    // no dither
1273    NULL,   // S32_D4444_Opaque,
1274    NULL,   // S32_D4444_Blend,
1275    NULL,   // S32A_D4444_Opaque,
1276    NULL,   // S32A_D4444_Blend,
1277
1278    // dither
1279    NULL,   // S32_D4444_Opaque_Dither,
1280    NULL,   // S32_D4444_Blend_Dither,
1281    NULL,   // S32A_D4444_Opaque_Dither,
1282    NULL,   // S32A_D4444_Blend_Dither
1283};
1284
1285static const SkBlitRow::Proc32 platform_32_procs[] = {
1286    NULL,   // S32_Opaque,
1287    S32_Blend_BlitRow32_PROC,		// S32_Blend,
1288    S32A_Opaque_BlitRow32_PROC,		// S32A_Opaque,
1289    S32A_Blend_BlitRow32_PROC		// S32A_Blend
1290};
1291
1292SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
1293    return platform_4444_procs[flags];
1294}
1295
1296SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
1297    return platform_565_procs[flags];
1298}
1299
1300SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
1301    return platform_32_procs[flags];
1302}
1303
1304SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
1305    return NULL;
1306}
1307
1308///////////////////////////////////////////////////////////////////////////////
1309
1310SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
1311                                                     SkMask::Format maskFormat,
1312                                                     SkColor color) {
1313    return NULL;
1314}
1315
1316SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
1317                                                 SkMask::Format maskFormat,
1318                                                 RowFlags flags) {
1319    return NULL;
1320}
1321