SkBlitRow_opts_arm.cpp revision a40390c08b42226a0f459a7d1363f33b35b4741a
1/*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17
18#ifdef ANDROID
19    #include <machine/cpu-features.h>
20#endif
21
22#include "SkBlitRow.h"
23#include "SkColorPriv.h"
24#include "SkDither.h"
25
26#if defined(__ARM_HAVE_NEON)
27#include <arm_neon.h>
28#endif
29
30#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
31static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
32                                  const SkPMColor* SK_RESTRICT src, int count,
33                                  U8CPU alpha, int /*x*/, int /*y*/) {
34    SkASSERT(255 == alpha);
35
36    if (count >= 8) {
37        uint16_t* SK_RESTRICT keep_dst;
38
39        asm volatile (
40                      "ands       ip, %[count], #7            \n\t"
41                      "vmov.u8    d31, #1<<7                  \n\t"
42                      "vld1.16    {q12}, [%[dst]]             \n\t"
43                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
44                      "moveq      ip, #8                      \n\t"
45                      "mov        %[keep_dst], %[dst]         \n\t"
46
47                      "add        %[src], %[src], ip, LSL#2   \n\t"
48                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
49                      "subs       %[count], %[count], ip      \n\t"
50                      "b          9f                          \n\t"
51                      // LOOP
52                      "2:                                         \n\t"
53
54                      "vld1.16    {q12}, [%[dst]]!            \n\t"
55                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
56                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
57                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
58                      "subs       %[count], %[count], #8      \n\t"
59                      "9:                                         \n\t"
60                      "pld        [%[dst],#32]                \n\t"
61                      // expand 0565 q12 to 8888 {d4-d7}
62                      "vmovn.u16  d4, q12                     \n\t"
63                      "vshr.u16   q11, q12, #5                \n\t"
64                      "vshr.u16   q10, q12, #6+5              \n\t"
65                      "vmovn.u16  d5, q11                     \n\t"
66                      "vmovn.u16  d6, q10                     \n\t"
67                      "vshl.u8    d4, d4, #3                  \n\t"
68                      "vshl.u8    d5, d5, #2                  \n\t"
69                      "vshl.u8    d6, d6, #3                  \n\t"
70
71                      "vmovl.u8   q14, d31                    \n\t"
72                      "vmovl.u8   q13, d31                    \n\t"
73                      "vmovl.u8   q12, d31                    \n\t"
74
75                      // duplicate in 4/2/1 & 8pix vsns
76                      "vmvn.8     d30, d3                     \n\t"
77                      "vmlal.u8   q14, d30, d6                \n\t"
78                      "vmlal.u8   q13, d30, d5                \n\t"
79                      "vmlal.u8   q12, d30, d4                \n\t"
80                      "vshr.u16   q8, q14, #5                 \n\t"
81                      "vshr.u16   q9, q13, #6                 \n\t"
82                      "vaddhn.u16 d6, q14, q8                 \n\t"
83                      "vshr.u16   q8, q12, #5                 \n\t"
84                      "vaddhn.u16 d5, q13, q9                 \n\t"
85                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
86                      "vaddhn.u16 d4, q12, q8                 \n\t"
87                      // intentionally don't calculate alpha
88                      // result in d4-d6
89
90                      "vqadd.u8   d5, d5, d1                  \n\t"
91                      "vqadd.u8   d4, d4, d2                  \n\t"
92
93                      // pack 8888 {d4-d6} to 0565 q10
94                      "vshll.u8   q10, d6, #8                 \n\t"
95                      "vshll.u8   q3, d5, #8                  \n\t"
96                      "vshll.u8   q2, d4, #8                  \n\t"
97                      "vsri.u16   q10, q3, #5                 \n\t"
98                      "vsri.u16   q10, q2, #11                \n\t"
99
100                      "bne        2b                          \n\t"
101
102                      "1:                                         \n\t"
103                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
104                      : [count] "+r" (count)
105                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
106                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
107                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
108                      "d30","d31"
109                      );
110    }
111    else
112    {   // handle count < 8
113        uint16_t* SK_RESTRICT keep_dst;
114
115        asm volatile (
116                      "vmov.u8    d31, #1<<7                  \n\t"
117                      "mov        %[keep_dst], %[dst]         \n\t"
118
119                      "tst        %[count], #4                \n\t"
120                      "beq        14f                         \n\t"
121                      "vld1.16    {d25}, [%[dst]]!            \n\t"
122                      "vld1.32    {q1}, [%[src]]!             \n\t"
123
124                      "14:                                        \n\t"
125                      "tst        %[count], #2                \n\t"
126                      "beq        12f                         \n\t"
127                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
128                      "vld1.32    {d1}, [%[src]]!             \n\t"
129
130                      "12:                                        \n\t"
131                      "tst        %[count], #1                \n\t"
132                      "beq        11f                         \n\t"
133                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
134                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
135
136                      "11:                                        \n\t"
137                      // unzips achieve the same as a vld4 operation
138                      "vuzpq.u16  q0, q1                      \n\t"
139                      "vuzp.u8    d0, d1                      \n\t"
140                      "vuzp.u8    d2, d3                      \n\t"
141                      // expand 0565 q12 to 8888 {d4-d7}
142                      "vmovn.u16  d4, q12                     \n\t"
143                      "vshr.u16   q11, q12, #5                \n\t"
144                      "vshr.u16   q10, q12, #6+5              \n\t"
145                      "vmovn.u16  d5, q11                     \n\t"
146                      "vmovn.u16  d6, q10                     \n\t"
147                      "vshl.u8    d4, d4, #3                  \n\t"
148                      "vshl.u8    d5, d5, #2                  \n\t"
149                      "vshl.u8    d6, d6, #3                  \n\t"
150
151                      "vmovl.u8   q14, d31                    \n\t"
152                      "vmovl.u8   q13, d31                    \n\t"
153                      "vmovl.u8   q12, d31                    \n\t"
154
155                      // duplicate in 4/2/1 & 8pix vsns
156                      "vmvn.8     d30, d3                     \n\t"
157                      "vmlal.u8   q14, d30, d6                \n\t"
158                      "vmlal.u8   q13, d30, d5                \n\t"
159                      "vmlal.u8   q12, d30, d4                \n\t"
160                      "vshr.u16   q8, q14, #5                 \n\t"
161                      "vshr.u16   q9, q13, #6                 \n\t"
162                      "vaddhn.u16 d6, q14, q8                 \n\t"
163                      "vshr.u16   q8, q12, #5                 \n\t"
164                      "vaddhn.u16 d5, q13, q9                 \n\t"
165                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
166                      "vaddhn.u16 d4, q12, q8                 \n\t"
167                      // intentionally don't calculate alpha
168                      // result in d4-d6
169
170                      "vqadd.u8   d5, d5, d1                  \n\t"
171                      "vqadd.u8   d4, d4, d2                  \n\t"
172
173                      // pack 8888 {d4-d6} to 0565 q10
174                      "vshll.u8   q10, d6, #8                 \n\t"
175                      "vshll.u8   q3, d5, #8                  \n\t"
176                      "vshll.u8   q2, d4, #8                  \n\t"
177                      "vsri.u16   q10, q3, #5                 \n\t"
178                      "vsri.u16   q10, q2, #11                \n\t"
179
180                      // store
181                      "tst        %[count], #4                \n\t"
182                      "beq        24f                         \n\t"
183                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
184
185                      "24:                                        \n\t"
186                      "tst        %[count], #2                \n\t"
187                      "beq        22f                         \n\t"
188                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
189
190                      "22:                                        \n\t"
191                      "tst        %[count], #1                \n\t"
192                      "beq        21f                         \n\t"
193                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
194
195                      "21:                                        \n\t"
196                      : [count] "+r" (count)
197                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
198                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
199                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
200                      "d30","d31"
201                      );
202    }
203}
204
205static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
206                                 const SkPMColor* SK_RESTRICT src, int count,
207                                 U8CPU alpha, int /*x*/, int /*y*/) {
208
209    U8CPU alpha_for_asm = alpha;
210
211    asm volatile (
212    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
213     * the original in two respects:
214     *  1. The results have a few mismatches compared to the original code. These mismatches
215     *     never exceed 1. It's possible to improve accuracy vs. a floating point
216     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
217     *     Rounding is not present in the code below, because although results would be closer
218     *     to a floating point implementation, the number of mismatches compared to the
219     *     original code would be far greater.
220     *  2. On certain inputs, the original code can overflow, causing colour channels to
221     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
222     *     to affect another.
223     */
224
225#if 1
226		/* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
227                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
228#else
229                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
230#endif
231                  "vmov.u16   q3, #255                        \n\t"   // set up constant
232                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
233                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
234                  "beq        2f                              \n\t"   // if count8 == 0, exit
235                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
236
237                  "1:                                             \n\t"
238                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
239                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
240                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
241                  //  and deinterleave
242
243                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
244                  "vand       q10, q0, q15                    \n\t"   // extract blue
245                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
246                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
247                  // dstrgb = {q8, q9, q10}
248
249                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
250                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
251                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
252
253                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
254                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
255                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
256                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
257                  // srcrgba = {q11, q12, q13, q14}
258
259                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
260                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
261                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
262                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
263
264                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
265                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
266                  // dst_scale = q2
267
268                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
269                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
270                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
271
272#if 1
273	// trying for a better match with SkDiv255Round(a)
274	// C alg is:  a+=128; (a+a>>8)>>8
275	// we'll use just a rounding shift [q2 is available for scratch]
276                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
277                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
278                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
279#else
280	// arm's original "truncating divide by 256"
281                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
282                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
283                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
284#endif
285
286                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
287                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
288                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
289
290                  "bne        1b                              \n\t"   // if counter != 0, loop
291                  "2:                                             \n\t"   // exit
292
293                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
294                  :
295                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
296                  );
297
298    count &= 7;
299    if (count > 0) {
300        do {
301            SkPMColor sc = *src++;
302            if (sc) {
303                uint16_t dc = *dst;
304                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
305                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
306                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
307                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
308                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
309            }
310            dst += 1;
311        } while (--count != 0);
312    }
313}
314
315/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
316 * each dither value is spaced out into byte lanes, and repeated
317 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
318 * start of each row.
319 */
320static const uint8_t gDitherMatrix_Neon[48] = {
321    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
322    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
323    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
324    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
325
326};
327
328static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
329                                       int count, U8CPU alpha, int x, int y)
330{
331    /* select row and offset for dither array */
332    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
333
334    /* rescale alpha to range 0 - 256 */
335    int scale = SkAlpha255To256(alpha);
336
337    asm volatile (
338                  "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
339                  "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
340                  "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
341                  "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
342                  "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
343                  "1:                                                 \n\t"
344                  "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
345                  "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
346                  "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
347                  "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
348                  "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
349                  "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
350                  "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
351                  "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
352                  "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
353                  "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
354                  "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
355                  "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
356                  "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
357                  // load 8 pixels from dst, extract rgb
358                  "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
359                  "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
360                  "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
361                  "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
362                  "vand           d17, d17, d29                   \n\t"   // and green with green mask
363                  "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
364                  "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
365                  // src = {d22 (r), d23 (g), d24 (b)}
366                  // dst = {d16 (r), d17 (g), d18 (b)}
367                  // subtract dst from src and widen
368                  "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
369                  "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
370                  "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
371                  // multiply diffs by scale and shift
372                  "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
373                  "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
374                  "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
375                  "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
376                  "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
377                  "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
378                  "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
379                  // add dst to result
380                  "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
381                  "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
382                  "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
383                  // put result into 565 format
384                  "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
385                  "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
386                  "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
387                  "bgt            1b                              \n\t"   // loop if count > 0
388                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
389                  : [dstart] "r" (dstart), [scale] "r" (scale)
390                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
391                  );
392
393    DITHER_565_SCAN(y);
394
395    while((count & 7) > 0)
396    {
397        SkPMColor c = *src++;
398
399        int dither = DITHER_VALUE(x);
400        int sr = SkGetPackedR32(c);
401        int sg = SkGetPackedG32(c);
402        int sb = SkGetPackedB32(c);
403        sr = SkDITHER_R32To565(sr, dither);
404        sg = SkDITHER_G32To565(sg, dither);
405        sb = SkDITHER_B32To565(sb, dither);
406
407        uint16_t d = *dst;
408        *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
409                             SkAlphaBlend(sg, SkGetPackedG16(d), scale),
410                             SkAlphaBlend(sb, SkGetPackedB16(d), scale));
411        DITHER_INC_X(x);
412        count--;
413    }
414}
415
416#define S32A_D565_Opaque_PROC       S32A_D565_Opaque_neon
417#define S32A_D565_Blend_PROC        S32A_D565_Blend_neon
418#define S32_D565_Blend_Dither_PROC  S32_D565_Blend_Dither_neon
419#else
420#define S32A_D565_Opaque_PROC       NULL
421#define S32A_D565_Blend_PROC        NULL
422#define S32_D565_Blend_Dither_PROC  NULL
423#endif
424
425/* Don't have a special version that assumes each src is opaque, but our S32A
426    is still faster than the default, so use it here
427 */
428#define S32_D565_Opaque_PROC    S32A_D565_Opaque_PROC
429#define S32_D565_Blend_PROC     S32A_D565_Blend_PROC
430
431///////////////////////////////////////////////////////////////////////////////
432
433#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
434
435static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
436                                  const SkPMColor* SK_RESTRICT src,
437                                  int count, U8CPU alpha) {
438
439    SkASSERT(255 == alpha);
440    if (count > 0) {
441
442
443	uint8x8_t alpha_mask;
444
445	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
446	alpha_mask = vld1_u8(alpha_mask_setup);
447
448	/* do the NEON unrolled code */
449#define	UNROLL	4
450	while (count >= UNROLL) {
451	    uint8x8_t src_raw, dst_raw, dst_final;
452	    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
453
454	    /* get the source */
455	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
456#if	UNROLL > 2
457	    src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
458#endif
459
460	    /* get and hold the dst too */
461	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
462#if	UNROLL > 2
463	    dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
464#endif
465
466	/* 1st and 2nd bits of the unrolling */
467	{
468	    uint8x8_t dst_cooked;
469	    uint16x8_t dst_wide;
470	    uint8x8_t alpha_narrow;
471	    uint16x8_t alpha_wide;
472
473	    /* get the alphas spread out properly */
474	    alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
475#if 1
476	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
477	    /* we collapsed (255-a)+1 ... */
478	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
479#else
480	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
481	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
482#endif
483
484	    /* spread the dest */
485	    dst_wide = vmovl_u8(dst_raw);
486
487	    /* alpha mul the dest */
488	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
489	    dst_cooked = vshrn_n_u16(dst_wide, 8);
490
491	    /* sum -- ignoring any byte lane overflows */
492	    dst_final = vadd_u8(src_raw, dst_cooked);
493	}
494
495#if	UNROLL > 2
496	/* the 3rd and 4th bits of our unrolling */
497	{
498	    uint8x8_t dst_cooked;
499	    uint16x8_t dst_wide;
500	    uint8x8_t alpha_narrow;
501	    uint16x8_t alpha_wide;
502
503	    alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
504#if 1
505	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
506	    /* we collapsed (255-a)+1 ... */
507	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
508#else
509	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
510	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
511#endif
512
513	    /* spread the dest */
514	    dst_wide = vmovl_u8(dst_raw_2);
515
516	    /* alpha mul the dest */
517	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
518	    dst_cooked = vshrn_n_u16(dst_wide, 8);
519
520	    /* sum -- ignoring any byte lane overflows */
521	    dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
522	}
523#endif
524
525	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
526#if	UNROLL > 2
527	    vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
528#endif
529
530	    src += UNROLL;
531	    dst += UNROLL;
532	    count -= UNROLL;
533	}
534#undef	UNROLL
535
536	/* do any residual iterations */
537        while (--count >= 0) {
538#ifdef TEST_SRC_ALPHA
539            SkPMColor sc = *src;
540            if (sc) {
541                unsigned srcA = SkGetPackedA32(sc);
542                SkPMColor result = sc;
543                if (srcA != 255) {
544                    result = SkPMSrcOver(sc, *dst);
545                }
546                *dst = result;
547            }
548#else
549            *dst = SkPMSrcOver(*src, *dst);
550#endif
551            src += 1;
552            dst += 1;
553        }
554    }
555}
556
557#define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_neon
558#else
559#define	S32A_Opaque_BlitRow32_PROC	NULL
560#endif
561
562/* Neon version of S32_Blend_BlitRow32()
563 * portable version is in src/core/SkBlitRow_D32.cpp
564 */
565#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
566static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
567                                const SkPMColor* SK_RESTRICT src,
568                                int count, U8CPU alpha) {
569    SkASSERT(alpha <= 255);
570    if (count > 0) {
571        uint16_t src_scale = SkAlpha255To256(alpha);
572        uint16_t dst_scale = 256 - src_scale;
573
574	/* run them N at a time through the NEON unit */
575	/* note that each 1 is 4 bytes, each treated exactly the same,
576	 * so we can work under that guise. We *do* know that the src&dst
577	 * will be 32-bit aligned quantities, so we can specify that on
578	 * the load/store ops and do a neon 'reinterpret' to get us to
579	 * byte-sized (pun intended) pieces that we widen/multiply/shift
580	 * we're limited at 128 bits in the wide ops, which is 8x16bits
581	 * or a pair of 32 bit src/dsts.
582	 */
583	/* we *could* manually unroll this loop so that we load 128 bits
584	 * (as a pair of 64s) from each of src and dst, processing them
585	 * in pieces. This might give us a little better management of
586	 * the memory latency, but my initial attempts here did not
587	 * produce an instruction stream that looked all that nice.
588	 */
589#define	UNROLL	2
590	while (count >= UNROLL) {
591	    uint8x8_t  src_raw, dst_raw, dst_final;
592	    uint16x8_t  src_wide, dst_wide;
593
594	    /* get 64 bits of src, widen it, multiply by src_scale */
595	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
596	    src_wide = vmovl_u8(src_raw);
597	    /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
598	    src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
599
600	    /* ditto with dst */
601	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
602	    dst_wide = vmovl_u8(dst_raw);
603
604	    /* combine add with dst multiply into mul-accumulate */
605	    dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
606
607	    dst_final = vshrn_n_u16(dst_wide, 8);
608	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
609
610	    src += UNROLL;
611	    dst += UNROLL;
612	    count -= UNROLL;
613	}
614	/* RBE: well, i don't like how gcc manages src/dst across the above
615	 * loop it's constantly calculating src+bias, dst+bias and it only
616	 * adjusts the real ones when we leave the loop. Not sure why
617	 * it's "hoisting down" (hoisting implies above in my lexicon ;))
618	 * the adjustments to src/dst/count, but it does...
619	 * (might be SSA-style internal logic...
620	 */
621
622#if	UNROLL == 2
623	if (count == 1) {
624            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
625	}
626#else
627	if (count > 0) {
628            do {
629                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
630                src += 1;
631                dst += 1;
632            } while (--count > 0);
633	}
634#endif
635
636#undef	UNROLL
637    }
638}
639
640#define	S32_Blend_BlitRow32_PROC	S32_Blend_BlitRow32_neon
641#else
642#define	S32_Blend_BlitRow32_PROC	NULL
643#endif
644
645///////////////////////////////////////////////////////////////////////////////
646
647#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
648
649#undef	DEBUG_OPAQUE_DITHER
650
651#if	defined(DEBUG_OPAQUE_DITHER)
652static void showme8(char *str, void *p, int len)
653{
654	static char buf[256];
655	char tbuf[32];
656	int i;
657	char *pc = (char*) p;
658	sprintf(buf,"%8s:", str);
659	for(i=0;i<len;i++) {
660	    sprintf(tbuf, "   %02x", pc[i]);
661	    strcat(buf, tbuf);
662	}
663	SkDebugf("%s\n", buf);
664}
665static void showme16(char *str, void *p, int len)
666{
667	static char buf[256];
668	char tbuf[32];
669	int i;
670	uint16_t *pc = (uint16_t*) p;
671	sprintf(buf,"%8s:", str);
672	len = (len / sizeof(uint16_t));	/* passed as bytes */
673	for(i=0;i<len;i++) {
674	    sprintf(tbuf, " %04x", pc[i]);
675	    strcat(buf, tbuf);
676	}
677	SkDebugf("%s\n", buf);
678}
679#endif
680
681static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
682                                      const SkPMColor* SK_RESTRICT src,
683                                      int count, U8CPU alpha, int x, int y) {
684    SkASSERT(255 == alpha);
685
686#define	UNROLL	8
687
688    if (count >= UNROLL) {
689	uint8x8_t dbase;
690
691#if	defined(DEBUG_OPAQUE_DITHER)
692	uint16_t tmpbuf[UNROLL];
693	int td[UNROLL];
694	int tdv[UNROLL];
695	int ta[UNROLL];
696	int tap[UNROLL];
697	uint16_t in_dst[UNROLL];
698	int offset = 0;
699	int noisy = 0;
700#endif
701
702	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
703	dbase = vld1_u8(dstart);
704
705        do {
706	    uint8x8_t sr, sg, sb, sa, d;
707	    uint16x8_t dst8, scale8, alpha8;
708	    uint16x8_t dst_r, dst_g, dst_b;
709
710#if	defined(DEBUG_OPAQUE_DITHER)
711	/* calculate 8 elements worth into a temp buffer */
712	{
713	  int my_y = y;
714	  int my_x = x;
715	  SkPMColor* my_src = (SkPMColor*)src;
716	  uint16_t* my_dst = dst;
717	  int i;
718
719          DITHER_565_SCAN(my_y);
720          for(i=0;i<UNROLL;i++) {
721            SkPMColor c = *my_src++;
722            SkPMColorAssert(c);
723            if (c) {
724                unsigned a = SkGetPackedA32(c);
725
726                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
727		tdv[i] = DITHER_VALUE(my_x);
728		ta[i] = a;
729		tap[i] = SkAlpha255To256(a);
730		td[i] = d;
731
732                unsigned sr = SkGetPackedR32(c);
733                unsigned sg = SkGetPackedG32(c);
734                unsigned sb = SkGetPackedB32(c);
735                sr = SkDITHER_R32_FOR_565(sr, d);
736                sg = SkDITHER_G32_FOR_565(sg, d);
737                sb = SkDITHER_B32_FOR_565(sb, d);
738
739                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
740                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
741                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
742                // now src and dst expanded are in g:11 r:10 x:1 b:10
743                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
744		td[i] = d;
745
746            } else {
747		tmpbuf[i] = *my_dst;
748		ta[i] = tdv[i] = td[i] = 0xbeef;
749	    }
750	    in_dst[i] = *my_dst;
751            my_dst += 1;
752            DITHER_INC_X(my_x);
753          }
754	}
755#endif
756
757	    /* source is in ABGR */
758	    {
759		register uint8x8_t d0 asm("d0");
760		register uint8x8_t d1 asm("d1");
761		register uint8x8_t d2 asm("d2");
762		register uint8x8_t d3 asm("d3");
763
764		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
765		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
766		    : "r" (src)
767                    );
768		    sr = d0; sg = d1; sb = d2; sa = d3;
769	    }
770
771	    /* calculate 'd', which will be 0..7 */
772	    /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
773#if ANDROID
774	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
775	    alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
776#else
777	    alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
778#endif
779	    alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
780	    d = vshrn_n_u16(alpha8, 8);	/* narrowing too */
781
782	    /* sr = sr - (sr>>5) + d */
783	    /* watching for 8-bit overflow.  d is 0..7; risky range of
784	     * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
785	     * safe  as long as we do ((sr-sr>>5) + d) */
786	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
787	    sr = vadd_u8(sr, d);
788
789	    /* sb = sb - (sb>>5) + d */
790	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
791	    sb = vadd_u8(sb, d);
792
793	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
794	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
795	    sg = vadd_u8(sg, vshr_n_u8(d,1));
796
797	    /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
798	    dst8 = vld1q_u16(dst);
799	    dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
800	    dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
801	    dst_r = vshrq_n_u16(dst8,11);	/* clearing hi bits */
802
803	    /* blend */
804#if 1
805	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
806	    /* originally 255-sa + 1 */
807	    scale8 = vsubw_u8(vdupq_n_u16(256), sa);
808#else
809	    scale8 = vsubw_u8(vdupq_n_u16(255), sa);
810	    scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
811#endif
812
813#if 1
814	    /* combine the addq and mul, save 3 insns */
815	    scale8 = vshrq_n_u16(scale8, 3);
816	    dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
817	    dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
818	    dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
819#else
820	    /* known correct, but +3 insns over above */
821	    scale8 = vshrq_n_u16(scale8, 3);
822	    dst_b = vmulq_u16(dst_b, scale8);
823	    dst_g = vmulq_u16(dst_g, scale8);
824	    dst_r = vmulq_u16(dst_r, scale8);
825
826	    /* combine */
827	    /* NB: vshll widens, need to preserve those bits */
828	    dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
829	    dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
830	    dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
831#endif
832
833	    /* repack to store */
834	    dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
835	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
836	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
837
838	    vst1q_u16(dst, dst8);
839
840#if	defined(DEBUG_OPAQUE_DITHER)
841	    /* verify my 8 elements match the temp buffer */
842	{
843	   int i, bad=0;
844	   static int invocation;
845
846	   for (i=0;i<UNROLL;i++)
847		if (tmpbuf[i] != dst[i]) bad=1;
848	   if (bad) {
849		SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
850			invocation, offset);
851		SkDebugf("  alpha 0x%x\n", alpha);
852		for (i=0;i<UNROLL;i++)
853		    SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
854			i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
855			dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
856
857		showme16("alpha8", &alpha8, sizeof(alpha8));
858		showme16("scale8", &scale8, sizeof(scale8));
859		showme8("d", &d, sizeof(d));
860		showme16("dst8", &dst8, sizeof(dst8));
861		showme16("dst_b", &dst_b, sizeof(dst_b));
862		showme16("dst_g", &dst_g, sizeof(dst_g));
863		showme16("dst_r", &dst_r, sizeof(dst_r));
864		showme8("sb", &sb, sizeof(sb));
865		showme8("sg", &sg, sizeof(sg));
866		showme8("sr", &sr, sizeof(sr));
867
868		/* cop out */
869		return;
870	   }
871	   offset += UNROLL;
872	   invocation++;
873	}
874#endif
875
876            dst += UNROLL;
877	    src += UNROLL;
878	    count -= UNROLL;
879	    /* skip x += UNROLL, since it's unchanged mod-4 */
880        } while (count >= UNROLL);
881    }
882#undef	UNROLL
883
884    /* residuals */
885    if (count > 0) {
886        DITHER_565_SCAN(y);
887        do {
888            SkPMColor c = *src++;
889            SkPMColorAssert(c);
890            if (c) {
891                unsigned a = SkGetPackedA32(c);
892
893                // dither and alpha are just temporary variables to work-around
894                // an ICE in debug.
895                unsigned dither = DITHER_VALUE(x);
896                unsigned alpha = SkAlpha255To256(a);
897                int d = SkAlphaMul(dither, alpha);
898
899                unsigned sr = SkGetPackedR32(c);
900                unsigned sg = SkGetPackedG32(c);
901                unsigned sb = SkGetPackedB32(c);
902                sr = SkDITHER_R32_FOR_565(sr, d);
903                sg = SkDITHER_G32_FOR_565(sg, d);
904                sb = SkDITHER_B32_FOR_565(sb, d);
905
906                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
907                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
908                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
909                // now src and dst expanded are in g:11 r:10 x:1 b:10
910                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
911            }
912            dst += 1;
913            DITHER_INC_X(x);
914        } while (--count != 0);
915    }
916}
917
918#define	S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
919#else
920#define	S32A_D565_Opaque_Dither_PROC NULL
921#endif
922
923///////////////////////////////////////////////////////////////////////////////
924
925#if	defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
926/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
927 * speedup untested, but ARM version is 26 insns/iteration and
928 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
929 * which is 10x the native version; that's pure instruction counts,
930 * not accounting for any instruction or memory latencies.
931 */
932
933#undef	DEBUG_S32_OPAQUE_DITHER
934
935static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
936                                     const SkPMColor* SK_RESTRICT src,
937                                     int count, U8CPU alpha, int x, int y) {
938    SkASSERT(255 == alpha);
939
940#define	UNROLL	8
941    if (count >= UNROLL) {
942	uint8x8_t d;
943	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
944	d = vld1_u8(dstart);
945
946	while (count >= UNROLL) {
947	    uint8x8_t sr, sg, sb, sa;
948	    uint16x8_t dr, dg, db, da;
949	    uint16x8_t dst8;
950
951	    /* source is in ABGR ordering (R == lsb) */
952	    {
953		register uint8x8_t d0 asm("d0");
954		register uint8x8_t d1 asm("d1");
955		register uint8x8_t d2 asm("d2");
956		register uint8x8_t d3 asm("d3");
957
958		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
959		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
960		    : "r" (src)
961                    );
962		    sr = d0; sg = d1; sb = d2; sa = d3;
963	    }
964	    /* XXX: if we want to prefetch, hide it in the above asm()
965	     * using the gcc __builtin_prefetch(), the prefetch will
966	     * fall to the bottom of the loop -- it won't stick up
967	     * at the top of the loop, just after the vld4.
968	     */
969
970	    /* sr = sr - (sr>>5) + d */
971	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
972	    dr = vaddl_u8(sr, d);
973
974	    /* sb = sb - (sb>>5) + d */
975	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
976	    db = vaddl_u8(sb, d);
977
978	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
979	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
980	    dg = vaddl_u8(sg, vshr_n_u8(d,1));
981	    /* XXX: check that the "d>>1" here is hoisted */
982
983	    /* pack high bits of each into 565 format  (rgb, b is lsb) */
984	    dst8 = vshrq_n_u16(db, 3);
985	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
986	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
987
988	    /* store it */
989	    vst1q_u16(dst, dst8);
990
991#if	defined(DEBUG_S32_OPAQUE_DITHER)
992	    /* always good to know if we generated good results */
993	    {
994		int i, myx = x, myy = y;
995		DITHER_565_SCAN(myy);
996		for (i=0;i<UNROLL;i++) {
997		    SkPMColor c = src[i];
998		    unsigned dither = DITHER_VALUE(myx);
999		    uint16_t val = SkDitherRGB32To565(c, dither);
1000		    if (val != dst[i]) {
1001			SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1002			    c, dither, val, dst[i], dstart[i]);
1003		    }
1004		    DITHER_INC_X(myx);
1005		}
1006	    }
1007#endif
1008
1009	    dst += UNROLL;
1010	    src += UNROLL;
1011	    count -= UNROLL;
1012	    x += UNROLL;		/* probably superfluous */
1013	}
1014    }
1015#undef	UNROLL
1016
1017    /* residuals */
1018    if (count > 0) {
1019        DITHER_565_SCAN(y);
1020        do {
1021            SkPMColor c = *src++;
1022            SkPMColorAssert(c);
1023            SkASSERT(SkGetPackedA32(c) == 255);
1024
1025            unsigned dither = DITHER_VALUE(x);
1026            *dst++ = SkDitherRGB32To565(c, dither);
1027            DITHER_INC_X(x);
1028        } while (--count != 0);
1029    }
1030}
1031
1032#define	S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1033#else
1034#define	S32_D565_Opaque_Dither_PROC NULL
1035#endif
1036
1037///////////////////////////////////////////////////////////////////////////////
1038
1039static const SkBlitRow::Proc platform_565_procs[] = {
1040    // no dither
1041    S32_D565_Opaque_PROC,
1042    S32_D565_Blend_PROC,
1043    S32A_D565_Opaque_PROC,
1044    S32A_D565_Blend_PROC,
1045
1046    // dither
1047    S32_D565_Opaque_Dither_PROC,
1048    S32_D565_Blend_Dither_PROC,
1049    S32A_D565_Opaque_Dither_PROC,
1050    NULL,   // S32A_D565_Blend_Dither
1051};
1052
1053static const SkBlitRow::Proc platform_4444_procs[] = {
1054    // no dither
1055    NULL,   // S32_D4444_Opaque,
1056    NULL,   // S32_D4444_Blend,
1057    NULL,   // S32A_D4444_Opaque,
1058    NULL,   // S32A_D4444_Blend,
1059
1060    // dither
1061    NULL,   // S32_D4444_Opaque_Dither,
1062    NULL,   // S32_D4444_Blend_Dither,
1063    NULL,   // S32A_D4444_Opaque_Dither,
1064    NULL,   // S32A_D4444_Blend_Dither
1065};
1066
1067static const SkBlitRow::Proc32 platform_32_procs[] = {
1068    NULL,   // S32_Opaque,
1069    S32_Blend_BlitRow32_PROC,		// S32_Blend,
1070    S32A_Opaque_BlitRow32_PROC,		// S32A_Opaque,
1071    NULL,   // S32A_Blend,
1072};
1073
1074SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
1075    return platform_4444_procs[flags];
1076}
1077
1078SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
1079    return platform_565_procs[flags];
1080}
1081
1082SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
1083    return platform_32_procs[flags];
1084}
1085
1086