SkBlitRow_opts_arm_neon.cpp revision 5b92499f8ff96760ba54fdd76f48a8af2088b3f5
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorPriv.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkCachePreload_arm.h"
18#include "SkColor_opts_neon.h"
19#include <arm_neon.h>
20
21void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22                           const SkPMColor* SK_RESTRICT src, int count,
23                           U8CPU alpha, int /*x*/, int /*y*/) {
24    SkASSERT(255 == alpha);
25
26    while (count >= 8) {
27        uint8x8x4_t vsrc;
28        uint16x8_t vdst;
29
30        // Load
31        vsrc = vld4_u8((uint8_t*)src);
32
33        // Convert src to 565
34        vdst = SkPixel32ToPixel16_neon8(vsrc);
35
36        // Store
37        vst1q_u16(dst, vdst);
38
39        // Prepare next iteration
40        dst += 8;
41        src += 8;
42        count -= 8;
43    };
44
45    // Leftovers
46    while (count > 0) {
47        SkPMColor c = *src++;
48        SkPMColorAssert(c);
49        *dst = SkPixel32ToPixel16_ToU16(c);
50        dst++;
51        count--;
52    };
53}
54
55void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
56                           const SkPMColor* SK_RESTRICT src, int count,
57                           U8CPU alpha, int /*x*/, int /*y*/) {
58    SkASSERT(255 == alpha);
59
60    if (count >= 8) {
61        uint16_t* SK_RESTRICT keep_dst = 0;
62
63        asm volatile (
64                      "ands       ip, %[count], #7            \n\t"
65                      "vmov.u8    d31, #1<<7                  \n\t"
66                      "vld1.16    {q12}, [%[dst]]             \n\t"
67                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
68                      // Thumb does not support the standard ARM conditional
69                      // instructions but instead requires the 'it' instruction
70                      // to signal conditional execution
71                      "it eq                                  \n\t"
72                      "moveq      ip, #8                      \n\t"
73                      "mov        %[keep_dst], %[dst]         \n\t"
74
75                      "add        %[src], %[src], ip, LSL#2   \n\t"
76                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
77                      "subs       %[count], %[count], ip      \n\t"
78                      "b          9f                          \n\t"
79                      // LOOP
80                      "2:                                         \n\t"
81
82                      "vld1.16    {q12}, [%[dst]]!            \n\t"
83                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
84                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
85                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
86                      "subs       %[count], %[count], #8      \n\t"
87                      "9:                                         \n\t"
88                      "pld        [%[dst],#32]                \n\t"
89                      // expand 0565 q12 to 8888 {d4-d7}
90                      "vmovn.u16  d4, q12                     \n\t"
91                      "vshr.u16   q11, q12, #5                \n\t"
92                      "vshr.u16   q10, q12, #6+5              \n\t"
93                      "vmovn.u16  d5, q11                     \n\t"
94                      "vmovn.u16  d6, q10                     \n\t"
95                      "vshl.u8    d4, d4, #3                  \n\t"
96                      "vshl.u8    d5, d5, #2                  \n\t"
97                      "vshl.u8    d6, d6, #3                  \n\t"
98
99                      "vmovl.u8   q14, d31                    \n\t"
100                      "vmovl.u8   q13, d31                    \n\t"
101                      "vmovl.u8   q12, d31                    \n\t"
102
103                      // duplicate in 4/2/1 & 8pix vsns
104                      "vmvn.8     d30, d3                     \n\t"
105                      "vmlal.u8   q14, d30, d6                \n\t"
106                      "vmlal.u8   q13, d30, d5                \n\t"
107                      "vmlal.u8   q12, d30, d4                \n\t"
108                      "vshr.u16   q8, q14, #5                 \n\t"
109                      "vshr.u16   q9, q13, #6                 \n\t"
110                      "vaddhn.u16 d6, q14, q8                 \n\t"
111                      "vshr.u16   q8, q12, #5                 \n\t"
112                      "vaddhn.u16 d5, q13, q9                 \n\t"
113                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
114                      "vaddhn.u16 d4, q12, q8                 \n\t"
115                      // intentionally don't calculate alpha
116                      // result in d4-d6
117
118                      "vqadd.u8   d5, d5, d1                  \n\t"
119                      "vqadd.u8   d4, d4, d2                  \n\t"
120
121                      // pack 8888 {d4-d6} to 0565 q10
122                      "vshll.u8   q10, d6, #8                 \n\t"
123                      "vshll.u8   q3, d5, #8                  \n\t"
124                      "vshll.u8   q2, d4, #8                  \n\t"
125                      "vsri.u16   q10, q3, #5                 \n\t"
126                      "vsri.u16   q10, q2, #11                \n\t"
127
128                      "bne        2b                          \n\t"
129
130                      "1:                                         \n\t"
131                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
132                      : [count] "+r" (count)
133                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
134                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
135                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
136                      "d30","d31"
137                      );
138    }
139    else
140    {   // handle count < 8
141        uint16_t* SK_RESTRICT keep_dst = 0;
142
143        asm volatile (
144                      "vmov.u8    d31, #1<<7                  \n\t"
145                      "mov        %[keep_dst], %[dst]         \n\t"
146
147                      "tst        %[count], #4                \n\t"
148                      "beq        14f                         \n\t"
149                      "vld1.16    {d25}, [%[dst]]!            \n\t"
150                      "vld1.32    {q1}, [%[src]]!             \n\t"
151
152                      "14:                                        \n\t"
153                      "tst        %[count], #2                \n\t"
154                      "beq        12f                         \n\t"
155                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
156                      "vld1.32    {d1}, [%[src]]!             \n\t"
157
158                      "12:                                        \n\t"
159                      "tst        %[count], #1                \n\t"
160                      "beq        11f                         \n\t"
161                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
162                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
163
164                      "11:                                        \n\t"
165                      // unzips achieve the same as a vld4 operation
166                      "vuzpq.u16  q0, q1                      \n\t"
167                      "vuzp.u8    d0, d1                      \n\t"
168                      "vuzp.u8    d2, d3                      \n\t"
169                      // expand 0565 q12 to 8888 {d4-d7}
170                      "vmovn.u16  d4, q12                     \n\t"
171                      "vshr.u16   q11, q12, #5                \n\t"
172                      "vshr.u16   q10, q12, #6+5              \n\t"
173                      "vmovn.u16  d5, q11                     \n\t"
174                      "vmovn.u16  d6, q10                     \n\t"
175                      "vshl.u8    d4, d4, #3                  \n\t"
176                      "vshl.u8    d5, d5, #2                  \n\t"
177                      "vshl.u8    d6, d6, #3                  \n\t"
178
179                      "vmovl.u8   q14, d31                    \n\t"
180                      "vmovl.u8   q13, d31                    \n\t"
181                      "vmovl.u8   q12, d31                    \n\t"
182
183                      // duplicate in 4/2/1 & 8pix vsns
184                      "vmvn.8     d30, d3                     \n\t"
185                      "vmlal.u8   q14, d30, d6                \n\t"
186                      "vmlal.u8   q13, d30, d5                \n\t"
187                      "vmlal.u8   q12, d30, d4                \n\t"
188                      "vshr.u16   q8, q14, #5                 \n\t"
189                      "vshr.u16   q9, q13, #6                 \n\t"
190                      "vaddhn.u16 d6, q14, q8                 \n\t"
191                      "vshr.u16   q8, q12, #5                 \n\t"
192                      "vaddhn.u16 d5, q13, q9                 \n\t"
193                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
194                      "vaddhn.u16 d4, q12, q8                 \n\t"
195                      // intentionally don't calculate alpha
196                      // result in d4-d6
197
198                      "vqadd.u8   d5, d5, d1                  \n\t"
199                      "vqadd.u8   d4, d4, d2                  \n\t"
200
201                      // pack 8888 {d4-d6} to 0565 q10
202                      "vshll.u8   q10, d6, #8                 \n\t"
203                      "vshll.u8   q3, d5, #8                  \n\t"
204                      "vshll.u8   q2, d4, #8                  \n\t"
205                      "vsri.u16   q10, q3, #5                 \n\t"
206                      "vsri.u16   q10, q2, #11                \n\t"
207
208                      // store
209                      "tst        %[count], #4                \n\t"
210                      "beq        24f                         \n\t"
211                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
212
213                      "24:                                        \n\t"
214                      "tst        %[count], #2                \n\t"
215                      "beq        22f                         \n\t"
216                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
217
218                      "22:                                        \n\t"
219                      "tst        %[count], #1                \n\t"
220                      "beq        21f                         \n\t"
221                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
222
223                      "21:                                        \n\t"
224                      : [count] "+r" (count)
225                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
226                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
227                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
228                      "d30","d31"
229                      );
230    }
231}
232
233static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
234    prod += vdupq_n_u16(128);
235    prod += vshrq_n_u16(prod, 8);
236    return vshrq_n_u16(prod, 8);
237}
238
239void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
240                          const SkPMColor* SK_RESTRICT src, int count,
241                          U8CPU alpha, int /*x*/, int /*y*/) {
242   SkASSERT(255 > alpha);
243
244    /* This code implements a Neon version of S32A_D565_Blend. The results have
245     * a few mismatches compared to the original code. These mismatches never
246     * exceed 1.
247     */
248
249    if (count >= 8) {
250        uint16x8_t valpha_max, vmask_blue;
251        uint8x8_t valpha;
252
253        // prepare constants
254        valpha_max = vmovq_n_u16(255);
255        valpha = vdup_n_u8(alpha);
256        vmask_blue = vmovq_n_u16(SK_B16_MASK);
257
258        do {
259            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
260            uint16x8_t vres_a, vres_r, vres_g, vres_b;
261            uint8x8x4_t vsrc;
262
263            // load pixels
264            vdst = vld1q_u16(dst);
265#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
266            asm (
267                "vld4.u8 %h[vsrc], [%[src]]!"
268                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
269                : :
270            );
271#else
272            register uint8x8_t d0 asm("d0");
273            register uint8x8_t d1 asm("d1");
274            register uint8x8_t d2 asm("d2");
275            register uint8x8_t d3 asm("d3");
276
277            asm volatile (
278                "vld4.u8    {d0-d3},[%[src]]!;"
279                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
280                  [src] "+&r" (src)
281                : :
282            );
283            vsrc.val[0] = d0;
284            vsrc.val[1] = d1;
285            vsrc.val[2] = d2;
286            vsrc.val[3] = d3;
287#endif
288
289
290            // deinterleave dst
291            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
292            vdst_b = vdst & vmask_blue;                     // extract blue
293            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
294            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
295
296            // shift src to 565
297            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
298            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
299            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
300
301            // calc src * src_scale
302            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
303            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
304            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
305            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
306
307            // prepare dst_scale
308            vres_a = SkDiv255Round_neon8(vres_a);
309            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
310
311            // add dst * dst_scale to previous result
312            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
313            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
314            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
315
316#ifdef S32A_D565_BLEND_EXACT
317            // It is possible to get exact results with this but it is slow,
318            // even slower than C code in some cases
319            vres_r = SkDiv255Round_neon8(vres_r);
320            vres_g = SkDiv255Round_neon8(vres_g);
321            vres_b = SkDiv255Round_neon8(vres_b);
322#else
323            vres_r = vrshrq_n_u16(vres_r, 8);
324            vres_g = vrshrq_n_u16(vres_g, 8);
325            vres_b = vrshrq_n_u16(vres_b, 8);
326#endif
327            // pack result
328            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
329            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
330
331            // store
332            vst1q_u16(dst, vres_b);
333            dst += 8;
334            count -= 8;
335        } while (count >= 8);
336    }
337
338    // leftovers
339    while (count-- > 0) {
340        SkPMColor sc = *src++;
341        if (sc) {
342            uint16_t dc = *dst;
343            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
344            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
345            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
346            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
347            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
348        }
349        dst += 1;
350    }
351}
352
353/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
354 * each dither value is spaced out into byte lanes, and repeated
355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
356 * start of each row.
357 */
358static const uint8_t gDitherMatrix_Neon[48] = {
359    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
360    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
361    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
362    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
363
364};
365
366void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
367                                int count, U8CPU alpha, int x, int y)
368{
369
370    SkASSERT(255 > alpha);
371
372    // rescale alpha to range 1 - 256
373    int scale = SkAlpha255To256(alpha);
374
375    if (count >= 8) {
376        /* select row and offset for dither array */
377        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
378
379        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
380        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
381
382        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
383        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
384
385        do {
386
387            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
388            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
389            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
390            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
391            uint16x8_t vdst;
392            uint16x8_t vdst_r, vdst_g, vdst_b;
393            int16x8_t vres_r, vres_g, vres_b;
394            int8x8_t vres8_r, vres8_g, vres8_b;
395
396            // Load source and add dither
397            {
398            register uint8x8_t d0 asm("d0");
399            register uint8x8_t d1 asm("d1");
400            register uint8x8_t d2 asm("d2");
401            register uint8x8_t d3 asm("d3");
402
403            asm (
404                "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
405                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
406                :
407            );
408            vsrc_g = d1;
409#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
410            vsrc_r = d2; vsrc_b = d0;
411#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
412            vsrc_r = d0; vsrc_b = d2;
413#endif
414            }
415
416            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
417            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
418            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
419
420            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
421            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
422            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
423
424            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
425            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
426            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
427
428            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
429            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
430            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
431
432            // Load dst and unpack
433            vdst = vld1q_u16(dst);
434            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
435            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
436            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
437
438            // subtract dst from src and widen
439            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
440            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
441            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
442
443            // multiply diffs by scale and shift
444            vres_r = vmulq_s16(vres_r, vscale);
445            vres_g = vmulq_s16(vres_g, vscale);
446            vres_b = vmulq_s16(vres_b, vscale);
447
448            vres8_r = vshrn_n_s16(vres_r, 8);
449            vres8_g = vshrn_n_s16(vres_g, 8);
450            vres8_b = vshrn_n_s16(vres_b, 8);
451
452            // add dst to result
453            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
454            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
455            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
456
457            // put result into 565 format
458            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
459            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
460
461            // Store result
462            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
463
464            // Next iteration
465            dst += 8;
466            count -= 8;
467
468        } while (count >= 8);
469    }
470
471    // Leftovers
472    if (count > 0) {
473        int scale = SkAlpha255To256(alpha);
474        DITHER_565_SCAN(y);
475        do {
476            SkPMColor c = *src++;
477            SkPMColorAssert(c);
478
479            int dither = DITHER_VALUE(x);
480            int sr = SkGetPackedR32(c);
481            int sg = SkGetPackedG32(c);
482            int sb = SkGetPackedB32(c);
483            sr = SkDITHER_R32To565(sr, dither);
484            sg = SkDITHER_G32To565(sg, dither);
485            sb = SkDITHER_B32To565(sb, dither);
486
487            uint16_t d = *dst;
488            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
489                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
490                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
491            DITHER_INC_X(x);
492        } while (--count != 0);
493    }
494}
495
496void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
497                                const SkPMColor* SK_RESTRICT src,
498                                int count, U8CPU alpha) {
499
500    SkASSERT(255 == alpha);
501    if (count > 0) {
502
503
504    uint8x8_t alpha_mask;
505
506    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
507    alpha_mask = vld1_u8(alpha_mask_setup);
508
509    /* do the NEON unrolled code */
510#define    UNROLL    4
511    while (count >= UNROLL) {
512        uint8x8_t src_raw, dst_raw, dst_final;
513        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
514
515        /* The two prefetches below may make the code slighlty
516         * slower for small values of count but are worth having
517         * in the general case.
518         */
519        __builtin_prefetch(src+32);
520        __builtin_prefetch(dst+32);
521
522        /* get the source */
523        src_raw = vreinterpret_u8_u32(vld1_u32(src));
524#if    UNROLL > 2
525        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
526#endif
527
528        /* get and hold the dst too */
529        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
530#if    UNROLL > 2
531        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
532#endif
533
534    /* 1st and 2nd bits of the unrolling */
535    {
536        uint8x8_t dst_cooked;
537        uint16x8_t dst_wide;
538        uint8x8_t alpha_narrow;
539        uint16x8_t alpha_wide;
540
541        /* get the alphas spread out properly */
542        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
543        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
544
545        /* spread the dest */
546        dst_wide = vmovl_u8(dst_raw);
547
548        /* alpha mul the dest */
549        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
550        dst_cooked = vshrn_n_u16(dst_wide, 8);
551
552        /* sum -- ignoring any byte lane overflows */
553        dst_final = vadd_u8(src_raw, dst_cooked);
554    }
555
556#if    UNROLL > 2
557    /* the 3rd and 4th bits of our unrolling */
558    {
559        uint8x8_t dst_cooked;
560        uint16x8_t dst_wide;
561        uint8x8_t alpha_narrow;
562        uint16x8_t alpha_wide;
563
564        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
565        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
566
567        /* spread the dest */
568        dst_wide = vmovl_u8(dst_raw_2);
569
570        /* alpha mul the dest */
571        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
572        dst_cooked = vshrn_n_u16(dst_wide, 8);
573
574        /* sum -- ignoring any byte lane overflows */
575        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
576    }
577#endif
578
579        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
580#if    UNROLL > 2
581        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
582#endif
583
584        src += UNROLL;
585        dst += UNROLL;
586        count -= UNROLL;
587    }
588#undef    UNROLL
589
590    /* do any residual iterations */
591        while (--count >= 0) {
592            *dst = SkPMSrcOver(*src, *dst);
593            src += 1;
594            dst += 1;
595        }
596    }
597}
598
599void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
600                                const SkPMColor* SK_RESTRICT src,
601                                int count, U8CPU alpha) {
602    SkASSERT(255 == alpha);
603
604    if (count <= 0)
605    return;
606
607    /* Use these to check if src is transparent or opaque */
608    const unsigned int ALPHA_OPAQ  = 0xFF000000;
609    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
610
611#define UNROLL  4
612    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
613    const SkPMColor* SK_RESTRICT src_temp = src;
614
615    /* set up the NEON variables */
616    uint8x8_t alpha_mask;
617    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
618    alpha_mask = vld1_u8(alpha_mask_setup);
619
620    uint8x8_t src_raw, dst_raw, dst_final;
621    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
622    uint8x8_t dst_cooked;
623    uint16x8_t dst_wide;
624    uint8x8_t alpha_narrow;
625    uint16x8_t alpha_wide;
626
627    /* choose the first processing type */
628    if( src >= src_end)
629        goto TAIL;
630    if(*src <= ALPHA_TRANS)
631        goto ALPHA_0;
632    if(*src >= ALPHA_OPAQ)
633        goto ALPHA_255;
634    /* fall-thru */
635
636ALPHA_1_TO_254:
637    do {
638
639        /* get the source */
640        src_raw = vreinterpret_u8_u32(vld1_u32(src));
641        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
642
643        /* get and hold the dst too */
644        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
645        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
646
647
648        /* get the alphas spread out properly */
649        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
650        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
651        /* we collapsed (255-a)+1 ... */
652        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
653
654        /* spread the dest */
655        dst_wide = vmovl_u8(dst_raw);
656
657        /* alpha mul the dest */
658        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
659        dst_cooked = vshrn_n_u16(dst_wide, 8);
660
661        /* sum -- ignoring any byte lane overflows */
662        dst_final = vadd_u8(src_raw, dst_cooked);
663
664        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
665        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
666        /* we collapsed (255-a)+1 ... */
667        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
668
669        /* spread the dest */
670        dst_wide = vmovl_u8(dst_raw_2);
671
672        /* alpha mul the dest */
673        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
674        dst_cooked = vshrn_n_u16(dst_wide, 8);
675
676        /* sum -- ignoring any byte lane overflows */
677        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
678
679        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
680        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
681
682        src += UNROLL;
683        dst += UNROLL;
684
685        /* if 2 of the next pixels aren't between 1 and 254
686        it might make sense to go to the optimized loops */
687        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
688            break;
689
690    } while(src < src_end);
691
692    if (src >= src_end)
693        goto TAIL;
694
695    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
696        goto ALPHA_255;
697
698    /*fall-thru*/
699
700ALPHA_0:
701
702    /*In this state, we know the current alpha is 0 and
703     we optimize for the next alpha also being zero. */
704    src_temp = src;  //so we don't have to increment dst every time
705    do {
706        if(*(++src) > ALPHA_TRANS)
707            break;
708        if(*(++src) > ALPHA_TRANS)
709            break;
710        if(*(++src) > ALPHA_TRANS)
711            break;
712        if(*(++src) > ALPHA_TRANS)
713            break;
714    } while(src < src_end);
715
716    dst += (src - src_temp);
717
718    /* no longer alpha 0, so determine where to go next. */
719    if( src >= src_end)
720        goto TAIL;
721    if(*src >= ALPHA_OPAQ)
722        goto ALPHA_255;
723    else
724        goto ALPHA_1_TO_254;
725
726ALPHA_255:
727    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
728        dst[0]=src[0];
729        dst[1]=src[1];
730        dst[2]=src[2];
731        dst[3]=src[3];
732        src+=UNROLL;
733        dst+=UNROLL;
734        if(src >= src_end)
735            goto TAIL;
736    }
737
738    //Handle remainder.
739    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
740        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
741            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
742        }
743    }
744
745    if( src >= src_end)
746        goto TAIL;
747    if(*src <= ALPHA_TRANS)
748        goto ALPHA_0;
749    else
750        goto ALPHA_1_TO_254;
751
752TAIL:
753    /* do any residual iterations */
754    src_end += UNROLL + 1;  //goto the real end
755    while(src != src_end) {
756        if( *src != 0 ) {
757            if( *src >= ALPHA_OPAQ ) {
758                *dst = *src;
759            }
760            else {
761                *dst = SkPMSrcOver(*src, *dst);
762            }
763        }
764        src++;
765        dst++;
766    }
767
768#undef    UNROLL
769    return;
770}
771
772/* Neon version of S32_Blend_BlitRow32()
773 * portable version is in src/core/SkBlitRow_D32.cpp
774 */
775void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
776                              const SkPMColor* SK_RESTRICT src,
777                              int count, U8CPU alpha) {
778    SkASSERT(alpha <= 255);
779    if (count > 0) {
780        uint16_t src_scale = SkAlpha255To256(alpha);
781        uint16_t dst_scale = 256 - src_scale;
782
783    /* run them N at a time through the NEON unit */
784    /* note that each 1 is 4 bytes, each treated exactly the same,
785     * so we can work under that guise. We *do* know that the src&dst
786     * will be 32-bit aligned quantities, so we can specify that on
787     * the load/store ops and do a neon 'reinterpret' to get us to
788     * byte-sized (pun intended) pieces that we widen/multiply/shift
789     * we're limited at 128 bits in the wide ops, which is 8x16bits
790     * or a pair of 32 bit src/dsts.
791     */
792    /* we *could* manually unroll this loop so that we load 128 bits
793     * (as a pair of 64s) from each of src and dst, processing them
794     * in pieces. This might give us a little better management of
795     * the memory latency, but my initial attempts here did not
796     * produce an instruction stream that looked all that nice.
797     */
798#define    UNROLL    2
799    while (count >= UNROLL) {
800        uint8x8_t  src_raw, dst_raw, dst_final;
801        uint16x8_t  src_wide, dst_wide;
802
803        /* get 64 bits of src, widen it, multiply by src_scale */
804        src_raw = vreinterpret_u8_u32(vld1_u32(src));
805        src_wide = vmovl_u8(src_raw);
806        /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
807        src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
808
809        /* ditto with dst */
810        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
811        dst_wide = vmovl_u8(dst_raw);
812
813        /* combine add with dst multiply into mul-accumulate */
814        dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
815
816        dst_final = vshrn_n_u16(dst_wide, 8);
817        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
818
819        src += UNROLL;
820        dst += UNROLL;
821        count -= UNROLL;
822    }
823    /* RBE: well, i don't like how gcc manages src/dst across the above
824     * loop it's constantly calculating src+bias, dst+bias and it only
825     * adjusts the real ones when we leave the loop. Not sure why
826     * it's "hoisting down" (hoisting implies above in my lexicon ;))
827     * the adjustments to src/dst/count, but it does...
828     * (might be SSA-style internal logic...
829     */
830
831#if    UNROLL == 2
832    if (count == 1) {
833            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
834    }
835#else
836    if (count > 0) {
837            do {
838                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
839                src += 1;
840                dst += 1;
841            } while (--count > 0);
842    }
843#endif
844
845#undef    UNROLL
846    }
847}
848
849void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
850                         const SkPMColor* SK_RESTRICT src,
851                         int count, U8CPU alpha) {
852
853    SkASSERT(255 >= alpha);
854
855    if (count <= 0) {
856        return;
857    }
858
859    unsigned alpha256 = SkAlpha255To256(alpha);
860
861    // First deal with odd counts
862    if (count & 1) {
863        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
864        uint16x8_t vdst_wide, vsrc_wide;
865        unsigned dst_scale;
866
867        // Load
868        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
869        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
870
871        // Calc dst_scale
872        dst_scale = vget_lane_u8(vsrc, 3);
873        dst_scale *= alpha256;
874        dst_scale >>= 8;
875        dst_scale = 256 - dst_scale;
876
877        // Process src
878        vsrc_wide = vmovl_u8(vsrc);
879        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
880
881        // Process dst
882        vdst_wide = vmovl_u8(vdst);
883        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
884
885        // Combine
886        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
887
888        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
889        dst++;
890        src++;
891        count--;
892    }
893
894    if (count) {
895        uint8x8_t alpha_mask;
896        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
897        alpha_mask = vld1_u8(alpha_mask_setup);
898
899        do {
900
901            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
902            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
903
904            __builtin_prefetch(src+32);
905            __builtin_prefetch(dst+32);
906
907            // Load
908            vsrc = vreinterpret_u8_u32(vld1_u32(src));
909            vdst = vreinterpret_u8_u32(vld1_u32(dst));
910
911            // Prepare src_scale
912            vsrc_scale = vdupq_n_u16(alpha256);
913
914            // Calc dst_scale
915            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
916            vdst_scale = vmovl_u8(vsrc_alphas);
917            vdst_scale *= vsrc_scale;
918            vdst_scale = vshrq_n_u16(vdst_scale, 8);
919            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
920
921            // Process src
922            vsrc_wide = vmovl_u8(vsrc);
923            vsrc_wide *= vsrc_scale;
924
925            // Process dst
926            vdst_wide = vmovl_u8(vdst);
927            vdst_wide *= vdst_scale;
928
929            // Combine
930            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
931
932            vst1_u32(dst, vreinterpret_u32_u8(vres));
933
934            src += 2;
935            dst += 2;
936            count -= 2;
937        } while(count);
938    }
939}
940
941///////////////////////////////////////////////////////////////////////////////
942
943#undef    DEBUG_OPAQUE_DITHER
944
945#if    defined(DEBUG_OPAQUE_DITHER)
946static void showme8(char *str, void *p, int len)
947{
948    static char buf[256];
949    char tbuf[32];
950    int i;
951    char *pc = (char*) p;
952    sprintf(buf,"%8s:", str);
953    for(i=0;i<len;i++) {
954        sprintf(tbuf, "   %02x", pc[i]);
955        strcat(buf, tbuf);
956    }
957    SkDebugf("%s\n", buf);
958}
959static void showme16(char *str, void *p, int len)
960{
961    static char buf[256];
962    char tbuf[32];
963    int i;
964    uint16_t *pc = (uint16_t*) p;
965    sprintf(buf,"%8s:", str);
966    len = (len / sizeof(uint16_t));    /* passed as bytes */
967    for(i=0;i<len;i++) {
968        sprintf(tbuf, " %04x", pc[i]);
969        strcat(buf, tbuf);
970    }
971    SkDebugf("%s\n", buf);
972}
973#endif
974
975void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
976                                   const SkPMColor* SK_RESTRICT src,
977                                   int count, U8CPU alpha, int x, int y) {
978    SkASSERT(255 == alpha);
979
980#define    UNROLL    8
981
982    if (count >= UNROLL) {
983    uint8x8_t dbase;
984
985#if    defined(DEBUG_OPAQUE_DITHER)
986    uint16_t tmpbuf[UNROLL];
987    int td[UNROLL];
988    int tdv[UNROLL];
989    int ta[UNROLL];
990    int tap[UNROLL];
991    uint16_t in_dst[UNROLL];
992    int offset = 0;
993    int noisy = 0;
994#endif
995
996    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
997    dbase = vld1_u8(dstart);
998
999        do {
1000        uint8x8_t sr, sg, sb, sa, d;
1001        uint16x8_t dst8, scale8, alpha8;
1002        uint16x8_t dst_r, dst_g, dst_b;
1003
1004#if    defined(DEBUG_OPAQUE_DITHER)
1005    /* calculate 8 elements worth into a temp buffer */
1006    {
1007      int my_y = y;
1008      int my_x = x;
1009      SkPMColor* my_src = (SkPMColor*)src;
1010      uint16_t* my_dst = dst;
1011      int i;
1012
1013          DITHER_565_SCAN(my_y);
1014          for(i=0;i<UNROLL;i++) {
1015            SkPMColor c = *my_src++;
1016            SkPMColorAssert(c);
1017            if (c) {
1018                unsigned a = SkGetPackedA32(c);
1019
1020                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1021        tdv[i] = DITHER_VALUE(my_x);
1022        ta[i] = a;
1023        tap[i] = SkAlpha255To256(a);
1024        td[i] = d;
1025
1026                unsigned sr = SkGetPackedR32(c);
1027                unsigned sg = SkGetPackedG32(c);
1028                unsigned sb = SkGetPackedB32(c);
1029                sr = SkDITHER_R32_FOR_565(sr, d);
1030                sg = SkDITHER_G32_FOR_565(sg, d);
1031                sb = SkDITHER_B32_FOR_565(sb, d);
1032
1033                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1034                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1035                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1036                // now src and dst expanded are in g:11 r:10 x:1 b:10
1037                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1038        td[i] = d;
1039
1040            } else {
1041        tmpbuf[i] = *my_dst;
1042        ta[i] = tdv[i] = td[i] = 0xbeef;
1043        }
1044        in_dst[i] = *my_dst;
1045            my_dst += 1;
1046            DITHER_INC_X(my_x);
1047          }
1048    }
1049#endif
1050
1051        /* source is in ABGR */
1052        {
1053        register uint8x8_t d0 asm("d0");
1054        register uint8x8_t d1 asm("d1");
1055        register uint8x8_t d2 asm("d2");
1056        register uint8x8_t d3 asm("d3");
1057
1058        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1059            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1060            : "r" (src)
1061                    );
1062            sr = d0; sg = d1; sb = d2; sa = d3;
1063        }
1064
1065        /* calculate 'd', which will be 0..7 */
1066        /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1067#if defined(SK_BUILD_FOR_ANDROID)
1068        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1069        alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1070#else
1071        alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1072#endif
1073        alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1074        d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
1075
1076        /* sr = sr - (sr>>5) + d */
1077        /* watching for 8-bit overflow.  d is 0..7; risky range of
1078         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1079         * safe  as long as we do ((sr-sr>>5) + d) */
1080        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1081        sr = vadd_u8(sr, d);
1082
1083        /* sb = sb - (sb>>5) + d */
1084        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1085        sb = vadd_u8(sb, d);
1086
1087        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1088        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1089        sg = vadd_u8(sg, vshr_n_u8(d,1));
1090
1091        /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1092        dst8 = vld1q_u16(dst);
1093        dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1094        dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1095        dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
1096
1097        /* blend */
1098#if 1
1099        /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1100        /* originally 255-sa + 1 */
1101        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1102#else
1103        scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1104        scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1105#endif
1106
1107#if 1
1108        /* combine the addq and mul, save 3 insns */
1109        scale8 = vshrq_n_u16(scale8, 3);
1110        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1111        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1112        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1113#else
1114        /* known correct, but +3 insns over above */
1115        scale8 = vshrq_n_u16(scale8, 3);
1116        dst_b = vmulq_u16(dst_b, scale8);
1117        dst_g = vmulq_u16(dst_g, scale8);
1118        dst_r = vmulq_u16(dst_r, scale8);
1119
1120        /* combine */
1121        /* NB: vshll widens, need to preserve those bits */
1122        dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1123        dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1124        dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1125#endif
1126
1127        /* repack to store */
1128        dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1129        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1130        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1131
1132        vst1q_u16(dst, dst8);
1133
1134#if    defined(DEBUG_OPAQUE_DITHER)
1135        /* verify my 8 elements match the temp buffer */
1136    {
1137       int i, bad=0;
1138       static int invocation;
1139
1140       for (i=0;i<UNROLL;i++)
1141        if (tmpbuf[i] != dst[i]) bad=1;
1142       if (bad) {
1143        SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1144            invocation, offset);
1145        SkDebugf("  alpha 0x%x\n", alpha);
1146        for (i=0;i<UNROLL;i++)
1147            SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1148            i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1149            dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1150
1151        showme16("alpha8", &alpha8, sizeof(alpha8));
1152        showme16("scale8", &scale8, sizeof(scale8));
1153        showme8("d", &d, sizeof(d));
1154        showme16("dst8", &dst8, sizeof(dst8));
1155        showme16("dst_b", &dst_b, sizeof(dst_b));
1156        showme16("dst_g", &dst_g, sizeof(dst_g));
1157        showme16("dst_r", &dst_r, sizeof(dst_r));
1158        showme8("sb", &sb, sizeof(sb));
1159        showme8("sg", &sg, sizeof(sg));
1160        showme8("sr", &sr, sizeof(sr));
1161
1162        /* cop out */
1163        return;
1164       }
1165       offset += UNROLL;
1166       invocation++;
1167    }
1168#endif
1169
1170            dst += UNROLL;
1171        src += UNROLL;
1172        count -= UNROLL;
1173        /* skip x += UNROLL, since it's unchanged mod-4 */
1174        } while (count >= UNROLL);
1175    }
1176#undef    UNROLL
1177
1178    /* residuals */
1179    if (count > 0) {
1180        DITHER_565_SCAN(y);
1181        do {
1182            SkPMColor c = *src++;
1183            SkPMColorAssert(c);
1184            if (c) {
1185                unsigned a = SkGetPackedA32(c);
1186
1187                // dither and alpha are just temporary variables to work-around
1188                // an ICE in debug.
1189                unsigned dither = DITHER_VALUE(x);
1190                unsigned alpha = SkAlpha255To256(a);
1191                int d = SkAlphaMul(dither, alpha);
1192
1193                unsigned sr = SkGetPackedR32(c);
1194                unsigned sg = SkGetPackedG32(c);
1195                unsigned sb = SkGetPackedB32(c);
1196                sr = SkDITHER_R32_FOR_565(sr, d);
1197                sg = SkDITHER_G32_FOR_565(sg, d);
1198                sb = SkDITHER_B32_FOR_565(sb, d);
1199
1200                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1201                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1202                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1203                // now src and dst expanded are in g:11 r:10 x:1 b:10
1204                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1205            }
1206            dst += 1;
1207            DITHER_INC_X(x);
1208        } while (--count != 0);
1209    }
1210}
1211
1212///////////////////////////////////////////////////////////////////////////////
1213
1214#undef    DEBUG_S32_OPAQUE_DITHER
1215
1216void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1217                                 const SkPMColor* SK_RESTRICT src,
1218                                 int count, U8CPU alpha, int x, int y) {
1219    SkASSERT(255 == alpha);
1220
1221#define    UNROLL    8
1222    if (count >= UNROLL) {
1223    uint8x8_t d;
1224    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1225    d = vld1_u8(dstart);
1226
1227    while (count >= UNROLL) {
1228        uint8x8_t sr, sg, sb;
1229        uint16x8_t dr, dg, db;
1230        uint16x8_t dst8;
1231
1232        {
1233        register uint8x8_t d0 asm("d0");
1234        register uint8x8_t d1 asm("d1");
1235        register uint8x8_t d2 asm("d2");
1236        register uint8x8_t d3 asm("d3");
1237
1238        asm (
1239            "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1240            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1241            :
1242        );
1243        sg = d1;
1244#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1245        sr = d2; sb = d0;
1246#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1247        sr = d0; sb = d2;
1248#endif
1249        }
1250        /* XXX: if we want to prefetch, hide it in the above asm()
1251         * using the gcc __builtin_prefetch(), the prefetch will
1252         * fall to the bottom of the loop -- it won't stick up
1253         * at the top of the loop, just after the vld4.
1254         */
1255
1256        // sr = sr - (sr>>5) + d
1257        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1258        dr = vaddl_u8(sr, d);
1259
1260        // sb = sb - (sb>>5) + d
1261        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1262        db = vaddl_u8(sb, d);
1263
1264        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1265        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1266        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1267
1268        // pack high bits of each into 565 format  (rgb, b is lsb)
1269        dst8 = vshrq_n_u16(db, 3);
1270        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1271        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1272
1273        // store it
1274        vst1q_u16(dst, dst8);
1275
1276#if    defined(DEBUG_S32_OPAQUE_DITHER)
1277        // always good to know if we generated good results
1278        {
1279        int i, myx = x, myy = y;
1280        DITHER_565_SCAN(myy);
1281        for (i=0;i<UNROLL;i++) {
1282            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1283            SkPMColor c = src[i-8];
1284            unsigned dither = DITHER_VALUE(myx);
1285            uint16_t val = SkDitherRGB32To565(c, dither);
1286            if (val != dst[i]) {
1287            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1288                c, dither, val, dst[i], dstart[i]);
1289            }
1290            DITHER_INC_X(myx);
1291        }
1292        }
1293#endif
1294
1295        dst += UNROLL;
1296        // we don't need to increment src as the asm above has already done it
1297        count -= UNROLL;
1298        x += UNROLL;        // probably superfluous
1299    }
1300    }
1301#undef    UNROLL
1302
1303    // residuals
1304    if (count > 0) {
1305        DITHER_565_SCAN(y);
1306        do {
1307            SkPMColor c = *src++;
1308            SkPMColorAssert(c);
1309            SkASSERT(SkGetPackedA32(c) == 255);
1310
1311            unsigned dither = DITHER_VALUE(x);
1312            *dst++ = SkDitherRGB32To565(c, dither);
1313            DITHER_INC_X(x);
1314        } while (--count != 0);
1315    }
1316}
1317
1318void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1319                      SkPMColor color) {
1320    if (count <= 0) {
1321        return;
1322    }
1323
1324    if (0 == color) {
1325        if (src != dst) {
1326            memcpy(dst, src, count * sizeof(SkPMColor));
1327        }
1328        return;
1329    }
1330
1331    unsigned colorA = SkGetPackedA32(color);
1332    if (255 == colorA) {
1333        sk_memset32(dst, color, count);
1334    } else {
1335        unsigned scale = 256 - SkAlpha255To256(colorA);
1336
1337        if (count >= 8) {
1338            // at the end of this assembly, count will have been decremented
1339            // to a negative value. That is, if count mod 8 = x, it will be
1340            // -8 +x coming out.
1341            asm volatile (
1342                PLD128(src, 0)
1343
1344                "vdup.32    q0, %[color]                \n\t"
1345
1346                PLD128(src, 128)
1347
1348                // scale numerical interval [0-255], so load as 8 bits
1349                "vdup.8     d2, %[scale]                \n\t"
1350
1351                PLD128(src, 256)
1352
1353                "subs       %[count], %[count], #8      \n\t"
1354
1355                PLD128(src, 384)
1356
1357                "Loop_Color32:                          \n\t"
1358
1359                // load src color, 8 pixels, 4 64 bit registers
1360                // (and increment src).
1361                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
1362
1363                PLD128(src, 384)
1364
1365                // multiply long by scale, 64 bits at a time,
1366                // destination into a 128 bit register.
1367                "vmull.u8   q4, d4, d2                  \n\t"
1368                "vmull.u8   q5, d5, d2                  \n\t"
1369                "vmull.u8   q6, d6, d2                  \n\t"
1370                "vmull.u8   q7, d7, d2                  \n\t"
1371
1372                // shift the 128 bit registers, containing the 16
1373                // bit scaled values back to 8 bits, narrowing the
1374                // results to 64 bit registers.
1375                "vshrn.i16  d8, q4, #8                  \n\t"
1376                "vshrn.i16  d9, q5, #8                  \n\t"
1377                "vshrn.i16  d10, q6, #8                 \n\t"
1378                "vshrn.i16  d11, q7, #8                 \n\t"
1379
1380                // adding back the color, using 128 bit registers.
1381                "vadd.i8    q6, q4, q0                  \n\t"
1382                "vadd.i8    q7, q5, q0                  \n\t"
1383
1384                // store back the 8 calculated pixels (2 128 bit
1385                // registers), and increment dst.
1386                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
1387
1388                "subs       %[count], %[count], #8      \n\t"
1389                "bge        Loop_Color32                \n\t"
1390                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1391                : [color] "r" (color), [scale] "r" (scale)
1392                : "cc", "memory",
1393                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1394                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1395                          );
1396            // At this point, if we went through the inline assembly, count is
1397            // a negative value:
1398            // if the value is -8, there is no pixel left to process.
1399            // if the value is -7, there is one pixel left to process
1400            // ...
1401            // And'ing it with 7 will give us the number of pixels
1402            // left to process.
1403            count = count & 0x7;
1404        }
1405
1406        while (count > 0) {
1407            *dst = color + SkAlphaMulQ(*src, scale);
1408            src += 1;
1409            dst += 1;
1410            count--;
1411        }
1412    }
1413}
1414
1415///////////////////////////////////////////////////////////////////////////////
1416
1417const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1418    // no dither
1419    // NOTE: For the S32_D565_Blend function below, we don't have a special
1420    //       version that assumes that each source pixel is opaque. But our
1421    //       S32A is still faster than the default, so use it.
1422    S32_D565_Opaque_neon,
1423    S32A_D565_Blend_neon,   // really S32_D565_Blend
1424    S32A_D565_Opaque_neon,
1425    S32A_D565_Blend_neon,
1426
1427    // dither
1428    S32_D565_Opaque_Dither_neon,
1429    S32_D565_Blend_Dither_neon,
1430    S32A_D565_Opaque_Dither_neon,
1431    NULL,   // S32A_D565_Blend_Dither
1432};
1433
1434const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1435    NULL,   // S32_Opaque,
1436    S32_Blend_BlitRow32_neon,        // S32_Blend,
1437    /*
1438     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1439     * value and attempts to optimize accordingly.  The optimization is
1440     * sensitive to the source content and is not a win in all cases. For
1441     * example, if there are a lot of transitions between the alpha states,
1442     * the performance will almost certainly be worse.  However, for many
1443     * common cases the performance is equivalent or better than the standard
1444     * case where we do not inspect the src alpha.
1445     */
1446#if SK_A32_SHIFT == 24
1447    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1448    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1449#else
1450    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1451#endif
1452    S32A_Blend_BlitRow32_neon        // S32A_Blend
1453};
1454