196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed/*
21cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * Copyright 2009 The Android Open Source Project
31cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger *
41cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * Use of this source code is governed by a BSD-style license that can be
51cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * found in the LICENSE file.
696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed */
796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
88e048c19870a898cecdde3b3c0d2d512e6f372c0Mike Reed
996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#include "SkBlitRow.h"
101cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger#include "SkBlitMask.h"
117cc0a6ac7b817b217dd614cba96fc533b32d505dMike Reed#include "SkColorPriv.h"
127cc0a6ac7b817b217dd614cba96fc533b32d505dMike Reed#include "SkDither.h"
1396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
1454e0f955c21365271661cd92a29d06a847a18554Mike Reed#if defined(__ARM_HAVE_NEON)
1554e0f955c21365271661cd92a29d06a847a18554Mike Reed#include <arm_neon.h>
1654e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
1754e0f955c21365271661cd92a29d06a847a18554Mike Reed
1854e0f955c21365271661cd92a29d06a847a18554Mike Reed#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reedstatic void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
2096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                                  const SkPMColor* SK_RESTRICT src, int count,
2196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                                  U8CPU alpha, int /*x*/, int /*y*/) {
2296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    SkASSERT(255 == alpha);
23bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed
2496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    if (count >= 8) {
2596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        uint16_t* SK_RESTRICT keep_dst;
2696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
2796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        asm volatile (
2896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "ands       ip, %[count], #7            \n\t"
2996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmov.u8    d31, #1<<7                  \n\t"
3096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld1.16    {q12}, [%[dst]]             \n\t"
3196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
3296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "moveq      ip, #8                      \n\t"
3396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "mov        %[keep_dst], %[dst]         \n\t"
3496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
3596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "add        %[src], %[src], ip, LSL#2   \n\t"
3696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
3796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "subs       %[count], %[count], ip      \n\t"
3896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "b          9f                          \n\t"
3996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // LOOP
4096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "2:                                         \n\t"
4196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
4296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld1.16    {q12}, [%[dst]]!            \n\t"
4396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
4496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
4596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
4696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "subs       %[count], %[count], #8      \n\t"
4796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "9:                                         \n\t"
4896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "pld        [%[dst],#32]                \n\t"
4996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // expand 0565 q12 to 8888 {d4-d7}
5096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovn.u16  d4, q12                     \n\t"
5196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q11, q12, #5                \n\t"
5296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q10, q12, #6+5              \n\t"
5396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovn.u16  d5, q11                     \n\t"
5496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovn.u16  d6, q10                     \n\t"
5596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshl.u8    d4, d4, #3                  \n\t"
5696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshl.u8    d5, d5, #2                  \n\t"
5796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshl.u8    d6, d6, #3                  \n\t"
5896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
5996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovl.u8   q14, d31                    \n\t"
6096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovl.u8   q13, d31                    \n\t"
6196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovl.u8   q12, d31                    \n\t"
6296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
6396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // duplicate in 4/2/1 & 8pix vsns
6496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmvn.8     d30, d3                     \n\t"
6596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmlal.u8   q14, d30, d6                \n\t"
6696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmlal.u8   q13, d30, d5                \n\t"
6796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmlal.u8   q12, d30, d4                \n\t"
6896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q8, q14, #5                 \n\t"
6996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q9, q13, #6                 \n\t"
7096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vaddhn.u16 d6, q14, q8                 \n\t"
7196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q8, q12, #5                 \n\t"
7296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vaddhn.u16 d5, q13, q9                 \n\t"
7396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
7496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vaddhn.u16 d4, q12, q8                 \n\t"
7596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // intentionally don't calculate alpha
7696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // result in d4-d6
7796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
7896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vqadd.u8   d5, d5, d1                  \n\t"
7996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vqadd.u8   d4, d4, d2                  \n\t"
8096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
8196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // pack 8888 {d4-d6} to 0565 q10
8296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshll.u8   q10, d6, #8                 \n\t"
8396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshll.u8   q3, d5, #8                  \n\t"
8496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshll.u8   q2, d4, #8                  \n\t"
8596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vsri.u16   q10, q3, #5                 \n\t"
8696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vsri.u16   q10, q2, #11                \n\t"
8796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
8896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "bne        2b                          \n\t"
8996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
9096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "1:                                         \n\t"
9196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
9296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      : [count] "+r" (count)
9396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
9496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
9596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
9696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "d30","d31"
9796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      );
982274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed    }
992274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed    else
1002274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed    {   // handle count < 8
10196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        uint16_t* SK_RESTRICT keep_dst;
10296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
10396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        asm volatile (
10496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmov.u8    d31, #1<<7                  \n\t"
10596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "mov        %[keep_dst], %[dst]         \n\t"
10696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
10796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "tst        %[count], #4                \n\t"
10896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "beq        14f                         \n\t"
10996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld1.16    {d25}, [%[dst]]!            \n\t"
11096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld1.32    {q1}, [%[src]]!             \n\t"
11196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
11296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "14:                                        \n\t"
11396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "tst        %[count], #2                \n\t"
11496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "beq        12f                         \n\t"
11596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
11696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld1.32    {d1}, [%[src]]!             \n\t"
11796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
11896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "12:                                        \n\t"
11996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "tst        %[count], #1                \n\t"
12096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "beq        11f                         \n\t"
12196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
12296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
12396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
12496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "11:                                        \n\t"
12596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // unzips achieve the same as a vld4 operation
12696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vuzpq.u16  q0, q1                      \n\t"
12796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vuzp.u8    d0, d1                      \n\t"
12896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vuzp.u8    d2, d3                      \n\t"
12996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // expand 0565 q12 to 8888 {d4-d7}
13096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovn.u16  d4, q12                     \n\t"
13196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q11, q12, #5                \n\t"
13296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q10, q12, #6+5              \n\t"
13396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovn.u16  d5, q11                     \n\t"
13496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovn.u16  d6, q10                     \n\t"
13596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshl.u8    d4, d4, #3                  \n\t"
13696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshl.u8    d5, d5, #2                  \n\t"
13796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshl.u8    d6, d6, #3                  \n\t"
13896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
13996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovl.u8   q14, d31                    \n\t"
14096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovl.u8   q13, d31                    \n\t"
14196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmovl.u8   q12, d31                    \n\t"
14296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
14396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // duplicate in 4/2/1 & 8pix vsns
14496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmvn.8     d30, d3                     \n\t"
14596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmlal.u8   q14, d30, d6                \n\t"
14696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmlal.u8   q13, d30, d5                \n\t"
14796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vmlal.u8   q12, d30, d4                \n\t"
14896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q8, q14, #5                 \n\t"
14996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q9, q13, #6                 \n\t"
15096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vaddhn.u16 d6, q14, q8                 \n\t"
15196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshr.u16   q8, q12, #5                 \n\t"
15296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vaddhn.u16 d5, q13, q9                 \n\t"
15396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
15496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vaddhn.u16 d4, q12, q8                 \n\t"
15596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // intentionally don't calculate alpha
15696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // result in d4-d6
15796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
15896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vqadd.u8   d5, d5, d1                  \n\t"
15996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vqadd.u8   d4, d4, d2                  \n\t"
16096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
16196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // pack 8888 {d4-d6} to 0565 q10
16296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshll.u8   q10, d6, #8                 \n\t"
16396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshll.u8   q3, d5, #8                  \n\t"
16496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vshll.u8   q2, d4, #8                  \n\t"
16596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vsri.u16   q10, q3, #5                 \n\t"
16696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vsri.u16   q10, q2, #11                \n\t"
16796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
16896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      // store
16996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "tst        %[count], #4                \n\t"
17096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "beq        24f                         \n\t"
17196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
17296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
17396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "24:                                        \n\t"
17496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "tst        %[count], #2                \n\t"
17596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "beq        22f                         \n\t"
17696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
17796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
17896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "22:                                        \n\t"
17996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "tst        %[count], #1                \n\t"
18096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "beq        21f                         \n\t"
18196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
18296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
18396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "21:                                        \n\t"
18496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      : [count] "+r" (count)
18596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
18696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
18796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
18896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      "d30","d31"
18996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                      );
19096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    }
19196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed}
19296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
19396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reedstatic void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
19496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                                 const SkPMColor* SK_RESTRICT src, int count,
19596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                                 U8CPU alpha, int /*x*/, int /*y*/) {
196bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed
197bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed    U8CPU alpha_for_asm = alpha;
198bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed
19996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    asm volatile (
20096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    /* This code implements a Neon version of S32A_D565_Blend. The output differs from
20196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     * the original in two respects:
20296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *  1. The results have a few mismatches compared to the original code. These mismatches
20396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *     never exceed 1. It's possible to improve accuracy vs. a floating point
20496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *     implementation by introducing rounding right shifts (vrshr) for the final stage.
20596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *     Rounding is not present in the code below, because although results would be closer
20696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *     to a floating point implementation, the number of mismatches compared to the
20796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *     original code would be far greater.
20896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *  2. On certain inputs, the original code can overflow, causing colour channels to
20996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
21096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     *     to affect another.
21196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed     */
21296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
213bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1
214bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		/* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
215bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed                  "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
216bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else
21796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
218bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif
21996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmov.u16   q3, #255                        \n\t"   // set up constant
22096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
22196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
22296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "beq        2f                              \n\t"   // if count8 == 0, exit
22396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
22496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
22596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "1:                                             \n\t"
22696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
22796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "subs       r4, r4, #1                      \n\t"   // decrement loop counter
22896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
22996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  //  and deinterleave
23096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
23196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
23296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vand       q10, q0, q15                    \n\t"   // extract blue
23396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u16   q8, q0, #11                     \n\t"   // extract red
23496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u16   q9, q9, #10                     \n\t"   // extract green
23596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // dstrgb = {q8, q9, q10}
23696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
23796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
23896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
23996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
24096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
24196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
24296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
24396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
24496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
24596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // srcrgba = {q11, q12, q13, q14}
24696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
24796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
24896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
24996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
25096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
25196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
25296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
25396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
25496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // dst_scale = q2
25596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
25696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
25796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
25896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
259bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed
260bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1
261bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	// trying for a better match with SkDiv255Round(a)
262bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	// C alg is:  a+=128; (a+a>>8)>>8
263bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	// we'll use just a rounding shift [q2 is available for scratch]
264bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed                  "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
265bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed                  "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
266bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed                  "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
267bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else
268bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	// arm's original "truncating divide by 256"
26996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
27096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
27196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
272bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif
27396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
27496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
27596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
27696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
27796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
27896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "bne        1b                              \n\t"   // if counter != 0, loop
27996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "2:                                             \n\t"   // exit
28096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
281bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
28296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  :
28396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
28496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  );
285bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed
28696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    count &= 7;
28796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    if (count > 0) {
28896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        do {
28996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed            SkPMColor sc = *src++;
290215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed            if (sc) {
29196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                uint16_t dc = *dst;
292215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed                unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
293215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed                unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
294215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed                unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
295215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed                unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
296215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed                *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
29796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed            }
29896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed            dst += 1;
29996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        } while (--count != 0);
30096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    }
30196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed}
30296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
30396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
30496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * each dither value is spaced out into byte lanes, and repeated
30596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
30696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * start of each row.
30796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed */
30896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reedstatic const uint8_t gDitherMatrix_Neon[48] = {
30996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
31096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
31196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
31296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
31396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
31496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed};
31596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
31696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reedstatic void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
31796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                                       int count, U8CPU alpha, int x, int y)
31896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed{
31996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    /* select row and offset for dither array */
32096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
32196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
32296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    /* rescale alpha to range 0 - 256 */
32396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    int scale = SkAlpha255To256(alpha);
32496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
32596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    asm volatile (
32696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
32796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
32896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
32996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
33096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
33196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "1:                                                 \n\t"
33296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
33396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
33496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
33596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
33696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
33796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
33896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
33996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
34096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
34196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
34296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
34396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
34496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
34596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // load 8 pixels from dst, extract rgb
34696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
34796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
34896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
34996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
35096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vand           d17, d17, d29                   \n\t"   // and green with green mask
35196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
35296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
35396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // src = {d22 (r), d23 (g), d24 (b)}
35496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // dst = {d16 (r), d17 (g), d18 (b)}
35596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // subtract dst from src and widen
35696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
35796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
35896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
35996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // multiply diffs by scale and shift
36096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
36196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
36296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
36396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
36496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
36596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
36696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
36796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // add dst to result
36896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
36996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
37096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
37196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  // put result into 565 format
37296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
37396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
37496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
37596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  "bgt            1b                              \n\t"   // loop if count > 0
37696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
37796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  : [dstart] "r" (dstart), [scale] "r" (scale)
37896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
37996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                  );
38096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
38196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    DITHER_565_SCAN(y);
38296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
38396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    while((count & 7) > 0)
38496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    {
38596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        SkPMColor c = *src++;
38696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
38796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        int dither = DITHER_VALUE(x);
38896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        int sr = SkGetPackedR32(c);
38996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        int sg = SkGetPackedG32(c);
39096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        int sb = SkGetPackedB32(c);
39196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        sr = SkDITHER_R32To565(sr, dither);
39296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        sg = SkDITHER_G32To565(sg, dither);
39396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        sb = SkDITHER_B32To565(sb, dither);
39496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
39596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        uint16_t d = *dst;
39696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
39796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                             SkAlphaBlend(sg, SkGetPackedG16(d), scale),
39896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed                             SkAlphaBlend(sb, SkGetPackedB16(d), scale));
39996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        DITHER_INC_X(x);
40096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed        count--;
40196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    }
40296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed}
40396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
40496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32A_D565_Opaque_PROC       S32A_D565_Opaque_neon
40596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32A_D565_Blend_PROC        S32A_D565_Blend_neon
40696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32_D565_Blend_Dither_PROC  S32_D565_Blend_Dither_neon
407c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#elif __ARM_ARCH__ >= 7 && !defined(SK_CPU_BENDIAN)
408c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenbergerstatic void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst,
409c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                  const SkPMColor* SK_RESTRICT src, int count,
410c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                  U8CPU alpha, int /*x*/, int /*y*/) {
411c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    SkASSERT(255 == alpha);
412c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
413c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    asm volatile (
414c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "1:                                   \n\t"
415c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "ldr     r3, [%[src]], #4             \n\t"
416c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "cmp     r3, #0xff000000              \n\t"
417c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "blo     2f                           \n\t"
418c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "and     r4, r3, #0x0000f8            \n\t"
419c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "and     r5, r3, #0x00fc00            \n\t"
420c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "and     r6, r3, #0xf80000            \n\t"
421c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "pld     [r1, #32]                    \n\t"
422c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "lsl     r3, r4, #8                   \n\t"
423c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "orr     r3, r3, r5, lsr #5           \n\t"
424c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "orr     r3, r3, r6, lsr #19          \n\t"
425c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "subs    %[count], %[count], #1       \n\t"
426c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "strh    r3, [%[dst]], #2             \n\t"
427c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "bne     1b                           \n\t"
428c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "b       4f                           \n\t"
429c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "2:                                   \n\t"
430c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "lsrs    r7, r3, #24                  \n\t"
431c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "beq     3f                           \n\t"
432c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "ldrh    r4, [%[dst]]                 \n\t"
433c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "rsb     r7, r7, #255                 \n\t"
434c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "and     r6, r4, #0x001f              \n\t"
435c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "ubfx    r5, r4, #5, #6               \n\t"
436c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "pld     [r0, #16]                    \n\t"
437c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "lsr     r4, r4, #11                  \n\t"
438c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "smulbb  r6, r6, r7                   \n\t"
439c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "smulbb  r5, r5, r7                   \n\t"
440c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "smulbb  r4, r4, r7                   \n\t"
441c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "ubfx    r7, r3, #16, #8              \n\t"
442c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "ubfx    ip, r3, #8, #8               \n\t"
443c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "and     r3, r3, #0xff                \n\t"
444c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r6, r6, #16                  \n\t"
445c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r5, r5, #32                  \n\t"
446c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r4, r4, #16                  \n\t"
447c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r6, r6, r6, lsr #5           \n\t"
448c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r5, r5, r5, lsr #6           \n\t"
449c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r4, r4, r4, lsr #5           \n\t"
450c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r6, r7, r6, lsr #5           \n\t"
451c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r5, ip, r5, lsr #6           \n\t"
452c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     r4, r3, r4, lsr #5           \n\t"
453c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "lsr     r6, r6, #3                   \n\t"
454c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "and     r5, r5, #0xfc                \n\t"
455c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "and     r4, r4, #0xf8                \n\t"
456c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "orr     r6, r6, r5, lsl #3           \n\t"
457c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "orr     r4, r6, r4, lsl #8           \n\t"
458c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "strh    r4, [%[dst]], #2             \n\t"
459c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "pld     [r1, #32]                    \n\t"
460c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "subs    %[count], %[count], #1       \n\t"
461c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "bne     1b                           \n\t"
462c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "b       4f                           \n\t"
463c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "3:                                   \n\t"
464c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "subs    %[count], %[count], #1       \n\t"
465c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "add     %[dst], %[dst], #2           \n\t"
466c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "bne     1b                           \n\t"
467c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  "4:                                   \n\t"
468c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
469c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  :
470c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip"
471c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                  );
472c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger}
473c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32A_D565_Opaque_PROC       S32A_D565_Opaque_v7
474c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32A_D565_Blend_PROC        NULL
475c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32_D565_Blend_Dither_PROC  NULL
47696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#else
47796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32A_D565_Opaque_PROC       NULL
47896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32A_D565_Blend_PROC        NULL
47996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32_D565_Blend_Dither_PROC  NULL
48096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#endif
48196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
48296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed/* Don't have a special version that assumes each src is opaque, but our S32A
48396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    is still faster than the default, so use it here
48496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed */
48596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32_D565_Opaque_PROC    S32A_D565_Opaque_PROC
48696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32_D565_Blend_PROC     S32A_D565_Blend_PROC
48796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
48896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed///////////////////////////////////////////////////////////////////////////////
48996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
490c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(TEST_SRC_ALPHA)
491c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
492c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenbergerstatic void S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor* SK_RESTRICT dst,
493c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                  const SkPMColor* SK_RESTRICT src,
494c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                  int count, U8CPU alpha) {
495c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	SkASSERT(255 == alpha);
496c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if (count <= 0)
497c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	return;
498c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
499c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	/* Use these to check if src is transparent or opaque */
500c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	const unsigned int ALPHA_OPAQ  = 0xFF000000;
501c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	const unsigned int ALPHA_TRANS = 0x00FFFFFF;
502c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
503c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define UNROLL  4
504c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
505c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	const SkPMColor* SK_RESTRICT src_temp = src;
506c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
507c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	/* set up the NEON variables */
508c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	uint8x8_t alpha_mask;
509c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
510c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	alpha_mask = vld1_u8(alpha_mask_setup);
511c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
512c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	uint8x8_t src_raw, dst_raw, dst_final;
513c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
514c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	uint8x8_t dst_cooked;
515c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	uint16x8_t dst_wide;
516c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	uint8x8_t alpha_narrow;
517c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	uint16x8_t alpha_wide;
518c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
519c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	/* choose the first processing type */
520c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if( src >= src_end)
521c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto TAIL;
522c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if(*src <= ALPHA_TRANS)
523c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto ALPHA_0;
524c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if(*src >= ALPHA_OPAQ)
525c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto ALPHA_255;
526c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	/* fall-thru */
527c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
528c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek SollenbergerALPHA_1_TO_254:
529c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	do {
530c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
531c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* get the source */
532c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		src_raw = vreinterpret_u8_u32(vld1_u32(src));
533c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
534c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
535c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* get and hold the dst too */
536c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
537c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
538c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
539c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
540c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* get the alphas spread out properly */
541c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
542c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
543c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* we collapsed (255-a)+1 ... */
544c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
545c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
546c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* spread the dest */
547c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_wide = vmovl_u8(dst_raw);
548c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
549c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* alpha mul the dest */
550c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_wide = vmulq_u16 (dst_wide, alpha_wide);
551c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_cooked = vshrn_n_u16(dst_wide, 8);
552c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
553c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* sum -- ignoring any byte lane overflows */
554c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_final = vadd_u8(src_raw, dst_cooked);
555c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
556c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
557c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
558c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* we collapsed (255-a)+1 ... */
559c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
560c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
561c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* spread the dest */
562c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_wide = vmovl_u8(dst_raw_2);
563c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
564c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* alpha mul the dest */
565c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_wide = vmulq_u16 (dst_wide, alpha_wide);
566c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_cooked = vshrn_n_u16(dst_wide, 8);
567c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
568c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* sum -- ignoring any byte lane overflows */
569c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
570c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
571c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		vst1_u32(dst, vreinterpret_u32_u8(dst_final));
572c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
573c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
574c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		src += UNROLL;
575c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst += UNROLL;
576c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
577c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		/* if 2 of the next pixels aren't between 1 and 254
578c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		it might make sense to go to the optimized loops */
579c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
580c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			break;
581c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
582c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	} while(src < src_end);
583c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
584c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if (src >= src_end)
585c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto TAIL;
586c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
587c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
588c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto ALPHA_255;
589c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
590c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	/*fall-thru*/
591c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
592c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek SollenbergerALPHA_0:
593c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
594c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	/*In this state, we know the current alpha is 0 and
595c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	 we optimize for the next alpha also being zero. */
596c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	src_temp = src;  //so we don't have to increment dst every time
597c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	do {
598c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		if(*(++src) > ALPHA_TRANS)
599c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			break;
600c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		if(*(++src) > ALPHA_TRANS)
601c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			break;
602c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		if(*(++src) > ALPHA_TRANS)
603c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			break;
604c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		if(*(++src) > ALPHA_TRANS)
605c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			break;
606c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	} while(src < src_end);
607c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
608c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	dst += (src - src_temp);
609c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
610c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	/* no longer alpha 0, so determine where to go next. */
611c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if( src >= src_end)
612c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto TAIL;
613c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if(*src >= ALPHA_OPAQ)
614c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto ALPHA_255;
615c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	else
616c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto ALPHA_1_TO_254;
617c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
618c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek SollenbergerALPHA_255:
619c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
620c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst[0]=src[0];
621c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst[1]=src[1];
622c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst[2]=src[2];
623c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst[3]=src[3];
624c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		src+=UNROLL;
625c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst+=UNROLL;
626c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		if(src >= src_end)
627c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			goto TAIL;
628c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	}
629c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
630c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	//Handle remainder.
631c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
632c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
633c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
634c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		}
635c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	}
636c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
637c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if( src >= src_end)
638c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto TAIL;
639c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	if(*src <= ALPHA_TRANS)
640c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto ALPHA_0;
641c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	else
642c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		goto ALPHA_1_TO_254;
643c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
644c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek SollenbergerTAIL:
645c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	/* do any residual iterations */
646c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	src_end += UNROLL + 1;  //goto the real end
647c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	while(src != src_end) {
648c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		if( *src != 0 ) {
649c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			if( *src >= ALPHA_OPAQ ) {
650c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger				*dst = *src;
651c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			}
652c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			else {
653c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger				*dst = SkPMSrcOver(*src, *dst);
654c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger			}
655c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		}
656c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		src++;
657c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger		dst++;
658c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	}
659c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger	return;
660c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger}
661c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
662c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32A_Opaque_BlitRow32_PROC  S32A_Opaque_BlitRow32_neon_test_alpha
663c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
664c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
66554e0f955c21365271661cd92a29d06a847a18554Mike Reed
66654e0f955c21365271661cd92a29d06a847a18554Mike Reedstatic void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
66754e0f955c21365271661cd92a29d06a847a18554Mike Reed                                  const SkPMColor* SK_RESTRICT src,
66854e0f955c21365271661cd92a29d06a847a18554Mike Reed                                  int count, U8CPU alpha) {
66954e0f955c21365271661cd92a29d06a847a18554Mike Reed
67054e0f955c21365271661cd92a29d06a847a18554Mike Reed    SkASSERT(255 == alpha);
67154e0f955c21365271661cd92a29d06a847a18554Mike Reed    if (count > 0) {
67254e0f955c21365271661cd92a29d06a847a18554Mike Reed
6732274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
6742274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	uint8x8_t alpha_mask;
6752274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
6762274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
6772274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	alpha_mask = vld1_u8(alpha_mask_setup);
6782274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
67954e0f955c21365271661cd92a29d06a847a18554Mike Reed	/* do the NEON unrolled code */
68054e0f955c21365271661cd92a29d06a847a18554Mike Reed#define	UNROLL	4
68154e0f955c21365271661cd92a29d06a847a18554Mike Reed	while (count >= UNROLL) {
68254e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint8x8_t src_raw, dst_raw, dst_final;
68354e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
68454e0f955c21365271661cd92a29d06a847a18554Mike Reed
68554e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* get the source */
68654e0f955c21365271661cd92a29d06a847a18554Mike Reed	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
68754e0f955c21365271661cd92a29d06a847a18554Mike Reed#if	UNROLL > 2
68854e0f955c21365271661cd92a29d06a847a18554Mike Reed	    src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
68954e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
69054e0f955c21365271661cd92a29d06a847a18554Mike Reed
69154e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* get and hold the dst too */
69254e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
69354e0f955c21365271661cd92a29d06a847a18554Mike Reed#if	UNROLL > 2
69454e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
69554e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
69654e0f955c21365271661cd92a29d06a847a18554Mike Reed
69754e0f955c21365271661cd92a29d06a847a18554Mike Reed	/* 1st and 2nd bits of the unrolling */
69854e0f955c21365271661cd92a29d06a847a18554Mike Reed	{
69954e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint8x8_t dst_cooked;
70054e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint16x8_t dst_wide;
70154e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint8x8_t alpha_narrow;
70254e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint16x8_t alpha_wide;
70354e0f955c21365271661cd92a29d06a847a18554Mike Reed
70454e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* get the alphas spread out properly */
70554e0f955c21365271661cd92a29d06a847a18554Mike Reed	    alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
706bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1
707bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
708bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* we collapsed (255-a)+1 ... */
709bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
710bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else
7112274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
71254e0f955c21365271661cd92a29d06a847a18554Mike Reed	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
713bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif
71454e0f955c21365271661cd92a29d06a847a18554Mike Reed
7152274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* spread the dest */
71654e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_wide = vmovl_u8(dst_raw);
71754e0f955c21365271661cd92a29d06a847a18554Mike Reed
71854e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* alpha mul the dest */
71954e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
72054e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_cooked = vshrn_n_u16(dst_wide, 8);
72154e0f955c21365271661cd92a29d06a847a18554Mike Reed
72254e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* sum -- ignoring any byte lane overflows */
72354e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_final = vadd_u8(src_raw, dst_cooked);
72454e0f955c21365271661cd92a29d06a847a18554Mike Reed	}
72554e0f955c21365271661cd92a29d06a847a18554Mike Reed
72654e0f955c21365271661cd92a29d06a847a18554Mike Reed#if	UNROLL > 2
72754e0f955c21365271661cd92a29d06a847a18554Mike Reed	/* the 3rd and 4th bits of our unrolling */
72854e0f955c21365271661cd92a29d06a847a18554Mike Reed	{
72954e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint8x8_t dst_cooked;
73054e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint16x8_t dst_wide;
73154e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint8x8_t alpha_narrow;
73254e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint16x8_t alpha_wide;
73354e0f955c21365271661cd92a29d06a847a18554Mike Reed
73454e0f955c21365271661cd92a29d06a847a18554Mike Reed	    alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
735bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1
736bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
737bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* we collapsed (255-a)+1 ... */
738bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
739bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else
7402274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
74154e0f955c21365271661cd92a29d06a847a18554Mike Reed	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
742bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif
74354e0f955c21365271661cd92a29d06a847a18554Mike Reed
7442274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* spread the dest */
74554e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_wide = vmovl_u8(dst_raw_2);
74654e0f955c21365271661cd92a29d06a847a18554Mike Reed
74754e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* alpha mul the dest */
74854e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
74954e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_cooked = vshrn_n_u16(dst_wide, 8);
75054e0f955c21365271661cd92a29d06a847a18554Mike Reed
75154e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* sum -- ignoring any byte lane overflows */
75254e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
75354e0f955c21365271661cd92a29d06a847a18554Mike Reed	}
75454e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
75554e0f955c21365271661cd92a29d06a847a18554Mike Reed
75654e0f955c21365271661cd92a29d06a847a18554Mike Reed	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
75754e0f955c21365271661cd92a29d06a847a18554Mike Reed#if	UNROLL > 2
75854e0f955c21365271661cd92a29d06a847a18554Mike Reed	    vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
75954e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
76054e0f955c21365271661cd92a29d06a847a18554Mike Reed
76154e0f955c21365271661cd92a29d06a847a18554Mike Reed	    src += UNROLL;
76254e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst += UNROLL;
76354e0f955c21365271661cd92a29d06a847a18554Mike Reed	    count -= UNROLL;
76454e0f955c21365271661cd92a29d06a847a18554Mike Reed	}
76554e0f955c21365271661cd92a29d06a847a18554Mike Reed#undef	UNROLL
76654e0f955c21365271661cd92a29d06a847a18554Mike Reed
76754e0f955c21365271661cd92a29d06a847a18554Mike Reed	/* do any residual iterations */
76854e0f955c21365271661cd92a29d06a847a18554Mike Reed        while (--count >= 0) {
76954e0f955c21365271661cd92a29d06a847a18554Mike Reed#ifdef TEST_SRC_ALPHA
77054e0f955c21365271661cd92a29d06a847a18554Mike Reed            SkPMColor sc = *src;
77154e0f955c21365271661cd92a29d06a847a18554Mike Reed            if (sc) {
77254e0f955c21365271661cd92a29d06a847a18554Mike Reed                unsigned srcA = SkGetPackedA32(sc);
77354e0f955c21365271661cd92a29d06a847a18554Mike Reed                SkPMColor result = sc;
77454e0f955c21365271661cd92a29d06a847a18554Mike Reed                if (srcA != 255) {
77554e0f955c21365271661cd92a29d06a847a18554Mike Reed                    result = SkPMSrcOver(sc, *dst);
77654e0f955c21365271661cd92a29d06a847a18554Mike Reed                }
77754e0f955c21365271661cd92a29d06a847a18554Mike Reed                *dst = result;
77854e0f955c21365271661cd92a29d06a847a18554Mike Reed            }
77954e0f955c21365271661cd92a29d06a847a18554Mike Reed#else
78054e0f955c21365271661cd92a29d06a847a18554Mike Reed            *dst = SkPMSrcOver(*src, *dst);
78154e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
78254e0f955c21365271661cd92a29d06a847a18554Mike Reed            src += 1;
78354e0f955c21365271661cd92a29d06a847a18554Mike Reed            dst += 1;
78454e0f955c21365271661cd92a29d06a847a18554Mike Reed        }
78554e0f955c21365271661cd92a29d06a847a18554Mike Reed    }
78654e0f955c21365271661cd92a29d06a847a18554Mike Reed}
78754e0f955c21365271661cd92a29d06a847a18554Mike Reed
78854e0f955c21365271661cd92a29d06a847a18554Mike Reed#define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_neon
78935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
790c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#elif defined (__ARM_ARCH__) /* #if defined(__ARM_HAVE_NEON) && defined... */
79135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
792c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#if defined(TEST_SRC_ALPHA)
793c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
794c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenbergerstatic void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_test_alpha
795c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                        (SkPMColor* SK_RESTRICT dst,
796c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                         const SkPMColor* SK_RESTRICT src,
797c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                         int count, U8CPU alpha) {
798c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
799c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger/* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */
800c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger/* Predicts that the next pixel will have the same alpha type as the current pixel */
801c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
802c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenbergerasm volatile (
803c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
804c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSTMDB  r13!, {r4-r12, r14}        \n" /* saving r4-r12, lr on the stack */
805c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* we should not save r0-r3 according to ABI */
806c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
807c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    r2, #0                     \n" /* if (count == 0) */
808c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBEQ    9f                         \n" /* go to EXIT */
809c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
810c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMOV    r12, #0xff                 \n" /* load the 0xff mask in r12 */
811c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r12, r12, r12, LSL #16     \n" /* convert it to 0xff00ff in r12 */
812c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
813c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMOV    r14, #255                  \n" /* r14 = 255 */
814c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* will be used later for left-side comparison */
815c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
816c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r2, %[src], r2, LSL #2     \n" /* r2 points to last array element which can be used */
817c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSUB    r2, r2, #16                \n" /* as a base for 4-way processing algorithm */
818c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
819c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer is bigger than */
820c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBGT    8f                         \n" /* calculated marker for 4-way -> */
821c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* use simple one-by-one processing */
822c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
823c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* START OF DISPATCHING BLOCK */
824c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
825c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t0:                                \n"
826c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
827c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
828c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
829c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSR    r7, r3, #24                \n" /* if not all src alphas of 4-way block are equal -> */
830c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    r7, r4, LSR #24            \n"
831c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMPEQ  r7, r5, LSR #24            \n"
832c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMPEQ  r7, r6, LSR #24            \n"
833c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBNE    1f                         \n" /* -> go to general 4-way processing routine */
834c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
835c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    r14, r7                    \n" /* if all src alphas are equal to 255 */
836c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBEQ    3f                         \n" /* go to alpha == 255 optimized routine */
837c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
838c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    r7,  #0                    \n" /* if all src alphas are equal to 0 */
839c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBEQ    6f                         \n" /* go to alpha == 0 optimized routine */
840c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
841c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* END OF DISPATCHING BLOCK */
842c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
843c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
844c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
845c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t1:                                \n"
846c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* we do not have enough registers to make */
847c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* 4-way [dst] loading -> we are using 2 * 2-way */
848c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
849c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDM    %[dst], {r7, r8}           \n" /* 1st 2-way loading of dst values to r7-r8 */
850c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
851c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* PROCESSING BLOCK 1 */
852c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* r3 = src, r7 = dst */
853c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
854c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSR    r11, r3,  #24              \n" /* extracting alpha from source and storing to r11 */
855c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r7               \n" /* r9 = br masked by r12 (0xff00ff) */
856c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tRSB    r11, r11, #256             \n" /* subtracting the alpha from 255 -> r11 = scale */
857c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r12, r7, LSR #8       \n" /* r10 = ag masked by r12 (0xff00ff) */
858c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r9,  r9,  r11              \n" /* br = br * scale */
859c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r9, LSR #8       \n" /* lsr br by 8 and mask it */
860c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r10, r10, r11              \n" /* ag = ag * scale */
861c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r10, r12, LSL #8      \n" /* mask ag with reverse mask */
862c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r7,  r9,  r10              \n" /* br | ag */
863c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r7,  r3,  r7               \n" /* dst = src + calc dest(r8) */
864c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
865c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* PROCESSING BLOCK 2 */
866c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* r4 = src, r8 = dst */
867c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
868c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSR    r11, r4,  #24              \n" /* see PROCESSING BLOCK 1 */
869c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r8               \n"
870c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tRSB    r11, r11, #256             \n"
871c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r12, r8, LSR #8       \n"
872c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r9,  r9,  r11              \n"
873c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r9, LSR #8       \n"
874c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r10, r10, r11              \n"
875c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r10, r12, LSL #8      \n"
876c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r8,  r9,  r10              \n"
877c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r8,  r4,  r8               \n"
878c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
879c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSTM    %[dst]!, {r7, r8}          \n" /* 1st 2-way storing of processed dst values */
880c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
881c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDM    %[dst], {r9, r10}          \n" /* 2nd 2-way loading of dst values to r9-r10 */
882c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
883c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* PROCESSING BLOCK 3 */
884c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* r5 = src, r9 = dst */
885c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
886c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSR    r11, r5,  #24              \n" /* see PROCESSING BLOCK 1 */
887c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r7,  r12, r9               \n"
888c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tRSB    r11, r11, #256             \n"
889c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r8,  r12, r9, LSR #8       \n"
890c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r7,  r7,  r11              \n"
891c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r7,  r12, r7, LSR #8       \n"
892c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r8,  r8,  r11              \n"
893c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r8,  r8,  r12, LSL #8      \n"
894c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r9,  r7,  r8               \n"
895c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r9,  r5,  r9               \n"
896c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
897c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* PROCESSING BLOCK 4 */
898c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* r6 = src, r10 = dst */
899c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
900c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSR    r11, r6,  #24              \n" /* see PROCESSING BLOCK 1 */
901c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r7,  r12, r10              \n"
902c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tRSB    r11, r11, #256             \n"
903c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r8,  r12, r10, LSR #8      \n"
904c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r7,  r7,  r11              \n"
905c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r7,  r12, r7, LSR #8       \n"
906c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r8,  r8,  r11              \n"
907c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r8,  r8,  r12, LSL #8      \n"
908c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r10, r7,  r8               \n"
909c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r10, r6,  r10              \n"
910c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
911c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSTM    %[dst]!, {r9, r10}         \n" /* 2nd 2-way storing of processed dst values */
912c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
913c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n" /* if our current [src] pointer <= calculated marker */
914c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBLE    0b                         \n" /* we could run 4-way processing -> go to dispatcher */
915c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBGT    8f                         \n" /* else -> use simple one-by-one processing */
916c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
917c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
918c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
919c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */
920c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
921c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t2:                                \n" /* ENTRY 1: LOADING [src] to registers */
922c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
923c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
924c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
925c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r7, r3, r4                 \n" /* if not all alphas == 255 -> */
926c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r8, r5, r6                 \n"
927c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9, r7, r8                 \n"
928c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    r14, r9, LSR #24           \n"
929c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBNE    4f                         \n" /* -> go to alpha == 0 check */
930c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
931c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t3:                                \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
932c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
933c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSTM    %[dst]!, {r3, r4, r5, r6}  \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
934c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
935c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
936c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBLE    2b                         \n" /* we could run 4-way processing */
937c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* because now we're in ALPHA == 255 state */
938c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* run next cycle with priority alpha == 255 checks */
939c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
940c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
941c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* use simple one-by-one processing */
942c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
943c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t4:                                \n"
944c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
945c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r7, r3, r4                 \n" /* if not all alphas == 0 -> */
946c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r8, r5, r6                 \n"
947c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r9, r7, r8                 \n"
948c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSRS   r9, #24                    \n"
949c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBNE    1b                         \n" /* -> go to general processing mode */
950c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* (we already checked for alpha == 255) */
951c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
952c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    %[dst], %[dst], #16        \n" /* all src alphas == 0 -> do not change dst values */
953c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
954c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
955c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBLE    5f                         \n" /* we could run 4-way processing one more time */
956c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* because now we're in ALPHA == 0 state */
957c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* run next cycle with priority alpha == 0 checks */
958c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
959c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
960c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* use simple one-by-one processing */
961c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
962c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */
963c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
964c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */
965c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
966c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t5:                                \n" /* ENTRY 1: LOADING [src] to registers */
967c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
968c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
969c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
970c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r7, r3, r4                 \n" /* if not all alphas == 0 -> */
971c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r8, r5, r6                 \n"
972c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r9, r7, r8                 \n"
973c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSRS   r9, #24                    \n"
974c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBNE    7f                         \n" /* -> go to alpha == 255 check */
975c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
976c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t6:                                \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
977c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
978c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    %[dst], %[dst], #16        \n" /* all src alphas == 0 -> do not change dst values */
979c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
980c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
981c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBLE    5b                         \n" /* we could run 4-way processing one more time */
982c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* because now we're in ALPHA == 0 state */
983c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* run next cycle with priority alpha == 0 checks */
984c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
985c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
986c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* use simple one-by-one processing */
987c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t7:                                \n"
988c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
989c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r7, r3, r4                 \n" /* if not all alphas == 255 -> */
990c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r8, r5, r6                 \n"
991c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9, r7, r8                 \n"
992c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    r14, r9, LSR #24           \n"
993c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBNE    1b                         \n" /* -> go to general processing mode */
994c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* (we already checked for alpha == 0) */
995c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
996c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSTM    %[dst]!, {r3, r4, r5, r6}  \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
997c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
998c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
999c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBLE    2b                         \n" /* we could run 4-way processing one more time */
1000c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* because now we're in ALPHA == 255 state */
1001c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* run next cycle with priority alpha == 255 checks */
1002c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1003c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
1004c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* use simple one-by-one processing */
1005c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1006c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */
1007c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1008c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* START OF TAIL BLOCK */
1009c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* (used when array is too small to be processed with 4-way algorithm)*/
1010c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1011c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t8:                                \n"
1012c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1013c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r2, r2, #16                \n" /* now r2 points to the element just after array */
1014c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger                                             /* we've done r2 = r2 - 16 at procedure start */
1015c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1016c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer > final marker */
1017c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBEQ    9f                         \n" /* goto EXIT */
1018c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1019c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* TAIL PROCESSING BLOCK 1 */
1020c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1021c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDR    r3, [%[src]], #4           \n" /* r3 = *src, src++ */
1022c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDR    r7, [%[dst]]               \n" /* r7 = *dst */
1023c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1024c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSR    r11, r3,  #24              \n" /* extracting alpha from source */
1025c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r7               \n" /* r9 = br masked by r12 (0xff00ff) */
1026c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tRSB    r11, r11, #256             \n" /* subtracting the alpha from 255 -> r11 = scale */
1027c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r12, r7, LSR #8       \n" /* r10 = ag masked by r12 (0xff00ff) */
1028c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r9,  r9,  r11              \n" /* br = br * scale */
1029c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r9, LSR #8       \n" /* lsr br by 8 and mask it */
1030c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r10, r10, r11              \n" /* ag = ag * scale */
1031c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r10, r12, LSL #8      \n" /* mask ag with reverse mask */
1032c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r7,  r9,  r10              \n" /* br | ag */
1033c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r7,  r3,  r7               \n" /* dst = src + calc dest(r8) */
1034c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1035c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSTR    r7, [%[dst]], #4           \n" /* *dst = r7; dst++ */
1036c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1037c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer > final marker */
1038c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBEQ    9f                         \n" /* goto EXIT */
1039c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1040c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* TAIL PROCESSING BLOCK 2 */
1041c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1042c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDR    r3, [%[src]], #4           \n" /* see TAIL PROCESSING BLOCK 1 */
1043c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDR    r7, [%[dst]]               \n"
1044c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1045c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSR    r11, r3,  #24              \n"
1046c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r7               \n"
1047c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tRSB    r11, r11, #256             \n"
1048c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r12, r7, LSR #8       \n"
1049c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r9,  r9,  r11              \n"
1050c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r9, LSR #8       \n"
1051c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r10, r10, r11              \n"
1052c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r10, r12, LSL #8      \n"
1053c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r7,  r9,  r10              \n"
1054c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r7,  r3,  r7               \n"
1055c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1056c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSTR    r7, [%[dst]], #4           \n"
1057c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1058c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tCMP    %[src], r2                 \n"
1059c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBEQ    9f                         \n"
1060c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1061c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* TAIL PROCESSING BLOCK 3 */
1062c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1063c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDR    r3, [%[src]], #4           \n" /* see TAIL PROCESSING BLOCK 1 */
1064c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDR    r7, [%[dst]]               \n"
1065c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1066c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLSR    r11, r3,  #24              \n"
1067c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r7               \n"
1068c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tRSB    r11, r11, #256             \n"
1069c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r12, r7, LSR #8       \n"
1070c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r9,  r9,  r11              \n"
1071c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r9,  r12, r9, LSR #8       \n"
1072c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tMUL    r10, r10, r11              \n"
1073c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tAND    r10, r10, r12, LSL #8      \n"
1074c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tORR    r7,  r9,  r10              \n"
1075c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tADD    r7,  r3,  r7               \n"
1076c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1077c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tSTR    r7, [%[dst]], #4           \n"
1078c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1079c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    /* END OF TAIL BLOCK */
1080c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1081c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\t9:                                \n" /* EXIT */
1082c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1083c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tLDMIA  r13!, {r4-r12, r14}        \n" /* restoring r4-r12, lr from stack */
1084c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    "\tBX     lr                         \n" /* return */
1085c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1086c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    : [dst] "+r" (dst), [src] "+r" (src)
1087c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    :
1088c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    : "cc", "r2", "r3", "memory"
1089c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1090c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger    );
1091c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1092c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger}
1093c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger
1094c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define	S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm_test_alpha
1095c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#else /* !defined(TEST_SRC_ALPHA) */
109635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
109735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenbergerstatic void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
109835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                                  const SkPMColor* SK_RESTRICT src,
109935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                                  int count, U8CPU alpha) {
110035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
110135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger    SkASSERT(255 == alpha);
110235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
110335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger    /* Does not support the TEST_SRC_ALPHA case */
110435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger    asm volatile (
110535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "cmp    %[count], #0               \n\t" /* comparing count with 0 */
110635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "beq    3f                         \n\t" /* if zero exit */
110735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
110835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mov    ip, #0xff                  \n\t" /* load the 0xff mask in ip */
110935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    ip, ip, ip, lsl #16        \n\t" /* convert it to 0xff00ff in ip */
111035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
111135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "cmp    %[count], #2               \n\t" /* compare count with 2 */
111235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "blt    2f                         \n\t" /* if less than 2 -> single loop */
111335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
111435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* Double Loop */
111535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "1:                                \n\t" /* <double loop> */
111635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "ldm    %[src]!, {r5,r6}           \n\t" /* load the src(s) at r5-r6 */
111735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "ldm    %[dst], {r7,r8}            \n\t" /* loading dst(s) into r7-r8 */
111835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
111935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
112035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------- */
112135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
112235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
112335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
112435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
112535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
112635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
112735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
112835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
112935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
113035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r4, r6, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
113135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r7, r9, r10                \n\t" /* br | ag*/
113235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
113335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "add    r7, r5, r7                 \n\t" /* dst = src + calc dest(r7) */
113435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 255 -> r4=scale */
113535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
113635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------- */
113735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, ip, r8                 \n\t" /* r9 = br masked by ip */
113835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
113935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r10, ip, r8, lsr #8        \n\t" /* r10 = ag masked by ip */
114035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
114135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "sub    %[count], %[count], #2     \n\t"
114235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
114335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
114435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
114535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
114635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "cmp    %[count], #1               \n\t" /* comparing count with 1 */
114735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r8, r9, r10                \n\t" /* br | ag */
114835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
114935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "add    r8, r6, r8                 \n\t" /* dst = src + calc dest(r8) */
115035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
115135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------------- */
115235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "stm    %[dst]!, {r7,r8}           \n\t" /* *dst = r7, increment dst by two (each times 4) */
115335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------------- */
115435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
115535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "bgt    1b                         \n\t" /* if greater than 1 -> reloop */
115635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "blt    3f                         \n\t" /* if less than 1 -> exit */
115735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
115835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* Single Loop */
115935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "2:                                \n\t" /* <single loop> */
116035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "ldr    r5, [%[src]], #4           \n\t" /* load the src pointer into r5 r5=src */
116135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "ldr    r7, [%[dst]]               \n\t" /* loading dst into r7 */
116235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
116335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
116435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------- */
116535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
116635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
116735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
116835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
116935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r9, r9, r4                 \n\t" /* br = br * scale */
117035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
117135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
117235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
117335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r10, r10, ip, lsl #8       \n\t" /* mask ag */
117435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r7, r9, r10                \n\t" /* br | ag */
117535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
117635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "add    r7, r5, r7                 \n\t" /* *dst = src + calc dest(r7) */
117735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
117835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------------- */
117935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "str    r7, [%[dst]], #4           \n\t" /* *dst = r7, increment dst by one (times 4) */
118035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------------- */
118135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
118235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "3:                                \n\t" /* <exit> */
118335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
118435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  :
118535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
118635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  );
118735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger}
118835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger#define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_arm
1189c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#endif /* !defined(TEST_SRC_ALPHA) */
1190c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#else /* ... #elif defined (__ARM_ARCH__) */
1191c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define	S32A_Opaque_BlitRow32_PROC	NULL
119254e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
119354e0f955c21365271661cd92a29d06a847a18554Mike Reed
119435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger/*
119535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger * ARM asm version of S32A_Blend_BlitRow32
119635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger */
119735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenbergerstatic void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
119835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                                 const SkPMColor* SK_RESTRICT src,
119935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                                 int count, U8CPU alpha) {
120035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger    asm volatile (
120135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "cmp    %[count], #0               \n\t" /* comparing count with 0 */
120235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "beq    3f                         \n\t" /* if zero exit */
120335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
120435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mov    r12, #0xff                 \n\t" /* load the 0xff mask in r12 */
120535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r12, r12, r12, lsl #16     \n\t" /* convert it to 0xff00ff in r12 */
120635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
120735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* src1,2_scale */
120835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "add    %[alpha], %[alpha], #1     \n\t" /* loading %[alpha]=src_scale=alpha+1 */
120935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
121035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "cmp    %[count], #2               \n\t" /* comparing count with 2 */
121135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "blt    2f                         \n\t" /* if less than 2 -> single loop */
121235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
121335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* Double Loop */
121435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "1:                                \n\t" /* <double loop> */
121535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "ldm    %[src]!, {r5, r6}          \n\t" /* loading src pointers into r5 and r6 */
121635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "ldm    %[dst], {r7, r8}           \n\t" /* loading dst pointers into r7 and r8 */
121735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
121835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* dst1_scale and dst2_scale*/
121935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r9, r5, #24                \n\t" /* src >> 24 */
122035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r10, r6, #24               \n\t" /* src >> 24 */
122135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "smulbb r9, r9, %[alpha]           \n\t" /* r9 = SkMulS16 r9 with src_scale */
122235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "smulbb r10, r10, %[alpha]         \n\t" /* r10 = SkMulS16 r10 with src_scale */
122335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r9, r9, #8                 \n\t" /* r9 >> 8 */
122435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r10, r10, #8               \n\t" /* r10 >> 8 */
122535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "rsb    r9, r9, #256               \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
122635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "rsb    r10, r10, #256             \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
122735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
122835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ---------------------- */
122935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
123035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* src1, src1_scale */
123135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r11, r12, r5, lsr #8       \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
123235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r4, r12, r5                \n\t" /* rb = r4 = r5 masked by r12 */
123335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
123435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
123535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
123635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
123735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r5, r11, r4                \n\t" /* r5 = (src1, src_scale) */
123835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
123935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* dst1, dst1_scale */
124035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r11, r12, r7, lsr #8       \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
124135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r4, r12, r7                \n\t" /* rb = r4 = r7 masked by r12 */
124235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r11, r11, r9               \n\t" /* ag = r11 times dst_scale (r9) */
124335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r4, r4, r9                 \n\t" /* rb = r4 times dst_scale (r9) */
124435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
124535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
124635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r9, r11, r4                \n\t" /* r9 = (dst1, dst_scale) */
124735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
124835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ---------------------- */
124935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "add    r9, r5, r9                 \n\t" /* *dst = src plus dst both scaled */
125035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ---------------------- */
125135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
125235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ====================== */
125335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
125435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* src2, src2_scale */
125535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r11, r12, r6, lsr #8       \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
125635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r4, r12, r6                \n\t" /* rb = r4 = r6 masked by r12 */
125735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
125835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
125935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
126035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
126135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r6, r11, r4                \n\t" /* r6 = (src2, src_scale) */
126235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
126335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* dst2, dst2_scale */
126435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r11, r12, r8, lsr #8       \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
126535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r4, r12, r8                \n\t" /* rb = r4 = r8 masked by r12 */
126635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r11, r11, r10              \n\t" /* ag = r11 times dst_scale (r10) */
126735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r4, r4, r10                \n\t" /* rb = r4 times dst_scale (r6) */
126835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
126935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
127035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r10, r11, r4               \n\t" /* r10 = (dst2, dst_scale) */
127135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
127235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "sub    %[count], %[count], #2     \n\t" /* decrease count by 2 */
127335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ---------------------- */
127435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "add    r10, r6, r10               \n\t" /* *dst = src plus dst both scaled */
127535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ---------------------- */
127635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "cmp    %[count], #1               \n\t" /* compare count with 1 */
127735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------------- */
127835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "stm    %[dst]!, {r9, r10}         \n\t" /* copy r9 and r10 to r7 and r8 respectively */
127935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------------- */
128035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
128135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "bgt    1b                         \n\t" /* if %[count] greater than 1 reloop */
128235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "blt    3f                         \n\t" /* if %[count] less than 1 exit */
128335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                                                           /* else get into the single loop */
128435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* Single Loop */
128535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "2:                                \n\t" /* <single loop> */
128635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "ldr    r5, [%[src]], #4           \n\t" /* loading src pointer into r5: r5=src */
128735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "ldr    r7, [%[dst]]               \n\t" /* loading dst pointer into r7: r7=dst */
128835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
128935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r6, r5, #24                \n\t" /* src >> 24 */
129035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r8, r12, r5, lsr #8        \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
129135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "smulbb r6, r6, %[alpha]           \n\t" /* r6 = SkMulS16 with src_scale */
129235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, r12, r5                \n\t" /* rb = r9 = r5 masked by r12 */
129335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "lsr    r6, r6, #8                 \n\t" /* r6 >> 8 */
129435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r8, r8, %[alpha]           \n\t" /* ag = r8 times scale */
129535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "rsb    r6, r6, #256               \n\t" /* r6 = 255 - r6 + 1 */
129635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
129735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* src, src_scale */
129835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r9, r9, %[alpha]           \n\t" /* rb = r9 times scale */
129935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
130035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
130135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r10, r8, r9                \n\t" /* r10 = (scr, src_scale) */
130235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
130335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* dst, dst_scale */
130435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r8, r12, r7, lsr #8        \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
130535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, r12, r7                \n\t" /* rb = r9 = r7 masked by r12 */
130635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r8, r8, r6                 \n\t" /* ag = r8 times scale (r6) */
130735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "mul    r9, r9, r6                 \n\t" /* rb = r9 times scale (r6) */
130835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
130935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
131035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "orr    r7, r8, r9                 \n\t" /* r7 = (dst, dst_scale) */
131135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
131235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "add    r10, r7, r10               \n\t" /* *dst = src plus dst both scaled */
131335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
131435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------------- */
131535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "str    r10, [%[dst]], #4          \n\t" /* *dst = r10, postincrement dst by one (times 4) */
131635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  /* ----------------- */
131735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
131835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  "3:                                \n\t" /* <exit> */
131935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
132035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  :
132135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
132235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger                  );
132335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
132435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger}
132535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger#define	S32A_Blend_BlitRow32_PROC	S32A_Blend_BlitRow32_arm
132635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger
132754e0f955c21365271661cd92a29d06a847a18554Mike Reed/* Neon version of S32_Blend_BlitRow32()
13282274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed * portable version is in src/core/SkBlitRow_D32.cpp
132954e0f955c21365271661cd92a29d06a847a18554Mike Reed */
133054e0f955c21365271661cd92a29d06a847a18554Mike Reed#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
133154e0f955c21365271661cd92a29d06a847a18554Mike Reedstatic void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
133254e0f955c21365271661cd92a29d06a847a18554Mike Reed                                const SkPMColor* SK_RESTRICT src,
133354e0f955c21365271661cd92a29d06a847a18554Mike Reed                                int count, U8CPU alpha) {
133454e0f955c21365271661cd92a29d06a847a18554Mike Reed    SkASSERT(alpha <= 255);
133554e0f955c21365271661cd92a29d06a847a18554Mike Reed    if (count > 0) {
133654e0f955c21365271661cd92a29d06a847a18554Mike Reed        uint16_t src_scale = SkAlpha255To256(alpha);
133754e0f955c21365271661cd92a29d06a847a18554Mike Reed        uint16_t dst_scale = 256 - src_scale;
133854e0f955c21365271661cd92a29d06a847a18554Mike Reed
133954e0f955c21365271661cd92a29d06a847a18554Mike Reed	/* run them N at a time through the NEON unit */
134054e0f955c21365271661cd92a29d06a847a18554Mike Reed	/* note that each 1 is 4 bytes, each treated exactly the same,
134154e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * so we can work under that guise. We *do* know that the src&dst
134254e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * will be 32-bit aligned quantities, so we can specify that on
134354e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * the load/store ops and do a neon 'reinterpret' to get us to
134454e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * byte-sized (pun intended) pieces that we widen/multiply/shift
134554e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * we're limited at 128 bits in the wide ops, which is 8x16bits
134654e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * or a pair of 32 bit src/dsts.
134754e0f955c21365271661cd92a29d06a847a18554Mike Reed	 */
134854e0f955c21365271661cd92a29d06a847a18554Mike Reed	/* we *could* manually unroll this loop so that we load 128 bits
134954e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * (as a pair of 64s) from each of src and dst, processing them
135054e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * in pieces. This might give us a little better management of
135154e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * the memory latency, but my initial attempts here did not
135254e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * produce an instruction stream that looked all that nice.
135354e0f955c21365271661cd92a29d06a847a18554Mike Reed	 */
135454e0f955c21365271661cd92a29d06a847a18554Mike Reed#define	UNROLL	2
135554e0f955c21365271661cd92a29d06a847a18554Mike Reed	while (count >= UNROLL) {
135654e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint8x8_t  src_raw, dst_raw, dst_final;
135754e0f955c21365271661cd92a29d06a847a18554Mike Reed	    uint16x8_t  src_wide, dst_wide;
135854e0f955c21365271661cd92a29d06a847a18554Mike Reed
135954e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* get 64 bits of src, widen it, multiply by src_scale */
136054e0f955c21365271661cd92a29d06a847a18554Mike Reed	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
136154e0f955c21365271661cd92a29d06a847a18554Mike Reed	    src_wide = vmovl_u8(src_raw);
13622274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
136354e0f955c21365271661cd92a29d06a847a18554Mike Reed	    src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
136454e0f955c21365271661cd92a29d06a847a18554Mike Reed
136554e0f955c21365271661cd92a29d06a847a18554Mike Reed	    /* ditto with dst */
136654e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
136754e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst_wide = vmovl_u8(dst_raw);
136854e0f955c21365271661cd92a29d06a847a18554Mike Reed
13692274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* combine add with dst multiply into mul-accumulate */
13702274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
137154e0f955c21365271661cd92a29d06a847a18554Mike Reed
13722274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_final = vshrn_n_u16(dst_wide, 8);
137354e0f955c21365271661cd92a29d06a847a18554Mike Reed	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
137454e0f955c21365271661cd92a29d06a847a18554Mike Reed
137554e0f955c21365271661cd92a29d06a847a18554Mike Reed	    src += UNROLL;
137654e0f955c21365271661cd92a29d06a847a18554Mike Reed	    dst += UNROLL;
137754e0f955c21365271661cd92a29d06a847a18554Mike Reed	    count -= UNROLL;
137854e0f955c21365271661cd92a29d06a847a18554Mike Reed	}
137954e0f955c21365271661cd92a29d06a847a18554Mike Reed	/* RBE: well, i don't like how gcc manages src/dst across the above
138054e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * loop it's constantly calculating src+bias, dst+bias and it only
138154e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * adjusts the real ones when we leave the loop. Not sure why
138254e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * it's "hoisting down" (hoisting implies above in my lexicon ;))
138354e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * the adjustments to src/dst/count, but it does...
138454e0f955c21365271661cd92a29d06a847a18554Mike Reed	 * (might be SSA-style internal logic...
138554e0f955c21365271661cd92a29d06a847a18554Mike Reed	 */
138654e0f955c21365271661cd92a29d06a847a18554Mike Reed
138754e0f955c21365271661cd92a29d06a847a18554Mike Reed#if	UNROLL == 2
138854e0f955c21365271661cd92a29d06a847a18554Mike Reed	if (count == 1) {
138954e0f955c21365271661cd92a29d06a847a18554Mike Reed            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
139054e0f955c21365271661cd92a29d06a847a18554Mike Reed	}
139154e0f955c21365271661cd92a29d06a847a18554Mike Reed#else
139254e0f955c21365271661cd92a29d06a847a18554Mike Reed	if (count > 0) {
139354e0f955c21365271661cd92a29d06a847a18554Mike Reed            do {
139454e0f955c21365271661cd92a29d06a847a18554Mike Reed                *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
139554e0f955c21365271661cd92a29d06a847a18554Mike Reed                src += 1;
139654e0f955c21365271661cd92a29d06a847a18554Mike Reed                dst += 1;
139754e0f955c21365271661cd92a29d06a847a18554Mike Reed            } while (--count > 0);
139854e0f955c21365271661cd92a29d06a847a18554Mike Reed	}
139954e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
140054e0f955c21365271661cd92a29d06a847a18554Mike Reed
140154e0f955c21365271661cd92a29d06a847a18554Mike Reed#undef	UNROLL
140254e0f955c21365271661cd92a29d06a847a18554Mike Reed    }
140354e0f955c21365271661cd92a29d06a847a18554Mike Reed}
140454e0f955c21365271661cd92a29d06a847a18554Mike Reed
140554e0f955c21365271661cd92a29d06a847a18554Mike Reed#define	S32_Blend_BlitRow32_PROC	S32_Blend_BlitRow32_neon
140654e0f955c21365271661cd92a29d06a847a18554Mike Reed#else
140754e0f955c21365271661cd92a29d06a847a18554Mike Reed#define	S32_Blend_BlitRow32_PROC	NULL
140854e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
140954e0f955c21365271661cd92a29d06a847a18554Mike Reed
141054e0f955c21365271661cd92a29d06a847a18554Mike Reed///////////////////////////////////////////////////////////////////////////////
141154e0f955c21365271661cd92a29d06a847a18554Mike Reed
141254e0f955c21365271661cd92a29d06a847a18554Mike Reed#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
14132274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14142274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#undef	DEBUG_OPAQUE_DITHER
14152274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14162274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#if	defined(DEBUG_OPAQUE_DITHER)
14172274ddecab6242780c010b52ae90b2c06ce38d66Mike Reedstatic void showme8(char *str, void *p, int len)
14182274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed{
14192274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	static char buf[256];
14202274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	char tbuf[32];
14212274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	int i;
14222274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	char *pc = (char*) p;
14232274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	sprintf(buf,"%8s:", str);
14242274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	for(i=0;i<len;i++) {
14252274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    sprintf(tbuf, "   %02x", pc[i]);
14262274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    strcat(buf, tbuf);
14272274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	}
14282274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	SkDebugf("%s\n", buf);
14292274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed}
14302274ddecab6242780c010b52ae90b2c06ce38d66Mike Reedstatic void showme16(char *str, void *p, int len)
14312274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed{
14322274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	static char buf[256];
14332274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	char tbuf[32];
14342274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	int i;
14352274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	uint16_t *pc = (uint16_t*) p;
14362274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	sprintf(buf,"%8s:", str);
14372274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	len = (len / sizeof(uint16_t));	/* passed as bytes */
14382274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	for(i=0;i<len;i++) {
14392274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    sprintf(tbuf, " %04x", pc[i]);
14402274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    strcat(buf, tbuf);
14412274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	}
14422274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	SkDebugf("%s\n", buf);
14432274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed}
14442274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#endif
14452274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14462274ddecab6242780c010b52ae90b2c06ce38d66Mike Reedstatic void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
144754e0f955c21365271661cd92a29d06a847a18554Mike Reed                                      const SkPMColor* SK_RESTRICT src,
144854e0f955c21365271661cd92a29d06a847a18554Mike Reed                                      int count, U8CPU alpha, int x, int y) {
144954e0f955c21365271661cd92a29d06a847a18554Mike Reed    SkASSERT(255 == alpha);
14502274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14512274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#define	UNROLL	8
14522274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14532274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed    if (count >= UNROLL) {
14542274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	uint8x8_t dbase;
14552274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14562274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#if	defined(DEBUG_OPAQUE_DITHER)
14572274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	uint16_t tmpbuf[UNROLL];
14582274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	int td[UNROLL];
14592274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	int tdv[UNROLL];
14602274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	int ta[UNROLL];
14612274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	int tap[UNROLL];
14622274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	uint16_t in_dst[UNROLL];
14632274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	int offset = 0;
14642274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	int noisy = 0;
14652274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#endif
14662274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14672274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
14682274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	dbase = vld1_u8(dstart);
14692274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14702274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed        do {
14712274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    uint8x8_t sr, sg, sb, sa, d;
1472bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    uint16x8_t dst8, scale8, alpha8;
14732274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    uint16x8_t dst_r, dst_g, dst_b;
14742274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14752274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#if	defined(DEBUG_OPAQUE_DITHER)
14762274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	/* calculate 8 elements worth into a temp buffer */
14772274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	{
14782274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	  int my_y = y;
14792274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	  int my_x = x;
14802274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	  SkPMColor* my_src = (SkPMColor*)src;
14812274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	  uint16_t* my_dst = dst;
14822274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	  int i;
14832274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14842274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed          DITHER_565_SCAN(my_y);
14852274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed          for(i=0;i<UNROLL;i++) {
14862274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed            SkPMColor c = *my_src++;
14872274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed            SkPMColorAssert(c);
14882274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed            if (c) {
14892274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                unsigned a = SkGetPackedA32(c);
14902274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14912274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
14922274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		tdv[i] = DITHER_VALUE(my_x);
14932274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		ta[i] = a;
14942274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		tap[i] = SkAlpha255To256(a);
14952274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		td[i] = d;
14962274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
14972274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                unsigned sr = SkGetPackedR32(c);
14982274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                unsigned sg = SkGetPackedG32(c);
14992274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                unsigned sb = SkGetPackedB32(c);
15002274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                sr = SkDITHER_R32_FOR_565(sr, d);
15012274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                sg = SkDITHER_G32_FOR_565(sg, d);
15022274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                sb = SkDITHER_B32_FOR_565(sb, d);
15032274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15042274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
15052274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
15062274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
15072274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                // now src and dst expanded are in g:11 r:10 x:1 b:10
15082274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
15092274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		td[i] = d;
15102274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15112274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed            } else {
15122274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		tmpbuf[i] = *my_dst;
15132274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		ta[i] = tdv[i] = td[i] = 0xbeef;
15142274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    }
15152274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    in_dst[i] = *my_dst;
15162274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed            my_dst += 1;
15172274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed            DITHER_INC_X(my_x);
15182274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed          }
15192274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	}
15202274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#endif
15212274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15222274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* source is in ABGR */
15232274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    {
15242274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		register uint8x8_t d0 asm("d0");
15252274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		register uint8x8_t d1 asm("d1");
15262274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		register uint8x8_t d2 asm("d2");
15272274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		register uint8x8_t d3 asm("d3");
15282274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15292274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
15302274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
15312274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		    : "r" (src)
15322274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed                    );
15332274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		    sr = d0; sg = d1; sb = d2; sa = d3;
15342274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    }
15352274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15362274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* calculate 'd', which will be 0..7 */
15372274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
15381cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger#if defined(SK_BUILD_FOR_ANDROID)
1539bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1540bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1541bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else
1542bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1543bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif
1544bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1545bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    d = vshrn_n_u16(alpha8, 8);	/* narrowing too */
15462274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15472274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* sr = sr - (sr>>5) + d */
15482274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* watching for 8-bit overflow.  d is 0..7; risky range of
15492274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	     * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
15502274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	     * safe  as long as we do ((sr-sr>>5) + d) */
15512274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
15522274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    sr = vadd_u8(sr, d);
15532274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15542274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* sb = sb - (sb>>5) + d */
15552274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
15562274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    sb = vadd_u8(sb, d);
15572274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15582274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
15592274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
15602274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    sg = vadd_u8(sg, vshr_n_u8(d,1));
15612274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15622274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
15632274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst8 = vld1q_u16(dst);
15642274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
15652274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
15662274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_r = vshrq_n_u16(dst8,11);	/* clearing hi bits */
15672274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15682274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* blend */
1569bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1
1570bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1571bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* originally 255-sa + 1 */
1572bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1573bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else
15742274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    scale8 = vsubw_u8(vdupq_n_u16(255), sa);
15752274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1576bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif
1577bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed
1578bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1
1579bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* combine the addq and mul, save 3 insns */
1580bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    scale8 = vshrq_n_u16(scale8, 3);
1581bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1582bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1583bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1584bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else
1585bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	    /* known correct, but +3 insns over above */
15862274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    scale8 = vshrq_n_u16(scale8, 3);
15872274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_b = vmulq_u16(dst_b, scale8);
15882274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_g = vmulq_u16(dst_g, scale8);
15892274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_r = vmulq_u16(dst_r, scale8);
15902274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15912274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* combine */
15922274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* NB: vshll widens, need to preserve those bits */
15932274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
15942274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
15952274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1596bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif
15972274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
15982274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* repack to store */
15992274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
16002274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
16012274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
16022274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
16032274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    vst1q_u16(dst, dst8);
16042274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
16052274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#if	defined(DEBUG_OPAQUE_DITHER)
16062274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* verify my 8 elements match the temp buffer */
16072274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	{
16082274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	   int i, bad=0;
16092274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	   static int invocation;
16102274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
16112274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	   for (i=0;i<UNROLL;i++)
16122274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		if (tmpbuf[i] != dst[i]) bad=1;
1613bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed	   if (bad) {
16142274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
16152274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed			invocation, offset);
1616bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		SkDebugf("  alpha 0x%x\n", alpha);
16172274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		for (i=0;i<UNROLL;i++)
16182274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		    SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
16192274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed			i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
16202274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed			dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
16212274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
1622bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme16("alpha8", &alpha8, sizeof(alpha8));
1623bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme16("scale8", &scale8, sizeof(scale8));
1624bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme8("d", &d, sizeof(d));
1625bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme16("dst8", &dst8, sizeof(dst8));
1626bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme16("dst_b", &dst_b, sizeof(dst_b));
1627bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme16("dst_g", &dst_g, sizeof(dst_g));
1628bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme16("dst_r", &dst_r, sizeof(dst_r));
1629bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme8("sb", &sb, sizeof(sb));
1630bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme8("sg", &sg, sizeof(sg));
1631bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed		showme8("sr", &sr, sizeof(sr));
1632bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed
16332274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		/* cop out */
16342274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed		return;
16352274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	   }
16362274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	   offset += UNROLL;
16372274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	   invocation++;
16382274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	}
16392274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#endif
16402274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
16412274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed            dst += UNROLL;
16422274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    src += UNROLL;
16432274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    count -= UNROLL;
16442274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed	    /* skip x += UNROLL, since it's unchanged mod-4 */
16452274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed        } while (count >= UNROLL);
16462274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed    }
16472274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#undef	UNROLL
16482274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed
16492274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed    /* residuals */
165054e0f955c21365271661cd92a29d06a847a18554Mike Reed    if (count > 0) {
165154e0f955c21365271661cd92a29d06a847a18554Mike Reed        DITHER_565_SCAN(y);
165254e0f955c21365271661cd92a29d06a847a18554Mike Reed        do {
165354e0f955c21365271661cd92a29d06a847a18554Mike Reed            SkPMColor c = *src++;
165454e0f955c21365271661cd92a29d06a847a18554Mike Reed            SkPMColorAssert(c);
165554e0f955c21365271661cd92a29d06a847a18554Mike Reed            if (c) {
165654e0f955c21365271661cd92a29d06a847a18554Mike Reed                unsigned a = SkGetPackedA32(c);
165754e0f955c21365271661cd92a29d06a847a18554Mike Reed
165887a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed                // dither and alpha are just temporary variables to work-around
165987a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed                // an ICE in debug.
166087a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed                unsigned dither = DITHER_VALUE(x);
166187a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed                unsigned alpha = SkAlpha255To256(a);
166287a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed                int d = SkAlphaMul(dither, alpha);
166354e0f955c21365271661cd92a29d06a847a18554Mike Reed
166454e0f955c21365271661cd92a29d06a847a18554Mike Reed                unsigned sr = SkGetPackedR32(c);
166554e0f955c21365271661cd92a29d06a847a18554Mike Reed                unsigned sg = SkGetPackedG32(c);
166654e0f955c21365271661cd92a29d06a847a18554Mike Reed                unsigned sb = SkGetPackedB32(c);
166754e0f955c21365271661cd92a29d06a847a18554Mike Reed                sr = SkDITHER_R32_FOR_565(sr, d);
166854e0f955c21365271661cd92a29d06a847a18554Mike Reed                sg = SkDITHER_G32_FOR_565(sg, d);
166954e0f955c21365271661cd92a29d06a847a18554Mike Reed                sb = SkDITHER_B32_FOR_565(sb, d);
167054e0f955c21365271661cd92a29d06a847a18554Mike Reed
167154e0f955c21365271661cd92a29d06a847a18554Mike Reed                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
167254e0f955c21365271661cd92a29d06a847a18554Mike Reed                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
167354e0f955c21365271661cd92a29d06a847a18554Mike Reed                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
167454e0f955c21365271661cd92a29d06a847a18554Mike Reed                // now src and dst expanded are in g:11 r:10 x:1 b:10
167554e0f955c21365271661cd92a29d06a847a18554Mike Reed                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
167654e0f955c21365271661cd92a29d06a847a18554Mike Reed            }
167754e0f955c21365271661cd92a29d06a847a18554Mike Reed            dst += 1;
167854e0f955c21365271661cd92a29d06a847a18554Mike Reed            DITHER_INC_X(x);
167954e0f955c21365271661cd92a29d06a847a18554Mike Reed        } while (--count != 0);
168054e0f955c21365271661cd92a29d06a847a18554Mike Reed    }
168154e0f955c21365271661cd92a29d06a847a18554Mike Reed}
168254e0f955c21365271661cd92a29d06a847a18554Mike Reed
168354e0f955c21365271661cd92a29d06a847a18554Mike Reed#define	S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
168454e0f955c21365271661cd92a29d06a847a18554Mike Reed#else
168554e0f955c21365271661cd92a29d06a847a18554Mike Reed#define	S32A_D565_Opaque_Dither_PROC NULL
168654e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif
168754e0f955c21365271661cd92a29d06a847a18554Mike Reed
168854e0f955c21365271661cd92a29d06a847a18554Mike Reed///////////////////////////////////////////////////////////////////////////////
168954e0f955c21365271661cd92a29d06a847a18554Mike Reed
16901ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#if	defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
16911ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
16921ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * speedup untested, but ARM version is 26 insns/iteration and
16931ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
16941ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * which is 10x the native version; that's pure instruction counts,
16951ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * not accounting for any instruction or memory latencies.
16961ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed */
16971ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
16981ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#undef	DEBUG_S32_OPAQUE_DITHER
16991ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17001ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reedstatic void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
17011ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed                                     const SkPMColor* SK_RESTRICT src,
17021ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed                                     int count, U8CPU alpha, int x, int y) {
17031ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed    SkASSERT(255 == alpha);
17041ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17051ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#define	UNROLL	8
17061ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed    if (count >= UNROLL) {
17071ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	uint8x8_t d;
17081ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
17091ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	d = vld1_u8(dstart);
17101ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17111ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	while (count >= UNROLL) {
17121ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    uint8x8_t sr, sg, sb, sa;
17131ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    uint16x8_t dr, dg, db, da;
17141ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    uint16x8_t dst8;
17151ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17161ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* source is in ABGR ordering (R == lsb) */
17171ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    {
17181ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		register uint8x8_t d0 asm("d0");
17191ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		register uint8x8_t d1 asm("d1");
17201ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		register uint8x8_t d2 asm("d2");
17211ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		register uint8x8_t d3 asm("d3");
17221ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17231ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
17241ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
17251ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    : "r" (src)
17261ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed                    );
17271ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    sr = d0; sg = d1; sb = d2; sa = d3;
17281ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    }
17291ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* XXX: if we want to prefetch, hide it in the above asm()
17301ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	     * using the gcc __builtin_prefetch(), the prefetch will
17311ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	     * fall to the bottom of the loop -- it won't stick up
17321ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	     * at the top of the loop, just after the vld4.
17331ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	     */
17341ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17351ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* sr = sr - (sr>>5) + d */
17361ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
17371ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    dr = vaddl_u8(sr, d);
17381ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17391ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* sb = sb - (sb>>5) + d */
17401ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
17411ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    db = vaddl_u8(sb, d);
17421ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17431ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
17441ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
17451ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    dg = vaddl_u8(sg, vshr_n_u8(d,1));
17461ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* XXX: check that the "d>>1" here is hoisted */
17471ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17481ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* pack high bits of each into 565 format  (rgb, b is lsb) */
17491ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    dst8 = vshrq_n_u16(db, 3);
17501ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
17511ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
17521ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17531ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* store it */
17541ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    vst1q_u16(dst, dst8);
17551ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17561ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#if	defined(DEBUG_S32_OPAQUE_DITHER)
17571ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    /* always good to know if we generated good results */
17581ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    {
17591ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		int i, myx = x, myy = y;
17601ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		DITHER_565_SCAN(myy);
17611ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		for (i=0;i<UNROLL;i++) {
17621ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    SkPMColor c = src[i];
17631ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    unsigned dither = DITHER_VALUE(myx);
17641ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    uint16_t val = SkDitherRGB32To565(c, dither);
17651ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    if (val != dst[i]) {
17661ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed			SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
17671ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed			    c, dither, val, dst[i], dstart[i]);
17681ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    }
17691ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		    DITHER_INC_X(myx);
17701ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed		}
17711ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    }
17721ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#endif
17731ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17741ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    dst += UNROLL;
17751ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    src += UNROLL;
17761ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    count -= UNROLL;
17771ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	    x += UNROLL;		/* probably superfluous */
17781ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed	}
17791ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed    }
17801ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#undef	UNROLL
17811ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17821ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed    /* residuals */
17831ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed    if (count > 0) {
17841ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed        DITHER_565_SCAN(y);
17851ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed        do {
17861ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed            SkPMColor c = *src++;
17871ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed            SkPMColorAssert(c);
17881ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed            SkASSERT(SkGetPackedA32(c) == 255);
17891ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17901ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed            unsigned dither = DITHER_VALUE(x);
17911ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed            *dst++ = SkDitherRGB32To565(c, dither);
17921ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed            DITHER_INC_X(x);
17931ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed        } while (--count != 0);
17941ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed    }
17951ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed}
17961ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
17971ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#define	S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
17981ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#else
17991ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#define	S32_D565_Opaque_Dither_PROC NULL
18001ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#endif
18011ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
18021ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed///////////////////////////////////////////////////////////////////////////////
18031ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed
180424fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reedstatic const SkBlitRow::Proc platform_565_procs[] = {
180596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    // no dither
180696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    S32_D565_Opaque_PROC,
180796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    S32_D565_Blend_PROC,
180896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    S32A_D565_Opaque_PROC,
18093d54018fa5ed403ecff0e5ef6177fbf660d6025bMike Reed    S32A_D565_Blend_PROC,
181096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
181196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    // dither
18121ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed    S32_D565_Opaque_Dither_PROC,
181396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    S32_D565_Blend_Dither_PROC,
18143d54018fa5ed403ecff0e5ef6177fbf660d6025bMike Reed    S32A_D565_Opaque_Dither_PROC,
181596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32A_D565_Blend_Dither
181696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed};
181796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
181824fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reedstatic const SkBlitRow::Proc platform_4444_procs[] = {
181996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    // no dither
182096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32_D4444_Opaque,
182196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32_D4444_Blend,
182296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32A_D4444_Opaque,
182396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32A_D4444_Blend,
182496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
182596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    // dither
182696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32_D4444_Opaque_Dither,
182796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32_D4444_Blend_Dither,
182896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32A_D4444_Opaque_Dither,
182996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed    NULL,   // S32A_D4444_Blend_Dither
183096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed};
183196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed
183224fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reedstatic const SkBlitRow::Proc32 platform_32_procs[] = {
1833d0195f840fa964da51f7a1192b432954794e660cMike Reed    NULL,   // S32_Opaque,
183454e0f955c21365271661cd92a29d06a847a18554Mike Reed    S32_Blend_BlitRow32_PROC,		// S32_Blend,
183554e0f955c21365271661cd92a29d06a847a18554Mike Reed    S32A_Opaque_BlitRow32_PROC,		// S32A_Opaque,
183635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger    S32A_Blend_BlitRow32_PROC		// S32A_Blend
1837d0195f840fa964da51f7a1192b432954794e660cMike Reed};
1838d0195f840fa964da51f7a1192b432954794e660cMike Reed
183924fb8c7cc7b76134a25914d8f6346c89c359c621Mike ReedSkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
184024fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed    return platform_4444_procs[flags];
184124fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed}
184224fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed
184324fb8c7cc7b76134a25914d8f6346c89c359c621Mike ReedSkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
184424fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed    return platform_565_procs[flags];
184524fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed}
184624fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed
184724fb8c7cc7b76134a25914d8f6346c89c359c621Mike ReedSkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
184824fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed    return platform_32_procs[flags];
184924fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed}
1850bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed
185140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek SollenbergerSkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
185240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger    return NULL;
185340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger}
185405b6b4d746867a9fb02e14edfe1bf3685abeb813Derek Sollenberger
18551cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger///////////////////////////////////////////////////////////////////////////////
185605b6b4d746867a9fb02e14edfe1bf3685abeb813Derek Sollenberger
18571cab2921ab279367f8206cdadc9259d12e603548Derek SollenbergerSkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
18581cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger                                                     SkMask::Format maskFormat,
18591cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger                                                     SkColor color) {
18601cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger    return NULL;
18611cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger}
18621cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger
18634f1dae40e24d57d647db01443b8bf2410514b8b5Derek SollenbergerSkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
18644f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger    return NULL;
18654f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger}
18664f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger
18671cab2921ab279367f8206cdadc9259d12e603548Derek SollenbergerSkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
18681cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger                                                 SkMask::Format maskFormat,
18691cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger                                                 RowFlags flags) {
18701cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger    return NULL;
187105b6b4d746867a9fb02e14edfe1bf3685abeb813Derek Sollenberger}
1872