196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed/* 21cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * Copyright 2009 The Android Open Source Project 31cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * 41cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * Use of this source code is governed by a BSD-style license that can be 51cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * found in the LICENSE file. 696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed */ 796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 88e048c19870a898cecdde3b3c0d2d512e6f372c0Mike Reed 996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#include "SkBlitRow.h" 101cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger#include "SkBlitMask.h" 117cc0a6ac7b817b217dd614cba96fc533b32d505dMike Reed#include "SkColorPriv.h" 127cc0a6ac7b817b217dd614cba96fc533b32d505dMike Reed#include "SkDither.h" 1396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 1454e0f955c21365271661cd92a29d06a847a18554Mike Reed#if defined(__ARM_HAVE_NEON) 1554e0f955c21365271661cd92a29d06a847a18554Mike Reed#include <arm_neon.h> 1654e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 1754e0f955c21365271661cd92a29d06a847a18554Mike Reed 1854e0f955c21365271661cd92a29d06a847a18554Mike Reed#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 1996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reedstatic void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 2096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed const SkPMColor* SK_RESTRICT src, int count, 2196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed U8CPU alpha, int /*x*/, int /*y*/) { 2296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed SkASSERT(255 == alpha); 23bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed 2496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed if (count >= 8) { 2596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed uint16_t* SK_RESTRICT keep_dst; 2696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 2796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed asm volatile ( 2896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "ands ip, %[count], #7 \n\t" 2996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmov.u8 d31, #1<<7 \n\t" 3096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.16 {q12}, [%[dst]] \n\t" 3196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld4.8 {d0-d3}, [%[src]] \n\t" 3296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "moveq ip, #8 \n\t" 3396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "mov %[keep_dst], %[dst] \n\t" 3496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 3596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "add %[src], %[src], ip, LSL#2 \n\t" 3696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "add %[dst], %[dst], ip, LSL#1 \n\t" 3796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "subs %[count], %[count], ip \n\t" 3896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "b 9f \n\t" 3996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // LOOP 4096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "2: \n\t" 4196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 4296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.16 {q12}, [%[dst]]! \n\t" 4396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld4.8 {d0-d3}, [%[src]]! \n\t" 4496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vst1.16 {q10}, [%[keep_dst]] \n\t" 4596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "sub %[keep_dst], %[dst], #8*2 \n\t" 4696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "subs %[count], %[count], #8 \n\t" 4796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "9: \n\t" 4896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "pld [%[dst],#32] \n\t" 4996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // expand 0565 q12 to 8888 {d4-d7} 5096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovn.u16 d4, q12 \n\t" 5196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q11, q12, #5 \n\t" 5296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q10, q12, #6+5 \n\t" 5396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovn.u16 d5, q11 \n\t" 5496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovn.u16 d6, q10 \n\t" 5596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshl.u8 d4, d4, #3 \n\t" 5696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshl.u8 d5, d5, #2 \n\t" 5796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshl.u8 d6, d6, #3 \n\t" 5896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 5996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q14, d31 \n\t" 6096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q13, d31 \n\t" 6196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q12, d31 \n\t" 6296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 6396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // duplicate in 4/2/1 & 8pix vsns 6496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmvn.8 d30, d3 \n\t" 6596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmlal.u8 q14, d30, d6 \n\t" 6696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmlal.u8 q13, d30, d5 \n\t" 6796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmlal.u8 q12, d30, d4 \n\t" 6896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q8, q14, #5 \n\t" 6996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q9, q13, #6 \n\t" 7096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddhn.u16 d6, q14, q8 \n\t" 7196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q8, q12, #5 \n\t" 7296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddhn.u16 d5, q13, q9 \n\t" 7396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vqadd.u8 d6, d6, d0 \n\t" // moved up 7496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddhn.u16 d4, q12, q8 \n\t" 7596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // intentionally don't calculate alpha 7696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // result in d4-d6 7796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 7896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vqadd.u8 d5, d5, d1 \n\t" 7996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vqadd.u8 d4, d4, d2 \n\t" 8096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 8196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // pack 8888 {d4-d6} to 0565 q10 8296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshll.u8 q10, d6, #8 \n\t" 8396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshll.u8 q3, d5, #8 \n\t" 8496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshll.u8 q2, d4, #8 \n\t" 8596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsri.u16 q10, q3, #5 \n\t" 8696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsri.u16 q10, q2, #11 \n\t" 8796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 8896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "bne 2b \n\t" 8996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 9096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "1: \n\t" 9196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vst1.16 {q10}, [%[keep_dst]] \n\t" 9296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : [count] "+r" (count) 9396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 9496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 9596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 9696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "d30","d31" 9796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed ); 982274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 992274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed else 1002274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed { // handle count < 8 10196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed uint16_t* SK_RESTRICT keep_dst; 10296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 10396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed asm volatile ( 10496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmov.u8 d31, #1<<7 \n\t" 10596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "mov %[keep_dst], %[dst] \n\t" 10696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 10796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "tst %[count], #4 \n\t" 10896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "beq 14f \n\t" 10996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.16 {d25}, [%[dst]]! \n\t" 11096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.32 {q1}, [%[src]]! \n\t" 11196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 11296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "14: \n\t" 11396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "tst %[count], #2 \n\t" 11496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "beq 12f \n\t" 11596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.32 {d24[1]}, [%[dst]]! \n\t" 11696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.32 {d1}, [%[src]]! \n\t" 11796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 11896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "12: \n\t" 11996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "tst %[count], #1 \n\t" 12096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "beq 11f \n\t" 12196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.16 {d24[1]}, [%[dst]]! \n\t" 12296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.32 {d0[1]}, [%[src]]! \n\t" 12396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 12496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "11: \n\t" 12596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // unzips achieve the same as a vld4 operation 12696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vuzpq.u16 q0, q1 \n\t" 12796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vuzp.u8 d0, d1 \n\t" 12896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vuzp.u8 d2, d3 \n\t" 12996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // expand 0565 q12 to 8888 {d4-d7} 13096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovn.u16 d4, q12 \n\t" 13196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q11, q12, #5 \n\t" 13296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q10, q12, #6+5 \n\t" 13396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovn.u16 d5, q11 \n\t" 13496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovn.u16 d6, q10 \n\t" 13596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshl.u8 d4, d4, #3 \n\t" 13696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshl.u8 d5, d5, #2 \n\t" 13796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshl.u8 d6, d6, #3 \n\t" 13896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 13996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q14, d31 \n\t" 14096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q13, d31 \n\t" 14196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q12, d31 \n\t" 14296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 14396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // duplicate in 4/2/1 & 8pix vsns 14496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmvn.8 d30, d3 \n\t" 14596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmlal.u8 q14, d30, d6 \n\t" 14696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmlal.u8 q13, d30, d5 \n\t" 14796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmlal.u8 q12, d30, d4 \n\t" 14896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q8, q14, #5 \n\t" 14996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q9, q13, #6 \n\t" 15096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddhn.u16 d6, q14, q8 \n\t" 15196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q8, q12, #5 \n\t" 15296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddhn.u16 d5, q13, q9 \n\t" 15396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vqadd.u8 d6, d6, d0 \n\t" // moved up 15496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddhn.u16 d4, q12, q8 \n\t" 15596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // intentionally don't calculate alpha 15696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // result in d4-d6 15796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 15896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vqadd.u8 d5, d5, d1 \n\t" 15996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vqadd.u8 d4, d4, d2 \n\t" 16096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 16196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // pack 8888 {d4-d6} to 0565 q10 16296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshll.u8 q10, d6, #8 \n\t" 16396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshll.u8 q3, d5, #8 \n\t" 16496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshll.u8 q2, d4, #8 \n\t" 16596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsri.u16 q10, q3, #5 \n\t" 16696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsri.u16 q10, q2, #11 \n\t" 16796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 16896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // store 16996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "tst %[count], #4 \n\t" 17096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "beq 24f \n\t" 17196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vst1.16 {d21}, [%[keep_dst]]! \n\t" 17296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 17396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "24: \n\t" 17496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "tst %[count], #2 \n\t" 17596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "beq 22f \n\t" 17696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 17796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 17896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "22: \n\t" 17996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "tst %[count], #1 \n\t" 18096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "beq 21f \n\t" 18196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 18296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 18396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "21: \n\t" 18496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : [count] "+r" (count) 18596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 18696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 18796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 18896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "d30","d31" 18996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed ); 19096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed } 19196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed} 19296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 19396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reedstatic void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 19496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed const SkPMColor* SK_RESTRICT src, int count, 19596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed U8CPU alpha, int /*x*/, int /*y*/) { 196bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed 197bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed U8CPU alpha_for_asm = alpha; 198bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed 19996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed asm volatile ( 20096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed /* This code implements a Neon version of S32A_D565_Blend. The output differs from 20196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * the original in two respects: 20296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * 1. The results have a few mismatches compared to the original code. These mismatches 20396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * never exceed 1. It's possible to improve accuracy vs. a floating point 20496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * implementation by introducing rounding right shifts (vrshr) for the final stage. 20596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * Rounding is not present in the code below, because although results would be closer 20696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * to a floating point implementation, the number of mismatches compared to the 20796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * original code would be far greater. 20896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * 2. On certain inputs, the original code can overflow, causing colour channels to 20996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 21096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * to affect another. 21196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed */ 21296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 213bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1 214bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 215bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 216bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else 21796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 218bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif 21996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmov.u16 q3, #255 \n\t" // set up constant 22096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 22196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 22296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "beq 2f \n\t" // if count8 == 0, exit 22396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmov.u16 q15, #0x1f \n\t" // set up blue mask 22496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 22596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "1: \n\t" 22696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 22796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "subs r4, r4, #1 \n\t" // decrement loop counter 22896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 22996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // and deinterleave 23096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 23196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 23296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vand q10, q0, q15 \n\t" // extract blue 23396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q8, q0, #11 \n\t" // extract red 23496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q9, q9, #10 \n\t" // extract green 23596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // dstrgb = {q8, q9, q10} 23696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 23796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 23896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 23996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 24096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 24196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 24296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 24396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 24496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 24596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // srcrgba = {q11, q12, q13, q14} 24696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 24796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 24896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 24996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 25096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 25196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 25296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 25396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 25496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // dst_scale = q2 25596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 25696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 25796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 25896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 259bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed 260bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1 261bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed // trying for a better match with SkDiv255Round(a) 262bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed // C alg is: a+=128; (a+a>>8)>>8 263bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed // we'll use just a rounding shift [q2 is available for scratch] 264bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed "vrshr.u16 q11, q11, #8 \n\t" // shift down red 265bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed "vrshr.u16 q12, q12, #8 \n\t" // shift down green 266bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 267bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else 268bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed // arm's original "truncating divide by 256" 26996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q11, q11, #8 \n\t" // shift down red 27096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q12, q12, #8 \n\t" // shift down green 27196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q13, q13, #8 \n\t" // shift down blue 272bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif 27396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 27496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 27596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 27696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 27796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 27896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "bne 1b \n\t" // if counter != 0, loop 27996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "2: \n\t" // exit 28096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 281bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 28296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : 28396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 28496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed ); 285bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed 28696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed count &= 7; 28796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed if (count > 0) { 28896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed do { 28996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed SkPMColor sc = *src++; 290215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed if (sc) { 29196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed uint16_t dc = *dst; 292215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 293215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 294215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 295215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 296215473cea1702d8acc0316da3e5a9bf4ce0130efMike Reed *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 29796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed } 29896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed dst += 1; 29996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed } while (--count != 0); 30096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed } 30196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed} 30296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 30396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 30496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * each dither value is spaced out into byte lanes, and repeated 30596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 30696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed * start of each row. 30796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed */ 30896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reedstatic const uint8_t gDitherMatrix_Neon[48] = { 30996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 31096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 31196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 31296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 31396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 31496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed}; 31596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 31696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reedstatic void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 31796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed int count, U8CPU alpha, int x, int y) 31896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed{ 31996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed /* select row and offset for dither array */ 32096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 32196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 32296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed /* rescale alpha to range 0 - 256 */ 32396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed int scale = SkAlpha255To256(alpha); 32496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 32596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed asm volatile ( 32696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values 32796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values 32896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg 32996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmov.i8 d29, #0x3f \n\t" // set up green mask 33096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmov.i8 d28, #0x1f \n\t" // set up blue mask 33196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "1: \n\t" 33296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb 33396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5 33496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6 33596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5 33696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen 33796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen 33896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen 33996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result 34096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result 34196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result 34296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits 34396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits 34496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits 34596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // load 8 pixels from dst, extract rgb 34696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels 34796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits 34896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes 34996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red 35096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vand d17, d17, d29 \n\t" // and green with green mask 35196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vand d18, d18, d28 \n\t" // and blue with blue mask 35296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes 35396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // src = {d22 (r), d23 (g), d24 (b)} 35496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // dst = {d16 (r), d17 (g), d18 (b)} 35596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // subtract dst from src and widen 35696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst 35796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst 35896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst 35996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // multiply diffs by scale and shift 36096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale 36196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale 36296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale 36396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "subs %[count], %[count], #8 \n\t" // decrement loop counter 36496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow 36596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow 36696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow 36796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // add dst to result 36896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddl.s8 q0, d0, d16 \n\t" // add dst to red 36996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddl.s8 q1, d2, d17 \n\t" // add dst to green 37096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue 37196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // put result into 565 format 37296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue 37396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue 37496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result 37596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed "bgt 1b \n\t" // loop if count > 0 37696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 37796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : [dstart] "r" (dstart), [scale] "r" (scale) 37896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31" 37996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed ); 38096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 38196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed DITHER_565_SCAN(y); 38296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 38396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed while((count & 7) > 0) 38496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed { 38596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed SkPMColor c = *src++; 38696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 38796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed int dither = DITHER_VALUE(x); 38896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed int sr = SkGetPackedR32(c); 38996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed int sg = SkGetPackedG32(c); 39096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed int sb = SkGetPackedB32(c); 39196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed sr = SkDITHER_R32To565(sr, dither); 39296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed sg = SkDITHER_G32To565(sg, dither); 39396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed sb = SkDITHER_B32To565(sb, dither); 39496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 39596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed uint16_t d = *dst; 39696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 39796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed SkAlphaBlend(sg, SkGetPackedG16(d), scale), 39896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 39996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed DITHER_INC_X(x); 40096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed count--; 40196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed } 40296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed} 40396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 40496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon 40596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32A_D565_Blend_PROC S32A_D565_Blend_neon 40696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon 407c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#elif __ARM_ARCH__ >= 7 && !defined(SK_CPU_BENDIAN) 408c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenbergerstatic void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst, 409c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger const SkPMColor* SK_RESTRICT src, int count, 410c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger U8CPU alpha, int /*x*/, int /*y*/) { 411c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger SkASSERT(255 == alpha); 412c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 413c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger asm volatile ( 414c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "1: \n\t" 415c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "ldr r3, [%[src]], #4 \n\t" 416c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "cmp r3, #0xff000000 \n\t" 417c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "blo 2f \n\t" 418c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "and r4, r3, #0x0000f8 \n\t" 419c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "and r5, r3, #0x00fc00 \n\t" 420c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "and r6, r3, #0xf80000 \n\t" 421c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "pld [r1, #32] \n\t" 422c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "lsl r3, r4, #8 \n\t" 423c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "orr r3, r3, r5, lsr #5 \n\t" 424c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "orr r3, r3, r6, lsr #19 \n\t" 425c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "subs %[count], %[count], #1 \n\t" 426c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "strh r3, [%[dst]], #2 \n\t" 427c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "bne 1b \n\t" 428c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "b 4f \n\t" 429c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "2: \n\t" 430c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "lsrs r7, r3, #24 \n\t" 431c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "beq 3f \n\t" 432c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "ldrh r4, [%[dst]] \n\t" 433c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "rsb r7, r7, #255 \n\t" 434c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "and r6, r4, #0x001f \n\t" 435c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "ubfx r5, r4, #5, #6 \n\t" 436c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "pld [r0, #16] \n\t" 437c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "lsr r4, r4, #11 \n\t" 438c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "smulbb r6, r6, r7 \n\t" 439c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "smulbb r5, r5, r7 \n\t" 440c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "smulbb r4, r4, r7 \n\t" 441c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "ubfx r7, r3, #16, #8 \n\t" 442c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "ubfx ip, r3, #8, #8 \n\t" 443c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "and r3, r3, #0xff \n\t" 444c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r6, r6, #16 \n\t" 445c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r5, r5, #32 \n\t" 446c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r4, r4, #16 \n\t" 447c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r6, r6, r6, lsr #5 \n\t" 448c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r5, r5, r5, lsr #6 \n\t" 449c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r4, r4, r4, lsr #5 \n\t" 450c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r6, r7, r6, lsr #5 \n\t" 451c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r5, ip, r5, lsr #6 \n\t" 452c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add r4, r3, r4, lsr #5 \n\t" 453c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "lsr r6, r6, #3 \n\t" 454c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "and r5, r5, #0xfc \n\t" 455c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "and r4, r4, #0xf8 \n\t" 456c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "orr r6, r6, r5, lsl #3 \n\t" 457c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "orr r4, r6, r4, lsl #8 \n\t" 458c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "strh r4, [%[dst]], #2 \n\t" 459c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "pld [r1, #32] \n\t" 460c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "subs %[count], %[count], #1 \n\t" 461c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "bne 1b \n\t" 462c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "b 4f \n\t" 463c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "3: \n\t" 464c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "subs %[count], %[count], #1 \n\t" 465c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "add %[dst], %[dst], #2 \n\t" 466c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "bne 1b \n\t" 467c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "4: \n\t" 468c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 469c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger : 470c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip" 471c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger ); 472c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger} 473c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32A_D565_Opaque_PROC S32A_D565_Opaque_v7 474c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32A_D565_Blend_PROC NULL 475c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32_D565_Blend_Dither_PROC NULL 47696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#else 47796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32A_D565_Opaque_PROC NULL 47896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32A_D565_Blend_PROC NULL 47996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32_D565_Blend_Dither_PROC NULL 48096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#endif 48196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 48296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed/* Don't have a special version that assumes each src is opaque, but our S32A 48396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed is still faster than the default, so use it here 48496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed */ 48596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC 48696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed#define S32_D565_Blend_PROC S32A_D565_Blend_PROC 48796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 48896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed/////////////////////////////////////////////////////////////////////////////// 48996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 490c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(TEST_SRC_ALPHA) 491c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 492c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenbergerstatic void S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor* SK_RESTRICT dst, 493c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger const SkPMColor* SK_RESTRICT src, 494c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger int count, U8CPU alpha) { 495c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger SkASSERT(255 == alpha); 496c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if (count <= 0) 497c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger return; 498c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 499c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* Use these to check if src is transparent or opaque */ 500c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger const unsigned int ALPHA_OPAQ = 0xFF000000; 501c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger const unsigned int ALPHA_TRANS = 0x00FFFFFF; 502c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 503c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define UNROLL 4 504c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 505c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger const SkPMColor* SK_RESTRICT src_temp = src; 506c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 507c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* set up the NEON variables */ 508c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger uint8x8_t alpha_mask; 509c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 510c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger alpha_mask = vld1_u8(alpha_mask_setup); 511c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 512c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger uint8x8_t src_raw, dst_raw, dst_final; 513c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 514c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger uint8x8_t dst_cooked; 515c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger uint16x8_t dst_wide; 516c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger uint8x8_t alpha_narrow; 517c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger uint16x8_t alpha_wide; 518c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 519c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* choose the first processing type */ 520c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if( src >= src_end) 521c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto TAIL; 522c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*src <= ALPHA_TRANS) 523c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto ALPHA_0; 524c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*src >= ALPHA_OPAQ) 525c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto ALPHA_255; 526c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* fall-thru */ 527c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 528c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek SollenbergerALPHA_1_TO_254: 529c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger do { 530c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 531c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* get the source */ 532c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger src_raw = vreinterpret_u8_u32(vld1_u32(src)); 533c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 534c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 535c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* get and hold the dst too */ 536c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 537c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 538c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 539c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 540c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* get the alphas spread out properly */ 541c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 542c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 543c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* we collapsed (255-a)+1 ... */ 544c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 545c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 546c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* spread the dest */ 547c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_wide = vmovl_u8(dst_raw); 548c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 549c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* alpha mul the dest */ 550c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_wide = vmulq_u16 (dst_wide, alpha_wide); 551c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_cooked = vshrn_n_u16(dst_wide, 8); 552c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 553c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* sum -- ignoring any byte lane overflows */ 554c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_final = vadd_u8(src_raw, dst_cooked); 555c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 556c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 557c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 558c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* we collapsed (255-a)+1 ... */ 559c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 560c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 561c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* spread the dest */ 562c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_wide = vmovl_u8(dst_raw_2); 563c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 564c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* alpha mul the dest */ 565c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_wide = vmulq_u16 (dst_wide, alpha_wide); 566c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_cooked = vshrn_n_u16(dst_wide, 8); 567c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 568c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* sum -- ignoring any byte lane overflows */ 569c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 570c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 571c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 572c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 573c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 574c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger src += UNROLL; 575c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst += UNROLL; 576c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 577c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* if 2 of the next pixels aren't between 1 and 254 578c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger it might make sense to go to the optimized loops */ 579c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 580c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger break; 581c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 582c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } while(src < src_end); 583c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 584c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if (src >= src_end) 585c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto TAIL; 586c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 587c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 588c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto ALPHA_255; 589c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 590c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /*fall-thru*/ 591c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 592c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek SollenbergerALPHA_0: 593c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 594c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /*In this state, we know the current alpha is 0 and 595c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger we optimize for the next alpha also being zero. */ 596c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger src_temp = src; //so we don't have to increment dst every time 597c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger do { 598c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*(++src) > ALPHA_TRANS) 599c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger break; 600c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*(++src) > ALPHA_TRANS) 601c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger break; 602c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*(++src) > ALPHA_TRANS) 603c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger break; 604c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*(++src) > ALPHA_TRANS) 605c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger break; 606c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } while(src < src_end); 607c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 608c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst += (src - src_temp); 609c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 610c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* no longer alpha 0, so determine where to go next. */ 611c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if( src >= src_end) 612c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto TAIL; 613c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*src >= ALPHA_OPAQ) 614c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto ALPHA_255; 615c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger else 616c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto ALPHA_1_TO_254; 617c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 618c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek SollenbergerALPHA_255: 619c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 620c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst[0]=src[0]; 621c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst[1]=src[1]; 622c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst[2]=src[2]; 623c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst[3]=src[3]; 624c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger src+=UNROLL; 625c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst+=UNROLL; 626c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(src >= src_end) 627c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto TAIL; 628c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } 629c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 630c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger //Handle remainder. 631c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 632c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 633c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 634c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } 635c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } 636c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 637c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if( src >= src_end) 638c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto TAIL; 639c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if(*src <= ALPHA_TRANS) 640c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto ALPHA_0; 641c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger else 642c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger goto ALPHA_1_TO_254; 643c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 644c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek SollenbergerTAIL: 645c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* do any residual iterations */ 646c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger src_end += UNROLL + 1; //goto the real end 647c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger while(src != src_end) { 648c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if( *src != 0 ) { 649c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger if( *src >= ALPHA_OPAQ ) { 650c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger *dst = *src; 651c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } 652c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger else { 653c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger *dst = SkPMSrcOver(*src, *dst); 654c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } 655c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } 656c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger src++; 657c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger dst++; 658c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger } 659c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger return; 660c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger} 661c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 662c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon_test_alpha 663c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 664c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 66554e0f955c21365271661cd92a29d06a847a18554Mike Reed 66654e0f955c21365271661cd92a29d06a847a18554Mike Reedstatic void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 66754e0f955c21365271661cd92a29d06a847a18554Mike Reed const SkPMColor* SK_RESTRICT src, 66854e0f955c21365271661cd92a29d06a847a18554Mike Reed int count, U8CPU alpha) { 66954e0f955c21365271661cd92a29d06a847a18554Mike Reed 67054e0f955c21365271661cd92a29d06a847a18554Mike Reed SkASSERT(255 == alpha); 67154e0f955c21365271661cd92a29d06a847a18554Mike Reed if (count > 0) { 67254e0f955c21365271661cd92a29d06a847a18554Mike Reed 6732274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 6742274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint8x8_t alpha_mask; 6752274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 6762274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 6772274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed alpha_mask = vld1_u8(alpha_mask_setup); 6782274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 67954e0f955c21365271661cd92a29d06a847a18554Mike Reed /* do the NEON unrolled code */ 68054e0f955c21365271661cd92a29d06a847a18554Mike Reed#define UNROLL 4 68154e0f955c21365271661cd92a29d06a847a18554Mike Reed while (count >= UNROLL) { 68254e0f955c21365271661cd92a29d06a847a18554Mike Reed uint8x8_t src_raw, dst_raw, dst_final; 68354e0f955c21365271661cd92a29d06a847a18554Mike Reed uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 68454e0f955c21365271661cd92a29d06a847a18554Mike Reed 68554e0f955c21365271661cd92a29d06a847a18554Mike Reed /* get the source */ 68654e0f955c21365271661cd92a29d06a847a18554Mike Reed src_raw = vreinterpret_u8_u32(vld1_u32(src)); 68754e0f955c21365271661cd92a29d06a847a18554Mike Reed#if UNROLL > 2 68854e0f955c21365271661cd92a29d06a847a18554Mike Reed src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 68954e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 69054e0f955c21365271661cd92a29d06a847a18554Mike Reed 69154e0f955c21365271661cd92a29d06a847a18554Mike Reed /* get and hold the dst too */ 69254e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 69354e0f955c21365271661cd92a29d06a847a18554Mike Reed#if UNROLL > 2 69454e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 69554e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 69654e0f955c21365271661cd92a29d06a847a18554Mike Reed 69754e0f955c21365271661cd92a29d06a847a18554Mike Reed /* 1st and 2nd bits of the unrolling */ 69854e0f955c21365271661cd92a29d06a847a18554Mike Reed { 69954e0f955c21365271661cd92a29d06a847a18554Mike Reed uint8x8_t dst_cooked; 70054e0f955c21365271661cd92a29d06a847a18554Mike Reed uint16x8_t dst_wide; 70154e0f955c21365271661cd92a29d06a847a18554Mike Reed uint8x8_t alpha_narrow; 70254e0f955c21365271661cd92a29d06a847a18554Mike Reed uint16x8_t alpha_wide; 70354e0f955c21365271661cd92a29d06a847a18554Mike Reed 70454e0f955c21365271661cd92a29d06a847a18554Mike Reed /* get the alphas spread out properly */ 70554e0f955c21365271661cd92a29d06a847a18554Mike Reed alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 706bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1 707bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 708bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* we collapsed (255-a)+1 ... */ 709bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 710bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else 7112274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 71254e0f955c21365271661cd92a29d06a847a18554Mike Reed alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 713bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif 71454e0f955c21365271661cd92a29d06a847a18554Mike Reed 7152274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* spread the dest */ 71654e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_wide = vmovl_u8(dst_raw); 71754e0f955c21365271661cd92a29d06a847a18554Mike Reed 71854e0f955c21365271661cd92a29d06a847a18554Mike Reed /* alpha mul the dest */ 71954e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_wide = vmulq_u16 (dst_wide, alpha_wide); 72054e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_cooked = vshrn_n_u16(dst_wide, 8); 72154e0f955c21365271661cd92a29d06a847a18554Mike Reed 72254e0f955c21365271661cd92a29d06a847a18554Mike Reed /* sum -- ignoring any byte lane overflows */ 72354e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_final = vadd_u8(src_raw, dst_cooked); 72454e0f955c21365271661cd92a29d06a847a18554Mike Reed } 72554e0f955c21365271661cd92a29d06a847a18554Mike Reed 72654e0f955c21365271661cd92a29d06a847a18554Mike Reed#if UNROLL > 2 72754e0f955c21365271661cd92a29d06a847a18554Mike Reed /* the 3rd and 4th bits of our unrolling */ 72854e0f955c21365271661cd92a29d06a847a18554Mike Reed { 72954e0f955c21365271661cd92a29d06a847a18554Mike Reed uint8x8_t dst_cooked; 73054e0f955c21365271661cd92a29d06a847a18554Mike Reed uint16x8_t dst_wide; 73154e0f955c21365271661cd92a29d06a847a18554Mike Reed uint8x8_t alpha_narrow; 73254e0f955c21365271661cd92a29d06a847a18554Mike Reed uint16x8_t alpha_wide; 73354e0f955c21365271661cd92a29d06a847a18554Mike Reed 73454e0f955c21365271661cd92a29d06a847a18554Mike Reed alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 735bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1 736bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 737bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* we collapsed (255-a)+1 ... */ 738bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 739bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else 7402274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 74154e0f955c21365271661cd92a29d06a847a18554Mike Reed alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 742bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif 74354e0f955c21365271661cd92a29d06a847a18554Mike Reed 7442274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* spread the dest */ 74554e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_wide = vmovl_u8(dst_raw_2); 74654e0f955c21365271661cd92a29d06a847a18554Mike Reed 74754e0f955c21365271661cd92a29d06a847a18554Mike Reed /* alpha mul the dest */ 74854e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_wide = vmulq_u16 (dst_wide, alpha_wide); 74954e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_cooked = vshrn_n_u16(dst_wide, 8); 75054e0f955c21365271661cd92a29d06a847a18554Mike Reed 75154e0f955c21365271661cd92a29d06a847a18554Mike Reed /* sum -- ignoring any byte lane overflows */ 75254e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 75354e0f955c21365271661cd92a29d06a847a18554Mike Reed } 75454e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 75554e0f955c21365271661cd92a29d06a847a18554Mike Reed 75654e0f955c21365271661cd92a29d06a847a18554Mike Reed vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 75754e0f955c21365271661cd92a29d06a847a18554Mike Reed#if UNROLL > 2 75854e0f955c21365271661cd92a29d06a847a18554Mike Reed vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 75954e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 76054e0f955c21365271661cd92a29d06a847a18554Mike Reed 76154e0f955c21365271661cd92a29d06a847a18554Mike Reed src += UNROLL; 76254e0f955c21365271661cd92a29d06a847a18554Mike Reed dst += UNROLL; 76354e0f955c21365271661cd92a29d06a847a18554Mike Reed count -= UNROLL; 76454e0f955c21365271661cd92a29d06a847a18554Mike Reed } 76554e0f955c21365271661cd92a29d06a847a18554Mike Reed#undef UNROLL 76654e0f955c21365271661cd92a29d06a847a18554Mike Reed 76754e0f955c21365271661cd92a29d06a847a18554Mike Reed /* do any residual iterations */ 76854e0f955c21365271661cd92a29d06a847a18554Mike Reed while (--count >= 0) { 76954e0f955c21365271661cd92a29d06a847a18554Mike Reed#ifdef TEST_SRC_ALPHA 77054e0f955c21365271661cd92a29d06a847a18554Mike Reed SkPMColor sc = *src; 77154e0f955c21365271661cd92a29d06a847a18554Mike Reed if (sc) { 77254e0f955c21365271661cd92a29d06a847a18554Mike Reed unsigned srcA = SkGetPackedA32(sc); 77354e0f955c21365271661cd92a29d06a847a18554Mike Reed SkPMColor result = sc; 77454e0f955c21365271661cd92a29d06a847a18554Mike Reed if (srcA != 255) { 77554e0f955c21365271661cd92a29d06a847a18554Mike Reed result = SkPMSrcOver(sc, *dst); 77654e0f955c21365271661cd92a29d06a847a18554Mike Reed } 77754e0f955c21365271661cd92a29d06a847a18554Mike Reed *dst = result; 77854e0f955c21365271661cd92a29d06a847a18554Mike Reed } 77954e0f955c21365271661cd92a29d06a847a18554Mike Reed#else 78054e0f955c21365271661cd92a29d06a847a18554Mike Reed *dst = SkPMSrcOver(*src, *dst); 78154e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 78254e0f955c21365271661cd92a29d06a847a18554Mike Reed src += 1; 78354e0f955c21365271661cd92a29d06a847a18554Mike Reed dst += 1; 78454e0f955c21365271661cd92a29d06a847a18554Mike Reed } 78554e0f955c21365271661cd92a29d06a847a18554Mike Reed } 78654e0f955c21365271661cd92a29d06a847a18554Mike Reed} 78754e0f955c21365271661cd92a29d06a847a18554Mike Reed 78854e0f955c21365271661cd92a29d06a847a18554Mike Reed#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon 78935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 790c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#elif defined (__ARM_ARCH__) /* #if defined(__ARM_HAVE_NEON) && defined... */ 79135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 792c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#if defined(TEST_SRC_ALPHA) 793c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 794c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenbergerstatic void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_test_alpha 795c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger (SkPMColor* SK_RESTRICT dst, 796c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger const SkPMColor* SK_RESTRICT src, 797c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger int count, U8CPU alpha) { 798c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 799c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger/* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */ 800c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger/* Predicts that the next pixel will have the same alpha type as the current pixel */ 801c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 802c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenbergerasm volatile ( 803c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 804c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSTMDB r13!, {r4-r12, r14} \n" /* saving r4-r12, lr on the stack */ 805c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* we should not save r0-r3 according to ABI */ 806c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 807c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP r2, #0 \n" /* if (count == 0) */ 808c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBEQ 9f \n" /* go to EXIT */ 809c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 810c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMOV r12, #0xff \n" /* load the 0xff mask in r12 */ 811c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r12, r12, r12, LSL #16 \n" /* convert it to 0xff00ff in r12 */ 812c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 813c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMOV r14, #255 \n" /* r14 = 255 */ 814c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* will be used later for left-side comparison */ 815c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 816c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r2, %[src], r2, LSL #2 \n" /* r2 points to last array element which can be used */ 817c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSUB r2, r2, #16 \n" /* as a base for 4-way processing algorithm */ 818c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 819c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" /* if our current [src] array pointer is bigger than */ 820c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBGT 8f \n" /* calculated marker for 4-way -> */ 821c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* use simple one-by-one processing */ 822c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 823c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* START OF DISPATCHING BLOCK */ 824c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 825c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t0: \n" 826c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 827c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ 828c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 829c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSR r7, r3, #24 \n" /* if not all src alphas of 4-way block are equal -> */ 830c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP r7, r4, LSR #24 \n" 831c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMPEQ r7, r5, LSR #24 \n" 832c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMPEQ r7, r6, LSR #24 \n" 833c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBNE 1f \n" /* -> go to general 4-way processing routine */ 834c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 835c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP r14, r7 \n" /* if all src alphas are equal to 255 */ 836c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBEQ 3f \n" /* go to alpha == 255 optimized routine */ 837c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 838c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP r7, #0 \n" /* if all src alphas are equal to 0 */ 839c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBEQ 6f \n" /* go to alpha == 0 optimized routine */ 840c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 841c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* END OF DISPATCHING BLOCK */ 842c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 843c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */ 844c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 845c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t1: \n" 846c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* we do not have enough registers to make */ 847c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* 4-way [dst] loading -> we are using 2 * 2-way */ 848c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 849c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDM %[dst], {r7, r8} \n" /* 1st 2-way loading of dst values to r7-r8 */ 850c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 851c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* PROCESSING BLOCK 1 */ 852c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* r3 = src, r7 = dst */ 853c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 854c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSR r11, r3, #24 \n" /* extracting alpha from source and storing to r11 */ 855c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */ 856c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */ 857c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */ 858c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r9, r9, r11 \n" /* br = br * scale */ 859c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */ 860c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r10, r10, r11 \n" /* ag = ag * scale */ 861c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */ 862c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r7, r9, r10 \n" /* br | ag */ 863c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */ 864c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 865c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* PROCESSING BLOCK 2 */ 866c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* r4 = src, r8 = dst */ 867c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 868c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSR r11, r4, #24 \n" /* see PROCESSING BLOCK 1 */ 869c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r8 \n" 870c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tRSB r11, r11, #256 \n" 871c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r12, r8, LSR #8 \n" 872c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r9, r9, r11 \n" 873c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r9, LSR #8 \n" 874c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r10, r10, r11 \n" 875c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r10, r12, LSL #8 \n" 876c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r8, r9, r10 \n" 877c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r8, r4, r8 \n" 878c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 879c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSTM %[dst]!, {r7, r8} \n" /* 1st 2-way storing of processed dst values */ 880c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 881c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDM %[dst], {r9, r10} \n" /* 2nd 2-way loading of dst values to r9-r10 */ 882c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 883c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* PROCESSING BLOCK 3 */ 884c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* r5 = src, r9 = dst */ 885c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 886c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSR r11, r5, #24 \n" /* see PROCESSING BLOCK 1 */ 887c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r7, r12, r9 \n" 888c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tRSB r11, r11, #256 \n" 889c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r8, r12, r9, LSR #8 \n" 890c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r7, r7, r11 \n" 891c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r7, r12, r7, LSR #8 \n" 892c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r8, r8, r11 \n" 893c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r8, r8, r12, LSL #8 \n" 894c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r9, r7, r8 \n" 895c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r9, r5, r9 \n" 896c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 897c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* PROCESSING BLOCK 4 */ 898c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* r6 = src, r10 = dst */ 899c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 900c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSR r11, r6, #24 \n" /* see PROCESSING BLOCK 1 */ 901c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r7, r12, r10 \n" 902c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tRSB r11, r11, #256 \n" 903c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r8, r12, r10, LSR #8 \n" 904c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r7, r7, r11 \n" 905c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r7, r12, r7, LSR #8 \n" 906c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r8, r8, r11 \n" 907c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r8, r8, r12, LSL #8 \n" 908c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r10, r7, r8 \n" 909c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r10, r6, r10 \n" 910c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 911c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSTM %[dst]!, {r9, r10} \n" /* 2nd 2-way storing of processed dst values */ 912c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 913c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" /* if our current [src] pointer <= calculated marker */ 914c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBLE 0b \n" /* we could run 4-way processing -> go to dispatcher */ 915c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBGT 8f \n" /* else -> use simple one-by-one processing */ 916c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 917c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */ 918c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 919c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */ 920c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 921c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t2: \n" /* ENTRY 1: LOADING [src] to registers */ 922c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 923c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ 924c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 925c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */ 926c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r8, r5, r6 \n" 927c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r7, r8 \n" 928c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP r14, r9, LSR #24 \n" 929c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBNE 4f \n" /* -> go to alpha == 0 check */ 930c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 931c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t3: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */ 932c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 933c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */ 934c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 935c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ 936c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBLE 2b \n" /* we could run 4-way processing */ 937c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* because now we're in ALPHA == 255 state */ 938c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* run next cycle with priority alpha == 255 checks */ 939c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 940c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBGT 8f \n" /* if our current [src] array pointer > marker */ 941c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* use simple one-by-one processing */ 942c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 943c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t4: \n" 944c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 945c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */ 946c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r8, r5, r6 \n" 947c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r9, r7, r8 \n" 948c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSRS r9, #24 \n" 949c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBNE 1b \n" /* -> go to general processing mode */ 950c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* (we already checked for alpha == 255) */ 951c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 952c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */ 953c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 954c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ 955c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBLE 5f \n" /* we could run 4-way processing one more time */ 956c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* because now we're in ALPHA == 0 state */ 957c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* run next cycle with priority alpha == 0 checks */ 958c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 959c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBGT 8f \n" /* if our current [src] array pointer > marker */ 960c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* use simple one-by-one processing */ 961c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 962c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */ 963c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 964c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */ 965c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 966c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t5: \n" /* ENTRY 1: LOADING [src] to registers */ 967c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 968c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDM %[src]!, {r3, r4, r5, r6} \n" /* 4-way loading of source values to r3-r6 */ 969c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 970c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r7, r3, r4 \n" /* if not all alphas == 0 -> */ 971c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r8, r5, r6 \n" 972c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r9, r7, r8 \n" 973c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSRS r9, #24 \n" 974c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBNE 7f \n" /* -> go to alpha == 255 check */ 975c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 976c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t6: \n" /* ENTRY 2: [src] already loaded by DISPATCHER */ 977c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 978c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD %[dst], %[dst], #16 \n" /* all src alphas == 0 -> do not change dst values */ 979c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 980c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ 981c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBLE 5b \n" /* we could run 4-way processing one more time */ 982c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* because now we're in ALPHA == 0 state */ 983c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* run next cycle with priority alpha == 0 checks */ 984c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 985c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBGT 8f \n" /* if our current [src] array pointer > marker */ 986c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* use simple one-by-one processing */ 987c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t7: \n" 988c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 989c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r7, r3, r4 \n" /* if not all alphas == 255 -> */ 990c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r8, r5, r6 \n" 991c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r7, r8 \n" 992c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP r14, r9, LSR #24 \n" 993c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBNE 1b \n" /* -> go to general processing mode */ 994c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* (we already checked for alpha == 0) */ 995c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 996c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSTM %[dst]!, {r3, r4, r5, r6} \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */ 997c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 998c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" /* if our current [src] array pointer <= marker */ 999c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBLE 2b \n" /* we could run 4-way processing one more time */ 1000c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* because now we're in ALPHA == 255 state */ 1001c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* run next cycle with priority alpha == 255 checks */ 1002c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1003c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBGT 8f \n" /* if our current [src] array pointer > marker */ 1004c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* use simple one-by-one processing */ 1005c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1006c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */ 1007c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1008c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* START OF TAIL BLOCK */ 1009c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* (used when array is too small to be processed with 4-way algorithm)*/ 1010c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1011c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t8: \n" 1012c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1013c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r2, r2, #16 \n" /* now r2 points to the element just after array */ 1014c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* we've done r2 = r2 - 16 at procedure start */ 1015c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1016c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */ 1017c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBEQ 9f \n" /* goto EXIT */ 1018c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1019c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* TAIL PROCESSING BLOCK 1 */ 1020c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1021c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDR r3, [%[src]], #4 \n" /* r3 = *src, src++ */ 1022c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDR r7, [%[dst]] \n" /* r7 = *dst */ 1023c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1024c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSR r11, r3, #24 \n" /* extracting alpha from source */ 1025c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r7 \n" /* r9 = br masked by r12 (0xff00ff) */ 1026c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tRSB r11, r11, #256 \n" /* subtracting the alpha from 255 -> r11 = scale */ 1027c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r12, r7, LSR #8 \n" /* r10 = ag masked by r12 (0xff00ff) */ 1028c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r9, r9, r11 \n" /* br = br * scale */ 1029c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r9, LSR #8 \n" /* lsr br by 8 and mask it */ 1030c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r10, r10, r11 \n" /* ag = ag * scale */ 1031c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r10, r12, LSL #8 \n" /* mask ag with reverse mask */ 1032c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r7, r9, r10 \n" /* br | ag */ 1033c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r7, r3, r7 \n" /* dst = src + calc dest(r8) */ 1034c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1035c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSTR r7, [%[dst]], #4 \n" /* *dst = r7; dst++ */ 1036c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1037c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" /* if our current [src] array pointer > final marker */ 1038c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBEQ 9f \n" /* goto EXIT */ 1039c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1040c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* TAIL PROCESSING BLOCK 2 */ 1041c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1042c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */ 1043c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDR r7, [%[dst]] \n" 1044c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1045c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSR r11, r3, #24 \n" 1046c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r7 \n" 1047c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tRSB r11, r11, #256 \n" 1048c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r12, r7, LSR #8 \n" 1049c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r9, r9, r11 \n" 1050c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r9, LSR #8 \n" 1051c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r10, r10, r11 \n" 1052c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r10, r12, LSL #8 \n" 1053c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r7, r9, r10 \n" 1054c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r7, r3, r7 \n" 1055c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1056c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSTR r7, [%[dst]], #4 \n" 1057c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1058c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tCMP %[src], r2 \n" 1059c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBEQ 9f \n" 1060c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1061c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* TAIL PROCESSING BLOCK 3 */ 1062c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1063c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDR r3, [%[src]], #4 \n" /* see TAIL PROCESSING BLOCK 1 */ 1064c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDR r7, [%[dst]] \n" 1065c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1066c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLSR r11, r3, #24 \n" 1067c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r7 \n" 1068c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tRSB r11, r11, #256 \n" 1069c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r12, r7, LSR #8 \n" 1070c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r9, r9, r11 \n" 1071c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r9, r12, r9, LSR #8 \n" 1072c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tMUL r10, r10, r11 \n" 1073c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tAND r10, r10, r12, LSL #8 \n" 1074c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tORR r7, r9, r10 \n" 1075c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tADD r7, r3, r7 \n" 1076c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1077c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tSTR r7, [%[dst]], #4 \n" 1078c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1079c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger /* END OF TAIL BLOCK */ 1080c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1081c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\t9: \n" /* EXIT */ 1082c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1083c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tLDMIA r13!, {r4-r12, r14} \n" /* restoring r4-r12, lr from stack */ 1084c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger "\tBX lr \n" /* return */ 1085c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1086c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger : [dst] "+r" (dst), [src] "+r" (src) 1087c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger : 1088c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger : "cc", "r2", "r3", "memory" 1089c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1090c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger ); 1091c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1092c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger} 1093c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger 1094c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm_test_alpha 1095c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#else /* !defined(TEST_SRC_ALPHA) */ 109635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 109735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenbergerstatic void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 109835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger const SkPMColor* SK_RESTRICT src, 109935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger int count, U8CPU alpha) { 110035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 110135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger SkASSERT(255 == alpha); 110235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 110335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* Does not support the TEST_SRC_ALPHA case */ 110435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger asm volatile ( 110535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "cmp %[count], #0 \n\t" /* comparing count with 0 */ 110635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "beq 3f \n\t" /* if zero exit */ 110735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 110835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mov ip, #0xff \n\t" /* load the 0xff mask in ip */ 110935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */ 111035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 111135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "cmp %[count], #2 \n\t" /* compare count with 2 */ 111235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "blt 2f \n\t" /* if less than 2 -> single loop */ 111335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 111435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* Double Loop */ 111535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "1: \n\t" /* <double loop> */ 111635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */ 111735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */ 111835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 111935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 112035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------- */ 112135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 112235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 112335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 112435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 112535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r9, r9, r4 \n\t" /* br = br * scale */ 112635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 112735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 112835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 112935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 113035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 113135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r7, r9, r10 \n\t" /* br | ag*/ 113235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 113335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */ 113435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */ 113535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 113635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------- */ 113735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, ip, r8 \n\t" /* r9 = br masked by ip */ 113835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 113935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */ 114035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r9, r9, r4 \n\t" /* br = br * scale */ 114135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "sub %[count], %[count], #2 \n\t" 114235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 114335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 114435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 114535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 114635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "cmp %[count], #1 \n\t" /* comparing count with 1 */ 114735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r8, r9, r10 \n\t" /* br | ag */ 114835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 114935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */ 115035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 115135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------------- */ 115235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */ 115335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------------- */ 115435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 115535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "bgt 1b \n\t" /* if greater than 1 -> reloop */ 115635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "blt 3f \n\t" /* if less than 1 -> exit */ 115735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 115835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* Single Loop */ 115935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "2: \n\t" /* <single loop> */ 116035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */ 116135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */ 116235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 116335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 116435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------- */ 116535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 116635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 116735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 116835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 116935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r9, r9, r4 \n\t" /* br = br * scale */ 117035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 117135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 117235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 117335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r10, r10, ip, lsl #8 \n\t" /* mask ag */ 117435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r7, r9, r10 \n\t" /* br | ag */ 117535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 117635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */ 117735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 117835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------------- */ 117935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */ 118035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------------- */ 118135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 118235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "3: \n\t" /* <exit> */ 118335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 118435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger : 118535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" 118635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger ); 118735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger} 118835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm 1189c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#endif /* !defined(TEST_SRC_ALPHA) */ 1190c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#else /* ... #elif defined (__ARM_ARCH__) */ 1191c7cd3e0c090c34b165ff6d1113bdc13f4b917b9bDerek Sollenberger#define S32A_Opaque_BlitRow32_PROC NULL 119254e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 119354e0f955c21365271661cd92a29d06a847a18554Mike Reed 119435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger/* 119535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger * ARM asm version of S32A_Blend_BlitRow32 119635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger */ 119735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenbergerstatic void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 119835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger const SkPMColor* SK_RESTRICT src, 119935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger int count, U8CPU alpha) { 120035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger asm volatile ( 120135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "cmp %[count], #0 \n\t" /* comparing count with 0 */ 120235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "beq 3f \n\t" /* if zero exit */ 120335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 120435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */ 120535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */ 120635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 120735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* src1,2_scale */ 120835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */ 120935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 121035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "cmp %[count], #2 \n\t" /* comparing count with 2 */ 121135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "blt 2f \n\t" /* if less than 2 -> single loop */ 121235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 121335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* Double Loop */ 121435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "1: \n\t" /* <double loop> */ 121535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */ 121635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */ 121735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 121835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* dst1_scale and dst2_scale*/ 121935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r9, r5, #24 \n\t" /* src >> 24 */ 122035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r10, r6, #24 \n\t" /* src >> 24 */ 122135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ 122235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ 122335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r9, r9, #8 \n\t" /* r9 >> 8 */ 122435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r10, r10, #8 \n\t" /* r10 >> 8 */ 122535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */ 122635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */ 122735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 122835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ---------------------- */ 122935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 123035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* src1, src1_scale */ 123135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */ 123235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */ 123335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 123435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 123535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 123635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 123735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */ 123835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 123935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* dst1, dst1_scale */ 124035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */ 124135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */ 124235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */ 124335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */ 124435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 124535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 124635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */ 124735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 124835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ---------------------- */ 124935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */ 125035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ---------------------- */ 125135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 125235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ====================== */ 125335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 125435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* src2, src2_scale */ 125535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */ 125635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */ 125735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 125835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 125935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 126035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 126135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */ 126235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 126335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* dst2, dst2_scale */ 126435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */ 126535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */ 126635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */ 126735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */ 126835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 126935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 127035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */ 127135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 127235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */ 127335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ---------------------- */ 127435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */ 127535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ---------------------- */ 127635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "cmp %[count], #1 \n\t" /* compare count with 1 */ 127735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------------- */ 127835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */ 127935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------------- */ 128035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 128135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "bgt 1b \n\t" /* if %[count] greater than 1 reloop */ 128235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "blt 3f \n\t" /* if %[count] less than 1 exit */ 128335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* else get into the single loop */ 128435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* Single Loop */ 128535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "2: \n\t" /* <single loop> */ 128635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */ 128735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */ 128835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 128935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r6, r5, #24 \n\t" /* src >> 24 */ 129035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */ 129135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ 129235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */ 129335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "lsr r6, r6, #8 \n\t" /* r6 >> 8 */ 129435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */ 129535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */ 129635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 129735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* src, src_scale */ 129835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */ 129935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 130035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 130135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */ 130235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 130335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* dst, dst_scale */ 130435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */ 130535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */ 130635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */ 130735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */ 130835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 130935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 131035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */ 131135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 131235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */ 131335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 131435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------------- */ 131535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */ 131635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger /* ----------------- */ 131735e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 131835e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger "3: \n\t" /* <exit> */ 131935e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha) 132035e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger : 132135e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory" 132235e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger ); 132335e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 132435e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger} 132535e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger#define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm 132635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger 132754e0f955c21365271661cd92a29d06a847a18554Mike Reed/* Neon version of S32_Blend_BlitRow32() 13282274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed * portable version is in src/core/SkBlitRow_D32.cpp 132954e0f955c21365271661cd92a29d06a847a18554Mike Reed */ 133054e0f955c21365271661cd92a29d06a847a18554Mike Reed#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 133154e0f955c21365271661cd92a29d06a847a18554Mike Reedstatic void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 133254e0f955c21365271661cd92a29d06a847a18554Mike Reed const SkPMColor* SK_RESTRICT src, 133354e0f955c21365271661cd92a29d06a847a18554Mike Reed int count, U8CPU alpha) { 133454e0f955c21365271661cd92a29d06a847a18554Mike Reed SkASSERT(alpha <= 255); 133554e0f955c21365271661cd92a29d06a847a18554Mike Reed if (count > 0) { 133654e0f955c21365271661cd92a29d06a847a18554Mike Reed uint16_t src_scale = SkAlpha255To256(alpha); 133754e0f955c21365271661cd92a29d06a847a18554Mike Reed uint16_t dst_scale = 256 - src_scale; 133854e0f955c21365271661cd92a29d06a847a18554Mike Reed 133954e0f955c21365271661cd92a29d06a847a18554Mike Reed /* run them N at a time through the NEON unit */ 134054e0f955c21365271661cd92a29d06a847a18554Mike Reed /* note that each 1 is 4 bytes, each treated exactly the same, 134154e0f955c21365271661cd92a29d06a847a18554Mike Reed * so we can work under that guise. We *do* know that the src&dst 134254e0f955c21365271661cd92a29d06a847a18554Mike Reed * will be 32-bit aligned quantities, so we can specify that on 134354e0f955c21365271661cd92a29d06a847a18554Mike Reed * the load/store ops and do a neon 'reinterpret' to get us to 134454e0f955c21365271661cd92a29d06a847a18554Mike Reed * byte-sized (pun intended) pieces that we widen/multiply/shift 134554e0f955c21365271661cd92a29d06a847a18554Mike Reed * we're limited at 128 bits in the wide ops, which is 8x16bits 134654e0f955c21365271661cd92a29d06a847a18554Mike Reed * or a pair of 32 bit src/dsts. 134754e0f955c21365271661cd92a29d06a847a18554Mike Reed */ 134854e0f955c21365271661cd92a29d06a847a18554Mike Reed /* we *could* manually unroll this loop so that we load 128 bits 134954e0f955c21365271661cd92a29d06a847a18554Mike Reed * (as a pair of 64s) from each of src and dst, processing them 135054e0f955c21365271661cd92a29d06a847a18554Mike Reed * in pieces. This might give us a little better management of 135154e0f955c21365271661cd92a29d06a847a18554Mike Reed * the memory latency, but my initial attempts here did not 135254e0f955c21365271661cd92a29d06a847a18554Mike Reed * produce an instruction stream that looked all that nice. 135354e0f955c21365271661cd92a29d06a847a18554Mike Reed */ 135454e0f955c21365271661cd92a29d06a847a18554Mike Reed#define UNROLL 2 135554e0f955c21365271661cd92a29d06a847a18554Mike Reed while (count >= UNROLL) { 135654e0f955c21365271661cd92a29d06a847a18554Mike Reed uint8x8_t src_raw, dst_raw, dst_final; 135754e0f955c21365271661cd92a29d06a847a18554Mike Reed uint16x8_t src_wide, dst_wide; 135854e0f955c21365271661cd92a29d06a847a18554Mike Reed 135954e0f955c21365271661cd92a29d06a847a18554Mike Reed /* get 64 bits of src, widen it, multiply by src_scale */ 136054e0f955c21365271661cd92a29d06a847a18554Mike Reed src_raw = vreinterpret_u8_u32(vld1_u32(src)); 136154e0f955c21365271661cd92a29d06a847a18554Mike Reed src_wide = vmovl_u8(src_raw); 13622274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 136354e0f955c21365271661cd92a29d06a847a18554Mike Reed src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 136454e0f955c21365271661cd92a29d06a847a18554Mike Reed 136554e0f955c21365271661cd92a29d06a847a18554Mike Reed /* ditto with dst */ 136654e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 136754e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_wide = vmovl_u8(dst_raw); 136854e0f955c21365271661cd92a29d06a847a18554Mike Reed 13692274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* combine add with dst multiply into mul-accumulate */ 13702274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 137154e0f955c21365271661cd92a29d06a847a18554Mike Reed 13722274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_final = vshrn_n_u16(dst_wide, 8); 137354e0f955c21365271661cd92a29d06a847a18554Mike Reed vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 137454e0f955c21365271661cd92a29d06a847a18554Mike Reed 137554e0f955c21365271661cd92a29d06a847a18554Mike Reed src += UNROLL; 137654e0f955c21365271661cd92a29d06a847a18554Mike Reed dst += UNROLL; 137754e0f955c21365271661cd92a29d06a847a18554Mike Reed count -= UNROLL; 137854e0f955c21365271661cd92a29d06a847a18554Mike Reed } 137954e0f955c21365271661cd92a29d06a847a18554Mike Reed /* RBE: well, i don't like how gcc manages src/dst across the above 138054e0f955c21365271661cd92a29d06a847a18554Mike Reed * loop it's constantly calculating src+bias, dst+bias and it only 138154e0f955c21365271661cd92a29d06a847a18554Mike Reed * adjusts the real ones when we leave the loop. Not sure why 138254e0f955c21365271661cd92a29d06a847a18554Mike Reed * it's "hoisting down" (hoisting implies above in my lexicon ;)) 138354e0f955c21365271661cd92a29d06a847a18554Mike Reed * the adjustments to src/dst/count, but it does... 138454e0f955c21365271661cd92a29d06a847a18554Mike Reed * (might be SSA-style internal logic... 138554e0f955c21365271661cd92a29d06a847a18554Mike Reed */ 138654e0f955c21365271661cd92a29d06a847a18554Mike Reed 138754e0f955c21365271661cd92a29d06a847a18554Mike Reed#if UNROLL == 2 138854e0f955c21365271661cd92a29d06a847a18554Mike Reed if (count == 1) { 138954e0f955c21365271661cd92a29d06a847a18554Mike Reed *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 139054e0f955c21365271661cd92a29d06a847a18554Mike Reed } 139154e0f955c21365271661cd92a29d06a847a18554Mike Reed#else 139254e0f955c21365271661cd92a29d06a847a18554Mike Reed if (count > 0) { 139354e0f955c21365271661cd92a29d06a847a18554Mike Reed do { 139454e0f955c21365271661cd92a29d06a847a18554Mike Reed *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 139554e0f955c21365271661cd92a29d06a847a18554Mike Reed src += 1; 139654e0f955c21365271661cd92a29d06a847a18554Mike Reed dst += 1; 139754e0f955c21365271661cd92a29d06a847a18554Mike Reed } while (--count > 0); 139854e0f955c21365271661cd92a29d06a847a18554Mike Reed } 139954e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 140054e0f955c21365271661cd92a29d06a847a18554Mike Reed 140154e0f955c21365271661cd92a29d06a847a18554Mike Reed#undef UNROLL 140254e0f955c21365271661cd92a29d06a847a18554Mike Reed } 140354e0f955c21365271661cd92a29d06a847a18554Mike Reed} 140454e0f955c21365271661cd92a29d06a847a18554Mike Reed 140554e0f955c21365271661cd92a29d06a847a18554Mike Reed#define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon 140654e0f955c21365271661cd92a29d06a847a18554Mike Reed#else 140754e0f955c21365271661cd92a29d06a847a18554Mike Reed#define S32_Blend_BlitRow32_PROC NULL 140854e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 140954e0f955c21365271661cd92a29d06a847a18554Mike Reed 141054e0f955c21365271661cd92a29d06a847a18554Mike Reed/////////////////////////////////////////////////////////////////////////////// 141154e0f955c21365271661cd92a29d06a847a18554Mike Reed 141254e0f955c21365271661cd92a29d06a847a18554Mike Reed#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 14132274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14142274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#undef DEBUG_OPAQUE_DITHER 14152274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14162274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#if defined(DEBUG_OPAQUE_DITHER) 14172274ddecab6242780c010b52ae90b2c06ce38d66Mike Reedstatic void showme8(char *str, void *p, int len) 14182274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed{ 14192274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed static char buf[256]; 14202274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed char tbuf[32]; 14212274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int i; 14222274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed char *pc = (char*) p; 14232274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sprintf(buf,"%8s:", str); 14242274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed for(i=0;i<len;i++) { 14252274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sprintf(tbuf, " %02x", pc[i]); 14262274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed strcat(buf, tbuf); 14272274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 14282274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed SkDebugf("%s\n", buf); 14292274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed} 14302274ddecab6242780c010b52ae90b2c06ce38d66Mike Reedstatic void showme16(char *str, void *p, int len) 14312274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed{ 14322274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed static char buf[256]; 14332274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed char tbuf[32]; 14342274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int i; 14352274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint16_t *pc = (uint16_t*) p; 14362274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sprintf(buf,"%8s:", str); 14372274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed len = (len / sizeof(uint16_t)); /* passed as bytes */ 14382274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed for(i=0;i<len;i++) { 14392274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sprintf(tbuf, " %04x", pc[i]); 14402274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed strcat(buf, tbuf); 14412274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 14422274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed SkDebugf("%s\n", buf); 14432274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed} 14442274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#endif 14452274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14462274ddecab6242780c010b52ae90b2c06ce38d66Mike Reedstatic void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 144754e0f955c21365271661cd92a29d06a847a18554Mike Reed const SkPMColor* SK_RESTRICT src, 144854e0f955c21365271661cd92a29d06a847a18554Mike Reed int count, U8CPU alpha, int x, int y) { 144954e0f955c21365271661cd92a29d06a847a18554Mike Reed SkASSERT(255 == alpha); 14502274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14512274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#define UNROLL 8 14522274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14532274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed if (count >= UNROLL) { 14542274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint8x8_t dbase; 14552274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14562274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#if defined(DEBUG_OPAQUE_DITHER) 14572274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint16_t tmpbuf[UNROLL]; 14582274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int td[UNROLL]; 14592274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int tdv[UNROLL]; 14602274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int ta[UNROLL]; 14612274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int tap[UNROLL]; 14622274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint16_t in_dst[UNROLL]; 14632274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int offset = 0; 14642274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int noisy = 0; 14652274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#endif 14662274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14672274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 14682274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dbase = vld1_u8(dstart); 14692274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14702274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed do { 14712274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint8x8_t sr, sg, sb, sa, d; 1472bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed uint16x8_t dst8, scale8, alpha8; 14732274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint16x8_t dst_r, dst_g, dst_b; 14742274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14752274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#if defined(DEBUG_OPAQUE_DITHER) 14762274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* calculate 8 elements worth into a temp buffer */ 14772274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed { 14782274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int my_y = y; 14792274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int my_x = x; 14802274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed SkPMColor* my_src = (SkPMColor*)src; 14812274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint16_t* my_dst = dst; 14822274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int i; 14832274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14842274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed DITHER_565_SCAN(my_y); 14852274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed for(i=0;i<UNROLL;i++) { 14862274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed SkPMColor c = *my_src++; 14872274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed SkPMColorAssert(c); 14882274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed if (c) { 14892274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed unsigned a = SkGetPackedA32(c); 14902274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14912274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 14922274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed tdv[i] = DITHER_VALUE(my_x); 14932274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed ta[i] = a; 14942274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed tap[i] = SkAlpha255To256(a); 14952274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed td[i] = d; 14962274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 14972274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed unsigned sr = SkGetPackedR32(c); 14982274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed unsigned sg = SkGetPackedG32(c); 14992274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed unsigned sb = SkGetPackedB32(c); 15002274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sr = SkDITHER_R32_FOR_565(sr, d); 15012274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sg = SkDITHER_G32_FOR_565(sg, d); 15022274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sb = SkDITHER_B32_FOR_565(sb, d); 15032274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15042274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 15052274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 15062274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 15072274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed // now src and dst expanded are in g:11 r:10 x:1 b:10 15082274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 15092274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed td[i] = d; 15102274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15112274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } else { 15122274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed tmpbuf[i] = *my_dst; 15132274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed ta[i] = tdv[i] = td[i] = 0xbeef; 15142274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 15152274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed in_dst[i] = *my_dst; 15162274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed my_dst += 1; 15172274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed DITHER_INC_X(my_x); 15182274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 15192274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 15202274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#endif 15212274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15222274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* source is in ABGR */ 15232274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed { 15242274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed register uint8x8_t d0 asm("d0"); 15252274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed register uint8x8_t d1 asm("d1"); 15262274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed register uint8x8_t d2 asm("d2"); 15272274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed register uint8x8_t d3 asm("d3"); 15282274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15292274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 15302274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 15312274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed : "r" (src) 15322274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed ); 15332274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sr = d0; sg = d1; sb = d2; sa = d3; 15342274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 15352274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15362274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* calculate 'd', which will be 0..7 */ 15372274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 15381cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger#if defined(SK_BUILD_FOR_ANDROID) 1539bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1540bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 1541bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else 1542bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 1543bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif 1544bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 1545bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 15462274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15472274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* sr = sr - (sr>>5) + d */ 15482274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* watching for 8-bit overflow. d is 0..7; risky range of 15492274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 15502274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed * safe as long as we do ((sr-sr>>5) + d) */ 15512274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 15522274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sr = vadd_u8(sr, d); 15532274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15542274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* sb = sb - (sb>>5) + d */ 15552274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 15562274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sb = vadd_u8(sb, d); 15572274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15582274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 15592274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 15602274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed sg = vadd_u8(sg, vshr_n_u8(d,1)); 15612274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15622274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 15632274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst8 = vld1q_u16(dst); 15642274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 15652274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 15662274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 15672274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15682274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* blend */ 1569bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1 1570bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1571bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* originally 255-sa + 1 */ 1572bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1573bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else 15742274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed scale8 = vsubw_u8(vdupq_n_u16(255), sa); 15752274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 1576bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif 1577bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed 1578bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#if 1 1579bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* combine the addq and mul, save 3 insns */ 1580bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed scale8 = vshrq_n_u16(scale8, 3); 1581bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1582bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1583bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1584bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#else 1585bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed /* known correct, but +3 insns over above */ 15862274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed scale8 = vshrq_n_u16(scale8, 3); 15872274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_b = vmulq_u16(dst_b, scale8); 15882274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_g = vmulq_u16(dst_g, scale8); 15892274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_r = vmulq_u16(dst_r, scale8); 15902274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15912274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* combine */ 15922274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* NB: vshll widens, need to preserve those bits */ 15932274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 15942274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 15952274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 1596bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed#endif 15972274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 15982274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* repack to store */ 15992274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 16002274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 16012274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 16022274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 16032274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed vst1q_u16(dst, dst8); 16042274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 16052274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#if defined(DEBUG_OPAQUE_DITHER) 16062274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* verify my 8 elements match the temp buffer */ 16072274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed { 16082274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed int i, bad=0; 16092274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed static int invocation; 16102274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 16112274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed for (i=0;i<UNROLL;i++) 16122274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed if (tmpbuf[i] != dst[i]) bad=1; 1613bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed if (bad) { 16142274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 16152274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed invocation, offset); 1616bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed SkDebugf(" alpha 0x%x\n", alpha); 16172274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed for (i=0;i<UNROLL;i++) 16182274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 16192274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 16202274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 16212274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 1622bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme16("alpha8", &alpha8, sizeof(alpha8)); 1623bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme16("scale8", &scale8, sizeof(scale8)); 1624bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme8("d", &d, sizeof(d)); 1625bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme16("dst8", &dst8, sizeof(dst8)); 1626bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme16("dst_b", &dst_b, sizeof(dst_b)); 1627bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme16("dst_g", &dst_g, sizeof(dst_g)); 1628bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme16("dst_r", &dst_r, sizeof(dst_r)); 1629bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme8("sb", &sb, sizeof(sb)); 1630bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme8("sg", &sg, sizeof(sg)); 1631bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed showme8("sr", &sr, sizeof(sr)); 1632bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed 16332274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* cop out */ 16342274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed return; 16352274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 16362274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed offset += UNROLL; 16372274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed invocation++; 16382274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 16392274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#endif 16402274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 16412274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed dst += UNROLL; 16422274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed src += UNROLL; 16432274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed count -= UNROLL; 16442274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* skip x += UNROLL, since it's unchanged mod-4 */ 16452274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } while (count >= UNROLL); 16462274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed } 16472274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed#undef UNROLL 16482274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed 16492274ddecab6242780c010b52ae90b2c06ce38d66Mike Reed /* residuals */ 165054e0f955c21365271661cd92a29d06a847a18554Mike Reed if (count > 0) { 165154e0f955c21365271661cd92a29d06a847a18554Mike Reed DITHER_565_SCAN(y); 165254e0f955c21365271661cd92a29d06a847a18554Mike Reed do { 165354e0f955c21365271661cd92a29d06a847a18554Mike Reed SkPMColor c = *src++; 165454e0f955c21365271661cd92a29d06a847a18554Mike Reed SkPMColorAssert(c); 165554e0f955c21365271661cd92a29d06a847a18554Mike Reed if (c) { 165654e0f955c21365271661cd92a29d06a847a18554Mike Reed unsigned a = SkGetPackedA32(c); 165754e0f955c21365271661cd92a29d06a847a18554Mike Reed 165887a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed // dither and alpha are just temporary variables to work-around 165987a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed // an ICE in debug. 166087a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed unsigned dither = DITHER_VALUE(x); 166187a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed unsigned alpha = SkAlpha255To256(a); 166287a2a317c4e99c547ecbfa81f40f7bd7f2932433Mike Reed int d = SkAlphaMul(dither, alpha); 166354e0f955c21365271661cd92a29d06a847a18554Mike Reed 166454e0f955c21365271661cd92a29d06a847a18554Mike Reed unsigned sr = SkGetPackedR32(c); 166554e0f955c21365271661cd92a29d06a847a18554Mike Reed unsigned sg = SkGetPackedG32(c); 166654e0f955c21365271661cd92a29d06a847a18554Mike Reed unsigned sb = SkGetPackedB32(c); 166754e0f955c21365271661cd92a29d06a847a18554Mike Reed sr = SkDITHER_R32_FOR_565(sr, d); 166854e0f955c21365271661cd92a29d06a847a18554Mike Reed sg = SkDITHER_G32_FOR_565(sg, d); 166954e0f955c21365271661cd92a29d06a847a18554Mike Reed sb = SkDITHER_B32_FOR_565(sb, d); 167054e0f955c21365271661cd92a29d06a847a18554Mike Reed 167154e0f955c21365271661cd92a29d06a847a18554Mike Reed uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 167254e0f955c21365271661cd92a29d06a847a18554Mike Reed uint32_t dst_expanded = SkExpand_rgb_16(*dst); 167354e0f955c21365271661cd92a29d06a847a18554Mike Reed dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 167454e0f955c21365271661cd92a29d06a847a18554Mike Reed // now src and dst expanded are in g:11 r:10 x:1 b:10 167554e0f955c21365271661cd92a29d06a847a18554Mike Reed *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 167654e0f955c21365271661cd92a29d06a847a18554Mike Reed } 167754e0f955c21365271661cd92a29d06a847a18554Mike Reed dst += 1; 167854e0f955c21365271661cd92a29d06a847a18554Mike Reed DITHER_INC_X(x); 167954e0f955c21365271661cd92a29d06a847a18554Mike Reed } while (--count != 0); 168054e0f955c21365271661cd92a29d06a847a18554Mike Reed } 168154e0f955c21365271661cd92a29d06a847a18554Mike Reed} 168254e0f955c21365271661cd92a29d06a847a18554Mike Reed 168354e0f955c21365271661cd92a29d06a847a18554Mike Reed#define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon 168454e0f955c21365271661cd92a29d06a847a18554Mike Reed#else 168554e0f955c21365271661cd92a29d06a847a18554Mike Reed#define S32A_D565_Opaque_Dither_PROC NULL 168654e0f955c21365271661cd92a29d06a847a18554Mike Reed#endif 168754e0f955c21365271661cd92a29d06a847a18554Mike Reed 168854e0f955c21365271661cd92a29d06a847a18554Mike Reed/////////////////////////////////////////////////////////////////////////////// 168954e0f955c21365271661cd92a29d06a847a18554Mike Reed 16901ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 16911ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed/* 2009/10/27: RBE says "a work in progress"; debugging says ok; 16921ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * speedup untested, but ARM version is 26 insns/iteration and 16931ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) 16941ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * which is 10x the native version; that's pure instruction counts, 16951ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * not accounting for any instruction or memory latencies. 16961ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed */ 16971ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 16981ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#undef DEBUG_S32_OPAQUE_DITHER 16991ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17001ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reedstatic void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 17011ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed const SkPMColor* SK_RESTRICT src, 17021ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed int count, U8CPU alpha, int x, int y) { 17031ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed SkASSERT(255 == alpha); 17041ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17051ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#define UNROLL 8 17061ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed if (count >= UNROLL) { 17071ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed uint8x8_t d; 17081ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 17091ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed d = vld1_u8(dstart); 17101ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17111ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed while (count >= UNROLL) { 17121ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed uint8x8_t sr, sg, sb, sa; 17131ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed uint16x8_t dr, dg, db, da; 17141ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed uint16x8_t dst8; 17151ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17161ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* source is in ABGR ordering (R == lsb) */ 17171ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed { 17181ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed register uint8x8_t d0 asm("d0"); 17191ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed register uint8x8_t d1 asm("d1"); 17201ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed register uint8x8_t d2 asm("d2"); 17211ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed register uint8x8_t d3 asm("d3"); 17221ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17231ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 17241ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 17251ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed : "r" (src) 17261ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed ); 17271ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed sr = d0; sg = d1; sb = d2; sa = d3; 17281ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed } 17291ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* XXX: if we want to prefetch, hide it in the above asm() 17301ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * using the gcc __builtin_prefetch(), the prefetch will 17311ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * fall to the bottom of the loop -- it won't stick up 17321ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed * at the top of the loop, just after the vld4. 17331ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed */ 17341ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17351ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* sr = sr - (sr>>5) + d */ 17361ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 17371ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed dr = vaddl_u8(sr, d); 17381ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17391ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* sb = sb - (sb>>5) + d */ 17401ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 17411ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed db = vaddl_u8(sb, d); 17421ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17431ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 17441ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 17451ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed dg = vaddl_u8(sg, vshr_n_u8(d,1)); 17461ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* XXX: check that the "d>>1" here is hoisted */ 17471ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17481ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* pack high bits of each into 565 format (rgb, b is lsb) */ 17491ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed dst8 = vshrq_n_u16(db, 3); 17501ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 17511ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); 17521ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17531ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* store it */ 17541ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed vst1q_u16(dst, dst8); 17551ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17561ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#if defined(DEBUG_S32_OPAQUE_DITHER) 17571ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* always good to know if we generated good results */ 17581ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed { 17591ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed int i, myx = x, myy = y; 17601ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed DITHER_565_SCAN(myy); 17611ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed for (i=0;i<UNROLL;i++) { 17621ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed SkPMColor c = src[i]; 17631ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed unsigned dither = DITHER_VALUE(myx); 17641ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed uint16_t val = SkDitherRGB32To565(c, dither); 17651ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed if (val != dst[i]) { 17661ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 17671ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed c, dither, val, dst[i], dstart[i]); 17681ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed } 17691ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed DITHER_INC_X(myx); 17701ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed } 17711ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed } 17721ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#endif 17731ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17741ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed dst += UNROLL; 17751ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed src += UNROLL; 17761ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed count -= UNROLL; 17771ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed x += UNROLL; /* probably superfluous */ 17781ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed } 17791ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed } 17801ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#undef UNROLL 17811ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17821ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed /* residuals */ 17831ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed if (count > 0) { 17841ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed DITHER_565_SCAN(y); 17851ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed do { 17861ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed SkPMColor c = *src++; 17871ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed SkPMColorAssert(c); 17881ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed SkASSERT(SkGetPackedA32(c) == 255); 17891ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17901ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed unsigned dither = DITHER_VALUE(x); 17911ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed *dst++ = SkDitherRGB32To565(c, dither); 17921ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed DITHER_INC_X(x); 17931ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed } while (--count != 0); 17941ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed } 17951ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed} 17961ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 17971ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon 17981ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#else 17991ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#define S32_D565_Opaque_Dither_PROC NULL 18001ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed#endif 18011ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 18021ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed/////////////////////////////////////////////////////////////////////////////// 18031ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed 180424fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reedstatic const SkBlitRow::Proc platform_565_procs[] = { 180596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // no dither 180696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed S32_D565_Opaque_PROC, 180796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed S32_D565_Blend_PROC, 180896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed S32A_D565_Opaque_PROC, 18093d54018fa5ed403ecff0e5ef6177fbf660d6025bMike Reed S32A_D565_Blend_PROC, 181096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 181196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // dither 18121ba87c2b414250b98cf5580cf105a80bbe7d311dMike Reed S32_D565_Opaque_Dither_PROC, 181396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed S32_D565_Blend_Dither_PROC, 18143d54018fa5ed403ecff0e5ef6177fbf660d6025bMike Reed S32A_D565_Opaque_Dither_PROC, 181596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32A_D565_Blend_Dither 181696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed}; 181796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 181824fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reedstatic const SkBlitRow::Proc platform_4444_procs[] = { 181996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // no dither 182096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32_D4444_Opaque, 182196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32_D4444_Blend, 182296e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32A_D4444_Opaque, 182396e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32A_D4444_Blend, 182496e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 182596e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed // dither 182696e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32_D4444_Opaque_Dither, 182796e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32_D4444_Blend_Dither, 182896e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32A_D4444_Opaque_Dither, 182996e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed NULL, // S32A_D4444_Blend_Dither 183096e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed}; 183196e6157bf8ddd0ab1ee75d3bf56d1443d3571d45Mike Reed 183224fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reedstatic const SkBlitRow::Proc32 platform_32_procs[] = { 1833d0195f840fa964da51f7a1192b432954794e660cMike Reed NULL, // S32_Opaque, 183454e0f955c21365271661cd92a29d06a847a18554Mike Reed S32_Blend_BlitRow32_PROC, // S32_Blend, 183554e0f955c21365271661cd92a29d06a847a18554Mike Reed S32A_Opaque_BlitRow32_PROC, // S32A_Opaque, 183635e2e62b55598210f6999fc2ea26ff8f41446ffeDerek Sollenberger S32A_Blend_BlitRow32_PROC // S32A_Blend 1837d0195f840fa964da51f7a1192b432954794e660cMike Reed}; 1838d0195f840fa964da51f7a1192b432954794e660cMike Reed 183924fb8c7cc7b76134a25914d8f6346c89c359c621Mike ReedSkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) { 184024fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed return platform_4444_procs[flags]; 184124fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed} 184224fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed 184324fb8c7cc7b76134a25914d8f6346c89c359c621Mike ReedSkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { 184424fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed return platform_565_procs[flags]; 184524fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed} 184624fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed 184724fb8c7cc7b76134a25914d8f6346c89c359c621Mike ReedSkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { 184824fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed return platform_32_procs[flags]; 184924fb8c7cc7b76134a25914d8f6346c89c359c621Mike Reed} 1850bebe09a7f530f0d80fb3da8674153813a8ba6edaMike Reed 185140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek SollenbergerSkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { 185240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger return NULL; 185340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger} 185405b6b4d746867a9fb02e14edfe1bf3685abeb813Derek Sollenberger 18551cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger/////////////////////////////////////////////////////////////////////////////// 185605b6b4d746867a9fb02e14edfe1bf3685abeb813Derek Sollenberger 18571cab2921ab279367f8206cdadc9259d12e603548Derek SollenbergerSkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, 18581cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger SkMask::Format maskFormat, 18591cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger SkColor color) { 18601cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger return NULL; 18611cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger} 18621cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger 18634f1dae40e24d57d647db01443b8bf2410514b8b5Derek SollenbergerSkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { 18644f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger return NULL; 18654f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger} 18664f1dae40e24d57d647db01443b8bf2410514b8b5Derek Sollenberger 18671cab2921ab279367f8206cdadc9259d12e603548Derek SollenbergerSkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, 18681cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger SkMask::Format maskFormat, 18691cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger RowFlags flags) { 18701cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger return NULL; 187105b6b4d746867a9fb02e14edfe1bf3685abeb813Derek Sollenberger} 1872