14a37d08382a16717cde52c3d2687b021c5413464mtklein/* 24a37d08382a16717cde52c3d2687b021c5413464mtklein * Copyright 2015 Google Inc. 34a37d08382a16717cde52c3d2687b021c5413464mtklein * 44a37d08382a16717cde52c3d2687b021c5413464mtklein * Use of this source code is governed by a BSD-style license that can be 54a37d08382a16717cde52c3d2687b021c5413464mtklein * found in the LICENSE file. 64a37d08382a16717cde52c3d2687b021c5413464mtklein */ 74a37d08382a16717cde52c3d2687b021c5413464mtklein 84a37d08382a16717cde52c3d2687b021c5413464mtklein#ifndef SkBlitRow_opts_DEFINED 94a37d08382a16717cde52c3d2687b021c5413464mtklein#define SkBlitRow_opts_DEFINED 104a37d08382a16717cde52c3d2687b021c5413464mtklein 114a37d08382a16717cde52c3d2687b021c5413464mtklein#include "Sk4px.h" 12a4083c97d48e8a4f88e2797d7363f141e3d42553Cary Clark#include "SkColorData.h" 13b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#include "SkMSAN.h" 14b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 15b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 16b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein #include "SkColor_opts_SSE2.h" 17d8e2b13ad4d45d8224c2e0b6ef77bfaff404c5b5Herbert Derby #include <immintrin.h> 18b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#endif 194a37d08382a16717cde52c3d2687b021c5413464mtklein 204a37d08382a16717cde52c3d2687b021c5413464mtkleinnamespace SK_OPTS_NS { 214a37d08382a16717cde52c3d2687b021c5413464mtklein 224a37d08382a16717cde52c3d2687b021c5413464mtklein// Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp. 234a37d08382a16717cde52c3d2687b021c5413464mtklein// It's not quite perfect, but it's never wrong in the interesting edge cases, 244a37d08382a16717cde52c3d2687b021c5413464mtklein// and it's quite a bit faster than blend_perfect. 254a37d08382a16717cde52c3d2687b021c5413464mtklein// 264a37d08382a16717cde52c3d2687b021c5413464mtklein// blend_256_round_alt is our currently blessed algorithm. Please use it or an analogous one. 27b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtkleinstatic inline 28b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtkleinvoid blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) { 294a37d08382a16717cde52c3d2687b021c5413464mtklein unsigned invA = 255 - SkGetPackedA32(color); 304a37d08382a16717cde52c3d2687b021c5413464mtklein invA += invA >> 7; 314a37d08382a16717cde52c3d2687b021c5413464mtklein SkASSERT(invA < 256); // We've should have already handled alpha == 0 externally. 324a37d08382a16717cde52c3d2687b021c5413464mtklein 334a37d08382a16717cde52c3d2687b021c5413464mtklein Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128); 344a37d08382a16717cde52c3d2687b021c5413464mtklein Sk16b invA_16x(invA); 354a37d08382a16717cde52c3d2687b021c5413464mtklein 364a37d08382a16717cde52c3d2687b021c5413464mtklein Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px { 374a37d08382a16717cde52c3d2687b021c5413464mtklein return (src4 * invA_16x).addNarrowHi(colorHighAndRound); 384a37d08382a16717cde52c3d2687b021c5413464mtklein }); 394a37d08382a16717cde52c3d2687b021c5413464mtklein} 404a37d08382a16717cde52c3d2687b021c5413464mtklein 41a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin#if defined(SK_ARM_HAS_NEON) 42a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin 43a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i], 44a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// y[i] are the i-th lanes of the corresponding NEON vectors. 45a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchinstatic inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) { 46a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint16x8_t prod = vmull_u8(x, y); 47a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin return vraddhn_u16(prod, vrshrq_n_u16(prod, 8)); 48a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin} 49a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin 50a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// The implementations of SkPMSrcOver below perform alpha blending consistently with 51a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as: 52a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// 53a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// result_i = src_i + rint(g(src_alpha, dst_i)) 54a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// 55a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer. 56a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin 57a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value 58a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// of the same color component for 8 consecutive pixels. The result of this function follows the 59a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// same convention. 60a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchinstatic inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) { 61a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8_t nalphas = vmvn_u8(src.val[3]); 62a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8x4_t result; 63a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas, dst.val[0])); 64a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas, dst.val[1])); 65a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas, dst.val[2])); 66a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas, dst.val[3])); 67a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin return result; 68a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin} 69a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin 70a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive 71a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// pixels. The return value follows the same convention. 72a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchinstatic inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) { 73a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303); 74a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices)); 75a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst)); 76a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin} 77a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin 78a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin#endif 79a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin 80cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline 81b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtkleinvoid blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) { 82b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein SkASSERT(alpha == 0xFF); 83b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein sk_msan_assert_initialized(src, src+len); 84b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 85b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 86b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein while (len >= 16) { 87b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // Load 16 source pixels. 88b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0), 89b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein s1 = _mm_loadu_si128((const __m128i*)(src) + 1), 90b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein s2 = _mm_loadu_si128((const __m128i*)(src) + 2), 91b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein s3 = _mm_loadu_si128((const __m128i*)(src) + 3); 92b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 93b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein const auto alphaMask = _mm_set1_epi32(0xFF000000); 94b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 95b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0))); 96b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein if (_mm_testz_si128(ORed, alphaMask)) { 97b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // All 16 source pixels are transparent. Nothing to do. 98b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein src += 16; 99b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein dst += 16; 100b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein len -= 16; 101b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein continue; 102b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 103b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 104b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein auto d0 = (__m128i*)(dst) + 0, 105b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein d1 = (__m128i*)(dst) + 1, 106b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein d2 = (__m128i*)(dst) + 2, 107b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein d3 = (__m128i*)(dst) + 3; 108b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 109b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0))); 110b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein if (_mm_testc_si128(ANDed, alphaMask)) { 111b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // All 16 source pixels are opaque. SrcOver becomes Src. 112b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d0, s0); 113b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d1, s1); 114b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d2, s2); 115b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d3, s3); 116b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein src += 16; 117b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein dst += 16; 118b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein len -= 16; 119b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein continue; 120b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 121b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 122b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // TODO: This math is wrong. 123b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // Do SrcOver. 124b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0))); 125b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1))); 126b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2))); 127b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3))); 128b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein src += 16; 129b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein dst += 16; 130b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein len -= 16; 131b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 132b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 133b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 134b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein while (len >= 16) { 135b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // Load 16 source pixels. 136b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0), 137b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein s1 = _mm_loadu_si128((const __m128i*)(src) + 1), 138b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein s2 = _mm_loadu_si128((const __m128i*)(src) + 2), 139b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein s3 = _mm_loadu_si128((const __m128i*)(src) + 3); 140b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 141b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein const auto alphaMask = _mm_set1_epi32(0xFF000000); 142b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 143b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0))); 144b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), 145b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_setzero_si128()))) { 146b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // All 16 source pixels are transparent. Nothing to do. 147b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein src += 16; 148b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein dst += 16; 149b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein len -= 16; 150b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein continue; 151b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 152b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 153b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein auto d0 = (__m128i*)(dst) + 0, 154b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein d1 = (__m128i*)(dst) + 1, 155b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein d2 = (__m128i*)(dst) + 2, 156b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein d3 = (__m128i*)(dst) + 3; 157b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 158b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0))); 159b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), 160b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein alphaMask))) { 161b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // All 16 source pixels are opaque. SrcOver becomes Src. 162b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d0, s0); 163b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d1, s1); 164b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d2, s2); 165b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d3, s3); 166b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein src += 16; 167b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein dst += 16; 168b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein len -= 16; 169b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein continue; 170b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 171b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 172b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // TODO: This math is wrong. 173b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein // Do SrcOver. 174b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0))); 175b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1))); 176b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2))); 177b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3))); 178b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 179b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein src += 16; 180b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein dst += 16; 181b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein len -= 16; 182b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 183b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 184b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#elif defined(SK_ARM_HAS_NEON) 185a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it 186a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // underperformed on some of the platforms under test for inputs with frequent transitions of 187a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth 188a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // revisiting the situation in the future. 189a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin while (len >= 8) { 190a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component 191a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8 192a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // pixels). 193a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src)); 194a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin src += 8; 195a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin len -= 8; 196a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin 197a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels 198a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // are all transparent), the second when all alphas are fully set (they are all opaque). 199a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8_t alphas = src_col.val[3]; 200a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0); 201a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin if (alphas_u64 == 0) { 202a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // All pixels transparent. 203a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin dst += 8; 204b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein continue; 205b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 206b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 207a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin if (~alphas_u64 == 0) { 208a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // All pixels opaque. 209a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col); 210a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin dst += 8; 211b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein continue; 212b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 213b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 214a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst)); 215a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col)); 216a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin dst += 8; 217a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin } 218b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 219a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin // Deal with leftover pixels. 220a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin for (; len >= 2; len -= 2, src += 2, dst += 2) { 221a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src)); 222a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst)); 223a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2)); 224a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin } 225b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 226a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin if (len != 0) { 227a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(*dst), vcreate_u8(*src)); 228a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0); 229b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 230a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin return; 231b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#endif 232b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 233b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein while (len-- > 0) { 2343e3181263c034db3ba657c35cce9ae29c411b2b6mtklein // This 0xFF000000 is not semantically necessary, but for compatibility 2353e3181263c034db3ba657c35cce9ae29c411b2b6mtklein // with chromium:611002 we need to keep it until we figure out where 2363e3181263c034db3ba657c35cce9ae29c411b2b6mtklein // the non-premultiplied src values (like 0x00FFFFFF) are coming from. 2373e3181263c034db3ba657c35cce9ae29c411b2b6mtklein // TODO(mtklein): sort this out and assert *src is premul here. 2383e3181263c034db3ba657c35cce9ae29c411b2b6mtklein if (*src & 0xFF000000) { 239b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst); 240b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 241b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein src++; 242b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein dst++; 243b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein } 244b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein} 245b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein 2464a37d08382a16717cde52c3d2687b021c5413464mtklein} // SK_OPTS_NS 2474a37d08382a16717cde52c3d2687b021c5413464mtklein 2484a37d08382a16717cde52c3d2687b021c5413464mtklein#endif//SkBlitRow_opts_DEFINED 249