14a37d08382a16717cde52c3d2687b021c5413464mtklein/*
24a37d08382a16717cde52c3d2687b021c5413464mtklein * Copyright 2015 Google Inc.
34a37d08382a16717cde52c3d2687b021c5413464mtklein *
44a37d08382a16717cde52c3d2687b021c5413464mtklein * Use of this source code is governed by a BSD-style license that can be
54a37d08382a16717cde52c3d2687b021c5413464mtklein * found in the LICENSE file.
64a37d08382a16717cde52c3d2687b021c5413464mtklein */
74a37d08382a16717cde52c3d2687b021c5413464mtklein
84a37d08382a16717cde52c3d2687b021c5413464mtklein#ifndef SkBlitRow_opts_DEFINED
94a37d08382a16717cde52c3d2687b021c5413464mtklein#define SkBlitRow_opts_DEFINED
104a37d08382a16717cde52c3d2687b021c5413464mtklein
114a37d08382a16717cde52c3d2687b021c5413464mtklein#include "Sk4px.h"
12a4083c97d48e8a4f88e2797d7363f141e3d42553Cary Clark#include "SkColorData.h"
13b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#include "SkMSAN.h"
14b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
15b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
16b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    #include "SkColor_opts_SSE2.h"
17d8e2b13ad4d45d8224c2e0b6ef77bfaff404c5b5Herbert Derby    #include <immintrin.h>
18b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#endif
194a37d08382a16717cde52c3d2687b021c5413464mtklein
204a37d08382a16717cde52c3d2687b021c5413464mtkleinnamespace SK_OPTS_NS {
214a37d08382a16717cde52c3d2687b021c5413464mtklein
224a37d08382a16717cde52c3d2687b021c5413464mtklein// Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp.
234a37d08382a16717cde52c3d2687b021c5413464mtklein// It's not quite perfect, but it's never wrong in the interesting edge cases,
244a37d08382a16717cde52c3d2687b021c5413464mtklein// and it's quite a bit faster than blend_perfect.
254a37d08382a16717cde52c3d2687b021c5413464mtklein//
264a37d08382a16717cde52c3d2687b021c5413464mtklein// blend_256_round_alt is our currently blessed algorithm.  Please use it or an analogous one.
27b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtkleinstatic inline
28b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtkleinvoid blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
294a37d08382a16717cde52c3d2687b021c5413464mtklein    unsigned invA = 255 - SkGetPackedA32(color);
304a37d08382a16717cde52c3d2687b021c5413464mtklein    invA += invA >> 7;
314a37d08382a16717cde52c3d2687b021c5413464mtklein    SkASSERT(invA < 256);  // We've should have already handled alpha == 0 externally.
324a37d08382a16717cde52c3d2687b021c5413464mtklein
334a37d08382a16717cde52c3d2687b021c5413464mtklein    Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128);
344a37d08382a16717cde52c3d2687b021c5413464mtklein    Sk16b invA_16x(invA);
354a37d08382a16717cde52c3d2687b021c5413464mtklein
364a37d08382a16717cde52c3d2687b021c5413464mtklein    Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
374a37d08382a16717cde52c3d2687b021c5413464mtklein        return (src4 * invA_16x).addNarrowHi(colorHighAndRound);
384a37d08382a16717cde52c3d2687b021c5413464mtklein    });
394a37d08382a16717cde52c3d2687b021c5413464mtklein}
404a37d08382a16717cde52c3d2687b021c5413464mtklein
41a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin#if defined(SK_ARM_HAS_NEON)
42a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin
43a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
44a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// y[i] are the i-th lanes of the corresponding NEON vectors.
45a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchinstatic inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
46a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    uint16x8_t prod = vmull_u8(x, y);
47a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
48a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin}
49a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin
50a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// The implementations of SkPMSrcOver below perform alpha blending consistently with
51a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
52a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin//
53a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin//   result_i = src_i + rint(g(src_alpha, dst_i))
54a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin//
55a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer.
56a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin
57a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
58a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// of the same color component for 8 consecutive pixels. The result of this function follows the
59a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// same convention.
60a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchinstatic inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
61a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    uint8x8_t nalphas = vmvn_u8(src.val[3]);
62a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    uint8x8x4_t result;
63a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas,  dst.val[0]));
64a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas,  dst.val[1]));
65a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas,  dst.val[2]));
66a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas,  dst.val[3]));
67a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    return result;
68a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin}
69a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin
70a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
71a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin// pixels. The return value follows the same convention.
72a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchinstatic inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
73a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
74a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
75a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
76a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin}
77a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin
78a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin#endif
79a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin
80cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline
81b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtkleinvoid blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
82b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    SkASSERT(alpha == 0xFF);
83b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    sk_msan_assert_initialized(src, src+len);
84b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
85b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
86b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    while (len >= 16) {
87b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        // Load 16 source pixels.
88b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
89b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
90b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
91b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
92b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
93b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        const auto alphaMask = _mm_set1_epi32(0xFF000000);
94b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
95b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
96b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        if (_mm_testz_si128(ORed, alphaMask)) {
97b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            // All 16 source pixels are transparent.  Nothing to do.
98b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            src += 16;
99b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            dst += 16;
100b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            len -= 16;
101b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            continue;
102b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        }
103b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
104b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        auto d0 = (__m128i*)(dst) + 0,
105b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             d1 = (__m128i*)(dst) + 1,
106b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             d2 = (__m128i*)(dst) + 2,
107b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             d3 = (__m128i*)(dst) + 3;
108b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
109b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
110b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        if (_mm_testc_si128(ANDed, alphaMask)) {
111b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            // All 16 source pixels are opaque.  SrcOver becomes Src.
112b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            _mm_storeu_si128(d0, s0);
113b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            _mm_storeu_si128(d1, s1);
114b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            _mm_storeu_si128(d2, s2);
115b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            _mm_storeu_si128(d3, s3);
116b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            src += 16;
117b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            dst += 16;
118b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            len -= 16;
119b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            continue;
120b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        }
121b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
122b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        // TODO: This math is wrong.
123b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        // Do SrcOver.
124b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
125b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
126b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
127b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
128b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        src += 16;
129b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        dst += 16;
130b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        len -= 16;
131b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    }
132b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
133b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
134b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    while (len >= 16) {
135b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        // Load 16 source pixels.
136b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
137b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
138b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
139b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
140b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
141b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        const auto alphaMask = _mm_set1_epi32(0xFF000000);
142b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
143b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
144b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
145b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein                                                       _mm_setzero_si128()))) {
146b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            // All 16 source pixels are transparent.  Nothing to do.
147b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            src += 16;
148b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            dst += 16;
149b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            len -= 16;
150b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            continue;
151b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        }
152b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
153b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        auto d0 = (__m128i*)(dst) + 0,
154b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             d1 = (__m128i*)(dst) + 1,
155b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             d2 = (__m128i*)(dst) + 2,
156b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein             d3 = (__m128i*)(dst) + 3;
157b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
158b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
159b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
160b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein                                                       alphaMask))) {
161b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            // All 16 source pixels are opaque.  SrcOver becomes Src.
162b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            _mm_storeu_si128(d0, s0);
163b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            _mm_storeu_si128(d1, s1);
164b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            _mm_storeu_si128(d2, s2);
165b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            _mm_storeu_si128(d3, s3);
166b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            src += 16;
167b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            dst += 16;
168b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            len -= 16;
169b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            continue;
170b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        }
171b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
172b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        // TODO: This math is wrong.
173b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        // Do SrcOver.
174b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
175b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
176b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
177b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
178b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
179b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        src += 16;
180b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        dst += 16;
181b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        len -= 16;
182b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    }
183b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
184b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#elif defined(SK_ARM_HAS_NEON)
185a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
186a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    // underperformed on some of the platforms under test for inputs with frequent transitions of
187a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
188a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    // revisiting the situation in the future.
189a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    while (len >= 8) {
190a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        // Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
191a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        // for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
192a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        // pixels).
193a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
194a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        src += 8;
195a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        len -= 8;
196a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin
197a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        // We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
198a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        // are all transparent), the second when all alphas are fully set (they are all opaque).
199a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        uint8x8_t alphas = src_col.val[3];
200a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0);
201a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        if (alphas_u64 == 0) {
202a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin            // All pixels transparent.
203a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin            dst += 8;
204b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            continue;
205b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        }
206b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
207a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        if (~alphas_u64 == 0) {
208a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin            // All pixels opaque.
209a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin            vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
210a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin            dst += 8;
211b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            continue;
212b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        }
213b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
214a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
215a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
216a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        dst += 8;
217a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    }
218b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
219a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    // Deal with leftover pixels.
220a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    for (; len >= 2; len -= 2, src += 2, dst += 2) {
221a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
222a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
223a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
224a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    }
225b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
226a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    if (len != 0) {
227a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(*dst), vcreate_u8(*src));
228a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin        vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
229b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    }
230a132c3869fcffb350d7a5ca7256496ab977bdd0cMatteo Franchin    return;
231b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein#endif
232b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
233b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    while (len-- > 0) {
2343e3181263c034db3ba657c35cce9ae29c411b2b6mtklein        // This 0xFF000000 is not semantically necessary, but for compatibility
2353e3181263c034db3ba657c35cce9ae29c411b2b6mtklein        // with chromium:611002 we need to keep it until we figure out where
2363e3181263c034db3ba657c35cce9ae29c411b2b6mtklein        // the non-premultiplied src values (like 0x00FFFFFF) are coming from.
2373e3181263c034db3ba657c35cce9ae29c411b2b6mtklein        // TODO(mtklein): sort this out and assert *src is premul here.
2383e3181263c034db3ba657c35cce9ae29c411b2b6mtklein        if (*src & 0xFF000000) {
239b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein            *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
240b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        }
241b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        src++;
242b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein        dst++;
243b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    }
244b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein}
245b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein
2464a37d08382a16717cde52c3d2687b021c5413464mtklein}  // SK_OPTS_NS
2474a37d08382a16717cde52c3d2687b021c5413464mtklein
2484a37d08382a16717cde52c3d2687b021c5413464mtklein#endif//SkBlitRow_opts_DEFINED
249