1/*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkBlitRow_opts_DEFINED
9#define SkBlitRow_opts_DEFINED
10
11#include "Sk4px.h"
12#include "SkColorData.h"
13#include "SkMSAN.h"
14
15#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
16    #include "SkColor_opts_SSE2.h"
17    #include <immintrin.h>
18#endif
19
20namespace SK_OPTS_NS {
21
22// Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp.
23// It's not quite perfect, but it's never wrong in the interesting edge cases,
24// and it's quite a bit faster than blend_perfect.
25//
26// blend_256_round_alt is our currently blessed algorithm.  Please use it or an analogous one.
27static inline
28void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
29    unsigned invA = 255 - SkGetPackedA32(color);
30    invA += invA >> 7;
31    SkASSERT(invA < 256);  // We've should have already handled alpha == 0 externally.
32
33    Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128);
34    Sk16b invA_16x(invA);
35
36    Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
37        return (src4 * invA_16x).addNarrowHi(colorHighAndRound);
38    });
39}
40
41#if defined(SK_ARM_HAS_NEON)
42
43// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
44// y[i] are the i-th lanes of the corresponding NEON vectors.
45static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
46    uint16x8_t prod = vmull_u8(x, y);
47    return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
48}
49
50// The implementations of SkPMSrcOver below perform alpha blending consistently with
51// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
52//
53//   result_i = src_i + rint(g(src_alpha, dst_i))
54//
55// where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer.
56
57// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
58// of the same color component for 8 consecutive pixels. The result of this function follows the
59// same convention.
60static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
61    uint8x8_t nalphas = vmvn_u8(src.val[3]);
62    uint8x8x4_t result;
63    result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas,  dst.val[0]));
64    result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas,  dst.val[1]));
65    result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas,  dst.val[2]));
66    result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas,  dst.val[3]));
67    return result;
68}
69
70// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
71// pixels. The return value follows the same convention.
72static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
73    const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
74    uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
75    return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
76}
77
78#endif
79
80/*not static*/ inline
81void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
82    SkASSERT(alpha == 0xFF);
83    sk_msan_assert_initialized(src, src+len);
84
85#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
86    while (len >= 16) {
87        // Load 16 source pixels.
88        auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
89             s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
90             s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
91             s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
92
93        const auto alphaMask = _mm_set1_epi32(0xFF000000);
94
95        auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
96        if (_mm_testz_si128(ORed, alphaMask)) {
97            // All 16 source pixels are transparent.  Nothing to do.
98            src += 16;
99            dst += 16;
100            len -= 16;
101            continue;
102        }
103
104        auto d0 = (__m128i*)(dst) + 0,
105             d1 = (__m128i*)(dst) + 1,
106             d2 = (__m128i*)(dst) + 2,
107             d3 = (__m128i*)(dst) + 3;
108
109        auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
110        if (_mm_testc_si128(ANDed, alphaMask)) {
111            // All 16 source pixels are opaque.  SrcOver becomes Src.
112            _mm_storeu_si128(d0, s0);
113            _mm_storeu_si128(d1, s1);
114            _mm_storeu_si128(d2, s2);
115            _mm_storeu_si128(d3, s3);
116            src += 16;
117            dst += 16;
118            len -= 16;
119            continue;
120        }
121
122        // TODO: This math is wrong.
123        // Do SrcOver.
124        _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
125        _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
126        _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
127        _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
128        src += 16;
129        dst += 16;
130        len -= 16;
131    }
132
133#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
134    while (len >= 16) {
135        // Load 16 source pixels.
136        auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
137             s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
138             s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
139             s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
140
141        const auto alphaMask = _mm_set1_epi32(0xFF000000);
142
143        auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
144        if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
145                                                       _mm_setzero_si128()))) {
146            // All 16 source pixels are transparent.  Nothing to do.
147            src += 16;
148            dst += 16;
149            len -= 16;
150            continue;
151        }
152
153        auto d0 = (__m128i*)(dst) + 0,
154             d1 = (__m128i*)(dst) + 1,
155             d2 = (__m128i*)(dst) + 2,
156             d3 = (__m128i*)(dst) + 3;
157
158        auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
159        if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
160                                                       alphaMask))) {
161            // All 16 source pixels are opaque.  SrcOver becomes Src.
162            _mm_storeu_si128(d0, s0);
163            _mm_storeu_si128(d1, s1);
164            _mm_storeu_si128(d2, s2);
165            _mm_storeu_si128(d3, s3);
166            src += 16;
167            dst += 16;
168            len -= 16;
169            continue;
170        }
171
172        // TODO: This math is wrong.
173        // Do SrcOver.
174        _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
175        _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
176        _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
177        _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
178
179        src += 16;
180        dst += 16;
181        len -= 16;
182    }
183
184#elif defined(SK_ARM_HAS_NEON)
185    // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
186    // underperformed on some of the platforms under test for inputs with frequent transitions of
187    // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
188    // revisiting the situation in the future.
189    while (len >= 8) {
190        // Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
191        // for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
192        // pixels).
193        uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
194        src += 8;
195        len -= 8;
196
197        // We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
198        // are all transparent), the second when all alphas are fully set (they are all opaque).
199        uint8x8_t alphas = src_col.val[3];
200        uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0);
201        if (alphas_u64 == 0) {
202            // All pixels transparent.
203            dst += 8;
204            continue;
205        }
206
207        if (~alphas_u64 == 0) {
208            // All pixels opaque.
209            vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
210            dst += 8;
211            continue;
212        }
213
214        uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
215        vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
216        dst += 8;
217    }
218
219    // Deal with leftover pixels.
220    for (; len >= 2; len -= 2, src += 2, dst += 2) {
221        uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
222        uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
223        vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
224    }
225
226    if (len != 0) {
227        uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(*dst), vcreate_u8(*src));
228        vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
229    }
230    return;
231#endif
232
233    while (len-- > 0) {
234        // This 0xFF000000 is not semantically necessary, but for compatibility
235        // with chromium:611002 we need to keep it until we figure out where
236        // the non-premultiplied src values (like 0x00FFFFFF) are coming from.
237        // TODO(mtklein): sort this out and assert *src is premul here.
238        if (*src & 0xFF000000) {
239            *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
240        }
241        src++;
242        dst++;
243    }
244}
245
246}  // SK_OPTS_NS
247
248#endif//SkBlitRow_opts_DEFINED
249