SkColor_opts_SSE2.h revision 2253aa93930cdc5d0615098ce5473065427bcff6
1/*
2 * Copyright 2014 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkColor_opts_SSE2_DEFINED
9#define SkColor_opts_SSE2_DEFINED
10
11#include <emmintrin.h>
12
13#define ASSERT_EQ(a,b) SkASSERT(0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8((a), (b))))
14
15// Because no _mm_mul_epi32() in SSE2, we emulate it here.
16// Multiplies 4 32-bit integers from a by 4 32-bit intergers from b.
17// The 4 multiplication results should be represented within 32-bit
18// integers, otherwise they would be overflow.
19static inline  __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) {
20    // Calculate results of a0 * b0 and a2 * b2.
21    __m128i r1 = _mm_mul_epu32(a, b);
22    // Calculate results of a1 * b1 and a3 * b3.
23    __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
24    // Shuffle results to [63..0] and interleave the results.
25    __m128i r = _mm_unpacklo_epi32(_mm_shuffle_epi32(r1, _MM_SHUFFLE(0,0,2,0)),
26                                   _mm_shuffle_epi32(r2, _MM_SHUFFLE(0,0,2,0)));
27    return r;
28}
29
30static inline __m128i SkAlpha255To256_SSE2(const __m128i& alpha) {
31    return _mm_add_epi32(alpha, _mm_set1_epi32(1));
32}
33
34// See #define SkAlphaMulAlpha(a, b)  SkMulDiv255Round(a, b) in SkXfermode.cpp.
35static inline __m128i SkAlphaMulAlpha_SSE2(const __m128i& a,
36                                           const __m128i& b) {
37    __m128i prod = _mm_mullo_epi16(a, b);
38    prod = _mm_add_epi32(prod, _mm_set1_epi32(128));
39    prod = _mm_add_epi32(prod, _mm_srli_epi32(prod, 8));
40    prod = _mm_srli_epi32(prod, 8);
41
42    return prod;
43}
44
45static const __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
46static const __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
47
48// Portable version SkAlphaMulQ is in SkColorPriv.h.
49static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const __m128i& scale) {
50    __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
51
52    // uint32_t rb = ((c & mask) * scale) >> 8
53    __m128i rb = _mm_and_si128(rb_mask, c);
54    rb = _mm_mullo_epi16(rb, s);
55    rb = _mm_srli_epi16(rb, 8);
56
57    // uint32_t ag = ((c >> 8) & mask) * scale
58    __m128i ag = _mm_srli_epi16(c, 8);
59    ASSERT_EQ(ag, _mm_and_si128(rb_mask, ag));  // ag = _mm_srli_epi16(c, 8) did this for us.
60    ag = _mm_mullo_epi16(ag, s);
61
62    // (rb & mask) | (ag & ~mask)
63    ASSERT_EQ(rb, _mm_and_si128(rb_mask, rb));  // rb = _mm_srli_epi16(rb, 8) did this for us.
64    ag = _mm_and_si128(ag_mask, ag);
65    return _mm_or_si128(rb, ag);
66}
67
68// Fast path for SkAlphaMulQ_SSE2 with a constant scale factor.
69static inline __m128i SkAlphaMulQ_SSE2(const __m128i& c, const unsigned scale) {
70    __m128i s = _mm_set1_epi16(scale << 8); // Move scale factor to upper byte of word.
71
72    // With mulhi, red and blue values are already in the right place and
73    // don't need to be divided by 256.
74    __m128i rb = _mm_and_si128(rb_mask, c);
75    rb = _mm_mulhi_epu16(rb, s);
76
77    __m128i ag = _mm_and_si128(ag_mask, c);
78    ag = _mm_mulhi_epu16(ag, s);     // Alpha and green values are in the higher byte of each word.
79    ag = _mm_and_si128(ag_mask, ag);
80
81    return _mm_or_si128(rb, ag);
82}
83
84static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {
85    __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));
86    return _mm_srli_epi32(a, 24);
87}
88
89static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {
90    __m128i r = _mm_slli_epi32(src, (24 - SK_R32_SHIFT));
91    return _mm_srli_epi32(r, 24);
92}
93
94static inline __m128i SkGetPackedG32_SSE2(const __m128i& src) {
95    __m128i g = _mm_slli_epi32(src, (24 - SK_G32_SHIFT));
96    return _mm_srli_epi32(g, 24);
97}
98
99static inline __m128i SkGetPackedB32_SSE2(const __m128i& src) {
100    __m128i b = _mm_slli_epi32(src, (24 - SK_B32_SHIFT));
101    return _mm_srli_epi32(b, 24);
102}
103
104static inline __m128i SkMul16ShiftRound_SSE2(const __m128i& a,
105                                             const __m128i& b, int shift) {
106    __m128i prod = _mm_mullo_epi16(a, b);
107    prod = _mm_add_epi16(prod, _mm_set1_epi16(1 << (shift - 1)));
108    prod = _mm_add_epi16(prod, _mm_srli_epi16(prod, shift));
109    prod = _mm_srli_epi16(prod, shift);
110
111    return prod;
112}
113
114static inline __m128i SkPackRGB16_SSE2(const __m128i& r,
115                                       const __m128i& g, const __m128i& b) {
116    __m128i dr = _mm_slli_epi16(r, SK_R16_SHIFT);
117    __m128i dg = _mm_slli_epi16(g, SK_G16_SHIFT);
118    __m128i db = _mm_slli_epi16(b, SK_B16_SHIFT);
119
120    __m128i c = _mm_or_si128(dr, dg);
121    return _mm_or_si128(c, db);
122}
123
124static inline __m128i SkPackARGB32_SSE2(const __m128i& a, const __m128i& r,
125                                        const __m128i& g, const __m128i& b) {
126    __m128i da = _mm_slli_epi32(a, SK_A32_SHIFT);
127    __m128i dr = _mm_slli_epi32(r, SK_R32_SHIFT);
128    __m128i dg = _mm_slli_epi32(g, SK_G32_SHIFT);
129    __m128i db = _mm_slli_epi32(b, SK_B32_SHIFT);
130
131    __m128i c = _mm_or_si128(da, dr);
132    c = _mm_or_si128(c, dg);
133    return _mm_or_si128(c, db);
134}
135
136static inline __m128i SkPacked16ToR32_SSE2(const __m128i& src) {
137    __m128i r = _mm_srli_epi32(src, SK_R16_SHIFT);
138    r = _mm_and_si128(r, _mm_set1_epi32(SK_R16_MASK));
139    r = _mm_or_si128(_mm_slli_epi32(r, (8 - SK_R16_BITS)),
140                     _mm_srli_epi32(r, (2 * SK_R16_BITS - 8)));
141
142    return r;
143}
144
145static inline __m128i SkPacked16ToG32_SSE2(const __m128i& src) {
146    __m128i g = _mm_srli_epi32(src, SK_G16_SHIFT);
147    g = _mm_and_si128(g, _mm_set1_epi32(SK_G16_MASK));
148    g = _mm_or_si128(_mm_slli_epi32(g, (8 - SK_G16_BITS)),
149                     _mm_srli_epi32(g, (2 * SK_G16_BITS - 8)));
150
151    return g;
152}
153
154static inline __m128i SkPacked16ToB32_SSE2(const __m128i& src) {
155    __m128i b = _mm_srli_epi32(src, SK_B16_SHIFT);
156    b = _mm_and_si128(b, _mm_set1_epi32(SK_B16_MASK));
157    b = _mm_or_si128(_mm_slli_epi32(b, (8 - SK_B16_BITS)),
158                     _mm_srli_epi32(b, (2 * SK_B16_BITS - 8)));
159
160    return b;
161}
162
163static inline __m128i SkPixel16ToPixel32_SSE2(const __m128i& src) {
164    __m128i r = SkPacked16ToR32_SSE2(src);
165    __m128i g = SkPacked16ToG32_SSE2(src);
166    __m128i b = SkPacked16ToB32_SSE2(src);
167
168    return SkPackARGB32_SSE2(_mm_set1_epi32(0xFF), r, g, b);
169}
170
171static inline __m128i SkPixel32ToPixel16_ToU16_SSE2(const __m128i& src_pixel1,
172                                                    const __m128i& src_pixel2) {
173    // Calculate result r.
174    __m128i r1 = _mm_srli_epi32(src_pixel1,
175                                SK_R32_SHIFT + (8 - SK_R16_BITS));
176    r1 = _mm_and_si128(r1, _mm_set1_epi32(SK_R16_MASK));
177    __m128i r2 = _mm_srli_epi32(src_pixel2,
178                                SK_R32_SHIFT + (8 - SK_R16_BITS));
179    r2 = _mm_and_si128(r2, _mm_set1_epi32(SK_R16_MASK));
180    __m128i r = _mm_packs_epi32(r1, r2);
181
182    // Calculate result g.
183    __m128i g1 = _mm_srli_epi32(src_pixel1,
184                                SK_G32_SHIFT + (8 - SK_G16_BITS));
185    g1 = _mm_and_si128(g1, _mm_set1_epi32(SK_G16_MASK));
186    __m128i g2 = _mm_srli_epi32(src_pixel2,
187                                SK_G32_SHIFT + (8 - SK_G16_BITS));
188    g2 = _mm_and_si128(g2, _mm_set1_epi32(SK_G16_MASK));
189    __m128i g = _mm_packs_epi32(g1, g2);
190
191    // Calculate result b.
192    __m128i b1 = _mm_srli_epi32(src_pixel1,
193                                SK_B32_SHIFT + (8 - SK_B16_BITS));
194    b1 = _mm_and_si128(b1, _mm_set1_epi32(SK_B16_MASK));
195    __m128i b2 = _mm_srli_epi32(src_pixel2,
196                                SK_B32_SHIFT + (8 - SK_B16_BITS));
197    b2 = _mm_and_si128(b2, _mm_set1_epi32(SK_B16_MASK));
198    __m128i b = _mm_packs_epi32(b1, b2);
199
200    // Store 8 16-bit colors in dst.
201    __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
202
203    return d_pixel;
204}
205
206// Portable version SkBlendARGB32 is in SkColorPriv.h.
207static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
208                                         const __m128i& aa) {
209    __m128i src_scale = SkAlpha255To256_SSE2(aa);
210    // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
211    __m128i dst_scale = SkGetPackedA32_SSE2(src);
212    dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
213    dst_scale = _mm_srli_epi16(dst_scale, 8);
214    dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
215
216    __m128i result = SkAlphaMulQ_SSE2(src, src_scale);
217    return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
218}
219
220// Fast path for SkBlendARGB32_SSE2 with a constant alpha factor.
221static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, const __m128i& dst,
222                                         const unsigned aa) {
223    unsigned alpha = SkAlpha255To256(aa);
224    __m128i src_scale = _mm_set1_epi32(alpha);
225    // SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale))
226    __m128i dst_scale = SkGetPackedA32_SSE2(src);
227    dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
228    dst_scale = _mm_srli_epi16(dst_scale, 8);
229    dst_scale = _mm_sub_epi32(_mm_set1_epi32(256), dst_scale);
230
231    __m128i result = SkAlphaMulQ_SSE2(src, alpha);
232    return _mm_add_epi8(result, SkAlphaMulQ_SSE2(dst, dst_scale));
233}
234
235#undef ASSERT_EQ
236#endif // SkColor_opts_SSE2_DEFINED
237