SkNx_sse.h revision e4c0beed744d09dae4757c1893d8caa64ee09cd2
1/*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkNx_sse_DEFINED
9#define SkNx_sse_DEFINED
10
11// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent.
12// If you do, make sure this is in a static inline function... anywhere else risks violating ODR.
13
14#define SKNX_IS_FAST
15
16template <>
17class SkNx<2, float> {
18public:
19    SkNx(const __m128& vec) : fVec(vec) {}
20
21    SkNx() {}
22    SkNx(float val) : fVec(_mm_set1_ps(val)) {}
23    static SkNx Load(const void* ptr) {
24        return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr));
25    }
26    SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {}
27
28    void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); }
29
30    SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
31    SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
32    SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
33    SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
34
35    SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
36    SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
37    SkNx operator  < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
38    SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
39    SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
40    SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
41
42    static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
43    static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
44
45    SkNx  sqrt () const { return _mm_sqrt_ps (fVec);  }
46    SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
47    SkNx rsqrt1() const { return this->rsqrt0(); }
48    SkNx rsqrt2() const { return this->rsqrt1(); }
49
50    SkNx       invert() const { return SkNx(1) / *this; }
51    SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
52
53    float operator[](int k) const {
54        SkASSERT(0 <= k && k < 2);
55        union { __m128 v; float fs[4]; } pun = {fVec};
56        return pun.fs[k&1];
57    }
58    template <int k> float kth() const { return (*this)[k]; }
59
60    bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); }
61    bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); }
62
63    __m128 fVec;
64};
65
66template <>
67class SkNx<4, float> {
68public:
69    SkNx(const __m128& vec) : fVec(vec) {}
70
71    SkNx() {}
72    SkNx(float val)           : fVec( _mm_set1_ps(val) ) {}
73    static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); }
74
75    SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {}
76
77    void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); }
78
79    SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
80    SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); }
81    SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); }
82    SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); }
83
84    SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
85    SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
86    SkNx operator  < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
87    SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); }
88    SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); }
89    SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); }
90
91    static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); }
92    static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); }
93
94    SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); }
95
96    SkNx  sqrt () const { return _mm_sqrt_ps (fVec);  }
97    SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); }
98    SkNx rsqrt1() const { return this->rsqrt0(); }
99    SkNx rsqrt2() const { return this->rsqrt1(); }
100
101    SkNx       invert() const { return SkNx(1) / *this; }
102    SkNx approxInvert() const { return _mm_rcp_ps(fVec); }
103
104    float operator[](int k) const {
105        SkASSERT(0 <= k && k < 4);
106        union { __m128 v; float fs[4]; } pun = {fVec};
107        return pun.fs[k&3];
108    }
109    template <int k> float kth() const { return (*this)[k]; }
110
111    bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); }
112    bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); }
113
114    SkNx thenElse(const SkNx& t, const SkNx& e) const {
115        return _mm_or_ps(_mm_and_ps   (fVec, t.fVec),
116                         _mm_andnot_ps(fVec, e.fVec));
117    }
118
119    __m128 fVec;
120};
121
122template <>
123class SkNx<4, uint16_t> {
124public:
125    SkNx(const __m128i& vec) : fVec(vec) {}
126
127    SkNx() {}
128    SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
129    static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); }
130    SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {}
131
132    void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); }
133
134    SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
135    SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
136    SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
137
138    SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
139    SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
140
141    uint16_t operator[](int k) const {
142        SkASSERT(0 <= k && k < 4);
143        union { __m128i v; uint16_t us[8]; } pun = {fVec};
144        return pun.us[k&3];
145    }
146    template <int k> uint16_t kth() const { return (*this)[k]; }
147
148    __m128i fVec;
149};
150
151template <>
152class SkNx<8, uint16_t> {
153public:
154    SkNx(const __m128i& vec) : fVec(vec) {}
155
156    SkNx() {}
157    SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {}
158    static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
159    SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
160         uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {}
161
162    void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
163
164    SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); }
165    SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); }
166    SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); }
167
168    SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); }
169    SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); }
170
171    static SkNx Min(const SkNx& a, const SkNx& b) {
172        // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the
173        // signed version, _mm_min_epi16, then shift back.
174        const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine.
175        const __m128i top_8x = _mm_set1_epi16(top);
176        return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x),
177                                                  _mm_sub_epi8(b.fVec, top_8x)));
178    }
179
180    SkNx thenElse(const SkNx& t, const SkNx& e) const {
181        return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
182                            _mm_andnot_si128(fVec, e.fVec));
183    }
184
185    uint16_t operator[](int k) const {
186        SkASSERT(0 <= k && k < 8);
187        union { __m128i v; uint16_t us[8]; } pun = {fVec};
188        return pun.us[k&7];
189    }
190    template <int k> uint16_t kth() const { return (*this)[k]; }
191
192    __m128i fVec;
193};
194
195template <>
196class SkNx<4, uint8_t> {
197public:
198    SkNx(const __m128i& vec) : fVec(vec) {}
199
200    SkNx() {}
201    static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); }
202    void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); }
203
204    // TODO as needed
205
206    __m128i fVec;
207};
208
209template <>
210class SkNx<16, uint8_t> {
211public:
212    SkNx(const __m128i& vec) : fVec(vec) {}
213
214    SkNx() {}
215    SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {}
216    static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
217    SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
218         uint8_t e, uint8_t f, uint8_t g, uint8_t h,
219         uint8_t i, uint8_t j, uint8_t k, uint8_t l,
220         uint8_t m, uint8_t n, uint8_t o, uint8_t p)
221        : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {}
222
223    void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
224
225    SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); }
226
227    SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); }
228    SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); }
229
230    static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); }
231    SkNx operator < (const SkNx& o) const {
232        // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare.
233        auto flip = _mm_set1_epi8(char(0x80));
234        return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec));
235    }
236
237    uint8_t operator[](int k) const {
238        SkASSERT(0 <= k && k < 16);
239        union { __m128i v; uint8_t us[16]; } pun = {fVec};
240        return pun.us[k&15];
241    }
242    template <int k> uint8_t kth() const { return (*this)[k]; }
243
244    SkNx thenElse(const SkNx& t, const SkNx& e) const {
245        return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
246                            _mm_andnot_si128(fVec, e.fVec));
247    }
248
249    __m128i fVec;
250};
251
252
253template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
254    auto _32 = _mm_cvttps_epi32(src.fVec);
255    // Ideally we'd use _mm_packus_epi32 here.  But that's SSE4.1+.
256#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
257    // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.
258    const int _ = ~0;
259    return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
260#else
261    // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32:
262    _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000));
263    return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000));
264#endif
265}
266
267template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
268    auto _32 = _mm_cvttps_epi32(src.fVec);
269#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
270    const int _ = ~0;
271    return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_));
272#else
273    auto _16 = _mm_packus_epi16(_32, _32);
274    return     _mm_packus_epi16(_16, _16);
275#endif
276}
277
278template<> /*static*/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
279#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
280    const int _ = ~0;
281    auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_));
282#else
283    auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()),
284         _32 = _mm_unpacklo_epi16(_16,     _mm_setzero_si128());
285#endif
286    return _mm_cvtepi32_ps(_32);
287}
288
289template<> /*static*/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
290    auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
291    return _mm_cvtepi32_ps(_32);
292}
293
294static inline void Sk4f_ToBytes(uint8_t bytes[16],
295                                const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) {
296    _mm_storeu_si128((__m128i*)bytes,
297                     _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec),
298                                                       _mm_cvttps_epi32(b.fVec)),
299                                      _mm_packus_epi16(_mm_cvttps_epi32(c.fVec),
300                                                       _mm_cvttps_epi32(d.fVec))));
301}
302
303template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
304    return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128());
305}
306
307template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
308    return _mm_packus_epi16(src.fVec, src.fVec);
309}
310
311#endif//SkNx_sse_DEFINED
312