SkNx_sse.h revision 7c249e531900929c2fe2cdde76619fa6d2538c49
1/* 2 * Copyright 2015 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#ifndef SkNx_sse_DEFINED 9#define SkNx_sse_DEFINED 10 11// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent. 12// If you do, make sure this is in a static inline function... anywhere else risks violating ODR. 13 14#define SKNX_IS_FAST 15 16// SSE 4.1 has _mm_floor_ps to floor 4 floats. We emulate it: 17// - roundtrip through integers via truncation 18// - subtract 1 if that's too big (possible for negative values). 19// This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big. 20static inline __m128 sse2_mm_floor_ps(__m128 v) { 21 __m128 roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); 22 __m128 too_big = _mm_cmpgt_ps(roundtrip, v); 23 return _mm_sub_ps(roundtrip, _mm_and_ps(too_big, _mm_set1_ps(1.0f))); 24} 25 26template <> 27class SkNx<2, float> { 28public: 29 SkNx(const __m128& vec) : fVec(vec) {} 30 31 SkNx() {} 32 SkNx(float val) : fVec(_mm_set1_ps(val)) {} 33 static SkNx Load(const void* ptr) { 34 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); 35 } 36 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} 37 38 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } 39 40 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } 41 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } 42 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } 43 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } 44 45 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } 46 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } 47 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } 48 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } 49 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } 50 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } 51 52 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } 53 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } 54 55 SkNx sqrt () const { return _mm_sqrt_ps (fVec); } 56 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } 57 SkNx rsqrt1() const { return this->rsqrt0(); } 58 SkNx rsqrt2() const { return this->rsqrt1(); } 59 60 SkNx invert() const { return SkNx(1) / *this; } 61 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } 62 63 float operator[](int k) const { 64 SkASSERT(0 <= k && k < 2); 65 union { __m128 v; float fs[4]; } pun = {fVec}; 66 return pun.fs[k&1]; 67 } 68 69 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } 70 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } 71 72 __m128 fVec; 73}; 74 75template <> 76class SkNx<4, float> { 77public: 78 SkNx(const __m128& vec) : fVec(vec) {} 79 80 SkNx() {} 81 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} 82 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); } 83 84 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} 85 86 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } 87 88 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } 89 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } 90 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } 91 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } 92 93 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } 94 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } 95 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } 96 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } 97 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } 98 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } 99 100 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } 101 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } 102 103 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } 104 SkNx floor() const { return sse2_mm_floor_ps(fVec); } 105 106 SkNx sqrt () const { return _mm_sqrt_ps (fVec); } 107 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } 108 SkNx rsqrt1() const { return this->rsqrt0(); } 109 SkNx rsqrt2() const { return this->rsqrt1(); } 110 111 SkNx invert() const { return SkNx(1) / *this; } 112 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } 113 114 float operator[](int k) const { 115 SkASSERT(0 <= k && k < 4); 116 union { __m128 v; float fs[4]; } pun = {fVec}; 117 return pun.fs[k&3]; 118 } 119 120 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); } 121 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); } 122 123 SkNx thenElse(const SkNx& t, const SkNx& e) const { 124 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), 125 _mm_andnot_ps(fVec, e.fVec)); 126 } 127 128 __m128 fVec; 129}; 130 131template <> 132class SkNx<4, int> { 133public: 134 SkNx(const __m128i& vec) : fVec(vec) {} 135 136 SkNx() {} 137 SkNx(int val) : fVec(_mm_set1_epi32(val)) {} 138 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 139 SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} 140 141 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 142 143 SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } 144 SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } 145 SkNx operator * (const SkNx& o) const { 146 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), 147 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4)); 148 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), 149 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); 150 } 151 152 SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } 153 SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } 154 155 int operator[](int k) const { 156 SkASSERT(0 <= k && k < 4); 157 union { __m128i v; int is[4]; } pun = {fVec}; 158 return pun.is[k&3]; 159 } 160 161 __m128i fVec; 162}; 163 164template <> 165class SkNx<4, uint16_t> { 166public: 167 SkNx(const __m128i& vec) : fVec(vec) {} 168 169 SkNx() {} 170 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 171 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); } 172 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {} 173 174 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } 175 176 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } 177 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } 178 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } 179 180 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 181 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 182 183 uint16_t operator[](int k) const { 184 SkASSERT(0 <= k && k < 4); 185 union { __m128i v; uint16_t us[8]; } pun = {fVec}; 186 return pun.us[k&3]; 187 } 188 189 __m128i fVec; 190}; 191 192template <> 193class SkNx<8, uint16_t> { 194public: 195 SkNx(const __m128i& vec) : fVec(vec) {} 196 197 SkNx() {} 198 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 199 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 200 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, 201 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {} 202 203 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 204 205 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } 206 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } 207 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } 208 209 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 210 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 211 212 static SkNx Min(const SkNx& a, const SkNx& b) { 213 // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the 214 // signed version, _mm_min_epi16, then shift back. 215 const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine. 216 const __m128i top_8x = _mm_set1_epi16(top); 217 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), 218 _mm_sub_epi8(b.fVec, top_8x))); 219 } 220 221 SkNx thenElse(const SkNx& t, const SkNx& e) const { 222 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 223 _mm_andnot_si128(fVec, e.fVec)); 224 } 225 226 uint16_t operator[](int k) const { 227 SkASSERT(0 <= k && k < 8); 228 union { __m128i v; uint16_t us[8]; } pun = {fVec}; 229 return pun.us[k&7]; 230 } 231 232 __m128i fVec; 233}; 234 235template <> 236class SkNx<4, uint8_t> { 237public: 238 SkNx(const __m128i& vec) : fVec(vec) {} 239 240 SkNx() {} 241 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); } 242 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } 243 244 // TODO as needed 245 246 __m128i fVec; 247}; 248 249template <> 250class SkNx<16, uint8_t> { 251public: 252 SkNx(const __m128i& vec) : fVec(vec) {} 253 254 SkNx() {} 255 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} 256 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 257 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 258 uint8_t e, uint8_t f, uint8_t g, uint8_t h, 259 uint8_t i, uint8_t j, uint8_t k, uint8_t l, 260 uint8_t m, uint8_t n, uint8_t o, uint8_t p) 261 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} 262 263 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 264 265 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); } 266 267 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } 268 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } 269 270 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); } 271 SkNx operator < (const SkNx& o) const { 272 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare. 273 auto flip = _mm_set1_epi8(char(0x80)); 274 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec)); 275 } 276 277 uint8_t operator[](int k) const { 278 SkASSERT(0 <= k && k < 16); 279 union { __m128i v; uint8_t us[16]; } pun = {fVec}; 280 return pun.us[k&15]; 281 } 282 283 SkNx thenElse(const SkNx& t, const SkNx& e) const { 284 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 285 _mm_andnot_si128(fVec, e.fVec)); 286 } 287 288 __m128i fVec; 289}; 290 291template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) { 292 return _mm_cvtepi32_ps(src.fVec); 293} 294 295template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) { 296 return _mm_cvttps_epi32(src.fVec); 297} 298 299template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { 300 auto _32 = _mm_cvttps_epi32(src.fVec); 301 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. 302#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 303 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place. 304 const int _ = ~0; 305 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_)); 306#else 307 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: 308 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); 309 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000)); 310#endif 311} 312 313template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { 314 auto _32 = _mm_cvttps_epi32(src.fVec); 315#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 316 const int _ = ~0; 317 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_)); 318#else 319 auto _16 = _mm_packus_epi16(_32, _32); 320 return _mm_packus_epi16(_16, _16); 321#endif 322} 323 324template<> /*static*/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { 325#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 326 const int _ = ~0; 327 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)); 328#else 329 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), 330 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); 331#endif 332 return _mm_cvtepi32_ps(_32); 333} 334 335template<> /*static*/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { 336 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); 337 return _mm_cvtepi32_ps(_32); 338} 339 340static inline void Sk4f_ToBytes(uint8_t bytes[16], 341 const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) { 342 _mm_storeu_si128((__m128i*)bytes, 343 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), 344 _mm_cvttps_epi32(b.fVec)), 345 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), 346 _mm_cvttps_epi32(d.fVec)))); 347} 348 349template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { 350 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); 351} 352 353template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { 354 return _mm_packus_epi16(src.fVec, src.fVec); 355} 356 357#endif//SkNx_sse_DEFINED 358