SkNx_sse.h revision e4c0beed744d09dae4757c1893d8caa64ee09cd2
1/* 2 * Copyright 2015 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#ifndef SkNx_sse_DEFINED 9#define SkNx_sse_DEFINED 10 11// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent. 12// If you do, make sure this is in a static inline function... anywhere else risks violating ODR. 13 14#define SKNX_IS_FAST 15 16template <> 17class SkNx<2, float> { 18public: 19 SkNx(const __m128& vec) : fVec(vec) {} 20 21 SkNx() {} 22 SkNx(float val) : fVec(_mm_set1_ps(val)) {} 23 static SkNx Load(const void* ptr) { 24 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)ptr)); 25 } 26 SkNx(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} 27 28 void store(void* ptr) const { _mm_storel_pi((__m64*)ptr, fVec); } 29 30 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } 31 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } 32 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } 33 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } 34 35 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } 36 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } 37 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } 38 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } 39 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } 40 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } 41 42 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } 43 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } 44 45 SkNx sqrt () const { return _mm_sqrt_ps (fVec); } 46 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } 47 SkNx rsqrt1() const { return this->rsqrt0(); } 48 SkNx rsqrt2() const { return this->rsqrt1(); } 49 50 SkNx invert() const { return SkNx(1) / *this; } 51 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } 52 53 float operator[](int k) const { 54 SkASSERT(0 <= k && k < 2); 55 union { __m128 v; float fs[4]; } pun = {fVec}; 56 return pun.fs[k&1]; 57 } 58 template <int k> float kth() const { return (*this)[k]; } 59 60 bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } 61 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } 62 63 __m128 fVec; 64}; 65 66template <> 67class SkNx<4, float> { 68public: 69 SkNx(const __m128& vec) : fVec(vec) {} 70 71 SkNx() {} 72 SkNx(float val) : fVec( _mm_set1_ps(val) ) {} 73 static SkNx Load(const void* ptr) { return _mm_loadu_ps((const float*)ptr); } 74 75 SkNx(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} 76 77 void store(void* ptr) const { _mm_storeu_ps((float*)ptr, fVec); } 78 79 SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); } 80 SkNx operator - (const SkNx& o) const { return _mm_sub_ps(fVec, o.fVec); } 81 SkNx operator * (const SkNx& o) const { return _mm_mul_ps(fVec, o.fVec); } 82 SkNx operator / (const SkNx& o) const { return _mm_div_ps(fVec, o.fVec); } 83 84 SkNx operator == (const SkNx& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } 85 SkNx operator != (const SkNx& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } 86 SkNx operator < (const SkNx& o) const { return _mm_cmplt_ps (fVec, o.fVec); } 87 SkNx operator > (const SkNx& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } 88 SkNx operator <= (const SkNx& o) const { return _mm_cmple_ps (fVec, o.fVec); } 89 SkNx operator >= (const SkNx& o) const { return _mm_cmpge_ps (fVec, o.fVec); } 90 91 static SkNx Min(const SkNx& l, const SkNx& r) { return _mm_min_ps(l.fVec, r.fVec); } 92 static SkNx Max(const SkNx& l, const SkNx& r) { return _mm_max_ps(l.fVec, r.fVec); } 93 94 SkNx abs() const { return _mm_andnot_ps(_mm_set1_ps(-0.0f), fVec); } 95 96 SkNx sqrt () const { return _mm_sqrt_ps (fVec); } 97 SkNx rsqrt0() const { return _mm_rsqrt_ps(fVec); } 98 SkNx rsqrt1() const { return this->rsqrt0(); } 99 SkNx rsqrt2() const { return this->rsqrt1(); } 100 101 SkNx invert() const { return SkNx(1) / *this; } 102 SkNx approxInvert() const { return _mm_rcp_ps(fVec); } 103 104 float operator[](int k) const { 105 SkASSERT(0 <= k && k < 4); 106 union { __m128 v; float fs[4]; } pun = {fVec}; 107 return pun.fs[k&3]; 108 } 109 template <int k> float kth() const { return (*this)[k]; } 110 111 bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); } 112 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); } 113 114 SkNx thenElse(const SkNx& t, const SkNx& e) const { 115 return _mm_or_ps(_mm_and_ps (fVec, t.fVec), 116 _mm_andnot_ps(fVec, e.fVec)); 117 } 118 119 __m128 fVec; 120}; 121 122template <> 123class SkNx<4, uint16_t> { 124public: 125 SkNx(const __m128i& vec) : fVec(vec) {} 126 127 SkNx() {} 128 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 129 static SkNx Load(const void* ptr) { return _mm_loadl_epi64((const __m128i*)ptr); } 130 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {} 131 132 void store(void* ptr) const { _mm_storel_epi64((__m128i*)ptr, fVec); } 133 134 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } 135 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } 136 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } 137 138 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 139 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 140 141 uint16_t operator[](int k) const { 142 SkASSERT(0 <= k && k < 4); 143 union { __m128i v; uint16_t us[8]; } pun = {fVec}; 144 return pun.us[k&3]; 145 } 146 template <int k> uint16_t kth() const { return (*this)[k]; } 147 148 __m128i fVec; 149}; 150 151template <> 152class SkNx<8, uint16_t> { 153public: 154 SkNx(const __m128i& vec) : fVec(vec) {} 155 156 SkNx() {} 157 SkNx(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 158 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 159 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, 160 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {} 161 162 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 163 164 SkNx operator + (const SkNx& o) const { return _mm_add_epi16(fVec, o.fVec); } 165 SkNx operator - (const SkNx& o) const { return _mm_sub_epi16(fVec, o.fVec); } 166 SkNx operator * (const SkNx& o) const { return _mm_mullo_epi16(fVec, o.fVec); } 167 168 SkNx operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 169 SkNx operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 170 171 static SkNx Min(const SkNx& a, const SkNx& b) { 172 // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the 173 // signed version, _mm_min_epi16, then shift back. 174 const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine. 175 const __m128i top_8x = _mm_set1_epi16(top); 176 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), 177 _mm_sub_epi8(b.fVec, top_8x))); 178 } 179 180 SkNx thenElse(const SkNx& t, const SkNx& e) const { 181 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 182 _mm_andnot_si128(fVec, e.fVec)); 183 } 184 185 uint16_t operator[](int k) const { 186 SkASSERT(0 <= k && k < 8); 187 union { __m128i v; uint16_t us[8]; } pun = {fVec}; 188 return pun.us[k&7]; 189 } 190 template <int k> uint16_t kth() const { return (*this)[k]; } 191 192 __m128i fVec; 193}; 194 195template <> 196class SkNx<4, uint8_t> { 197public: 198 SkNx(const __m128i& vec) : fVec(vec) {} 199 200 SkNx() {} 201 static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); } 202 void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } 203 204 // TODO as needed 205 206 __m128i fVec; 207}; 208 209template <> 210class SkNx<16, uint8_t> { 211public: 212 SkNx(const __m128i& vec) : fVec(vec) {} 213 214 SkNx() {} 215 SkNx(uint8_t val) : fVec(_mm_set1_epi8(val)) {} 216 static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } 217 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 218 uint8_t e, uint8_t f, uint8_t g, uint8_t h, 219 uint8_t i, uint8_t j, uint8_t k, uint8_t l, 220 uint8_t m, uint8_t n, uint8_t o, uint8_t p) 221 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} 222 223 void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } 224 225 SkNx saturatedAdd(const SkNx& o) const { return _mm_adds_epu8(fVec, o.fVec); } 226 227 SkNx operator + (const SkNx& o) const { return _mm_add_epi8(fVec, o.fVec); } 228 SkNx operator - (const SkNx& o) const { return _mm_sub_epi8(fVec, o.fVec); } 229 230 static SkNx Min(const SkNx& a, const SkNx& b) { return _mm_min_epu8(a.fVec, b.fVec); } 231 SkNx operator < (const SkNx& o) const { 232 // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare. 233 auto flip = _mm_set1_epi8(char(0x80)); 234 return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec)); 235 } 236 237 uint8_t operator[](int k) const { 238 SkASSERT(0 <= k && k < 16); 239 union { __m128i v; uint8_t us[16]; } pun = {fVec}; 240 return pun.us[k&15]; 241 } 242 template <int k> uint8_t kth() const { return (*this)[k]; } 243 244 SkNx thenElse(const SkNx& t, const SkNx& e) const { 245 return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), 246 _mm_andnot_si128(fVec, e.fVec)); 247 } 248 249 __m128i fVec; 250}; 251 252 253template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) { 254 auto _32 = _mm_cvttps_epi32(src.fVec); 255 // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+. 256#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 257 // With SSSE3, we can just shuffle the low 2 bytes from each lane right into place. 258 const int _ = ~0; 259 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_)); 260#else 261 // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32: 262 _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000)); 263 return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000)); 264#endif 265} 266 267template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) { 268 auto _32 = _mm_cvttps_epi32(src.fVec); 269#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 270 const int _ = ~0; 271 return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,4,8,12, _,_,_,_, _,_,_,_, _,_,_,_)); 272#else 273 auto _16 = _mm_packus_epi16(_32, _32); 274 return _mm_packus_epi16(_16, _16); 275#endif 276} 277 278template<> /*static*/ inline Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) { 279#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 280 const int _ = ~0; 281 auto _32 = _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)); 282#else 283 auto _16 = _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()), 284 _32 = _mm_unpacklo_epi16(_16, _mm_setzero_si128()); 285#endif 286 return _mm_cvtepi32_ps(_32); 287} 288 289template<> /*static*/ inline Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) { 290 auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); 291 return _mm_cvtepi32_ps(_32); 292} 293 294static inline void Sk4f_ToBytes(uint8_t bytes[16], 295 const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) { 296 _mm_storeu_si128((__m128i*)bytes, 297 _mm_packus_epi16(_mm_packus_epi16(_mm_cvttps_epi32(a.fVec), 298 _mm_cvttps_epi32(b.fVec)), 299 _mm_packus_epi16(_mm_cvttps_epi32(c.fVec), 300 _mm_cvttps_epi32(d.fVec)))); 301} 302 303template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) { 304 return _mm_unpacklo_epi8(src.fVec, _mm_setzero_si128()); 305} 306 307template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) { 308 return _mm_packus_epi16(src.fVec, src.fVec); 309} 310 311#endif//SkNx_sse_DEFINED 312