1/* 2 * Copyright 2015 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#ifndef SkNx_sse_DEFINED 9#define SkNx_sse_DEFINED 10 11// This file may assume <= SSE2, but must check SK_CPU_SSE_LEVEL for anything more recent. 12#include <immintrin.h> 13 14template <> 15class SkNb<2, 4> { 16public: 17 SkNb(const __m128i& vec) : fVec(vec) {} 18 19 SkNb() {} 20 bool allTrue() const { return 0xff == (_mm_movemask_epi8(fVec) & 0xff); } 21 bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(fVec) & 0xff); } 22 23 __m128i fVec; 24}; 25 26template <> 27class SkNb<4, 4> { 28public: 29 SkNb(const __m128i& vec) : fVec(vec) {} 30 31 SkNb() {} 32 bool allTrue() const { return 0xffff == _mm_movemask_epi8(fVec); } 33 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(fVec); } 34 35 __m128i fVec; 36}; 37 38template <> 39class SkNb<2, 8> { 40public: 41 SkNb(const __m128i& vec) : fVec(vec) {} 42 43 SkNb() {} 44 bool allTrue() const { return 0xffff == _mm_movemask_epi8(fVec); } 45 bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(fVec); } 46 47 __m128i fVec; 48}; 49 50 51template <> 52class SkNf<2, float> { 53 typedef SkNb<2, 4> Nb; 54public: 55 SkNf(const __m128& vec) : fVec(vec) {} 56 57 SkNf() {} 58 explicit SkNf(float val) : fVec(_mm_set1_ps(val)) {} 59 static SkNf Load(const float vals[2]) { 60 return _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)vals)); 61 } 62 SkNf(float a, float b) : fVec(_mm_setr_ps(a,b,0,0)) {} 63 64 void store(float vals[2]) const { _mm_storel_pi((__m64*)vals, fVec); } 65 66 SkNf operator + (const SkNf& o) const { return _mm_add_ps(fVec, o.fVec); } 67 SkNf operator - (const SkNf& o) const { return _mm_sub_ps(fVec, o.fVec); } 68 SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); } 69 SkNf operator / (const SkNf& o) const { return _mm_div_ps(fVec, o.fVec); } 70 71 Nb operator == (const SkNf& o) const { return _mm_castps_si128(_mm_cmpeq_ps (fVec, o.fVec)); } 72 Nb operator != (const SkNf& o) const { return _mm_castps_si128(_mm_cmpneq_ps(fVec, o.fVec)); } 73 Nb operator < (const SkNf& o) const { return _mm_castps_si128(_mm_cmplt_ps (fVec, o.fVec)); } 74 Nb operator > (const SkNf& o) const { return _mm_castps_si128(_mm_cmpgt_ps (fVec, o.fVec)); } 75 Nb operator <= (const SkNf& o) const { return _mm_castps_si128(_mm_cmple_ps (fVec, o.fVec)); } 76 Nb operator >= (const SkNf& o) const { return _mm_castps_si128(_mm_cmpge_ps (fVec, o.fVec)); } 77 78 static SkNf Min(const SkNf& l, const SkNf& r) { return _mm_min_ps(l.fVec, r.fVec); } 79 static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_ps(l.fVec, r.fVec); } 80 81 SkNf sqrt() const { return _mm_sqrt_ps (fVec); } 82 SkNf rsqrt0() const { return _mm_rsqrt_ps(fVec); } 83 SkNf rsqrt1() const { return this->rsqrt0(); } 84 SkNf rsqrt2() const { return this->rsqrt1(); } 85 86 SkNf invert() const { return SkNf(1) / *this; } 87 SkNf approxInvert() const { return _mm_rcp_ps(fVec); } 88 89 template <int k> float kth() const { 90 SkASSERT(0 <= k && k < 2); 91 union { __m128 v; float fs[4]; } pun = {fVec}; 92 return pun.fs[k&1]; 93 } 94 95 __m128 fVec; 96}; 97 98template <> 99class SkNf<2, double> { 100 typedef SkNb<2, 8> Nb; 101public: 102 SkNf(const __m128d& vec) : fVec(vec) {} 103 104 SkNf() {} 105 explicit SkNf(double val) : fVec( _mm_set1_pd(val) ) {} 106 static SkNf Load(const double vals[2]) { return _mm_loadu_pd(vals); } 107 SkNf(double a, double b) : fVec(_mm_setr_pd(a,b)) {} 108 109 void store(double vals[2]) const { _mm_storeu_pd(vals, fVec); } 110 111 SkNf operator + (const SkNf& o) const { return _mm_add_pd(fVec, o.fVec); } 112 SkNf operator - (const SkNf& o) const { return _mm_sub_pd(fVec, o.fVec); } 113 SkNf operator * (const SkNf& o) const { return _mm_mul_pd(fVec, o.fVec); } 114 SkNf operator / (const SkNf& o) const { return _mm_div_pd(fVec, o.fVec); } 115 116 Nb operator == (const SkNf& o) const { return _mm_castpd_si128(_mm_cmpeq_pd (fVec, o.fVec)); } 117 Nb operator != (const SkNf& o) const { return _mm_castpd_si128(_mm_cmpneq_pd(fVec, o.fVec)); } 118 Nb operator < (const SkNf& o) const { return _mm_castpd_si128(_mm_cmplt_pd (fVec, o.fVec)); } 119 Nb operator > (const SkNf& o) const { return _mm_castpd_si128(_mm_cmpgt_pd (fVec, o.fVec)); } 120 Nb operator <= (const SkNf& o) const { return _mm_castpd_si128(_mm_cmple_pd (fVec, o.fVec)); } 121 Nb operator >= (const SkNf& o) const { return _mm_castpd_si128(_mm_cmpge_pd (fVec, o.fVec)); } 122 123 static SkNf Min(const SkNf& l, const SkNf& r) { return _mm_min_pd(l.fVec, r.fVec); } 124 static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_pd(l.fVec, r.fVec); } 125 126 SkNf sqrt() const { return _mm_sqrt_pd(fVec); } 127 SkNf rsqrt0() const { return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(fVec))); } 128 SkNf rsqrt1() const { return this->rsqrt0(); } 129 SkNf rsqrt2() const { return this->rsqrt1(); } 130 131 SkNf invert() const { return SkNf(1) / *this; } 132 SkNf approxInvert() const { return _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(fVec))); } 133 134 template <int k> double kth() const { 135 SkASSERT(0 <= k && k < 2); 136 union { __m128d v; double ds[2]; } pun = {fVec}; 137 return pun.ds[k&1]; 138 } 139 140 __m128d fVec; 141}; 142 143template <> 144class SkNi<4, int> { 145public: 146 SkNi(const __m128i& vec) : fVec(vec) {} 147 148 SkNi() {} 149 explicit SkNi(int val) : fVec(_mm_set1_epi32(val)) {} 150 static SkNi Load(const int vals[4]) { return _mm_loadu_si128((const __m128i*)vals); } 151 SkNi(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} 152 153 void store(int vals[4]) const { _mm_storeu_si128((__m128i*)vals, fVec); } 154 155 SkNi operator + (const SkNi& o) const { return _mm_add_epi32(fVec, o.fVec); } 156 SkNi operator - (const SkNi& o) const { return _mm_sub_epi32(fVec, o.fVec); } 157 SkNi operator * (const SkNi& o) const { 158 __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), 159 mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4)); 160 return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), 161 _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); 162 } 163 164 SkNi operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } 165 SkNi operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); } 166 167 template <int k> int kth() const { 168 SkASSERT(0 <= k && k < 4); 169 switch (k) { 170 case 0: return _mm_cvtsi128_si32(fVec); 171 case 1: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 4)); 172 case 2: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 8)); 173 case 3: return _mm_cvtsi128_si32(_mm_srli_si128(fVec, 12)); 174 default: SkASSERT(false); return 0; 175 } 176 } 177 178 __m128i fVec; 179}; 180 181template <> 182class SkNf<4, float> { 183 typedef SkNb<4, 4> Nb; 184public: 185 SkNf(const __m128& vec) : fVec(vec) {} 186 187 SkNf() {} 188 explicit SkNf(float val) : fVec( _mm_set1_ps(val) ) {} 189 static SkNf Load(const float vals[4]) { return _mm_loadu_ps(vals); } 190 SkNf(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} 191 192 void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } 193 194 SkNi<4, int> castTrunc() const { return _mm_cvttps_epi32(fVec); } 195 196 SkNf operator + (const SkNf& o) const { return _mm_add_ps(fVec, o.fVec); } 197 SkNf operator - (const SkNf& o) const { return _mm_sub_ps(fVec, o.fVec); } 198 SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); } 199 SkNf operator / (const SkNf& o) const { return _mm_div_ps(fVec, o.fVec); } 200 201 Nb operator == (const SkNf& o) const { return _mm_castps_si128(_mm_cmpeq_ps (fVec, o.fVec)); } 202 Nb operator != (const SkNf& o) const { return _mm_castps_si128(_mm_cmpneq_ps(fVec, o.fVec)); } 203 Nb operator < (const SkNf& o) const { return _mm_castps_si128(_mm_cmplt_ps (fVec, o.fVec)); } 204 Nb operator > (const SkNf& o) const { return _mm_castps_si128(_mm_cmpgt_ps (fVec, o.fVec)); } 205 Nb operator <= (const SkNf& o) const { return _mm_castps_si128(_mm_cmple_ps (fVec, o.fVec)); } 206 Nb operator >= (const SkNf& o) const { return _mm_castps_si128(_mm_cmpge_ps (fVec, o.fVec)); } 207 208 static SkNf Min(const SkNf& l, const SkNf& r) { return _mm_min_ps(l.fVec, r.fVec); } 209 static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_ps(l.fVec, r.fVec); } 210 211 SkNf sqrt() const { return _mm_sqrt_ps (fVec); } 212 SkNf rsqrt0() const { return _mm_rsqrt_ps(fVec); } 213 SkNf rsqrt1() const { return this->rsqrt0(); } 214 SkNf rsqrt2() const { return this->rsqrt1(); } 215 216 SkNf invert() const { return SkNf(1) / *this; } 217 SkNf approxInvert() const { return _mm_rcp_ps(fVec); } 218 219 template <int k> float kth() const { 220 SkASSERT(0 <= k && k < 4); 221 union { __m128 v; float fs[4]; } pun = {fVec}; 222 return pun.fs[k&3]; 223 } 224 225 __m128 fVec; 226}; 227 228template <> 229class SkNi<4, uint16_t> { 230public: 231 SkNi(const __m128i& vec) : fVec(vec) {} 232 233 SkNi() {} 234 explicit SkNi(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 235 static SkNi Load(const uint16_t vals[4]) { return _mm_loadl_epi64((const __m128i*)vals); } 236 SkNi(uint16_t a, uint16_t b, uint16_t c, uint16_t d) : fVec(_mm_setr_epi16(a,b,c,d,0,0,0,0)) {} 237 238 void store(uint16_t vals[4]) const { _mm_storel_epi64((__m128i*)vals, fVec); } 239 240 SkNi operator + (const SkNi& o) const { return _mm_add_epi16(fVec, o.fVec); } 241 SkNi operator - (const SkNi& o) const { return _mm_sub_epi16(fVec, o.fVec); } 242 SkNi operator * (const SkNi& o) const { return _mm_mullo_epi16(fVec, o.fVec); } 243 244 SkNi operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 245 SkNi operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 246 247 template <int k> uint16_t kth() const { 248 SkASSERT(0 <= k && k < 4); 249 return _mm_extract_epi16(fVec, k); 250 } 251 252 __m128i fVec; 253}; 254 255template <> 256class SkNi<8, uint16_t> { 257public: 258 SkNi(const __m128i& vec) : fVec(vec) {} 259 260 SkNi() {} 261 explicit SkNi(uint16_t val) : fVec(_mm_set1_epi16(val)) {} 262 static SkNi Load(const uint16_t vals[8]) { return _mm_loadu_si128((const __m128i*)vals); } 263 SkNi(uint16_t a, uint16_t b, uint16_t c, uint16_t d, 264 uint16_t e, uint16_t f, uint16_t g, uint16_t h) : fVec(_mm_setr_epi16(a,b,c,d,e,f,g,h)) {} 265 266 void store(uint16_t vals[8]) const { _mm_storeu_si128((__m128i*)vals, fVec); } 267 268 SkNi operator + (const SkNi& o) const { return _mm_add_epi16(fVec, o.fVec); } 269 SkNi operator - (const SkNi& o) const { return _mm_sub_epi16(fVec, o.fVec); } 270 SkNi operator * (const SkNi& o) const { return _mm_mullo_epi16(fVec, o.fVec); } 271 272 SkNi operator << (int bits) const { return _mm_slli_epi16(fVec, bits); } 273 SkNi operator >> (int bits) const { return _mm_srli_epi16(fVec, bits); } 274 275 static SkNi Min(const SkNi& a, const SkNi& b) { 276 // No unsigned _mm_min_epu16, so we'll shift into a space where we can use the 277 // signed version, _mm_min_epi16, then shift back. 278 const uint16_t top = 0x8000; // Keep this separate from _mm_set1_epi16 or MSVC will whine. 279 const __m128i top_8x = _mm_set1_epi16(top); 280 return _mm_add_epi8(top_8x, _mm_min_epi16(_mm_sub_epi8(a.fVec, top_8x), 281 _mm_sub_epi8(b.fVec, top_8x))); 282 } 283 284 template <int k> uint16_t kth() const { 285 SkASSERT(0 <= k && k < 8); 286 return _mm_extract_epi16(fVec, k); 287 } 288 289 __m128i fVec; 290}; 291 292template <> 293class SkNi<16, uint8_t> { 294public: 295 SkNi(const __m128i& vec) : fVec(vec) {} 296 297 SkNi() {} 298 explicit SkNi(uint8_t val) : fVec(_mm_set1_epi8(val)) {} 299 static SkNi Load(const uint8_t vals[16]) { return _mm_loadu_si128((const __m128i*)vals); } 300 SkNi(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 301 uint8_t e, uint8_t f, uint8_t g, uint8_t h, 302 uint8_t i, uint8_t j, uint8_t k, uint8_t l, 303 uint8_t m, uint8_t n, uint8_t o, uint8_t p) 304 : fVec(_mm_setr_epi8(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p)) {} 305 306 void store(uint8_t vals[16]) const { _mm_storeu_si128((__m128i*)vals, fVec); } 307 308 SkNi saturatedAdd(const SkNi& o) const { return _mm_adds_epu8(fVec, o.fVec); } 309 310 SkNi operator + (const SkNi& o) const { return _mm_add_epi8(fVec, o.fVec); } 311 SkNi operator - (const SkNi& o) const { return _mm_sub_epi8(fVec, o.fVec); } 312 313 // SSE cannot multiply or shift vectors of uint8_t. 314 SkNi operator * (const SkNi& o) const { SkASSERT(false); return fVec; } 315 SkNi operator << (int bits) const { SkASSERT(false); return fVec; } 316 SkNi operator >> (int bits) const { SkASSERT(false); return fVec; } 317 318 static SkNi Min(const SkNi& a, const SkNi& b) { return _mm_min_epu8(a.fVec, b.fVec); } 319 320 template <int k> uint8_t kth() const { 321 SkASSERT(0 <= k && k < 16); 322 // SSE4.1 would just `return _mm_extract_epi8(fVec, k)`. We have to read 16-bits instead. 323 int pair = _mm_extract_epi16(fVec, k/2); 324 return k % 2 == 0 ? pair : (pair >> 8); 325 } 326 327 __m128i fVec; 328}; 329 330#endif//SkNx_sse_DEFINED 331