SkJumper_stages_lowp.cpp revision 5883a11862771c3d5abb3a26c8c4b6bb0570de23
1/* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkJumper.h" 9#include "SkJumper_misc.h" 10#include <immintrin.h> 11 12#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__) 13 #error "We're starting with just SSSE3 x86-64 for now, and will always require Clang." 14#endif 15 16#define WRAP(name) sk_##name##_ssse3_lowp 17 18using K = const SkJumper_constants; 19static const size_t kStride = 8; 20 21template <typename T> using V = T __attribute__((ext_vector_type(8))); 22using U8 = V<uint8_t>; 23using U16 = V<uint16_t>; 24using U32 = V<uint32_t>; 25 26// See SkFixed15.h for details on this format and its operations. 27struct F { 28 U16 vec; 29 30 F() = default; 31 F(float f) { 32 // After adding 256.0f, the SkFixed15 value is the bottom two bytes of the float. 33 f += 256.0f; 34 vec = unaligned_load<uint16_t>(&f); 35 } 36 37 F(U16 v) : vec(v) {} 38 operator U16() const { return vec; } 39}; 40 41SI F operator+(F x, F y) { return x.vec + y.vec; } 42SI F operator-(F x, F y) { return x.vec - y.vec; } 43SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); } 44SI F mad(F f, F m, F a) { return f*m+a; } 45SI F inv(F v) { return 1.0f - v; } 46SI F two(F v) { return v + v; } 47SI F lerp(F from, F to, F t) { return to*t + from*inv(t); } 48 49SI F operator<<(F x, int bits) { return x.vec << bits; } 50SI F operator>>(F x, int bits) { return x.vec >> bits; } 51 52using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F); 53 54MAYBE_MSABI 55extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) { 56 F v{}; 57 auto start = (Stage*)load_and_inc(program); 58 while (x + kStride <= limit) { 59 start(k,program,x,y,0, v,v,v,v, v,v,v,v); 60 x += kStride; 61 } 62 if (size_t tail = limit - x) { 63 start(k,program,x,y,tail, v,v,v,v, v,v,v,v); 64 } 65 return limit; 66} 67extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {} 68 69#define STAGE(name) \ 70 SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ 71 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ 72 extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \ 73 F r, F g, F b, F a, F dr, F dg, F db, F da) { \ 74 LazyCtx ctx(program); \ 75 name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da); \ 76 auto next = (Stage*)load_and_inc(program); \ 77 next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da); \ 78 } \ 79 SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail, \ 80 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) 81 82 83// Helper functions used by multiple stages. 84 85template <typename V, typename T> 86SI V load(const T* src, size_t tail) { 87 __builtin_assume(tail < kStride); 88 if (__builtin_expect(tail, 0)) { 89 V v{}; // Any inactive lanes are zeroed. 90 switch (tail-1) { 91 case 6: v[6] = src[6]; 92 case 5: v[5] = src[5]; 93 case 4: v[4] = src[4]; 94 case 3: v[3] = src[3]; 95 case 2: v[2] = src[2]; 96 case 1: v[1] = src[1]; 97 case 0: v[0] = src[0]; 98 } 99 return v; 100 } 101 return unaligned_load<V>(src); 102} 103 104template <typename V, typename T> 105SI void store(T* dst, V v, size_t tail) { 106 __builtin_assume(tail < kStride); 107 if (__builtin_expect(tail, 0)) { 108 switch (tail-1) { 109 case 6: dst[6] = v[6]; 110 case 5: dst[5] = v[5]; 111 case 4: dst[4] = v[4]; 112 case 3: dst[3] = v[3]; 113 case 2: dst[2] = v[2]; 114 case 1: dst[1] = v[1]; 115 case 0: dst[0] = v[0]; 116 } 117 return; 118 } 119 unaligned_store(dst, v); 120} 121 122SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) { 123 // Split the 8 pixels into low and high halves, and reinterpret as vectors of 16-bit values. 124 U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0), 125 hi = unaligned_load<U16>((const uint32_t*)&rgba + 4); 126 127 // Shuffle so that the 4 bytes of each color channel are contiguous... 128 lo = _mm_shuffle_epi8(lo, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15)); 129 hi = _mm_shuffle_epi8(hi, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15)); 130 131 // ...then get all 8 bytes of each color channel together into a single register. 132 U16 rg = _mm_unpacklo_epi32(lo,hi), 133 ba = _mm_unpackhi_epi32(lo,hi); 134 135 // Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256. 136 U16 R = _mm_unpacklo_epi8(U16(0), rg), 137 G = _mm_unpackhi_epi8(U16(0), rg), 138 B = _mm_unpacklo_epi8(U16(0), ba), 139 A = _mm_unpackhi_epi8(U16(0), ba); 140 141 // Now we scale from [0,255] to [0,32768]. Ideally that's 32768/255 = 128.50196, 142 // but we can approximate that very cheaply as 256*32897/65536 = 128.50391. 143 // 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1. 144 *r = _mm_mulhi_epu16(R, U16(32897)); 145 *g = _mm_mulhi_epu16(G, U16(32897)); 146 *b = _mm_mulhi_epu16(B, U16(32897)); 147 *a = _mm_mulhi_epu16(A, U16(32897)); 148} 149SI F from_byte(U8 bytes) { 150 // See from_8888() just above. 151 U16 hi = _mm_unpacklo_epi8(U16(0), widen_cast<__m128i>(bytes)); 152 return (F)_mm_mulhi_epu16(hi, U16(32897)); 153} 154 155SI U32 to_8888(F r, F g, F b, F a) { 156 // We want to interlace and pack these values from [0,32768] to [0,255]. 157 // Luckily the simplest possible thing works great: >>7, then saturate. 158 // The 'u' in packus handles the saturation to [0,255] we need. 159 U16 rb = _mm_packus_epi16(r>>7,b>>7), // r0 r1 r2 r3 r4 r5 r6 r7 b0 b1 b2 b3 b4 b5 b6 b7 160 ga = _mm_packus_epi16(g>>7,a>>7); 161 162 U16 rg = _mm_unpacklo_epi8(rb, ga), // r0 g0 r1 g1 ... r7 g7 163 ba = _mm_unpackhi_epi8(rb, ga); // b0 a0 ... b7 a7 164 165 U16 lo = _mm_unpacklo_epi16(rg, ba), // r0 g0 b0 a0 ... r3 g3 b3 a3 166 hi = _mm_unpackhi_epi16(rg, ba); // r4 g4 b4 a4 ... r7 g7 b7 a7 167 168 U32 px; 169 memcpy((uint32_t*)&px + 0, &lo, sizeof(lo)); 170 memcpy((uint32_t*)&px + 4, &hi, sizeof(hi)); 171 return px; 172} 173SI U8 to_byte(F v) { 174 // See to_8888() just above. 175 U16 packed = _mm_packus_epi16(v>>7, v>>7); // Doesn't really matter what we pack on top. 176 return unaligned_load<U8>(&packed); 177} 178 179// Stages! 180 181STAGE(constant_color) { 182 // We're converting to fixed point, which lets us play some IEEE representation tricks, 183 // replacing a naive *32768 and float->int conversion with a simple float add. 184 __m128i bits = _mm_loadu_ps((const float*)ctx) + _mm_set1_ps(256.0f); 185 r = _mm_shuffle_epi8(bits, _mm_set1_epi16(0x0100)); 186 g = _mm_shuffle_epi8(bits, _mm_set1_epi16(0x0504)); 187 b = _mm_shuffle_epi8(bits, _mm_set1_epi16(0x0908)); 188 a = _mm_shuffle_epi8(bits, _mm_set1_epi16(0x0d0c)); 189} 190 191STAGE(set_rgb) { 192 auto rgb = (const float*)ctx; 193 r = rgb[0]; 194 g = rgb[1]; 195 b = rgb[2]; 196} 197 198STAGE(premul) { 199 r = r * a; 200 g = g * a; 201 b = b * a; 202} 203 204STAGE(load_8888) { 205 auto ptr = *(const uint32_t**)ctx + x; 206 from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); 207} 208STAGE(store_8888) { 209 auto ptr = *(uint32_t**)ctx + x; 210 store(ptr, to_8888(r,g,b,a), tail); 211} 212 213STAGE(load_a8) { 214 auto ptr = *(const uint8_t**)ctx + x; 215 r = g = b = 0.0f; 216 a = from_byte(load<U8>(ptr, tail)); 217} 218STAGE(store_a8) { 219 auto ptr = *(uint8_t**)ctx + x; 220 store(ptr, to_byte(a), tail); 221} 222 223STAGE(load_g8) { 224 auto ptr = *(const uint8_t**)ctx + x; 225 r = g = b = from_byte(load<U8>(ptr, tail)); 226 a = 1.0f; 227} 228 229STAGE(srcover_rgba_8888) { 230 auto ptr = *(uint32_t**)ctx + x; 231 232 from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da); 233 234 r = mad(dr, inv(a), r); 235 g = mad(dg, inv(a), g); 236 b = mad(db, inv(a), b); 237 a = mad(da, inv(a), a); 238 239 store(ptr, to_8888(r,g,b,a), tail); 240} 241 242STAGE(scale_1_float) { 243 float c = *(const float*)ctx; 244 245 r = r * c; 246 g = g * c; 247 b = b * c; 248 a = a * c; 249} 250STAGE(scale_u8) { 251 auto ptr = *(const uint8_t**)ctx + x; 252 253 U8 scales = load<U8>(ptr, tail); 254 F c = from_byte(scales); 255 256 r = r * c; 257 g = g * c; 258 b = b * c; 259 a = a * c; 260} 261 262STAGE(lerp_1_float) { 263 float c = *(const float*)ctx; 264 265 r = lerp(dr, r, c); 266 g = lerp(dg, g, c); 267 b = lerp(db, b, c); 268 a = lerp(da, a, c); 269} 270STAGE(lerp_u8) { 271 auto ptr = *(const uint8_t**)ctx + x; 272 273 U8 scales = load<U8>(ptr, tail); 274 F c = from_byte(scales); 275 276 r = lerp(dr, r, c); 277 g = lerp(dg, g, c); 278 b = lerp(db, b, c); 279 a = lerp(da, a, c); 280} 281 282STAGE(swap_rb) { 283 auto tmp = r; 284 r = b; 285 b = tmp; 286} 287 288STAGE(swap) { 289 auto swap = [](F& v, F& dv) { 290 auto tmp = v; 291 v = dv; 292 dv = tmp; 293 }; 294 swap(r, dr); 295 swap(g, dg); 296 swap(b, db); 297 swap(a, da); 298} 299STAGE(move_src_dst) { 300 dr = r; 301 dg = g; 302 db = b; 303 da = a; 304} 305STAGE(move_dst_src) { 306 r = dr; 307 g = dg; 308 b = db; 309 a = da; 310} 311 312// Most blend modes apply the same logic to each channel. 313#define BLEND_MODE(name) \ 314 SI F name##_channel(F s, F d, F sa, F da); \ 315 STAGE(name) { \ 316 r = name##_channel(r,dr,a,da); \ 317 g = name##_channel(g,dg,a,da); \ 318 b = name##_channel(b,db,a,da); \ 319 a = name##_channel(a,da,a,da); \ 320 } \ 321 SI F name##_channel(F s, F d, F sa, F da) 322 323BLEND_MODE(clear) { return 0.0f; } 324BLEND_MODE(srcatop) { return s*da + d*inv(sa); } 325BLEND_MODE(dstatop) { return d*sa + s*inv(da); } 326BLEND_MODE(srcin) { return s * da; } 327BLEND_MODE(dstin) { return d * sa; } 328BLEND_MODE(srcout) { return s * inv(da); } 329BLEND_MODE(dstout) { return d * inv(sa); } 330BLEND_MODE(srcover) { return mad(d, inv(sa), s); } 331BLEND_MODE(dstover) { return mad(s, inv(da), d); } 332 333BLEND_MODE(modulate) { return s*d; } 334BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; } 335BLEND_MODE(screen) { return s + inv(s)*d; } 336BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); } 337 338#undef BLEND_MODE 339