1/* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#ifndef SkRasterPipeline_opts_DEFINED 9#define SkRasterPipeline_opts_DEFINED 10 11#include "SkColorPriv.h" 12#include "SkColorLookUpTable.h" 13#include "SkColorSpaceXform_A2B.h" 14#include "SkColorSpaceXformPriv.h" 15#include "SkHalf.h" 16#include "SkImageShaderContext.h" 17#include "SkMSAN.h" 18#include "SkPM4f.h" 19#include "SkPM4fPriv.h" 20#include "SkRasterPipeline.h" 21#include "SkShader.h" 22#include "SkSRGB.h" 23 24namespace { 25 26#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 27 static constexpr int N = 8; 28#else 29 static constexpr int N = 4; 30#endif 31 32 using SkNf = SkNx<N, float>; 33 using SkNi = SkNx<N, int32_t>; 34 using SkNu = SkNx<N, uint32_t>; 35 using SkNh = SkNx<N, uint16_t>; 36 using SkNb = SkNx<N, uint8_t>; 37 38 using Fn = void(SK_VECTORCALL *)(size_t x_tail, void** p, SkNf,SkNf,SkNf,SkNf, 39 SkNf,SkNf,SkNf,SkNf); 40 // x_tail encodes two values x and tail as x*N+tail, where 0 <= tail < N. 41 // x is the induction variable we're walking along, incrementing by N each step. 42 // tail == 0 means work with a full N pixels; otherwise use only the low tail pixels. 43 // 44 // p is our program, a sequence of Fn to call interlaced with any void* context pointers. E.g. 45 // &load_8888 46 // (src ptr) 47 // &from_srgb 48 // &move_src_dst 49 // &load_f16 50 // (dst ptr) 51 // &swap 52 // &srcover 53 // &store_f16 54 // (dst ptr) 55 // &just_return 56 57} // namespace 58 59#define SI static inline 60 61// Basically, return *(*ptr)++, maybe faster than the compiler can do it. 62SI void* load_and_increment(void*** ptr) { 63 // We do this often enough that it's worth hyper-optimizing. 64 // x86 can do this in one instruction if ptr is in rsi. 65 // (This is why p is the second argument to Fn: it's passed in rsi.) 66#if defined(__GNUC__) && defined(__x86_64__) 67 void* rax; 68 __asm__("lodsq" : "=a"(rax), "+S"(*ptr)); 69 return rax; 70#else 71 return *(*ptr)++; 72#endif 73} 74 75// Stages are logically a pipeline, and physically are contiguous in an array. 76// To get to the next stage, we just increment our pointer to the next array element. 77SI void SK_VECTORCALL next(size_t x_tail, void** p, SkNf r, SkNf g, SkNf b, SkNf a, 78 SkNf dr, SkNf dg, SkNf db, SkNf da) { 79 auto next = (Fn)load_and_increment(&p); 80 next(x_tail,p, r,g,b,a, dr,dg,db,da); 81} 82 83// Stages defined below always call next. 84// This is always the last stage, a backstop that actually returns to the caller when done. 85SI void SK_VECTORCALL just_return(size_t, void**, SkNf, SkNf, SkNf, SkNf, 86 SkNf, SkNf, SkNf, SkNf) {} 87 88#define STAGE(name) \ 89 static SK_ALWAYS_INLINE void name##_kernel(size_t x, size_t tail, \ 90 SkNf& r, SkNf& g, SkNf& b, SkNf& a, \ 91 SkNf& dr, SkNf& dg, SkNf& db, SkNf& da); \ 92 SI void SK_VECTORCALL name(size_t x_tail, void** p, \ 93 SkNf r, SkNf g, SkNf b, SkNf a, \ 94 SkNf dr, SkNf dg, SkNf db, SkNf da) { \ 95 name##_kernel(x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da); \ 96 next(x_tail,p, r,g,b,a, dr,dg,db,da); \ 97 } \ 98 static SK_ALWAYS_INLINE void name##_kernel(size_t x, size_t tail, \ 99 SkNf& r, SkNf& g, SkNf& b, SkNf& a, \ 100 SkNf& dr, SkNf& dg, SkNf& db, SkNf& da) 101 102#define STAGE_CTX(name, Ctx) \ 103 static SK_ALWAYS_INLINE void name##_kernel(Ctx ctx, size_t x, size_t tail, \ 104 SkNf& r, SkNf& g, SkNf& b, SkNf& a, \ 105 SkNf& dr, SkNf& dg, SkNf& db, SkNf& da); \ 106 SI void SK_VECTORCALL name(size_t x_tail, void** p, \ 107 SkNf r, SkNf g, SkNf b, SkNf a, \ 108 SkNf dr, SkNf dg, SkNf db, SkNf da) { \ 109 auto ctx = (Ctx)load_and_increment(&p); \ 110 name##_kernel(ctx, x_tail/N, x_tail%N, r,g,b,a, dr,dg,db,da); \ 111 next(x_tail,p, r,g,b,a, dr,dg,db,da); \ 112 } \ 113 static SK_ALWAYS_INLINE void name##_kernel(Ctx ctx, size_t x, size_t tail, \ 114 SkNf& r, SkNf& g, SkNf& b, SkNf& a, \ 115 SkNf& dr, SkNf& dg, SkNf& db, SkNf& da) 116 117// Many xfermodes apply the same logic to each channel. 118#define RGBA_XFERMODE(name) \ 119 static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \ 120 const SkNf& d, const SkNf& da); \ 121 SI void SK_VECTORCALL name(size_t x_tail, void** p, \ 122 SkNf r, SkNf g, SkNf b, SkNf a, \ 123 SkNf dr, SkNf dg, SkNf db, SkNf da) { \ 124 r = name##_kernel(r,a,dr,da); \ 125 g = name##_kernel(g,a,dg,da); \ 126 b = name##_kernel(b,a,db,da); \ 127 a = name##_kernel(a,a,da,da); \ 128 next(x_tail,p, r,g,b,a, dr,dg,db,da); \ 129 } \ 130 static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \ 131 const SkNf& d, const SkNf& da) 132 133// Most of the rest apply the same logic to color channels and use srcover's alpha logic. 134#define RGB_XFERMODE(name) \ 135 static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \ 136 const SkNf& d, const SkNf& da); \ 137 SI void SK_VECTORCALL name(size_t x_tail, void** p, \ 138 SkNf r, SkNf g, SkNf b, SkNf a, \ 139 SkNf dr, SkNf dg, SkNf db, SkNf da) { \ 140 r = name##_kernel(r,a,dr,da); \ 141 g = name##_kernel(g,a,dg,da); \ 142 b = name##_kernel(b,a,db,da); \ 143 a = a + (da * (1.0f-a)); \ 144 next(x_tail,p, r,g,b,a, dr,dg,db,da); \ 145 } \ 146 static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \ 147 const SkNf& d, const SkNf& da) 148 149template <typename T> 150SI SkNx<N,T> load(size_t tail, const T* src) { 151 if (tail) { 152 T buf[8]; 153 memset(buf, 0, 8*sizeof(T)); 154 switch (tail & (N-1)) { 155 case 7: buf[6] = src[6]; 156 case 6: buf[5] = src[5]; 157 case 5: buf[4] = src[4]; 158 case 4: buf[3] = src[3]; 159 case 3: buf[2] = src[2]; 160 case 2: buf[1] = src[1]; 161 } 162 buf[0] = src[0]; 163 return SkNx<N,T>::Load(buf); 164 } 165 return SkNx<N,T>::Load(src); 166} 167template <typename T> 168SI SkNx<N,T> gather(size_t tail, const T* src, const SkNi& offset) { 169 if (tail) { 170 T buf[8] = {0}; 171 switch (tail & (N-1)) { 172 case 7: buf[6] = src[offset[6]]; 173 case 6: buf[5] = src[offset[5]]; 174 case 5: buf[4] = src[offset[4]]; 175 case 4: buf[3] = src[offset[3]]; 176 case 3: buf[2] = src[offset[2]]; 177 case 2: buf[1] = src[offset[1]]; 178 } 179 buf[0] = src[offset[0]]; 180 return SkNx<N,T>::Load(buf); 181 } 182 T buf[8]; 183 for (size_t i = 0; i < N; i++) { 184 buf[i] = src[offset[i]]; 185 } 186 return SkNx<N,T>::Load(buf); 187} 188template <typename T> 189SI void store(size_t tail, const SkNx<N,T>& v, T* dst) { 190 if (tail) { 191 switch (tail & (N-1)) { 192 case 7: dst[6] = v[6]; 193 case 6: dst[5] = v[5]; 194 case 5: dst[4] = v[4]; 195 case 4: dst[3] = v[3]; 196 case 3: dst[2] = v[2]; 197 case 2: dst[1] = v[1]; 198 } 199 dst[0] = v[0]; 200 return; 201 } 202 v.store(dst); 203} 204 205#if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 206 SI __m256i mask(size_t tail) { 207 static const int masks[][8] = { 208 {~0,~0,~0,~0, ~0,~0,~0,~0 }, // remember, tail == 0 ~~> load all N 209 {~0, 0, 0, 0, 0, 0, 0, 0 }, 210 {~0,~0, 0, 0, 0, 0, 0, 0 }, 211 {~0,~0,~0, 0, 0, 0, 0, 0 }, 212 {~0,~0,~0,~0, 0, 0, 0, 0 }, 213 {~0,~0,~0,~0, ~0, 0, 0, 0 }, 214 {~0,~0,~0,~0, ~0,~0, 0, 0 }, 215 {~0,~0,~0,~0, ~0,~0,~0, 0 }, 216 }; 217 return SkNi::Load(masks + tail).fVec; 218 } 219 220 SI SkNi load(size_t tail, const int32_t* src) { 221 return tail ? _mm256_maskload_epi32((const int*)src, mask(tail)) 222 : SkNi::Load(src); 223 } 224 SI SkNu load(size_t tail, const uint32_t* src) { 225 return tail ? _mm256_maskload_epi32((const int*)src, mask(tail)) 226 : SkNu::Load(src); 227 } 228 SI SkNf load(size_t tail, const float* src) { 229 return tail ? _mm256_maskload_ps((const float*)src, mask(tail)) 230 : SkNf::Load(src); 231 } 232 SI SkNi gather(size_t tail, const int32_t* src, const SkNi& offset) { 233 auto m = mask(tail); 234 return _mm256_mask_i32gather_epi32(SkNi(0).fVec, (const int*)src, offset.fVec, m, 4); 235 } 236 SI SkNu gather(size_t tail, const uint32_t* src, const SkNi& offset) { 237 auto m = mask(tail); 238 return _mm256_mask_i32gather_epi32(SkNi(0).fVec, (const int*)src, offset.fVec, m, 4); 239 } 240 SI SkNf gather(size_t tail, const float* src, const SkNi& offset) { 241 auto m = _mm256_castsi256_ps(mask(tail)); 242 return _mm256_mask_i32gather_ps(SkNf(0).fVec, (const float*)src, offset.fVec, m, 4); 243 } 244 245 static const char* bug = "I don't think MSAN understands maskstore."; 246 247 SI void store(size_t tail, const SkNi& v, int32_t* dst) { 248 if (tail) { 249 _mm256_maskstore_epi32((int*)dst, mask(tail), v.fVec); 250 return sk_msan_mark_initialized(dst, dst+tail, bug); 251 } 252 v.store(dst); 253 } 254 SI void store(size_t tail, const SkNu& v, uint32_t* dst) { 255 if (tail) { 256 _mm256_maskstore_epi32((int*)dst, mask(tail), v.fVec); 257 return sk_msan_mark_initialized(dst, dst+tail, bug); 258 } 259 v.store(dst); 260 } 261 SI void store(size_t tail, const SkNf& v, float* dst) { 262 if (tail) { 263 _mm256_maskstore_ps((float*)dst, mask(tail), v.fVec); 264 return sk_msan_mark_initialized(dst, dst+tail, bug); 265 } 266 v.store(dst); 267 } 268#endif 269 270SI SkNf SkNf_fma(const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma(f,m,a); } 271 272SI SkNi SkNf_round(const SkNf& x, const SkNf& scale) { 273 // Every time I try, _mm_cvtps_epi32 benches as slower than using FMA and _mm_cvttps_epi32. :/ 274 return SkNx_cast<int>(SkNf_fma(x,scale, 0.5f)); 275} 276 277SI SkNf SkNf_from_byte(const SkNi& x) { 278 // Same trick as in store_8888: 0x470000BB == 32768.0f + BB/256.0f for all bytes BB. 279 auto v = 0x47000000 | x; 280 // Read this as (pun_float(v) - 32768.0f) * (256/255.0f), redistributed to be an FMA. 281 return SkNf_fma(SkNf::Load(&v), 256/255.0f, -32768*256/255.0f); 282} 283SI SkNf SkNf_from_byte(const SkNu& x) { return SkNf_from_byte(SkNi::Load(&x)); } 284SI SkNf SkNf_from_byte(const SkNb& x) { return SkNf_from_byte(SkNx_cast<int>(x)); } 285 286SI void from_8888(const SkNu& _8888, SkNf* r, SkNf* g, SkNf* b, SkNf* a) { 287 *r = SkNf_from_byte((_8888 ) & 0xff); 288 *g = SkNf_from_byte((_8888 >> 8) & 0xff); 289 *b = SkNf_from_byte((_8888 >> 16) & 0xff); 290 *a = SkNf_from_byte((_8888 >> 24) ); 291} 292SI void from_4444(const SkNh& _4444, SkNf* r, SkNf* g, SkNf* b, SkNf* a) { 293 auto _32_bit = SkNx_cast<int>(_4444); 294 295 *r = SkNx_cast<float>(_32_bit & (0xF << SK_R4444_SHIFT)) * (1.0f / (0xF << SK_R4444_SHIFT)); 296 *g = SkNx_cast<float>(_32_bit & (0xF << SK_G4444_SHIFT)) * (1.0f / (0xF << SK_G4444_SHIFT)); 297 *b = SkNx_cast<float>(_32_bit & (0xF << SK_B4444_SHIFT)) * (1.0f / (0xF << SK_B4444_SHIFT)); 298 *a = SkNx_cast<float>(_32_bit & (0xF << SK_A4444_SHIFT)) * (1.0f / (0xF << SK_A4444_SHIFT)); 299} 300SI void from_565(const SkNh& _565, SkNf* r, SkNf* g, SkNf* b) { 301 auto _32_bit = SkNx_cast<int>(_565); 302 303 *r = SkNx_cast<float>(_32_bit & SK_R16_MASK_IN_PLACE) * (1.0f / SK_R16_MASK_IN_PLACE); 304 *g = SkNx_cast<float>(_32_bit & SK_G16_MASK_IN_PLACE) * (1.0f / SK_G16_MASK_IN_PLACE); 305 *b = SkNx_cast<float>(_32_bit & SK_B16_MASK_IN_PLACE) * (1.0f / SK_B16_MASK_IN_PLACE); 306} 307SI void from_f16(const void* px, SkNf* r, SkNf* g, SkNf* b, SkNf* a) { 308 SkNh rh, gh, bh, ah; 309 SkNh::Load4(px, &rh, &gh, &bh, &ah); 310 311 *r = SkHalfToFloat_finite_ftz(rh); 312 *g = SkHalfToFloat_finite_ftz(gh); 313 *b = SkHalfToFloat_finite_ftz(bh); 314 *a = SkHalfToFloat_finite_ftz(ah); 315} 316 317STAGE_CTX(trace, const char*) { 318 SkDebugf("%s\n", ctx); 319} 320STAGE(registers) { 321 auto print = [](const char* name, const SkNf& v) { 322 SkDebugf("%s:", name); 323 for (int i = 0; i < N; i++) { 324 SkDebugf(" %g", v[i]); 325 } 326 SkDebugf("\n"); 327 }; 328 print(" r", r); 329 print(" g", g); 330 print(" b", b); 331 print(" a", a); 332 print("dr", dr); 333 print("dg", dg); 334 print("db", db); 335 print("da", da); 336} 337 338STAGE(clamp_0) { 339 a = SkNf::Max(a, 0.0f); 340 r = SkNf::Max(r, 0.0f); 341 g = SkNf::Max(g, 0.0f); 342 b = SkNf::Max(b, 0.0f); 343} 344STAGE(clamp_1) { 345 a = SkNf::Min(a, 1.0f); 346 r = SkNf::Min(r, 1.0f); 347 g = SkNf::Min(g, 1.0f); 348 b = SkNf::Min(b, 1.0f); 349} 350STAGE(clamp_a) { 351 a = SkNf::Min(a, 1.0f); 352 r = SkNf::Min(r, a); 353 g = SkNf::Min(g, a); 354 b = SkNf::Min(b, a); 355} 356 357STAGE(unpremul) { 358 auto scale = (a == 0.0f).thenElse(0.0f, 1.0f/a); 359 r *= scale; 360 g *= scale; 361 b *= scale; 362} 363STAGE(premul) { 364 r *= a; 365 g *= a; 366 b *= a; 367} 368 369STAGE_CTX(set_rgb, const float*) { 370 r = ctx[0]; 371 g = ctx[1]; 372 b = ctx[2]; 373} 374STAGE(swap_rb) { SkTSwap(r,b); } 375 376STAGE(move_src_dst) { 377 dr = r; 378 dg = g; 379 db = b; 380 da = a; 381} 382STAGE(move_dst_src) { 383 r = dr; 384 g = dg; 385 b = db; 386 a = da; 387} 388STAGE(swap) { 389 SkTSwap(r,dr); 390 SkTSwap(g,dg); 391 SkTSwap(b,db); 392 SkTSwap(a,da); 393} 394 395STAGE(from_srgb) { 396 r = sk_linear_from_srgb_math(r); 397 g = sk_linear_from_srgb_math(g); 398 b = sk_linear_from_srgb_math(b); 399} 400STAGE(to_srgb) { 401 r = sk_linear_to_srgb_needs_round(r); 402 g = sk_linear_to_srgb_needs_round(g); 403 b = sk_linear_to_srgb_needs_round(b); 404} 405 406STAGE(from_2dot2) { 407 auto from_2dot2 = [](const SkNf& x) { 408 // x^(141/64) = x^(2.20312) is a great approximation of the true value, x^(2.2). 409 // (note: x^(35/16) = x^(2.1875) is an okay one as well and would be quicker) 410 auto x16 = x.rsqrt().rsqrt().rsqrt().rsqrt(); // x^(1/16) = x^(4/64); 411 auto x64 = x16.rsqrt().rsqrt(); // x^(1/64) 412 413 // x^(141/64) = x^(128/64) * x^(12/64) * x^(1/64) 414 return SkNf::Max((x*x) * (x16*x16*x16) * (x64), 0.0f); 415 }; 416 417 r = from_2dot2(r); 418 g = from_2dot2(g); 419 b = from_2dot2(b); 420} 421STAGE(to_2dot2) { 422 auto to_2dot2 = [](const SkNf& x) { 423 // x^(29/64) is a very good approximation of the true value, x^(1/2.2). 424 auto x2 = x.rsqrt(), // x^(-1/2) 425 x32 = x2.rsqrt().rsqrt().rsqrt().rsqrt(), // x^(-1/32) 426 x64 = x32.rsqrt(); // x^(+1/64) 427 428 // 29 = 32 - 2 - 1 429 return SkNf::Max(x2.invert() * x32 * x64.invert(), 0.0f); // Watch out for NaN. 430 }; 431 432 r = to_2dot2(r); 433 g = to_2dot2(g); 434 b = to_2dot2(b); 435} 436 437// The default shader produces a constant color (from the SkPaint). 438STAGE_CTX(constant_color, const SkPM4f*) { 439 r = ctx->r(); 440 g = ctx->g(); 441 b = ctx->b(); 442 a = ctx->a(); 443} 444 445// Set up registers with values relevant to shaders. 446STAGE_CTX(seed_shader, const int*) { 447 int y = *ctx; 448 449 static const float dx[] = { 0,1,2,3,4,5,6,7 }; 450 r = x + 0.5f + SkNf::Load(dx); // dst pixel center x coordinates 451 g = y + 0.5f; // dst pixel center y coordinate(s) 452 b = 1.0f; 453 a = 0.0f; 454 dr = dg = db = da = 0.0f; 455} 456 457// s' = sc for a scalar c. 458STAGE_CTX(scale_1_float, const float*) { 459 SkNf c = *ctx; 460 461 r *= c; 462 g *= c; 463 b *= c; 464 a *= c; 465} 466// s' = sc for 8-bit c. 467STAGE_CTX(scale_u8, const uint8_t**) { 468 auto ptr = *ctx + x; 469 SkNf c = SkNf_from_byte(load(tail, ptr)); 470 471 r = r*c; 472 g = g*c; 473 b = b*c; 474 a = a*c; 475} 476 477SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) { 478 return SkNf_fma(to-from, cov, from); 479} 480 481// s' = d(1-c) + sc, for a scalar c. 482STAGE_CTX(lerp_1_float, const float*) { 483 SkNf c = *ctx; 484 485 r = lerp(dr, r, c); 486 g = lerp(dg, g, c); 487 b = lerp(db, b, c); 488 a = lerp(da, a, c); 489} 490 491// s' = d(1-c) + sc for 8-bit c. 492STAGE_CTX(lerp_u8, const uint8_t**) { 493 auto ptr = *ctx + x; 494 SkNf c = SkNf_from_byte(load(tail, ptr)); 495 496 r = lerp(dr, r, c); 497 g = lerp(dg, g, c); 498 b = lerp(db, b, c); 499 a = lerp(da, a, c); 500} 501 502// s' = d(1-c) + sc for 565 c. 503STAGE_CTX(lerp_565, const uint16_t**) { 504 auto ptr = *ctx + x; 505 SkNf cr, cg, cb; 506 from_565(load(tail, ptr), &cr, &cg, &cb); 507 508 r = lerp(dr, r, cr); 509 g = lerp(dg, g, cg); 510 b = lerp(db, b, cb); 511 a = 1.0f; 512} 513 514STAGE_CTX(load_a8, const uint8_t**) { 515 auto ptr = *ctx + x; 516 r = g = b = 0.0f; 517 a = SkNf_from_byte(load(tail, ptr)); 518} 519STAGE_CTX(store_a8, uint8_t**) { 520 auto ptr = *ctx + x; 521 store(tail, SkNx_cast<uint8_t>(SkNf_round(255.0f, a)), ptr); 522} 523 524STAGE_CTX(load_g8, const uint8_t**) { 525 auto ptr = *ctx + x; 526 r = g = b = SkNf_from_byte(load(tail, ptr)); 527 a = 1.0f; 528} 529 530STAGE_CTX(load_565, const uint16_t**) { 531 auto ptr = *ctx + x; 532 from_565(load(tail, ptr), &r,&g,&b); 533 a = 1.0f; 534} 535STAGE_CTX(store_565, uint16_t**) { 536 auto ptr = *ctx + x; 537 store(tail, SkNx_cast<uint16_t>( SkNf_round(r, SK_R16_MASK) << SK_R16_SHIFT 538 | SkNf_round(g, SK_G16_MASK) << SK_G16_SHIFT 539 | SkNf_round(b, SK_B16_MASK) << SK_B16_SHIFT), ptr); 540} 541 542STAGE_CTX(load_4444, const uint16_t**) { 543 auto ptr = *ctx + x; 544 from_4444(load(tail, ptr), &r,&g,&b,&a); 545} 546STAGE_CTX(store_4444, uint16_t**) { 547 auto ptr = *ctx + x; 548 store(tail, SkNx_cast<uint16_t>( SkNf_round(r, 0xF) << SK_R4444_SHIFT 549 | SkNf_round(g, 0xF) << SK_G4444_SHIFT 550 | SkNf_round(b, 0xF) << SK_B4444_SHIFT 551 | SkNf_round(a, 0xF) << SK_A4444_SHIFT), ptr); 552} 553 554STAGE_CTX(load_f16, const uint64_t**) { 555 auto ptr = *ctx + x; 556 557 const void* src = ptr; 558 SkNx<N, uint64_t> px; 559 if (tail) { 560 px = load(tail, ptr); 561 src = &px; 562 } 563 from_f16(src, &r, &g, &b, &a); 564} 565STAGE_CTX(store_f16, uint64_t**) { 566 auto ptr = *ctx + x; 567 568 SkNx<N, uint64_t> px; 569 SkNh::Store4(tail ? (void*)&px : (void*)ptr, SkFloatToHalf_finite_ftz(r), 570 SkFloatToHalf_finite_ftz(g), 571 SkFloatToHalf_finite_ftz(b), 572 SkFloatToHalf_finite_ftz(a)); 573 if (tail) { 574 store(tail, px, ptr); 575 } 576} 577 578STAGE_CTX(load_f32, const SkPM4f**) { 579 auto ptr = *ctx + x; 580 581 const void* src = ptr; 582 SkNx<N, SkPM4f> px; 583 if (tail) { 584 px = load(tail, ptr); 585 src = &px; 586 } 587 SkNf::Load4(src, &r, &g, &b, &a); 588} 589STAGE_CTX(store_f32, SkPM4f**) { 590 auto ptr = *ctx + x; 591 592 SkNx<N, SkPM4f> px; 593 SkNf::Store4(tail ? (void*)&px : (void*)ptr, r,g,b,a); 594 if (tail) { 595 store(tail, px, ptr); 596 } 597} 598 599 600STAGE_CTX(load_8888, const uint32_t**) { 601 auto ptr = *ctx + x; 602 from_8888(load(tail, ptr), &r, &g, &b, &a); 603} 604STAGE_CTX(store_8888, uint32_t**) { 605 auto byte = [](const SkNf& x, int ix) { 606 // Here's a neat trick: 0x47000000 == 32768.0f, and 0x470000ff == 32768.0f + (255/256.0f). 607 auto v = SkNf_fma(255/256.0f, x, 32768.0f); 608 switch (ix) { 609 case 0: return SkNi::Load(&v) & 0xff; // R 610 case 3: return SkNi::Load(&v) << 24; // A 611 } 612 return (SkNi::Load(&v) & 0xff) << (8*ix); // B or G 613 }; 614 615 auto ptr = *ctx + x; 616 store(tail, byte(r,0)|byte(g,1)|byte(b,2)|byte(a,3), (int*)ptr); 617} 618 619STAGE_CTX(load_u16_be, const uint64_t**) { 620 auto ptr = *ctx + x; 621 const void* src = ptr; 622 SkNx<N, uint64_t> px; 623 if (tail) { 624 px = load(tail, ptr); 625 src = &px; 626 } 627 628 SkNh rh, gh, bh, ah; 629 SkNh::Load4(src, &rh, &gh, &bh, &ah); 630 r = (1.0f / 65535.0f) * SkNx_cast<float>((rh << 8) | (rh >> 8)); 631 g = (1.0f / 65535.0f) * SkNx_cast<float>((gh << 8) | (gh >> 8)); 632 b = (1.0f / 65535.0f) * SkNx_cast<float>((bh << 8) | (bh >> 8)); 633 a = (1.0f / 65535.0f) * SkNx_cast<float>((ah << 8) | (ah >> 8)); 634} 635 636STAGE_CTX(load_rgb_u16_be, const uint16_t**) { 637 auto ptr = *ctx + 3*x; 638 const void* src = ptr; 639 uint16_t buf[N*3] = {0}; 640 if (tail) { 641 memcpy(buf, src, tail*3*sizeof(uint16_t)); 642 src = buf; 643 } 644 645 SkNh rh, gh, bh; 646 SkNh::Load3(src, &rh, &gh, &bh); 647 r = (1.0f / 65535.0f) * SkNx_cast<float>((rh << 8) | (rh >> 8)); 648 g = (1.0f / 65535.0f) * SkNx_cast<float>((gh << 8) | (gh >> 8)); 649 b = (1.0f / 65535.0f) * SkNx_cast<float>((bh << 8) | (bh >> 8)); 650 a = 1.0f; 651} 652 653STAGE_CTX(store_u16_be, uint64_t**) { 654 auto to_u16_be = [](const SkNf& x) { 655 SkNh x16 = SkNx_cast<uint16_t>(65535.0f * x); 656 return (x16 << 8) | (x16 >> 8); 657 }; 658 659 auto ptr = *ctx + x; 660 SkNx<N, uint64_t> px; 661 SkNh::Store4(tail ? (void*)&px : (void*)ptr, to_u16_be(r), 662 to_u16_be(g), 663 to_u16_be(b), 664 to_u16_be(a)); 665 if (tail) { 666 store(tail, px, ptr); 667 } 668} 669 670STAGE_CTX(load_tables, const LoadTablesContext*) { 671 auto ptr = (const uint32_t*)ctx->fSrc + x; 672 673 SkNu rgba = load(tail, ptr); 674 auto to_int = [](const SkNu& v) { return SkNi::Load(&v); }; 675 r = gather(tail, ctx->fR, to_int((rgba >> 0) & 0xff)); 676 g = gather(tail, ctx->fG, to_int((rgba >> 8) & 0xff)); 677 b = gather(tail, ctx->fB, to_int((rgba >> 16) & 0xff)); 678 a = SkNf_from_byte(rgba >> 24); 679} 680 681STAGE_CTX(load_tables_u16_be, const LoadTablesContext*) { 682 auto ptr = (const uint64_t*)ctx->fSrc + x; 683 const void* src = ptr; 684 SkNx<N, uint64_t> px; 685 if (tail) { 686 px = load(tail, ptr); 687 src = &px; 688 } 689 690 SkNh rh, gh, bh, ah; 691 SkNh::Load4(src, &rh, &gh, &bh, &ah); 692 693 // ctx->fSrc is big-endian, so "& 0xff" grabs the 8 most significant bits of each component. 694 r = gather(tail, ctx->fR, SkNx_cast<int>(rh & 0xff)); 695 g = gather(tail, ctx->fG, SkNx_cast<int>(gh & 0xff)); 696 b = gather(tail, ctx->fB, SkNx_cast<int>(bh & 0xff)); 697 a = (1.0f / 65535.0f) * SkNx_cast<float>((ah << 8) | (ah >> 8)); 698} 699 700STAGE_CTX(load_tables_rgb_u16_be, const LoadTablesContext*) { 701 auto ptr = (const uint16_t*)ctx->fSrc + 3*x; 702 const void* src = ptr; 703 uint16_t buf[N*3] = {0}; 704 if (tail) { 705 memcpy(buf, src, tail*3*sizeof(uint16_t)); 706 src = buf; 707 } 708 709 SkNh rh, gh, bh; 710 SkNh::Load3(src, &rh, &gh, &bh); 711 712 // ctx->fSrc is big-endian, so "& 0xff" grabs the 8 most significant bits of each component. 713 r = gather(tail, ctx->fR, SkNx_cast<int>(rh & 0xff)); 714 g = gather(tail, ctx->fG, SkNx_cast<int>(gh & 0xff)); 715 b = gather(tail, ctx->fB, SkNx_cast<int>(bh & 0xff)); 716 a = 1.0f; 717} 718 719SI SkNf inv(const SkNf& x) { return 1.0f - x; } 720 721RGBA_XFERMODE(clear) { return 0.0f; } 722RGBA_XFERMODE(srcatop) { return s*da + d*inv(sa); } 723RGBA_XFERMODE(srcin) { return s * da; } 724RGBA_XFERMODE(srcout) { return s * inv(da); } 725RGBA_XFERMODE(srcover) { return SkNf_fma(d, inv(sa), s); } 726RGBA_XFERMODE(dstatop) { return srcatop_kernel(d,da,s,sa); } 727RGBA_XFERMODE(dstin) { return srcin_kernel (d,da,s,sa); } 728RGBA_XFERMODE(dstout) { return srcout_kernel (d,da,s,sa); } 729RGBA_XFERMODE(dstover) { return srcover_kernel(d,da,s,sa); } 730 731RGBA_XFERMODE(modulate) { return s*d; } 732RGBA_XFERMODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; } 733RGBA_XFERMODE(plus_) { return s + d; } 734RGBA_XFERMODE(screen) { return s + d - s*d; } 735RGBA_XFERMODE(xor_) { return s*inv(da) + d*inv(sa); } 736 737RGB_XFERMODE(colorburn) { 738 return (d == da ).thenElse(d + s*inv(da), 739 (s == 0.0f).thenElse(s + d*inv(sa), 740 sa*(da - SkNf::Min(da, (da-d)*sa/s)) + s*inv(da) + d*inv(sa))); 741} 742RGB_XFERMODE(colordodge) { 743 return (d == 0.0f).thenElse(d + s*inv(da), 744 (s == sa ).thenElse(s + d*inv(sa), 745 sa*SkNf::Min(da, (d*sa)/(sa - s)) + s*inv(da) + d*inv(sa))); 746} 747RGB_XFERMODE(darken) { return s + d - SkNf::Max(s*da, d*sa); } 748RGB_XFERMODE(difference) { return s + d - 2.0f*SkNf::Min(s*da,d*sa); } 749RGB_XFERMODE(exclusion) { return s + d - 2.0f*s*d; } 750RGB_XFERMODE(hardlight) { 751 return s*inv(da) + d*inv(sa) 752 + (2.0f*s <= sa).thenElse(2.0f*s*d, sa*da - 2.0f*(da-d)*(sa-s)); 753} 754RGB_XFERMODE(lighten) { return s + d - SkNf::Min(s*da, d*sa); } 755RGB_XFERMODE(overlay) { return hardlight_kernel(d,da,s,sa); } 756RGB_XFERMODE(softlight) { 757 SkNf m = (da > 0.0f).thenElse(d / da, 0.0f), 758 s2 = 2.0f*s, 759 m4 = 4.0f*m; 760 761 // The logic forks three ways: 762 // 1. dark src? 763 // 2. light src, dark dst? 764 // 3. light src, light dst? 765 SkNf darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1. 766 darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2. 767 liteDst = m.rsqrt().invert() - m, // Used in case 3. 768 liteSrc = d*sa + da*(s2 - sa) * (4.0f*d <= da).thenElse(darkDst, liteDst); // 2 or 3? 769 return s*inv(da) + d*inv(sa) + (s2 <= sa).thenElse(darkSrc, liteSrc); // 1 or (2 or 3)? 770} 771 772STAGE(luminance_to_alpha) { 773 a = SK_LUM_COEFF_R*r + SK_LUM_COEFF_G*g + SK_LUM_COEFF_B*b; 774 r = g = b = 0; 775} 776 777STAGE(rgb_to_hsl) { 778 auto max = SkNf::Max(SkNf::Max(r, g), b); 779 auto min = SkNf::Min(SkNf::Min(r, g), b); 780 auto l = 0.5f * (max + min); 781 782 auto d = max - min; 783 auto d_inv = 1.0f/d; 784 auto s = (max == min).thenElse(0.0f, 785 d/(l > 0.5f).thenElse(2.0f - max - min, max + min)); 786 SkNf h = (max != r).thenElse(0.0f, 787 (g - b)*d_inv + (g < b).thenElse(6.0f, 0.0f)); 788 h = (max == g).thenElse((b - r)*d_inv + 2.0f, h); 789 h = (max == b).thenElse((r - g)*d_inv + 4.0f, h); 790 h *= (1/6.0f); 791 792 h = (max == min).thenElse(0.0f, h); 793 794 r = h; 795 g = s; 796 b = l; 797} 798 799STAGE(hsl_to_rgb) { 800 auto h = r; 801 auto s = g; 802 auto l = b; 803 auto q = (l < 0.5f).thenElse(l*(1.0f + s), l + s - l*s); 804 auto p = 2.0f*l - q; 805 806 auto hue_to_rgb = [](const SkNf& p, const SkNf& q, const SkNf& t) { 807 auto t2 = (t < 0.0f).thenElse(t + 1.0f, (t > 1.0f).thenElse(t - 1.0f, t)); 808 return (t2 < (1/6.0f)).thenElse( 809 p + (q - p)*6.0f*t, (t2 < (3/6.0f)).thenElse( 810 q, (t2 < (4/6.0f)).thenElse( 811 p + (q - p)*((4/6.0f) - t2)*6.0f, p))); 812 }; 813 814 r = (s == 0.f).thenElse(l, hue_to_rgb(p, q, h + (1/3.0f))); 815 g = (s == 0.f).thenElse(l, hue_to_rgb(p, q, h)); 816 b = (s == 0.f).thenElse(l, hue_to_rgb(p, q, h - (1/3.0f))); 817} 818 819STAGE_CTX(matrix_2x3, const float*) { 820 auto m = ctx; 821 822 auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[2], m[4])), 823 G = SkNf_fma(r,m[1], SkNf_fma(g,m[3], m[5])); 824 r = R; 825 g = G; 826} 827STAGE_CTX(matrix_3x4, const float*) { 828 auto m = ctx; 829 830 auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[3], SkNf_fma(b,m[6], m[ 9]))), 831 G = SkNf_fma(r,m[1], SkNf_fma(g,m[4], SkNf_fma(b,m[7], m[10]))), 832 B = SkNf_fma(r,m[2], SkNf_fma(g,m[5], SkNf_fma(b,m[8], m[11]))); 833 r = R; 834 g = G; 835 b = B; 836} 837STAGE_CTX(matrix_4x5, const float*) { 838 auto m = ctx; 839 840 auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[4], SkNf_fma(b,m[ 8], SkNf_fma(a,m[12], m[16])))), 841 G = SkNf_fma(r,m[1], SkNf_fma(g,m[5], SkNf_fma(b,m[ 9], SkNf_fma(a,m[13], m[17])))), 842 B = SkNf_fma(r,m[2], SkNf_fma(g,m[6], SkNf_fma(b,m[10], SkNf_fma(a,m[14], m[18])))), 843 A = SkNf_fma(r,m[3], SkNf_fma(g,m[7], SkNf_fma(b,m[11], SkNf_fma(a,m[15], m[19])))); 844 r = R; 845 g = G; 846 b = B; 847 a = A; 848} 849STAGE_CTX(matrix_perspective, const float*) { 850 // N.B. unlike the matrix_NxM stages, this takes a row-major matrix. 851 auto m = ctx; 852 853 auto R = SkNf_fma(r,m[0], SkNf_fma(g,m[1], m[2])), 854 G = SkNf_fma(r,m[3], SkNf_fma(g,m[4], m[5])), 855 Z = SkNf_fma(r,m[6], SkNf_fma(g,m[7], m[8])); 856 r = R * Z.invert(); 857 g = G * Z.invert(); 858} 859 860SI SkNf parametric(const SkNf& v, const SkColorSpaceTransferFn& p) { 861 float result[N]; // Unconstrained powf() doesn't vectorize well... 862 for (int i = 0; i < N; i++) { 863 float s = v[i]; 864 result[i] = (s <= p.fD) ? p.fC * s + p.fF 865 : powf(s * p.fA + p.fB, p.fG) + p.fE; 866 } 867 // Clamp the output to [0, 1]. 868 // Max(NaN, 0) = 0, but Max(0, NaN) = NaN, so we want this exact order to ensure NaN => 0 869 return SkNf::Min(SkNf::Max(SkNf::Load(result), 0.0f), 1.0f); 870} 871STAGE_CTX(parametric_r, const SkColorSpaceTransferFn*) { r = parametric(r, *ctx); } 872STAGE_CTX(parametric_g, const SkColorSpaceTransferFn*) { g = parametric(g, *ctx); } 873STAGE_CTX(parametric_b, const SkColorSpaceTransferFn*) { b = parametric(b, *ctx); } 874STAGE_CTX(parametric_a, const SkColorSpaceTransferFn*) { a = parametric(a, *ctx); } 875 876SI SkNf table(const SkNf& v, const SkTableTransferFn& table) { 877 float result[N]; 878 for (int i = 0; i < N; i++) { 879 result[i] = interp_lut(v[i], table.fData, table.fSize); 880 } 881 // no need to clamp - tables are by-design [0,1] -> [0,1] 882 return SkNf::Load(result); 883} 884STAGE_CTX(table_r, const SkTableTransferFn*) { r = table(r, *ctx); } 885STAGE_CTX(table_g, const SkTableTransferFn*) { g = table(g, *ctx); } 886STAGE_CTX(table_b, const SkTableTransferFn*) { b = table(b, *ctx); } 887STAGE_CTX(table_a, const SkTableTransferFn*) { a = table(a, *ctx); } 888 889STAGE_CTX(color_lookup_table, const SkColorLookUpTable*) { 890 const SkColorLookUpTable* colorLUT = ctx; 891 SkASSERT(3 == colorLUT->inputChannels() || 4 == colorLUT->inputChannels()); 892 SkASSERT(3 == colorLUT->outputChannels()); 893 float result[3][N]; 894 for (int i = 0; i < N; ++i) { 895 const float in[4] = { r[i], g[i], b[i], a[i] }; 896 float out[3]; 897 colorLUT->interp(out, in); 898 for (int j = 0; j < colorLUT->outputChannels(); ++j) { 899 result[j][i] = out[j]; 900 } 901 } 902 r = SkNf::Load(result[0]); 903 g = SkNf::Load(result[1]); 904 b = SkNf::Load(result[2]); 905 if (4 == colorLUT->inputChannels()) { 906 // we must set the pixel to opaque, as the alpha channel was used 907 // as input before this. 908 a = 1.f; 909 } 910} 911 912STAGE(lab_to_xyz) { 913 const auto lab_l = r * 100.0f; 914 const auto lab_a = g * 255.0f - 128.0f; 915 const auto lab_b = b * 255.0f - 128.0f; 916 auto Y = (lab_l + 16.0f) * (1/116.0f); 917 auto X = lab_a * (1/500.0f) + Y; 918 auto Z = Y - (lab_b * (1/200.0f)); 919 920 const auto X3 = X*X*X; 921 X = (X3 > 0.008856f).thenElse(X3, (X - (16/116.0f)) * (1/7.787f)); 922 const auto Y3 = Y*Y*Y; 923 Y = (Y3 > 0.008856f).thenElse(Y3, (Y - (16/116.0f)) * (1/7.787f)); 924 const auto Z3 = Z*Z*Z; 925 Z = (Z3 > 0.008856f).thenElse(Z3, (Z - (16/116.0f)) * (1/7.787f)); 926 927 // adjust to D50 illuminant 928 X *= 0.96422f; 929 Y *= 1.00000f; 930 Z *= 0.82521f; 931 932 r = X; 933 g = Y; 934 b = Z; 935} 936 937SI SkNf assert_in_tile(const SkNf& v, float limit) { 938 for (int i = 0; i < N; i++) { 939 SkASSERT(0 <= v[i] && v[i] < limit); 940 } 941 return v; 942} 943 944SI SkNf ulp_before(float v) { 945 SkASSERT(v > 0); 946 SkNf vs(v); 947 SkNu uvs = SkNu::Load(&vs) - 1; 948 return SkNf::Load(&uvs); 949} 950 951SI SkNf clamp(const SkNf& v, float limit) { 952 SkNf result = SkNf::Max(0, SkNf::Min(v, ulp_before(limit))); 953 return assert_in_tile(result, limit); 954} 955SI SkNf repeat(const SkNf& v, float limit) { 956 SkNf result = v - (v/limit).floor()*limit; 957 // For small negative v, (v/limit).floor()*limit can dominate v in the subtraction, 958 // which leaves result == limit. We want result < limit, so clamp it one ULP. 959 result = SkNf::Min(result, ulp_before(limit)); 960 return assert_in_tile(result, limit); 961} 962SI SkNf mirror(const SkNf& v, float l/*imit*/) { 963 SkNf result = ((v - l) - ((v - l) / (2*l)).floor()*(2*l) - l).abs(); 964 // Same deal as repeat. 965 result = SkNf::Min(result, ulp_before(l)); 966 return assert_in_tile(result, l); 967} 968STAGE_CTX( clamp_x, const float*) { r = clamp (r, *ctx); } 969STAGE_CTX(repeat_x, const float*) { r = repeat(r, *ctx); } 970STAGE_CTX(mirror_x, const float*) { r = mirror(r, *ctx); } 971STAGE_CTX( clamp_y, const float*) { g = clamp (g, *ctx); } 972STAGE_CTX(repeat_y, const float*) { g = repeat(g, *ctx); } 973STAGE_CTX(mirror_y, const float*) { g = mirror(g, *ctx); } 974 975STAGE_CTX(save_xy, SkImageShaderContext*) { 976 r.store(ctx->x); 977 g.store(ctx->y); 978 979 // Whether bilinear or bicubic, all sample points have the same fractional offset (fx,fy). 980 // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid 981 // surrounding (x,y), all (0.5,0.5) off-center. 982 auto fract = [](const SkNf& v) { return v - v.floor(); }; 983 fract(r + 0.5f).store(ctx->fx); 984 fract(g + 0.5f).store(ctx->fy); 985} 986 987STAGE_CTX(accumulate, const SkImageShaderContext*) { 988 // Bilinear and bicubic filtering are both separable, so we'll end up with independent 989 // scale contributions in x and y that we multiply together to get each pixel's scale factor. 990 auto scale = SkNf::Load(ctx->scalex) * SkNf::Load(ctx->scaley); 991 dr = SkNf_fma(scale, r, dr); 992 dg = SkNf_fma(scale, g, dg); 993 db = SkNf_fma(scale, b, db); 994 da = SkNf_fma(scale, a, da); 995} 996 997// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center 998// are combined in direct proportion to their area overlapping that logical query pixel. 999// At positive offsets, the x-axis contribution to that rectangular area is fx; (1-fx) 1000// at negative x offsets. The y-axis is treated symmetrically. 1001template <int Scale> 1002SI void bilinear_x(SkImageShaderContext* ctx, SkNf* x) { 1003 *x = SkNf::Load(ctx->x) + Scale*0.5f; 1004 auto fx = SkNf::Load(ctx->fx); 1005 (Scale > 0 ? fx : (1.0f - fx)).store(ctx->scalex); 1006} 1007template <int Scale> 1008SI void bilinear_y(SkImageShaderContext* ctx, SkNf* y) { 1009 *y = SkNf::Load(ctx->y) + Scale*0.5f; 1010 auto fy = SkNf::Load(ctx->fy); 1011 (Scale > 0 ? fy : (1.0f - fy)).store(ctx->scaley); 1012} 1013STAGE_CTX(bilinear_nx, SkImageShaderContext*) { bilinear_x<-1>(ctx, &r); } 1014STAGE_CTX(bilinear_px, SkImageShaderContext*) { bilinear_x<+1>(ctx, &r); } 1015STAGE_CTX(bilinear_ny, SkImageShaderContext*) { bilinear_y<-1>(ctx, &g); } 1016STAGE_CTX(bilinear_py, SkImageShaderContext*) { bilinear_y<+1>(ctx, &g); } 1017 1018 1019// In bilinear interpolation, the 16 pixels at +/- 0.5 and +/- 1.5 offsets from the sample 1020// pixel center are combined with a non-uniform cubic filter, with high filter values near 1021// the center and lower values farther away. 1022// 1023// We break this filter function into two parts, one for near +/- 0.5 offsets, 1024// and one for far +/- 1.5 offsets. 1025// 1026// See GrBicubicEffect for details about this particular Mitchell-Netravali filter. 1027SI SkNf bicubic_near(const SkNf& t) { 1028 // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18 1029 return SkNf_fma(t, SkNf_fma(t, SkNf_fma(-21/18.0f, t, 27/18.0f), 9/18.0f), 1/18.0f); 1030} 1031SI SkNf bicubic_far(const SkNf& t) { 1032 // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18) 1033 return (t*t)*SkNf_fma(7/18.0f, t, -6/18.0f); 1034} 1035 1036template <int Scale> 1037SI void bicubic_x(SkImageShaderContext* ctx, SkNf* x) { 1038 *x = SkNf::Load(ctx->x) + Scale*0.5f; 1039 auto fx = SkNf::Load(ctx->fx); 1040 if (Scale == -3) { return bicubic_far (1.0f - fx).store(ctx->scalex); } 1041 if (Scale == -1) { return bicubic_near(1.0f - fx).store(ctx->scalex); } 1042 if (Scale == +1) { return bicubic_near( fx).store(ctx->scalex); } 1043 if (Scale == +3) { return bicubic_far ( fx).store(ctx->scalex); } 1044 SkDEBUGFAIL("unreachable"); 1045} 1046template <int Scale> 1047SI void bicubic_y(SkImageShaderContext* ctx, SkNf* y) { 1048 *y = SkNf::Load(ctx->y) + Scale*0.5f; 1049 auto fy = SkNf::Load(ctx->fy); 1050 if (Scale == -3) { return bicubic_far (1.0f - fy).store(ctx->scaley); } 1051 if (Scale == -1) { return bicubic_near(1.0f - fy).store(ctx->scaley); } 1052 if (Scale == +1) { return bicubic_near( fy).store(ctx->scaley); } 1053 if (Scale == +3) { return bicubic_far ( fy).store(ctx->scaley); } 1054 SkDEBUGFAIL("unreachable"); 1055} 1056STAGE_CTX(bicubic_n3x, SkImageShaderContext*) { bicubic_x<-3>(ctx, &r); } 1057STAGE_CTX(bicubic_n1x, SkImageShaderContext*) { bicubic_x<-1>(ctx, &r); } 1058STAGE_CTX(bicubic_p1x, SkImageShaderContext*) { bicubic_x<+1>(ctx, &r); } 1059STAGE_CTX(bicubic_p3x, SkImageShaderContext*) { bicubic_x<+3>(ctx, &r); } 1060 1061STAGE_CTX(bicubic_n3y, SkImageShaderContext*) { bicubic_y<-3>(ctx, &g); } 1062STAGE_CTX(bicubic_n1y, SkImageShaderContext*) { bicubic_y<-1>(ctx, &g); } 1063STAGE_CTX(bicubic_p1y, SkImageShaderContext*) { bicubic_y<+1>(ctx, &g); } 1064STAGE_CTX(bicubic_p3y, SkImageShaderContext*) { bicubic_y<+3>(ctx, &g); } 1065 1066 1067template <typename T> 1068SI SkNi offset_and_ptr(T** ptr, const SkImageShaderContext* ctx, const SkNf& x, const SkNf& y) { 1069 SkNi ix = SkNx_cast<int>(x), 1070 iy = SkNx_cast<int>(y); 1071 SkNi offset = iy*ctx->stride + ix; 1072 1073 *ptr = (const T*)ctx->pixels; 1074 return offset; 1075} 1076 1077STAGE_CTX(gather_a8, const SkImageShaderContext*) { 1078 const uint8_t* p; 1079 SkNi offset = offset_and_ptr(&p, ctx, r, g); 1080 1081 r = g = b = 0.0f; 1082 a = SkNf_from_byte(gather(tail, p, offset)); 1083} 1084STAGE_CTX(gather_i8, const SkImageShaderContext*) { 1085 const uint8_t* p; 1086 SkNi offset = offset_and_ptr(&p, ctx, r, g); 1087 1088 SkNi ix = SkNx_cast<int>(gather(tail, p, offset)); 1089 from_8888(gather(tail, ctx->ctable->readColors(), ix), &r, &g, &b, &a); 1090} 1091STAGE_CTX(gather_g8, const SkImageShaderContext*) { 1092 const uint8_t* p; 1093 SkNi offset = offset_and_ptr(&p, ctx, r, g); 1094 1095 r = g = b = SkNf_from_byte(gather(tail, p, offset)); 1096 a = 1.0f; 1097} 1098STAGE_CTX(gather_565, const SkImageShaderContext*) { 1099 const uint16_t* p; 1100 SkNi offset = offset_and_ptr(&p, ctx, r, g); 1101 1102 from_565(gather(tail, p, offset), &r, &g, &b); 1103 a = 1.0f; 1104} 1105STAGE_CTX(gather_4444, const SkImageShaderContext*) { 1106 const uint16_t* p; 1107 SkNi offset = offset_and_ptr(&p, ctx, r, g); 1108 1109 from_4444(gather(tail, p, offset), &r, &g, &b, &a); 1110} 1111STAGE_CTX(gather_8888, const SkImageShaderContext*) { 1112 const uint32_t* p; 1113 SkNi offset = offset_and_ptr(&p, ctx, r, g); 1114 1115 from_8888(gather(tail, p, offset), &r, &g, &b, &a); 1116} 1117STAGE_CTX(gather_f16, const SkImageShaderContext*) { 1118 const uint64_t* p; 1119 SkNi offset = offset_and_ptr(&p, ctx, r, g); 1120 1121 auto px = gather(tail, p, offset); 1122 from_f16(&px, &r, &g, &b, &a); 1123} 1124 1125STAGE_CTX(linear_gradient_2stops, const SkPM4f*) { 1126 auto t = r; 1127 SkPM4f c0 = ctx[0], 1128 dc = ctx[1]; 1129 1130 r = SkNf_fma(t, dc.r(), c0.r()); 1131 g = SkNf_fma(t, dc.g(), c0.g()); 1132 b = SkNf_fma(t, dc.b(), c0.b()); 1133 a = SkNf_fma(t, dc.a(), c0.a()); 1134} 1135 1136STAGE_CTX(byte_tables, const void*) { 1137 struct Tables { const uint8_t *r, *g, *b, *a; }; 1138 auto tables = (const Tables*)ctx; 1139 1140 r = SkNf_from_byte(gather(tail, tables->r, SkNf_round(255.0f, r))); 1141 g = SkNf_from_byte(gather(tail, tables->g, SkNf_round(255.0f, g))); 1142 b = SkNf_from_byte(gather(tail, tables->b, SkNf_round(255.0f, b))); 1143 a = SkNf_from_byte(gather(tail, tables->a, SkNf_round(255.0f, a))); 1144} 1145 1146STAGE_CTX(byte_tables_rgb, const void*) { 1147 struct Tables { const uint8_t *r, *g, *b; int n; }; 1148 auto tables = (const Tables*)ctx; 1149 1150 float scale = tables->n - 1; 1151 r = SkNf_from_byte(gather(tail, tables->r, SkNf_round(scale, r))); 1152 g = SkNf_from_byte(gather(tail, tables->g, SkNf_round(scale, g))); 1153 b = SkNf_from_byte(gather(tail, tables->b, SkNf_round(scale, b))); 1154} 1155 1156STAGE_CTX(shader_adapter, SkShader::Context*) { 1157 SkPM4f buf[N]; 1158 static_assert(sizeof(buf) == sizeof(r) + sizeof(g) + sizeof(b) + sizeof(a), ""); 1159 ctx->shadeSpan4f(x, (int)g[0], buf, N); 1160 SkNf::Load4(buf, &r, &g, &b, &a); 1161} 1162 1163SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) { 1164 switch (st) { 1165 #define M(stage) case SkRasterPipeline::stage: return stage; 1166 SK_RASTER_PIPELINE_STAGES(M) 1167 #undef M 1168 } 1169 SkASSERT(false); 1170 return just_return; 1171} 1172 1173namespace { 1174 1175 static void build_program(void** program, const SkRasterPipeline::Stage* stages, int nstages) { 1176 for (int i = 0; i < nstages; i++) { 1177 *program++ = (void*)enum_to_Fn(stages[i].stage); 1178 if (stages[i].ctx) { 1179 *program++ = stages[i].ctx; 1180 } 1181 } 1182 *program++ = (void*)just_return; 1183 } 1184 1185 static void run_program(void** program, size_t x, size_t n) { 1186 SkNf u; // fastest to start uninitialized. 1187 1188 auto start = (Fn)load_and_increment(&program); 1189 while (n >= N) { 1190 start(x*N, program, u,u,u,u, u,u,u,u); 1191 x += N; 1192 n -= N; 1193 } 1194 if (n) { 1195 start(x*N+n, program, u,u,u,u, u,u,u,u); 1196 } 1197 } 1198 1199 // Compiled manages its memory manually because it's not safe to use 1200 // std::vector, SkTDArray, etc without setting us up for big ODR violations. 1201 struct Compiled { 1202 Compiled(const SkRasterPipeline::Stage* stages, int nstages) { 1203 int slots = nstages + 1; // One extra for just_return. 1204 for (int i = 0; i < nstages; i++) { 1205 if (stages[i].ctx) { 1206 slots++; 1207 } 1208 } 1209 fProgram = (void**)sk_malloc_throw(slots * sizeof(void*)); 1210 build_program(fProgram, stages, nstages); 1211 } 1212 ~Compiled() { sk_free(fProgram); } 1213 1214 Compiled(const Compiled& o) { 1215 int slots = 0; 1216 while (o.fProgram[slots++] != (void*)just_return); 1217 1218 fProgram = (void**)sk_malloc_throw(slots * sizeof(void*)); 1219 memcpy(fProgram, o.fProgram, slots * sizeof(void*)); 1220 } 1221 1222 void operator()(size_t x, size_t n) { 1223 run_program(fProgram, x, n); 1224 } 1225 1226 void** fProgram; 1227 }; 1228} 1229 1230namespace SK_OPTS_NS { 1231 1232 SI void run_pipeline(size_t x, size_t n, 1233 const SkRasterPipeline::Stage* stages, int nstages) { 1234 static const int kStackMax = 256; 1235 // Worst case is nstages stages with nstages context pointers, and just_return. 1236 if (2*nstages+1 <= kStackMax) { 1237 void* program[kStackMax]; 1238 build_program(program, stages, nstages); 1239 run_program(program, x,n); 1240 } else { 1241 Compiled{stages,nstages}(x,n); 1242 } 1243 } 1244 1245} // namespace SK_OPTS_NS 1246 1247#undef SI 1248#undef STAGE 1249#undef STAGE_CTX 1250#undef RGBA_XFERMODE 1251#undef RGB_XFERMODE 1252 1253#endif//SkRasterPipeline_opts_DEFINED 1254