SkJumper_stages_lowp.cpp revision e95a62faa0e615af3971981040fe0f90e8a489f5
1/* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8// This restricted SkJumper backend works on 8-bit per channel pixels stored in 9// 16-bit channels. This is a last attempt to write a performant low-precision 10// backend with stage definitions that can be shared by x86 and ARM. 11 12#include "SkJumper.h" 13#include "SkJumper_misc.h" 14 15#if defined(__clang__) // This file is empty when not compiled by Clang. 16 17#if defined(__ARM_NEON) 18 #include <arm_neon.h> 19 #if defined(__arm__) 20 #define ABI __attribute__((pcs("aapcs-vfp"))) 21 #else 22 #define ABI 23 #endif 24#elif defined(__SSE2__) 25 #include <immintrin.h> 26 #define ABI 27#else 28 #include <math.h> 29 #define ABI 30#endif 31 32#if !defined(JUMPER_IS_OFFLINE) 33 #define WRAP(name) sk_##name##_lowp 34#elif defined(__AVX2__) 35 #define WRAP(name) sk_##name##_hsw_lowp 36#elif defined(__SSE4_1__) 37 #define WRAP(name) sk_##name##_sse41_lowp 38#elif defined(__SSE2__) 39 #define WRAP(name) sk_##name##_sse2_lowp 40#endif 41 42#if defined(__AVX2__) 43 using U8 = uint8_t __attribute__((ext_vector_type(16))); 44 using U16 = uint16_t __attribute__((ext_vector_type(16))); 45 using I16 = int16_t __attribute__((ext_vector_type(16))); 46 using I32 = int32_t __attribute__((ext_vector_type(16))); 47 using U32 = uint32_t __attribute__((ext_vector_type(16))); 48 using F = float __attribute__((ext_vector_type(16))); 49#else 50 using U8 = uint8_t __attribute__((ext_vector_type(8))); 51 using U16 = uint16_t __attribute__((ext_vector_type(8))); 52 using I16 = int16_t __attribute__((ext_vector_type(8))); 53 using I32 = int32_t __attribute__((ext_vector_type(8))); 54 using U32 = uint32_t __attribute__((ext_vector_type(8))); 55 using F = float __attribute__((ext_vector_type(8))); 56#endif 57 58static const size_t N = sizeof(U16) / sizeof(uint16_t); 59 60// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. 61using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, 62 U16 r, U16 g, U16 b, U16 a, 63 U16 dr, U16 dg, U16 db, U16 da); 64 65MAYBE_MSABI 66ABI extern "C" void WRAP(start_pipeline)(const size_t x0, 67 const size_t y0, 68 const size_t xlimit, 69 const size_t ylimit, 70 void** program) { 71 auto start = (Stage)load_and_inc(program); 72 for (size_t dy = y0; dy < ylimit; dy++) { 73 size_t dx = x0; 74 for (; dx + N <= xlimit; dx += N) { 75 start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0); 76 } 77 if (size_t tail = xlimit - dx) { 78 start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); 79 } 80 } 81} 82 83ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t, 84 U16,U16,U16,U16, U16,U16,U16,U16) {} 85 86// All stages use the same function call ABI to chain into each other, but there are three types: 87// GG: geometry in, geometry out -- think, a matrix 88// GP: geometry in, pixels out. -- think, a memory gather 89// PP: pixels in, pixels out. -- think, a blend mode 90// 91// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.) 92// 93// These three STAGE_ macros let you define each type of stage, 94// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. 95 96#define STAGE_GG(name, ...) \ 97 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ 98 ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 99 U16 r, U16 g, U16 b, U16 a, \ 100 U16 dr, U16 dg, U16 db, U16 da) { \ 101 auto x = join<F>(r,g), \ 102 y = join<F>(b,a); \ 103 name##_k(Ctx{program}, dx,dy,tail, x,y); \ 104 split(x, &r,&g); \ 105 split(y, &b,&a); \ 106 auto next = (Stage)load_and_inc(program); \ 107 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 108 } \ 109 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) 110 111#define STAGE_GP(name, ...) \ 112 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 113 U16& r, U16& g, U16& b, U16& a, \ 114 U16& dr, U16& dg, U16& db, U16& da); \ 115 ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 116 U16 r, U16 g, U16 b, U16 a, \ 117 U16 dr, U16 dg, U16 db, U16 da) { \ 118 auto x = join<F>(r,g), \ 119 y = join<F>(b,a); \ 120 name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ 121 auto next = (Stage)load_and_inc(program); \ 122 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 123 } \ 124 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 125 U16& r, U16& g, U16& b, U16& a, \ 126 U16& dr, U16& dg, U16& db, U16& da) 127 128#define STAGE_PP(name, ...) \ 129 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 130 U16& r, U16& g, U16& b, U16& a, \ 131 U16& dr, U16& dg, U16& db, U16& da); \ 132 ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 133 U16 r, U16 g, U16 b, U16 a, \ 134 U16 dr, U16 dg, U16 db, U16 da) { \ 135 name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ 136 auto next = (Stage)load_and_inc(program); \ 137 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 138 } \ 139 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 140 U16& r, U16& g, U16& b, U16& a, \ 141 U16& dr, U16& dg, U16& db, U16& da) 142 143// ~~~~~~ Commonly used helper functions ~~~~~~ // 144 145SI U16 div255(U16 v) { 146#if 0 147 return (v+127)/255; // The ideal rounding divide by 255. 148#else 149 return (v+255)/256; // A good approximation of (v+127)/255. 150#endif 151} 152 153SI U16 inv(U16 v) { return 255-v; } 154 155SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } 156SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); } 157 158SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } 159SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } 160SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); } 161SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); } 162 163SI U16 from_float(float f) { return f * 255.0f + 0.5f; } 164 165SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } 166 167template <typename D, typename S> 168SI D cast(S src) { 169 return __builtin_convertvector(src, D); 170} 171 172template <typename D, typename S> 173SI void split(S v, D* lo, D* hi) { 174 static_assert(2*sizeof(D) == sizeof(S), ""); 175 memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); 176 memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); 177} 178template <typename D, typename S> 179SI D join(S lo, S hi) { 180 static_assert(sizeof(D) == 2*sizeof(S), ""); 181 D v; 182 memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); 183 memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); 184 return v; 185} 186template <typename V, typename H> 187SI V map(V v, H (*fn)(H)) { 188 H lo,hi; 189 split(v, &lo,&hi); 190 lo = fn(lo); 191 hi = fn(hi); 192 return join<V>(lo,hi); 193} 194 195// TODO: do we need platform-specific intrinsics for any of these? 196SI F if_then_else(I32 c, F t, F e) { 197 return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) ); 198} 199SI F max(F x, F y) { return if_then_else(x < y, y, x); } 200SI F min(F x, F y) { return if_then_else(x < y, x, y); } 201 202SI F mad(F f, F m, F a) { return f*m+a; } 203SI U32 trunc_(F x) { return (U32)cast<I32>(x); } 204 205SI F rcp(F x) { 206#if defined(__AVX2__) 207 return map(x, _mm256_rcp_ps); 208#elif defined(__SSE__) 209 return map(x, _mm_rcp_ps); 210#elif defined(__ARM_NEON) 211 return map(x, +[](float32x4_t v) { 212 auto est = vrecpeq_f32(v); 213 return vrecpsq_f32(v,est)*est; 214 }); 215#else 216 return 1.0f / x; 217#endif 218} 219SI F sqrt_(F x) { 220#if defined(__AVX2__) 221 return map(x, _mm256_sqrt_ps); 222#elif defined(__SSE__) 223 return map(x, _mm_sqrt_ps); 224#elif defined(__aarch64__) 225 return map(x, vsqrtq_f32); 226#elif defined(__ARM_NEON) 227 return map(x, +[](float32x4_t v) { 228 auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v). 229 est *= vrsqrtsq_f32(v,est*est); 230 est *= vrsqrtsq_f32(v,est*est); 231 return v*est; // sqrt(v) == v*rsqrt(v). 232 }); 233#else 234 return F{ 235 sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]), 236 sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]), 237 }; 238#endif 239} 240 241SI F floor_(F x) { 242#if defined(__aarch64__) 243 return map(x, vrndmq_f32); 244#elif defined(__AVX2__) 245 return map(x, +[](__m256 v){ return _mm256_floor_ps(v); }); // _mm256_floor_ps is a macro... 246#elif defined(__SSE4_1__) 247 return map(x, +[](__m128 v){ return _mm_floor_ps(v); }); // _mm_floor_ps() is a macro too. 248#else 249 F roundtrip = cast<F>(cast<I32>(x)); 250 return roundtrip - if_then_else(roundtrip > x, F(1), F(0)); 251#endif 252} 253SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); } 254 255// ~~~~~~ Basic / misc. stages ~~~~~~ // 256 257STAGE_GG(seed_shader, const float* iota) { 258 x = cast<F>(I32(dx)) + unaligned_load<F>(iota); 259 y = cast<F>(I32(dy)) + 0.5f; 260} 261 262STAGE_GG(matrix_translate, const float* m) { 263 x += m[0]; 264 y += m[1]; 265} 266STAGE_GG(matrix_scale_translate, const float* m) { 267 x = mad(x,m[0], m[2]); 268 y = mad(y,m[1], m[3]); 269} 270STAGE_GG(matrix_2x3, const float* m) { 271 auto X = mad(x,m[0], mad(y,m[2], m[4])), 272 Y = mad(x,m[1], mad(y,m[3], m[5])); 273 x = X; 274 y = Y; 275} 276STAGE_GG(matrix_perspective, const float* m) { 277 // N.B. Unlike the other matrix_ stages, this matrix is row-major. 278 auto X = mad(x,m[0], mad(y,m[1], m[2])), 279 Y = mad(x,m[3], mad(y,m[4], m[5])), 280 Z = mad(x,m[6], mad(y,m[7], m[8])); 281 x = X * rcp(Z); 282 y = Y * rcp(Z); 283} 284 285STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) { 286 r = c->rgba[0]; 287 g = c->rgba[1]; 288 b = c->rgba[2]; 289 a = c->rgba[3]; 290} 291STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; } 292STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } 293 294STAGE_PP(set_rgb, const float rgb[3]) { 295 r = from_float(rgb[0]); 296 g = from_float(rgb[1]); 297 b = from_float(rgb[2]); 298} 299 300STAGE_PP(clamp_a, Ctx::None) { 301 r = min(r, a); 302 g = min(g, a); 303 b = min(b, a); 304} 305STAGE_PP(clamp_a_dst, Ctx::None) { 306 dr = min(dr, da); 307 dg = min(dg, da); 308 db = min(db, da); 309} 310 311STAGE_PP(premul, Ctx::None) { 312 r = div255(r * a); 313 g = div255(g * a); 314 b = div255(b * a); 315} 316STAGE_PP(premul_dst, Ctx::None) { 317 dr = div255(dr * da); 318 dg = div255(dg * da); 319 db = div255(db * da); 320} 321 322STAGE_PP(swap_rb, Ctx::None) { 323 auto tmp = r; 324 r = b; 325 b = tmp; 326} 327 328STAGE_PP(move_src_dst, Ctx::None) { 329 dr = r; 330 dg = g; 331 db = b; 332 da = a; 333} 334 335STAGE_PP(move_dst_src, Ctx::None) { 336 r = dr; 337 g = dg; 338 b = db; 339 a = da; 340} 341 342STAGE_PP(invert, Ctx::None) { 343 r = inv(r); 344 g = inv(g); 345 b = inv(b); 346 a = inv(a); 347} 348 349// ~~~~~~ Blend modes ~~~~~~ // 350 351// The same logic applied to all 4 channels. 352#define BLEND_MODE(name) \ 353 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ 354 STAGE_PP(name, Ctx::None) { \ 355 r = name##_channel(r,dr,a,da); \ 356 g = name##_channel(g,dg,a,da); \ 357 b = name##_channel(b,db,a,da); \ 358 a = name##_channel(a,da,a,da); \ 359 } \ 360 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) 361 362 BLEND_MODE(clear) { return 0; } 363 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); } 364 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); } 365 BLEND_MODE(srcin) { return div255( s*da ); } 366 BLEND_MODE(dstin) { return div255( d*sa ); } 367 BLEND_MODE(srcout) { return div255( s*inv(da) ); } 368 BLEND_MODE(dstout) { return div255( d*inv(sa) ); } 369 BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); } 370 BLEND_MODE(dstover) { return d + div255( s*inv(da) ); } 371 BLEND_MODE(modulate) { return div255( s*d ); } 372 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } 373 BLEND_MODE(plus_) { return min(s+d, 255); } 374 BLEND_MODE(screen) { return s + d - div255( s*d ); } 375 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); } 376#undef BLEND_MODE 377 378// The same logic applied to color, and srcover for alpha. 379#define BLEND_MODE(name) \ 380 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ 381 STAGE_PP(name, Ctx::None) { \ 382 r = name##_channel(r,dr,a,da); \ 383 g = name##_channel(g,dg,a,da); \ 384 b = name##_channel(b,db,a,da); \ 385 a = a + div255( da*inv(a) ); \ 386 } \ 387 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) 388 389 BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); } 390 BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); } 391 BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } 392 BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); } 393 394 BLEND_MODE(hardlight) { 395 return div255( s*inv(da) + d*inv(sa) + 396 if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); 397 } 398 BLEND_MODE(overlay) { 399 return div255( s*inv(da) + d*inv(sa) + 400 if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); 401 } 402#undef BLEND_MODE 403 404// ~~~~~~ Helpers for interacting with memory ~~~~~~ // 405 406template <typename T> 407SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) { 408 return (T*)ctx->pixels + dy*ctx->stride + dx; 409} 410 411template <typename T> 412SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) { 413 auto clamp = [](F v, F limit) { 414 limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. 415 return min(max(0, v), limit); 416 }; 417 x = clamp(x, ctx->width); 418 y = clamp(y, ctx->height); 419 420 *ptr = (const T*)ctx->pixels; 421 return trunc_(y)*ctx->stride + trunc_(x); 422} 423 424template <typename V, typename T> 425SI V load(const T* ptr, size_t tail) { 426 V v = 0; 427 switch (tail & (N-1)) { 428 case 0: memcpy(&v, ptr, sizeof(v)); break; 429 #if defined(__AVX2__) 430 case 15: v[14] = ptr[14]; 431 case 14: v[13] = ptr[13]; 432 case 13: v[12] = ptr[12]; 433 case 12: memcpy(&v, ptr, 12*sizeof(T)); break; 434 case 11: v[10] = ptr[10]; 435 case 10: v[ 9] = ptr[ 9]; 436 case 9: v[ 8] = ptr[ 8]; 437 case 8: memcpy(&v, ptr, 8*sizeof(T)); break; 438 #endif 439 case 7: v[ 6] = ptr[ 6]; 440 case 6: v[ 5] = ptr[ 5]; 441 case 5: v[ 4] = ptr[ 4]; 442 case 4: memcpy(&v, ptr, 4*sizeof(T)); break; 443 case 3: v[ 2] = ptr[ 2]; 444 case 2: memcpy(&v, ptr, 2*sizeof(T)); break; 445 case 1: v[ 0] = ptr[ 0]; 446 } 447 return v; 448} 449template <typename V, typename T> 450SI void store(T* ptr, size_t tail, V v) { 451 switch (tail & (N-1)) { 452 case 0: memcpy(ptr, &v, sizeof(v)); break; 453 #if defined(__AVX2__) 454 case 15: ptr[14] = v[14]; 455 case 14: ptr[13] = v[13]; 456 case 13: ptr[12] = v[12]; 457 case 12: memcpy(ptr, &v, 12*sizeof(T)); break; 458 case 11: ptr[10] = v[10]; 459 case 10: ptr[ 9] = v[ 9]; 460 case 9: ptr[ 8] = v[ 8]; 461 case 8: memcpy(ptr, &v, 8*sizeof(T)); break; 462 #endif 463 case 7: ptr[ 6] = v[ 6]; 464 case 6: ptr[ 5] = v[ 5]; 465 case 5: ptr[ 4] = v[ 4]; 466 case 4: memcpy(ptr, &v, 4*sizeof(T)); break; 467 case 3: ptr[ 2] = v[ 2]; 468 case 2: memcpy(ptr, &v, 2*sizeof(T)); break; 469 case 1: ptr[ 0] = v[ 0]; 470 } 471} 472 473template <typename V, typename T> 474SI V gather(const T* ptr, U32 ix) { 475#if defined(__AVX2__) 476 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], 477 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], 478 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], 479 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; 480#else 481 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], 482 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; 483#endif 484} 485// TODO: AVX2 gather instructions where possible 486 487 488// ~~~~~~ 32-bit memory loads and stores ~~~~~~ // 489 490SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { 491#if 1 && defined(__AVX2__) 492 // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. 493 __m256i _01,_23; 494 split(rgba, &_01, &_23); 495 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), 496 _13 = _mm256_permute2x128_si256(_01,_23, 0x31); 497 rgba = join<U32>(_02, _13); 498 499 auto cast_U16 = [](U32 v) -> U16 { 500 __m256i _02,_13; 501 split(v, &_02,&_13); 502 return _mm256_packus_epi32(_02,_13); 503 }; 504#else 505 auto cast_U16 = [](U32 v) -> U16 { 506 return cast<U16>(v); 507 }; 508#endif 509 *r = cast_U16(rgba & 65535) & 255; 510 *g = cast_U16(rgba & 65535) >> 8; 511 *b = cast_U16(rgba >> 16) & 255; 512 *a = cast_U16(rgba >> 16) >> 8; 513} 514 515SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 516#if 1 && defined(__ARM_NEON) 517 uint8x8x4_t rgba; 518 switch (tail & (N-1)) { 519 case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; 520 case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); 521 case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); 522 case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); 523 case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); 524 case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); 525 case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); 526 case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); 527 } 528 *r = cast<U16>(rgba.val[0]); 529 *g = cast<U16>(rgba.val[1]); 530 *b = cast<U16>(rgba.val[2]); 531 *a = cast<U16>(rgba.val[3]); 532#else 533 from_8888(load<U32>(ptr, tail), r,g,b,a); 534#endif 535} 536SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 537#if 1 && defined(__ARM_NEON) 538 uint8x8x4_t rgba = {{ 539 cast<U8>(r), 540 cast<U8>(g), 541 cast<U8>(b), 542 cast<U8>(a), 543 }}; 544 switch (tail & (N-1)) { 545 case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break; 546 case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); 547 case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); 548 case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); 549 case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); 550 case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); 551 case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); 552 case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); 553 } 554#else 555 store(ptr, tail, cast<U32>(r | (g<<8)) << 0 556 | cast<U32>(b | (a<<8)) << 16); 557#endif 558} 559 560STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) { 561 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); 562} 563STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) { 564 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); 565} 566STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) { 567 store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); 568} 569 570STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) { 571 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a); 572} 573STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) { 574 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da); 575} 576STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) { 577 store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a); 578} 579 580STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) { 581 const uint32_t* ptr; 582 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 583 from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); 584} 585STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) { 586 const uint32_t* ptr; 587 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 588 from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a); 589} 590 591// ~~~~~~ 16-bit memory loads and stores ~~~~~~ // 592 593SI void from_565(U16 rgb, U16* r, U16* g, U16* b) { 594 // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 595 U16 R = (rgb >> 11) & 31, 596 G = (rgb >> 5) & 63, 597 B = (rgb >> 0) & 31; 598 599 // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. 600 *r = (R << 3) | (R >> 2); 601 *g = (G << 2) | (G >> 4); 602 *b = (B << 3) | (B >> 2); 603} 604SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 605 from_565(load<U16>(ptr, tail), r,g,b); 606} 607SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { 608 // Select the top 5,6,5 bits. 609 U16 R = r >> 3, 610 G = g >> 2, 611 B = b >> 3; 612 // Pack them back into 15|rrrrr gggggg bbbbb|0. 613 store(ptr, tail, R << 11 614 | G << 5 615 | B << 0); 616} 617 618STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) { 619 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); 620 a = 255; 621} 622STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) { 623 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); 624 da = 255; 625} 626STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) { 627 store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); 628} 629STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) { 630 const uint16_t* ptr; 631 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 632 from_565(gather<U16>(ptr, ix), &r, &g, &b); 633 a = 255; 634} 635 636SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) { 637 // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0. 638 U16 R = (rgba >> 12) & 15, 639 G = (rgba >> 8) & 15, 640 B = (rgba >> 4) & 15, 641 A = (rgba >> 0) & 15; 642 643 // Scale [0,15] to [0,255]. 644 *r = (R << 4) | R; 645 *g = (G << 4) | G; 646 *b = (B << 4) | B; 647 *a = (A << 4) | A; 648} 649SI void load_4444(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 650 from_4444(load<U16>(ptr, tail), r,g,b,a); 651} 652SI void store_4444(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 653 // Select the top 4 bits of each. 654 U16 R = r >> 4, 655 G = g >> 4, 656 B = b >> 4, 657 A = a >> 4; 658 // Pack them back into 15|rrrr gggg bbbb aaaa|0. 659 store(ptr, tail, R << 12 660 | G << 8 661 | B << 4 662 | A << 0); 663} 664 665STAGE_PP(load_4444, const SkJumper_MemoryCtx* ctx) { 666 load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a); 667} 668STAGE_PP(load_4444_dst, const SkJumper_MemoryCtx* ctx) { 669 load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); 670} 671STAGE_PP(store_4444, const SkJumper_MemoryCtx* ctx) { 672 store_4444(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a); 673} 674STAGE_GP(gather_4444, const SkJumper_GatherCtx* ctx) { 675 const uint16_t* ptr; 676 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 677 from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a); 678} 679 680// ~~~~~~ 8-bit memory loads and stores ~~~~~~ // 681 682SI U16 load_8(const uint8_t* ptr, size_t tail) { 683 return cast<U16>(load<U8>(ptr, tail)); 684} 685SI void store_8(uint8_t* ptr, size_t tail, U16 v) { 686 store(ptr, tail, cast<U8>(v)); 687} 688 689STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) { 690 r = g = b = 0; 691 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 692} 693STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) { 694 dr = dg = db = 0; 695 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 696} 697STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) { 698 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); 699} 700STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) { 701 const uint8_t* ptr; 702 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 703 r = g = b = 0; 704 a = cast<U16>(gather<U8>(ptr, ix)); 705} 706 707STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) { 708 r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 709 a = 255; 710} 711STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) { 712 dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 713 da = 255; 714} 715STAGE_PP(luminance_to_alpha, Ctx::None) { 716 a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. 717 r = g = b = 0; 718} 719STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) { 720 const uint8_t* ptr; 721 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 722 r = g = b = cast<U16>(gather<U8>(ptr, ix)); 723 a = 255; 724} 725 726// ~~~~~~ Coverage scales / lerps ~~~~~~ // 727 728STAGE_PP(scale_1_float, const float* f) { 729 U16 c = from_float(*f); 730 r = div255( r * c ); 731 g = div255( g * c ); 732 b = div255( b * c ); 733 a = div255( a * c ); 734} 735STAGE_PP(lerp_1_float, const float* f) { 736 U16 c = from_float(*f); 737 r = lerp(dr, r, c); 738 g = lerp(dg, g, c); 739 b = lerp(db, b, c); 740 a = lerp(da, a, c); 741} 742 743STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) { 744 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 745 r = div255( r * c ); 746 g = div255( g * c ); 747 b = div255( b * c ); 748 a = div255( a * c ); 749} 750STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) { 751 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 752 r = lerp(dr, r, c); 753 g = lerp(dg, g, c); 754 b = lerp(db, b, c); 755 a = lerp(da, a, c); 756} 757 758// Derive alpha's coverage from rgb coverage and the values of src and dst alpha. 759SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { 760 return if_then_else(a < da, min(cr,cg,cb) 761 , max(cr,cg,cb)); 762} 763STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) { 764 U16 cr,cg,cb; 765 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); 766 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 767 768 r = div255( r * cr ); 769 g = div255( g * cg ); 770 b = div255( b * cb ); 771 a = div255( a * ca ); 772} 773STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) { 774 U16 cr,cg,cb; 775 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); 776 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 777 778 r = lerp(dr, r, cr); 779 g = lerp(dg, g, cg); 780 b = lerp(db, b, cb); 781 a = lerp(da, a, ca); 782} 783 784// ~~~~~~ Gradient stages ~~~~~~ // 785 786// Clamp x to [0,1], both sides inclusive (think, gradients). 787// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. 788SI F clamp_01(F v) { return min(max(0, v), 1); } 789 790STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); } 791STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); } 792STAGE_GG(mirror_x_1, Ctx::None) { 793 auto two = [](F x){ return x+x; }; 794 x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f )); 795} 796 797SI U16 round_F_to_U16(F x) { return cast<U16>(x * 255.0f + 0.5f); } 798 799SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t, 800 U16* r, U16* g, U16* b, U16* a) { 801 F fr = gather<F>(c->fs[0], idx), 802 fg = gather<F>(c->fs[1], idx), 803 fb = gather<F>(c->fs[2], idx), 804 fa = gather<F>(c->fs[3], idx), 805 br = gather<F>(c->bs[0], idx), 806 bg = gather<F>(c->bs[1], idx), 807 bb = gather<F>(c->bs[2], idx), 808 ba = gather<F>(c->bs[3], idx); 809 810 *r = round_F_to_U16(mad(t, fr, br)); 811 *g = round_F_to_U16(mad(t, fg, bg)); 812 *b = round_F_to_U16(mad(t, fb, bb)); 813 *a = round_F_to_U16(mad(t, fa, ba)); 814} 815 816STAGE_GP(gradient, const SkJumper_GradientCtx* c) { 817 auto t = x; 818 U32 idx = 0; 819 820 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. 821 for (size_t i = 1; i < c->stopCount; i++) { 822 idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); 823 } 824 825 gradient_lookup(c, idx, t, &r, &g, &b, &a); 826} 827 828STAGE_GP(evenly_spaced_gradient, const SkJumper_GradientCtx* c) { 829 auto t = x; 830 auto idx = trunc_(t * (c->stopCount-1)); 831 gradient_lookup(c, idx, t, &r, &g, &b, &a); 832} 833 834STAGE_GP(evenly_spaced_2_stop_gradient, const void* ctx) { 835 // TODO: Rename Ctx SkJumper_EvenlySpaced2StopGradientCtx. 836 struct Ctx { float f[4], b[4]; }; 837 auto c = (const Ctx*)ctx; 838 839 auto t = x; 840 r = round_F_to_U16(mad(t, c->f[0], c->b[0])); 841 g = round_F_to_U16(mad(t, c->f[1], c->b[1])); 842 b = round_F_to_U16(mad(t, c->f[2], c->b[2])); 843 a = round_F_to_U16(mad(t, c->f[3], c->b[3])); 844} 845 846STAGE_GG(xy_to_unit_angle, Ctx::None) { 847 F xabs = abs_(x), 848 yabs = abs_(y); 849 850 F slope = min(xabs, yabs)/max(xabs, yabs); 851 F s = slope * slope; 852 853 // Use a 7th degree polynomial to approximate atan. 854 // This was generated using sollya.gforge.inria.fr. 855 // A float optimized polynomial was generated using the following command. 856 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); 857 F phi = slope 858 * (0.15912117063999176025390625f + s 859 * (-5.185396969318389892578125e-2f + s 860 * (2.476101927459239959716796875e-2f + s 861 * (-7.0547382347285747528076171875e-3f)))); 862 863 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); 864 phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi); 865 phi = if_then_else(y < 0.0f , 1.0f - phi , phi); 866 phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. 867 x = phi; 868} 869STAGE_GG(xy_to_radius, Ctx::None) { 870 x = sqrt_(x*x + y*y); 871} 872 873// ~~~~~~ Compound stages ~~~~~~ // 874 875STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) { 876 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 877 878 load_8888(ptr, tail, &dr,&dg,&db,&da); 879 r = r + div255( dr*inv(a) ); 880 g = g + div255( dg*inv(a) ); 881 b = b + div255( db*inv(a) ); 882 a = a + div255( da*inv(a) ); 883 store_8888(ptr, tail, r,g,b,a); 884} 885STAGE_PP(srcover_bgra_8888, const SkJumper_MemoryCtx* ctx) { 886 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 887 888 load_8888(ptr, tail, &db,&dg,&dr,&da); 889 r = r + div255( dr*inv(a) ); 890 g = g + div255( dg*inv(a) ); 891 b = b + div255( db*inv(a) ); 892 a = a + div255( da*inv(a) ); 893 store_8888(ptr, tail, b,g,r,a); 894} 895 896#endif//defined(__clang__) 897