SkJumper_stages_lowp.cpp revision be0bd925614bcfdea859416177b527294a6c92b1
1/* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8// This restricted SkJumper backend works on 8-bit per channel pixels stored in 9// 16-bit channels. This is a last attempt to write a performant low-precision 10// backend with stage definitions that can be shared by x86 and ARM. 11 12#include "SkJumper.h" 13#include "SkJumper_misc.h" 14 15#if defined(__clang__) // This file is empty when not compiled by Clang. 16 17#if defined(__ARM_NEON) 18 #include <arm_neon.h> 19 #if defined(__arm__) 20 #define ABI __attribute__((pcs("aapcs-vfp"))) 21 #else 22 #define ABI 23 #endif 24#elif defined(__SSE2__) 25 #include <immintrin.h> 26 #define ABI 27#else 28 #define ABI 29#endif 30 31#if !defined(JUMPER_IS_OFFLINE) 32 #define WRAP(name) sk_##name##_lowp 33#elif defined(__AVX2__) 34 #define WRAP(name) sk_##name##_hsw_lowp 35#elif defined(__SSE4_1__) 36 #define WRAP(name) sk_##name##_sse41_lowp 37#elif defined(__SSE2__) 38 #define WRAP(name) sk_##name##_sse2_lowp 39#endif 40 41#if defined(__AVX2__) 42 using U8 = uint8_t __attribute__((ext_vector_type(16))); 43 using U16 = uint16_t __attribute__((ext_vector_type(16))); 44 using I16 = int16_t __attribute__((ext_vector_type(16))); 45 using I32 = int32_t __attribute__((ext_vector_type(16))); 46 using U32 = uint32_t __attribute__((ext_vector_type(16))); 47 using F = float __attribute__((ext_vector_type(16))); 48#else 49 using U8 = uint8_t __attribute__((ext_vector_type(8))); 50 using U16 = uint16_t __attribute__((ext_vector_type(8))); 51 using I16 = int16_t __attribute__((ext_vector_type(8))); 52 using I32 = int32_t __attribute__((ext_vector_type(8))); 53 using U32 = uint32_t __attribute__((ext_vector_type(8))); 54 using F = float __attribute__((ext_vector_type(8))); 55#endif 56 57static const size_t N = sizeof(U16) / sizeof(uint16_t); 58 59// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. 60using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, 61 U16 r, U16 g, U16 b, U16 a, 62 U16 dr, U16 dg, U16 db, U16 da); 63 64MAYBE_MSABI 65ABI extern "C" void WRAP(start_pipeline)(const size_t x0, 66 const size_t y0, 67 const size_t xlimit, 68 const size_t ylimit, 69 void** program) { 70 auto start = (Stage)load_and_inc(program); 71 for (size_t dy = y0; dy < ylimit; dy++) { 72 size_t dx = x0; 73 for (; dx + N <= xlimit; dx += N) { 74 start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0); 75 } 76 if (size_t tail = xlimit - dx) { 77 start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); 78 } 79 } 80} 81 82ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t, 83 U16,U16,U16,U16, U16,U16,U16,U16) {} 84 85// All stages use the same function call ABI to chain into each other, but there are three types: 86// GG: geometry in, geometry out -- think, a matrix 87// GP: geometry in, pixels out. -- think, a memory gather 88// PP: pixels in, pixels out. -- think, a blend mode 89// 90// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.) 91// 92// These three STAGE_ macros let you define each type of stage, 93// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. 94 95#define STAGE_GG(name, ...) \ 96 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ 97 ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 98 U16 r, U16 g, U16 b, U16 a, \ 99 U16 dr, U16 dg, U16 db, U16 da) { \ 100 auto x = join<F>(r,g), \ 101 y = join<F>(b,a); \ 102 name##_k(Ctx{program}, dx,dy,tail, x,y); \ 103 split(x, &r,&g); \ 104 split(y, &b,&a); \ 105 auto next = (Stage)load_and_inc(program); \ 106 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 107 } \ 108 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) 109 110#define STAGE_GP(name, ...) \ 111 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 112 U16& r, U16& g, U16& b, U16& a, \ 113 U16& dr, U16& dg, U16& db, U16& da); \ 114 ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 115 U16 r, U16 g, U16 b, U16 a, \ 116 U16 dr, U16 dg, U16 db, U16 da) { \ 117 auto x = join<F>(r,g), \ 118 y = join<F>(b,a); \ 119 name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ 120 auto next = (Stage)load_and_inc(program); \ 121 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 122 } \ 123 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 124 U16& r, U16& g, U16& b, U16& a, \ 125 U16& dr, U16& dg, U16& db, U16& da) 126 127#define STAGE_PP(name, ...) \ 128 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 129 U16& r, U16& g, U16& b, U16& a, \ 130 U16& dr, U16& dg, U16& db, U16& da); \ 131 ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy, \ 132 U16 r, U16 g, U16 b, U16 a, \ 133 U16 dr, U16 dg, U16 db, U16 da) { \ 134 name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ 135 auto next = (Stage)load_and_inc(program); \ 136 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 137 } \ 138 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 139 U16& r, U16& g, U16& b, U16& a, \ 140 U16& dr, U16& dg, U16& db, U16& da) 141 142// ~~~~~~ Commonly used helper functions ~~~~~~ // 143 144SI U16 div255(U16 v) { 145#if 0 146 return (v+127)/255; // The ideal rounding divide by 255. 147#else 148 return (v+255)/256; // A good approximation of (v+127)/255. 149#endif 150} 151 152SI U16 inv(U16 v) { return 255-v; } 153 154SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } 155 156SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } 157SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } 158SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); } 159SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); } 160 161SI U16 from_float(float f) { return f * 255.0f + 0.5f; } 162 163SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } 164 165template <typename D, typename S> 166SI D cast(S src) { 167 return __builtin_convertvector(src, D); 168} 169 170template <typename D, typename S> 171SI void split(S v, D* lo, D* hi) { 172 static_assert(2*sizeof(D) == sizeof(S), ""); 173 memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); 174 memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); 175} 176template <typename D, typename S> 177SI D join(S lo, S hi) { 178 static_assert(sizeof(D) == 2*sizeof(S), ""); 179 D v; 180 memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); 181 memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); 182 return v; 183} 184template <typename V, typename H> 185SI V map(V v, H (*fn)(H)) { 186 H lo,hi; 187 split(v, &lo,&hi); 188 lo = fn(lo); 189 hi = fn(hi); 190 return join<V>(lo,hi); 191} 192 193// TODO: do we need platform-specific intrinsics for any of these? 194SI F if_then_else(I32 c, F t, F e) { 195 return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) ); 196} 197SI F max(F x, F y) { return if_then_else(x < y, y, x); } 198SI F min(F x, F y) { return if_then_else(x < y, x, y); } 199 200SI F mad(F f, F m, F a) { return f*m+a; } 201SI U32 trunc_(F x) { return (U32)cast<I32>(x); } 202 203SI F rcp(F x) { 204#if defined(__AVX2__) 205 return map(x, _mm256_rcp_ps); 206#elif defined(__SSE__) 207 return map(x, _mm_rcp_ps); 208#elif defined(__ARM_NEON) 209 return map(x, +[](float32x4_t v) { 210 auto est = vrecpeq_f32(v); 211 return vrecpsq_f32(v,est)*est; 212 }); 213#else 214 return 1.0f / x; 215#endif 216} 217 218// ~~~~~~ Basic / misc. stages ~~~~~~ // 219 220STAGE_GG(seed_shader, const float* iota) { 221 x = cast<F>(I32(dx)) + unaligned_load<F>(iota); 222 y = cast<F>(I32(dy)) + 0.5f; 223} 224 225STAGE_GG(matrix_translate, const float* m) { 226 x += m[0]; 227 y += m[1]; 228} 229STAGE_GG(matrix_scale_translate, const float* m) { 230 x = mad(x,m[0], m[2]); 231 y = mad(y,m[1], m[3]); 232} 233STAGE_GG(matrix_2x3, const float* m) { 234 auto X = mad(x,m[0], mad(y,m[2], m[4])), 235 Y = mad(x,m[1], mad(y,m[3], m[5])); 236 x = X; 237 y = Y; 238} 239STAGE_GG(matrix_perspective, const float* m) { 240 // N.B. Unlike the other matrix_ stages, this matrix is row-major. 241 auto X = mad(x,m[0], mad(y,m[1], m[2])), 242 Y = mad(x,m[3], mad(y,m[4], m[5])), 243 Z = mad(x,m[6], mad(y,m[7], m[8])); 244 x = X * rcp(Z); 245 y = Y * rcp(Z); 246} 247 248STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) { 249 r = c->rgba[0]; 250 g = c->rgba[1]; 251 b = c->rgba[2]; 252 a = c->rgba[3]; 253} 254STAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; } 255STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } 256 257STAGE_PP(set_rgb, const float rgb[3]) { 258 r = from_float(rgb[0]); 259 g = from_float(rgb[1]); 260 b = from_float(rgb[2]); 261} 262 263STAGE_PP(premul, Ctx::None) { 264 r = div255(r * a); 265 g = div255(g * a); 266 b = div255(b * a); 267} 268 269STAGE_PP(swap_rb, Ctx::None) { 270 auto tmp = r; 271 r = b; 272 b = tmp; 273} 274 275STAGE_PP(move_src_dst, Ctx::None) { 276 dr = r; 277 dg = g; 278 db = b; 279 da = a; 280} 281 282STAGE_PP(move_dst_src, Ctx::None) { 283 r = dr; 284 g = dg; 285 b = db; 286 a = da; 287} 288 289STAGE_PP(invert, Ctx::None) { 290 r = inv(r); 291 g = inv(g); 292 b = inv(b); 293 a = inv(a); 294} 295 296// ~~~~~~ Blend modes ~~~~~~ // 297 298// The same logic applied to all 4 channels. 299#define BLEND_MODE(name) \ 300 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ 301 STAGE_PP(name, Ctx::None) { \ 302 r = name##_channel(r,dr,a,da); \ 303 g = name##_channel(g,dg,a,da); \ 304 b = name##_channel(b,db,a,da); \ 305 a = name##_channel(a,da,a,da); \ 306 } \ 307 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) 308 309 BLEND_MODE(clear) { return 0; } 310 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); } 311 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); } 312 BLEND_MODE(srcin) { return div255( s*da ); } 313 BLEND_MODE(dstin) { return div255( d*sa ); } 314 BLEND_MODE(srcout) { return div255( s*inv(da) ); } 315 BLEND_MODE(dstout) { return div255( d*inv(sa) ); } 316 BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); } 317 BLEND_MODE(dstover) { return d + div255( s*inv(da) ); } 318 BLEND_MODE(modulate) { return div255( s*d ); } 319 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } 320 BLEND_MODE(plus_) { return min(s+d, 255); } 321 BLEND_MODE(screen) { return s + d - div255( s*d ); } 322 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); } 323#undef BLEND_MODE 324 325// The same logic applied to color, and srcover for alpha. 326#define BLEND_MODE(name) \ 327 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ 328 STAGE_PP(name, Ctx::None) { \ 329 r = name##_channel(r,dr,a,da); \ 330 g = name##_channel(g,dg,a,da); \ 331 b = name##_channel(b,db,a,da); \ 332 a = a + div255( da*inv(a) ); \ 333 } \ 334 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) 335 336 BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); } 337 BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); } 338 BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } 339 BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); } 340 341 BLEND_MODE(hardlight) { 342 return div255( s*inv(da) + d*inv(sa) + 343 if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); 344 } 345 BLEND_MODE(overlay) { 346 return div255( s*inv(da) + d*inv(sa) + 347 if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); 348 } 349#undef BLEND_MODE 350 351// ~~~~~~ Helpers for interacting with memory ~~~~~~ // 352 353template <typename T> 354SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) { 355 return (T*)ctx->pixels + dy*ctx->stride + dx; 356} 357 358template <typename T> 359SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) { 360 auto clamp = [](F v, F limit) { 361 limit = bit_cast<F>( bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. 362 return min(max(0, v), limit); 363 }; 364 x = clamp(x, ctx->width); 365 y = clamp(y, ctx->height); 366 367 *ptr = (const T*)ctx->pixels; 368 return trunc_(y)*ctx->stride + trunc_(x); 369} 370 371template <typename V, typename T> 372SI V load(const T* ptr, size_t tail) { 373 V v = 0; 374 switch (tail & (N-1)) { 375 case 0: memcpy(&v, ptr, sizeof(v)); break; 376 #if defined(__AVX2__) 377 case 15: v[14] = ptr[14]; 378 case 14: v[13] = ptr[13]; 379 case 13: v[12] = ptr[12]; 380 case 12: memcpy(&v, ptr, 12*sizeof(T)); break; 381 case 11: v[10] = ptr[10]; 382 case 10: v[ 9] = ptr[ 9]; 383 case 9: v[ 8] = ptr[ 8]; 384 case 8: memcpy(&v, ptr, 8*sizeof(T)); break; 385 #endif 386 case 7: v[ 6] = ptr[ 6]; 387 case 6: v[ 5] = ptr[ 5]; 388 case 5: v[ 4] = ptr[ 4]; 389 case 4: memcpy(&v, ptr, 4*sizeof(T)); break; 390 case 3: v[ 2] = ptr[ 2]; 391 case 2: memcpy(&v, ptr, 2*sizeof(T)); break; 392 case 1: v[ 0] = ptr[ 0]; 393 } 394 return v; 395} 396template <typename V, typename T> 397SI void store(T* ptr, size_t tail, V v) { 398 switch (tail & (N-1)) { 399 case 0: memcpy(ptr, &v, sizeof(v)); break; 400 #if defined(__AVX2__) 401 case 15: ptr[14] = v[14]; 402 case 14: ptr[13] = v[13]; 403 case 13: ptr[12] = v[12]; 404 case 12: memcpy(ptr, &v, 12*sizeof(T)); break; 405 case 11: ptr[10] = v[10]; 406 case 10: ptr[ 9] = v[ 9]; 407 case 9: ptr[ 8] = v[ 8]; 408 case 8: memcpy(ptr, &v, 8*sizeof(T)); break; 409 #endif 410 case 7: ptr[ 6] = v[ 6]; 411 case 6: ptr[ 5] = v[ 5]; 412 case 5: ptr[ 4] = v[ 4]; 413 case 4: memcpy(ptr, &v, 4*sizeof(T)); break; 414 case 3: ptr[ 2] = v[ 2]; 415 case 2: memcpy(ptr, &v, 2*sizeof(T)); break; 416 case 1: ptr[ 0] = v[ 0]; 417 } 418} 419 420template <typename V, typename T> 421SI V gather(const T* ptr, U32 ix) { 422#if defined(__AVX2__) 423 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], 424 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], 425 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], 426 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; 427#else 428 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], 429 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; 430#endif 431} 432// TODO: AVX2 gather instructions where possible 433 434 435// ~~~~~~ 32-bit memory loads and stores ~~~~~~ // 436 437SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { 438#if 1 && defined(__AVX2__) 439 // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. 440 __m256i _01,_23; 441 split(rgba, &_01, &_23); 442 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), 443 _13 = _mm256_permute2x128_si256(_01,_23, 0x31); 444 rgba = join<U32>(_02, _13); 445 446 auto cast_U16 = [](U32 v) -> U16 { 447 __m256i _02,_13; 448 split(v, &_02,&_13); 449 return _mm256_packus_epi32(_02,_13); 450 }; 451#else 452 auto cast_U16 = [](U32 v) -> U16 { 453 return cast<U16>(v); 454 }; 455#endif 456 *r = cast_U16(rgba & 65535) & 255; 457 *g = cast_U16(rgba & 65535) >> 8; 458 *b = cast_U16(rgba >> 16) & 255; 459 *a = cast_U16(rgba >> 16) >> 8; 460} 461 462SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 463#if 1 && defined(__ARM_NEON) 464 uint8x8x4_t rgba; 465 switch (tail & (N-1)) { 466 case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; 467 case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); 468 case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); 469 case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); 470 case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); 471 case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); 472 case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); 473 case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); 474 } 475 *r = cast<U16>(rgba.val[0]); 476 *g = cast<U16>(rgba.val[1]); 477 *b = cast<U16>(rgba.val[2]); 478 *a = cast<U16>(rgba.val[3]); 479#else 480 from_8888(load<U32>(ptr, tail), r,g,b,a); 481#endif 482} 483SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 484#if 1 && defined(__ARM_NEON) 485 uint8x8x4_t rgba = {{ 486 cast<U8>(r), 487 cast<U8>(g), 488 cast<U8>(b), 489 cast<U8>(a), 490 }}; 491 switch (tail & (N-1)) { 492 case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break; 493 case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); 494 case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); 495 case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); 496 case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); 497 case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); 498 case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); 499 case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); 500 } 501#else 502 store(ptr, tail, cast<U32>(r | (g<<8)) << 0 503 | cast<U32>(b | (a<<8)) << 16); 504#endif 505} 506 507STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) { 508 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); 509} 510STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) { 511 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); 512} 513STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) { 514 store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); 515} 516 517STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) { 518 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a); 519} 520STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) { 521 load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da); 522} 523STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) { 524 store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a); 525} 526 527STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) { 528 const uint32_t* ptr; 529 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 530 from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); 531} 532STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) { 533 const uint32_t* ptr; 534 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 535 from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a); 536} 537 538// ~~~~~~ 16-bit memory loads and stores ~~~~~~ // 539 540SI void from_565(U16 rgb, U16* r, U16* g, U16* b) { 541 // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 542 U16 R = (rgb >> 11) & 31, 543 G = (rgb >> 5) & 63, 544 B = (rgb >> 0) & 31; 545 546 // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. 547 *r = (R << 3) | (R >> 2); 548 *g = (G << 2) | (G >> 4); 549 *b = (B << 3) | (B >> 2); 550} 551SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 552 from_565(load<U16>(ptr, tail), r,g,b); 553} 554SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { 555 // Select the top 5,6,5 bits. 556 U16 R = r >> 3, 557 G = g >> 2, 558 B = b >> 3; 559 // Pack them back into 15|rrrrr gggggg bbbbb|0. 560 store(ptr, tail, R << 11 561 | G << 5 562 | B << 0); 563} 564 565STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) { 566 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); 567 a = 255; 568} 569STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) { 570 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); 571 da = 255; 572} 573STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) { 574 store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); 575} 576STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) { 577 const uint16_t* ptr; 578 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 579 from_565(gather<U16>(ptr, ix), &r, &g, &b); 580 a = 255; 581} 582 583// ~~~~~~ 8-bit memory loads and stores ~~~~~~ // 584 585SI U16 load_8(const uint8_t* ptr, size_t tail) { 586 return cast<U16>(load<U8>(ptr, tail)); 587} 588SI void store_8(uint8_t* ptr, size_t tail, U16 v) { 589 store(ptr, tail, cast<U8>(v)); 590} 591 592STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) { 593 r = g = b = 0; 594 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 595} 596STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) { 597 dr = dg = db = 0; 598 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 599} 600STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) { 601 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); 602} 603STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) { 604 const uint8_t* ptr; 605 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 606 r = g = b = 0; 607 a = cast<U16>(gather<U8>(ptr, ix)); 608} 609 610STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) { 611 r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 612 a = 255; 613} 614STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) { 615 dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 616 da = 255; 617} 618STAGE_PP(luminance_to_alpha, Ctx::None) { 619 a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. 620 r = g = b = 0; 621} 622STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) { 623 const uint8_t* ptr; 624 U32 ix = ix_and_ptr(&ptr, ctx, x,y); 625 r = g = b = cast<U16>(gather<U8>(ptr, ix)); 626 a = 255; 627} 628 629// ~~~~~~ Coverage scales / lerps ~~~~~~ // 630 631STAGE_PP(scale_1_float, const float* f) { 632 U16 c = from_float(*f); 633 r = div255( r * c ); 634 g = div255( g * c ); 635 b = div255( b * c ); 636 a = div255( a * c ); 637} 638STAGE_PP(lerp_1_float, const float* f) { 639 U16 c = from_float(*f); 640 r = lerp(dr, r, c); 641 g = lerp(dg, g, c); 642 b = lerp(db, b, c); 643 a = lerp(da, a, c); 644} 645 646STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) { 647 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 648 r = div255( r * c ); 649 g = div255( g * c ); 650 b = div255( b * c ); 651 a = div255( a * c ); 652} 653STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) { 654 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 655 r = lerp(dr, r, c); 656 g = lerp(dg, g, c); 657 b = lerp(db, b, c); 658 a = lerp(da, a, c); 659} 660 661// Derive alpha's coverage from rgb coverage and the values of src and dst alpha. 662SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { 663 return if_then_else(a < da, min(cr,cg,cb) 664 , max(cr,cg,cb)); 665} 666STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) { 667 U16 cr,cg,cb; 668 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); 669 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 670 671 r = div255( r * cr ); 672 g = div255( g * cg ); 673 b = div255( b * cb ); 674 a = div255( a * ca ); 675} 676STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) { 677 U16 cr,cg,cb; 678 load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); 679 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 680 681 r = lerp(dr, r, cr); 682 g = lerp(dg, g, cg); 683 b = lerp(db, b, cb); 684 a = lerp(da, a, ca); 685} 686 687// ~~~~~~ Compound stages ~~~~~~ // 688 689STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) { 690 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 691 692 load_8888(ptr, tail, &dr,&dg,&db,&da); 693 r = r + div255( dr*inv(a) ); 694 g = g + div255( dg*inv(a) ); 695 b = b + div255( db*inv(a) ); 696 a = a + div255( da*inv(a) ); 697 store_8888(ptr, tail, r,g,b,a); 698} 699 700#endif//defined(__clang__) 701