SkJumper_stages_lowp.cpp revision be0bd925614bcfdea859416177b527294a6c92b1
1/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8// This restricted SkJumper backend works on 8-bit per channel pixels stored in
9// 16-bit channels.  This is a last attempt to write a performant low-precision
10// backend with stage definitions that can be shared by x86 and ARM.
11
12#include "SkJumper.h"
13#include "SkJumper_misc.h"
14
15#if defined(__clang__)  // This file is empty when not compiled by Clang.
16
17#if defined(__ARM_NEON)
18    #include <arm_neon.h>
19    #if defined(__arm__)
20        #define ABI __attribute__((pcs("aapcs-vfp")))
21    #else
22        #define ABI
23    #endif
24#elif defined(__SSE2__)
25    #include <immintrin.h>
26    #define ABI
27#else
28    #define ABI
29#endif
30
31#if !defined(JUMPER_IS_OFFLINE)
32    #define WRAP(name) sk_##name##_lowp
33#elif defined(__AVX2__)
34    #define WRAP(name) sk_##name##_hsw_lowp
35#elif defined(__SSE4_1__)
36    #define WRAP(name) sk_##name##_sse41_lowp
37#elif defined(__SSE2__)
38    #define WRAP(name) sk_##name##_sse2_lowp
39#endif
40
41#if defined(__AVX2__)
42    using U8  = uint8_t  __attribute__((ext_vector_type(16)));
43    using U16 = uint16_t __attribute__((ext_vector_type(16)));
44    using I16 =  int16_t __attribute__((ext_vector_type(16)));
45    using I32 =  int32_t __attribute__((ext_vector_type(16)));
46    using U32 = uint32_t __attribute__((ext_vector_type(16)));
47    using F   = float    __attribute__((ext_vector_type(16)));
48#else
49    using U8  = uint8_t  __attribute__((ext_vector_type(8)));
50    using U16 = uint16_t __attribute__((ext_vector_type(8)));
51    using I16 =  int16_t __attribute__((ext_vector_type(8)));
52    using I32 =  int32_t __attribute__((ext_vector_type(8)));
53    using U32 = uint32_t __attribute__((ext_vector_type(8)));
54    using F   = float    __attribute__((ext_vector_type(8)));
55#endif
56
57static const size_t N = sizeof(U16) / sizeof(uint16_t);
58
59// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
60using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
61                          U16  r, U16  g, U16  b, U16  a,
62                          U16 dr, U16 dg, U16 db, U16 da);
63
64MAYBE_MSABI
65ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
66                                         const size_t y0,
67                                         const size_t xlimit,
68                                         const size_t ylimit,
69                                         void** program) {
70    auto start = (Stage)load_and_inc(program);
71    for (size_t dy = y0; dy < ylimit; dy++) {
72        size_t dx = x0;
73        for (; dx + N <= xlimit; dx += N) {
74            start(   0,program,dx,dy, 0,0,0,0, 0,0,0,0);
75        }
76        if (size_t tail = xlimit - dx) {
77            start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
78        }
79    }
80}
81
82ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
83                                      U16,U16,U16,U16, U16,U16,U16,U16) {}
84
85// All stages use the same function call ABI to chain into each other, but there are three types:
86//   GG: geometry in, geometry out  -- think, a matrix
87//   GP: geometry in, pixels out.   -- think, a memory gather
88//   PP: pixels in, pixels out.     -- think, a blend mode
89//
90// (Some stages ignore their inputs or produce no logical output.  That's perfectly fine.)
91//
92// These three STAGE_ macros let you define each type of stage,
93// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
94
95#define STAGE_GG(name, ...)                                                            \
96    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
97    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
98                                   U16  r, U16  g, U16  b, U16  a,                     \
99                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
100        auto x = join<F>(r,g),                                                         \
101             y = join<F>(b,a);                                                         \
102        name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \
103        split(x, &r,&g);                                                               \
104        split(y, &b,&a);                                                               \
105        auto next = (Stage)load_and_inc(program);                                      \
106        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
107    }                                                                                  \
108    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
109
110#define STAGE_GP(name, ...)                                                            \
111    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
112                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
113                     U16& dr, U16& dg, U16& db, U16& da);                              \
114    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
115                                   U16  r, U16  g, U16  b, U16  a,                     \
116                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
117        auto x = join<F>(r,g),                                                         \
118             y = join<F>(b,a);                                                         \
119        name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \
120        auto next = (Stage)load_and_inc(program);                                      \
121        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
122    }                                                                                  \
123    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
124                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
125                     U16& dr, U16& dg, U16& db, U16& da)
126
127#define STAGE_PP(name, ...)                                                            \
128    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
129                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
130                     U16& dr, U16& dg, U16& db, U16& da);                              \
131    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
132                                   U16  r, U16  g, U16  b, U16  a,                     \
133                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
134        name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
135        auto next = (Stage)load_and_inc(program);                                      \
136        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
137    }                                                                                  \
138    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
139                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
140                     U16& dr, U16& dg, U16& db, U16& da)
141
142// ~~~~~~ Commonly used helper functions ~~~~~~ //
143
144SI U16 div255(U16 v) {
145#if 0
146    return (v+127)/255;  // The ideal rounding divide by 255.
147#else
148    return (v+255)/256;  // A good approximation of (v+127)/255.
149#endif
150}
151
152SI U16 inv(U16 v) { return 255-v; }
153
154SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
155
156SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
157SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
158SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); }
159SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); }
160
161SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
162
163SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
164
165template <typename D, typename S>
166SI D cast(S src) {
167    return __builtin_convertvector(src, D);
168}
169
170template <typename D, typename S>
171SI void split(S v, D* lo, D* hi) {
172    static_assert(2*sizeof(D) == sizeof(S), "");
173    memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
174    memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
175}
176template <typename D, typename S>
177SI D join(S lo, S hi) {
178    static_assert(sizeof(D) == 2*sizeof(S), "");
179    D v;
180    memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
181    memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
182    return v;
183}
184template <typename V, typename H>
185SI V map(V v, H (*fn)(H)) {
186    H lo,hi;
187    split(v, &lo,&hi);
188    lo = fn(lo);
189    hi = fn(hi);
190    return join<V>(lo,hi);
191}
192
193// TODO: do we need platform-specific intrinsics for any of these?
194SI F if_then_else(I32 c, F t, F e) {
195    return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
196}
197SI F max(F x, F y) { return if_then_else(x < y, y, x); }
198SI F min(F x, F y) { return if_then_else(x < y, x, y); }
199
200SI F mad(F f, F m, F a) { return f*m+a; }
201SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
202
203SI F rcp(F x) {
204#if defined(__AVX2__)
205    return map(x, _mm256_rcp_ps);
206#elif defined(__SSE__)
207    return map(x, _mm_rcp_ps);
208#elif defined(__ARM_NEON)
209    return map(x, +[](float32x4_t v) {
210        auto est = vrecpeq_f32(v);
211        return vrecpsq_f32(v,est)*est;
212    });
213#else
214    return 1.0f / x;
215#endif
216}
217
218// ~~~~~~ Basic / misc. stages ~~~~~~ //
219
220STAGE_GG(seed_shader, const float* iota) {
221    x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
222    y = cast<F>(I32(dy)) + 0.5f;
223}
224
225STAGE_GG(matrix_translate, const float* m) {
226    x += m[0];
227    y += m[1];
228}
229STAGE_GG(matrix_scale_translate, const float* m) {
230    x = mad(x,m[0], m[2]);
231    y = mad(y,m[1], m[3]);
232}
233STAGE_GG(matrix_2x3, const float* m) {
234    auto X = mad(x,m[0], mad(y,m[2], m[4])),
235         Y = mad(x,m[1], mad(y,m[3], m[5]));
236    x = X;
237    y = Y;
238}
239STAGE_GG(matrix_perspective, const float* m) {
240    // N.B. Unlike the other matrix_ stages, this matrix is row-major.
241    auto X = mad(x,m[0], mad(y,m[1], m[2])),
242         Y = mad(x,m[3], mad(y,m[4], m[5])),
243         Z = mad(x,m[6], mad(y,m[7], m[8]));
244    x = X * rcp(Z);
245    y = Y * rcp(Z);
246}
247
248STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) {
249    r = c->rgba[0];
250    g = c->rgba[1];
251    b = c->rgba[2];
252    a = c->rgba[3];
253}
254STAGE_PP(black_color, Ctx::None) { r = g = b =   0; a = 255; }
255STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
256
257STAGE_PP(set_rgb, const float rgb[3]) {
258    r = from_float(rgb[0]);
259    g = from_float(rgb[1]);
260    b = from_float(rgb[2]);
261}
262
263STAGE_PP(premul, Ctx::None) {
264    r = div255(r * a);
265    g = div255(g * a);
266    b = div255(b * a);
267}
268
269STAGE_PP(swap_rb, Ctx::None) {
270    auto tmp = r;
271    r = b;
272    b = tmp;
273}
274
275STAGE_PP(move_src_dst, Ctx::None) {
276    dr = r;
277    dg = g;
278    db = b;
279    da = a;
280}
281
282STAGE_PP(move_dst_src, Ctx::None) {
283    r = dr;
284    g = dg;
285    b = db;
286    a = da;
287}
288
289STAGE_PP(invert, Ctx::None) {
290    r = inv(r);
291    g = inv(g);
292    b = inv(b);
293    a = inv(a);
294}
295
296// ~~~~~~ Blend modes ~~~~~~ //
297
298// The same logic applied to all 4 channels.
299#define BLEND_MODE(name)                                 \
300    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
301    STAGE_PP(name, Ctx::None) {                          \
302        r = name##_channel(r,dr,a,da);                   \
303        g = name##_channel(g,dg,a,da);                   \
304        b = name##_channel(b,db,a,da);                   \
305        a = name##_channel(a,da,a,da);                   \
306    }                                                    \
307    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
308
309    BLEND_MODE(clear)    { return 0; }
310    BLEND_MODE(srcatop)  { return div255( s*da + d*inv(sa) ); }
311    BLEND_MODE(dstatop)  { return div255( d*sa + s*inv(da) ); }
312    BLEND_MODE(srcin)    { return div255( s*da ); }
313    BLEND_MODE(dstin)    { return div255( d*sa ); }
314    BLEND_MODE(srcout)   { return div255( s*inv(da) ); }
315    BLEND_MODE(dstout)   { return div255( d*inv(sa) ); }
316    BLEND_MODE(srcover)  { return s + div255( d*inv(sa) ); }
317    BLEND_MODE(dstover)  { return d + div255( s*inv(da) ); }
318    BLEND_MODE(modulate) { return div255( s*d ); }
319    BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
320    BLEND_MODE(plus_)    { return min(s+d, 255); }
321    BLEND_MODE(screen)   { return s + d - div255( s*d ); }
322    BLEND_MODE(xor_)     { return div255( s*inv(da) + d*inv(sa) ); }
323#undef BLEND_MODE
324
325// The same logic applied to color, and srcover for alpha.
326#define BLEND_MODE(name)                                 \
327    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
328    STAGE_PP(name, Ctx::None) {                          \
329        r = name##_channel(r,dr,a,da);                   \
330        g = name##_channel(g,dg,a,da);                   \
331        b = name##_channel(b,db,a,da);                   \
332        a = a + div255( da*inv(a) );                     \
333    }                                                    \
334    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
335
336    BLEND_MODE(darken)     { return s + d -   div255( max(s*da, d*sa) ); }
337    BLEND_MODE(lighten)    { return s + d -   div255( min(s*da, d*sa) ); }
338    BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
339    BLEND_MODE(exclusion)  { return s + d - 2*div255( s*d ); }
340
341    BLEND_MODE(hardlight) {
342        return div255( s*inv(da) + d*inv(sa) +
343                       if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
344    }
345    BLEND_MODE(overlay) {
346        return div255( s*inv(da) + d*inv(sa) +
347                       if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
348    }
349#undef BLEND_MODE
350
351// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
352
353template <typename T>
354SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) {
355    return (T*)ctx->pixels + dy*ctx->stride + dx;
356}
357
358template <typename T>
359SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
360    auto clamp = [](F v, F limit) {
361        limit = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive.
362        return min(max(0, v), limit);
363    };
364    x = clamp(x, ctx->width);
365    y = clamp(y, ctx->height);
366
367    *ptr = (const T*)ctx->pixels;
368    return trunc_(y)*ctx->stride + trunc_(x);
369}
370
371template <typename V, typename T>
372SI V load(const T* ptr, size_t tail) {
373    V v = 0;
374    switch (tail & (N-1)) {
375        case  0: memcpy(&v, ptr, sizeof(v)); break;
376    #if defined(__AVX2__)
377        case 15: v[14] = ptr[14];
378        case 14: v[13] = ptr[13];
379        case 13: v[12] = ptr[12];
380        case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
381        case 11: v[10] = ptr[10];
382        case 10: v[ 9] = ptr[ 9];
383        case  9: v[ 8] = ptr[ 8];
384        case  8: memcpy(&v, ptr,  8*sizeof(T)); break;
385    #endif
386        case  7: v[ 6] = ptr[ 6];
387        case  6: v[ 5] = ptr[ 5];
388        case  5: v[ 4] = ptr[ 4];
389        case  4: memcpy(&v, ptr,  4*sizeof(T)); break;
390        case  3: v[ 2] = ptr[ 2];
391        case  2: memcpy(&v, ptr,  2*sizeof(T)); break;
392        case  1: v[ 0] = ptr[ 0];
393    }
394    return v;
395}
396template <typename V, typename T>
397SI void store(T* ptr, size_t tail, V v) {
398    switch (tail & (N-1)) {
399        case  0: memcpy(ptr, &v, sizeof(v)); break;
400    #if defined(__AVX2__)
401        case 15: ptr[14] = v[14];
402        case 14: ptr[13] = v[13];
403        case 13: ptr[12] = v[12];
404        case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
405        case 11: ptr[10] = v[10];
406        case 10: ptr[ 9] = v[ 9];
407        case  9: ptr[ 8] = v[ 8];
408        case  8: memcpy(ptr, &v,  8*sizeof(T)); break;
409    #endif
410        case  7: ptr[ 6] = v[ 6];
411        case  6: ptr[ 5] = v[ 5];
412        case  5: ptr[ 4] = v[ 4];
413        case  4: memcpy(ptr, &v,  4*sizeof(T)); break;
414        case  3: ptr[ 2] = v[ 2];
415        case  2: memcpy(ptr, &v,  2*sizeof(T)); break;
416        case  1: ptr[ 0] = v[ 0];
417    }
418}
419
420template <typename V, typename T>
421SI V gather(const T* ptr, U32 ix) {
422#if defined(__AVX2__)
423    return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
424              ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
425              ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
426              ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
427#else
428    return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
429              ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
430#endif
431}
432// TODO: AVX2 gather instructions where possible
433
434
435// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
436
437SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
438#if 1 && defined(__AVX2__)
439    // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
440    __m256i _01,_23;
441    split(rgba, &_01, &_23);
442    __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
443            _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
444    rgba = join<U32>(_02, _13);
445
446    auto cast_U16 = [](U32 v) -> U16 {
447        __m256i _02,_13;
448        split(v, &_02,&_13);
449        return _mm256_packus_epi32(_02,_13);
450    };
451#else
452    auto cast_U16 = [](U32 v) -> U16 {
453        return cast<U16>(v);
454    };
455#endif
456    *r = cast_U16(rgba & 65535) & 255;
457    *g = cast_U16(rgba & 65535) >>  8;
458    *b = cast_U16(rgba >>   16) & 255;
459    *a = cast_U16(rgba >>   16) >>  8;
460}
461
462SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
463#if 1 && defined(__ARM_NEON)
464    uint8x8x4_t rgba;
465    switch (tail & (N-1)) {
466        case 0: rgba = vld4_u8     ((const uint8_t*)(ptr+0)         ); break;
467        case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6);
468        case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5);
469        case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4);
470        case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3);
471        case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2);
472        case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1);
473        case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
474    }
475    *r = cast<U16>(rgba.val[0]);
476    *g = cast<U16>(rgba.val[1]);
477    *b = cast<U16>(rgba.val[2]);
478    *a = cast<U16>(rgba.val[3]);
479#else
480    from_8888(load<U32>(ptr, tail), r,g,b,a);
481#endif
482}
483SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
484#if 1 && defined(__ARM_NEON)
485    uint8x8x4_t rgba = {{
486        cast<U8>(r),
487        cast<U8>(g),
488        cast<U8>(b),
489        cast<U8>(a),
490    }};
491    switch (tail & (N-1)) {
492        case 0: vst4_u8     ((uint8_t*)(ptr+0), rgba   ); break;
493        case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6);
494        case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5);
495        case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4);
496        case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3);
497        case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2);
498        case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1);
499        case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
500    }
501#else
502    store(ptr, tail, cast<U32>(r | (g<<8)) <<  0
503                   | cast<U32>(b | (a<<8)) << 16);
504#endif
505}
506
507STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) {
508    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
509}
510STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
511    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
512}
513STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) {
514    store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
515}
516
517STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) {
518    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a);
519}
520STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
521    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da);
522}
523STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) {
524    store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a);
525}
526
527STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) {
528    const uint32_t* ptr;
529    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
530    from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
531}
532STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) {
533    const uint32_t* ptr;
534    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
535    from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a);
536}
537
538// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
539
540SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
541    // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
542    U16 R = (rgb >> 11) & 31,
543        G = (rgb >>  5) & 63,
544        B = (rgb >>  0) & 31;
545
546    // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
547    *r = (R << 3) | (R >> 2);
548    *g = (G << 2) | (G >> 4);
549    *b = (B << 3) | (B >> 2);
550}
551SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
552    from_565(load<U16>(ptr, tail), r,g,b);
553}
554SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
555    // Select the top 5,6,5 bits.
556    U16 R = r >> 3,
557        G = g >> 2,
558        B = b >> 3;
559    // Pack them back into 15|rrrrr gggggg bbbbb|0.
560    store(ptr, tail, R << 11
561                   | G <<  5
562                   | B <<  0);
563}
564
565STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) {
566    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
567    a = 255;
568}
569STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) {
570    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
571    da = 255;
572}
573STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) {
574    store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
575}
576STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) {
577    const uint16_t* ptr;
578    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
579    from_565(gather<U16>(ptr, ix), &r, &g, &b);
580    a = 255;
581}
582
583// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
584
585SI U16 load_8(const uint8_t* ptr, size_t tail) {
586    return cast<U16>(load<U8>(ptr, tail));
587}
588SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
589    store(ptr, tail, cast<U8>(v));
590}
591
592STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) {
593    r = g = b = 0;
594    a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
595}
596STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
597    dr = dg = db = 0;
598    da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
599}
600STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) {
601    store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
602}
603STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) {
604    const uint8_t* ptr;
605    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
606    r = g = b = 0;
607    a = cast<U16>(gather<U8>(ptr, ix));
608}
609
610STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) {
611    r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
612    a = 255;
613}
614STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
615    dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
616    da = 255;
617}
618STAGE_PP(luminance_to_alpha, Ctx::None) {
619    a = (r*54 + g*183 + b*19)/256;  // 0.2126, 0.7152, 0.0722 with 256 denominator.
620    r = g = b = 0;
621}
622STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) {
623    const uint8_t* ptr;
624    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
625    r = g = b = cast<U16>(gather<U8>(ptr, ix));
626    a = 255;
627}
628
629// ~~~~~~ Coverage scales / lerps ~~~~~~ //
630
631STAGE_PP(scale_1_float, const float* f) {
632    U16 c = from_float(*f);
633    r = div255( r * c );
634    g = div255( g * c );
635    b = div255( b * c );
636    a = div255( a * c );
637}
638STAGE_PP(lerp_1_float, const float* f) {
639    U16 c = from_float(*f);
640    r = lerp(dr, r, c);
641    g = lerp(dg, g, c);
642    b = lerp(db, b, c);
643    a = lerp(da, a, c);
644}
645
646STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) {
647    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
648    r = div255( r * c );
649    g = div255( g * c );
650    b = div255( b * c );
651    a = div255( a * c );
652}
653STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) {
654    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
655    r = lerp(dr, r, c);
656    g = lerp(dg, g, c);
657    b = lerp(db, b, c);
658    a = lerp(da, a, c);
659}
660
661// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
662SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
663    return if_then_else(a < da, min(cr,cg,cb)
664                              , max(cr,cg,cb));
665}
666STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) {
667    U16 cr,cg,cb;
668    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
669    U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
670
671    r = div255( r * cr );
672    g = div255( g * cg );
673    b = div255( b * cb );
674    a = div255( a * ca );
675}
676STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) {
677    U16 cr,cg,cb;
678    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
679    U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
680
681    r = lerp(dr, r, cr);
682    g = lerp(dg, g, cg);
683    b = lerp(db, b, cb);
684    a = lerp(da, a, ca);
685}
686
687// ~~~~~~ Compound stages ~~~~~~ //
688
689STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
690    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
691
692    load_8888(ptr, tail, &dr,&dg,&db,&da);
693    r = r + div255( dr*inv(a) );
694    g = g + div255( dg*inv(a) );
695    b = b + div255( db*inv(a) );
696    a = a + div255( da*inv(a) );
697    store_8888(ptr, tail, r,g,b,a);
698}
699
700#endif//defined(__clang__)
701