SkJumper_stages_lowp.cpp revision e95a62faa0e615af3971981040fe0f90e8a489f5
1/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8// This restricted SkJumper backend works on 8-bit per channel pixels stored in
9// 16-bit channels.  This is a last attempt to write a performant low-precision
10// backend with stage definitions that can be shared by x86 and ARM.
11
12#include "SkJumper.h"
13#include "SkJumper_misc.h"
14
15#if defined(__clang__)  // This file is empty when not compiled by Clang.
16
17#if defined(__ARM_NEON)
18    #include <arm_neon.h>
19    #if defined(__arm__)
20        #define ABI __attribute__((pcs("aapcs-vfp")))
21    #else
22        #define ABI
23    #endif
24#elif defined(__SSE2__)
25    #include <immintrin.h>
26    #define ABI
27#else
28    #include <math.h>
29    #define ABI
30#endif
31
32#if !defined(JUMPER_IS_OFFLINE)
33    #define WRAP(name) sk_##name##_lowp
34#elif defined(__AVX2__)
35    #define WRAP(name) sk_##name##_hsw_lowp
36#elif defined(__SSE4_1__)
37    #define WRAP(name) sk_##name##_sse41_lowp
38#elif defined(__SSE2__)
39    #define WRAP(name) sk_##name##_sse2_lowp
40#endif
41
42#if defined(__AVX2__)
43    using U8  = uint8_t  __attribute__((ext_vector_type(16)));
44    using U16 = uint16_t __attribute__((ext_vector_type(16)));
45    using I16 =  int16_t __attribute__((ext_vector_type(16)));
46    using I32 =  int32_t __attribute__((ext_vector_type(16)));
47    using U32 = uint32_t __attribute__((ext_vector_type(16)));
48    using F   = float    __attribute__((ext_vector_type(16)));
49#else
50    using U8  = uint8_t  __attribute__((ext_vector_type(8)));
51    using U16 = uint16_t __attribute__((ext_vector_type(8)));
52    using I16 =  int16_t __attribute__((ext_vector_type(8)));
53    using I32 =  int32_t __attribute__((ext_vector_type(8)));
54    using U32 = uint32_t __attribute__((ext_vector_type(8)));
55    using F   = float    __attribute__((ext_vector_type(8)));
56#endif
57
58static const size_t N = sizeof(U16) / sizeof(uint16_t);
59
60// We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64.
61using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
62                          U16  r, U16  g, U16  b, U16  a,
63                          U16 dr, U16 dg, U16 db, U16 da);
64
65MAYBE_MSABI
66ABI extern "C" void WRAP(start_pipeline)(const size_t x0,
67                                         const size_t y0,
68                                         const size_t xlimit,
69                                         const size_t ylimit,
70                                         void** program) {
71    auto start = (Stage)load_and_inc(program);
72    for (size_t dy = y0; dy < ylimit; dy++) {
73        size_t dx = x0;
74        for (; dx + N <= xlimit; dx += N) {
75            start(   0,program,dx,dy, 0,0,0,0, 0,0,0,0);
76        }
77        if (size_t tail = xlimit - dx) {
78            start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
79        }
80    }
81}
82
83ABI extern "C" void WRAP(just_return)(size_t,void**,size_t,size_t,
84                                      U16,U16,U16,U16, U16,U16,U16,U16) {}
85
86// All stages use the same function call ABI to chain into each other, but there are three types:
87//   GG: geometry in, geometry out  -- think, a matrix
88//   GP: geometry in, pixels out.   -- think, a memory gather
89//   PP: pixels in, pixels out.     -- think, a blend mode
90//
91// (Some stages ignore their inputs or produce no logical output.  That's perfectly fine.)
92//
93// These three STAGE_ macros let you define each type of stage,
94// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
95
96#define STAGE_GG(name, ...)                                                            \
97    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
98    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
99                                   U16  r, U16  g, U16  b, U16  a,                     \
100                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
101        auto x = join<F>(r,g),                                                         \
102             y = join<F>(b,a);                                                         \
103        name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \
104        split(x, &r,&g);                                                               \
105        split(y, &b,&a);                                                               \
106        auto next = (Stage)load_and_inc(program);                                      \
107        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
108    }                                                                                  \
109    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
110
111#define STAGE_GP(name, ...)                                                            \
112    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
113                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
114                     U16& dr, U16& dg, U16& db, U16& da);                              \
115    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
116                                   U16  r, U16  g, U16  b, U16  a,                     \
117                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
118        auto x = join<F>(r,g),                                                         \
119             y = join<F>(b,a);                                                         \
120        name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \
121        auto next = (Stage)load_and_inc(program);                                      \
122        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
123    }                                                                                  \
124    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
125                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
126                     U16& dr, U16& dg, U16& db, U16& da)
127
128#define STAGE_PP(name, ...)                                                            \
129    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
130                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
131                     U16& dr, U16& dg, U16& db, U16& da);                              \
132    ABI extern "C" void WRAP(name)(size_t tail, void** program, size_t dx, size_t dy,  \
133                                   U16  r, U16  g, U16  b, U16  a,                     \
134                                   U16 dr, U16 dg, U16 db, U16 da) {                   \
135        name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
136        auto next = (Stage)load_and_inc(program);                                      \
137        next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
138    }                                                                                  \
139    SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
140                     U16&  r, U16&  g, U16&  b, U16&  a,                               \
141                     U16& dr, U16& dg, U16& db, U16& da)
142
143// ~~~~~~ Commonly used helper functions ~~~~~~ //
144
145SI U16 div255(U16 v) {
146#if 0
147    return (v+127)/255;  // The ideal rounding divide by 255.
148#else
149    return (v+255)/256;  // A good approximation of (v+127)/255.
150#endif
151}
152
153SI U16 inv(U16 v) { return 255-v; }
154
155SI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); }
156SI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); }
157
158SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
159SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
160SI U16 max(U16 x, U16 y, U16 z) { return max(x, max(y, z)); }
161SI U16 min(U16 x, U16 y, U16 z) { return min(x, min(y, z)); }
162
163SI U16 from_float(float f) { return f * 255.0f + 0.5f; }
164
165SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
166
167template <typename D, typename S>
168SI D cast(S src) {
169    return __builtin_convertvector(src, D);
170}
171
172template <typename D, typename S>
173SI void split(S v, D* lo, D* hi) {
174    static_assert(2*sizeof(D) == sizeof(S), "");
175    memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
176    memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
177}
178template <typename D, typename S>
179SI D join(S lo, S hi) {
180    static_assert(sizeof(D) == 2*sizeof(S), "");
181    D v;
182    memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
183    memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
184    return v;
185}
186template <typename V, typename H>
187SI V map(V v, H (*fn)(H)) {
188    H lo,hi;
189    split(v, &lo,&hi);
190    lo = fn(lo);
191    hi = fn(hi);
192    return join<V>(lo,hi);
193}
194
195// TODO: do we need platform-specific intrinsics for any of these?
196SI F if_then_else(I32 c, F t, F e) {
197    return bit_cast<F>( (bit_cast<I32>(t) & c) | (bit_cast<I32>(e) & ~c) );
198}
199SI F max(F x, F y) { return if_then_else(x < y, y, x); }
200SI F min(F x, F y) { return if_then_else(x < y, x, y); }
201
202SI F mad(F f, F m, F a) { return f*m+a; }
203SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
204
205SI F rcp(F x) {
206#if defined(__AVX2__)
207    return map(x, _mm256_rcp_ps);
208#elif defined(__SSE__)
209    return map(x, _mm_rcp_ps);
210#elif defined(__ARM_NEON)
211    return map(x, +[](float32x4_t v) {
212        auto est = vrecpeq_f32(v);
213        return vrecpsq_f32(v,est)*est;
214    });
215#else
216    return 1.0f / x;
217#endif
218}
219SI F sqrt_(F x) {
220#if defined(__AVX2__)
221    return map(x, _mm256_sqrt_ps);
222#elif defined(__SSE__)
223    return map(x, _mm_sqrt_ps);
224#elif defined(__aarch64__)
225    return map(x, vsqrtq_f32);
226#elif defined(__ARM_NEON)
227    return map(x, +[](float32x4_t v) {
228        auto est = vrsqrteq_f32(v);  // Estimate and two refinement steps for est = rsqrt(v).
229        est *= vrsqrtsq_f32(v,est*est);
230        est *= vrsqrtsq_f32(v,est*est);
231        return v*est;                // sqrt(v) == v*rsqrt(v).
232    });
233#else
234    return F{
235        sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
236        sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
237    };
238#endif
239}
240
241SI F floor_(F x) {
242#if defined(__aarch64__)
243    return map(x, vrndmq_f32);
244#elif defined(__AVX2__)
245    return map(x, +[](__m256 v){ return _mm256_floor_ps(v); });  // _mm256_floor_ps is a macro...
246#elif defined(__SSE4_1__)
247    return map(x, +[](__m128 v){ return    _mm_floor_ps(v); });  // _mm_floor_ps() is a macro too.
248#else
249    F roundtrip = cast<F>(cast<I32>(x));
250    return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
251#endif
252}
253SI F abs_(F x) { return bit_cast<F>( bit_cast<I32>(x) & 0x7fffffff ); }
254
255// ~~~~~~ Basic / misc. stages ~~~~~~ //
256
257STAGE_GG(seed_shader, const float* iota) {
258    x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
259    y = cast<F>(I32(dy)) + 0.5f;
260}
261
262STAGE_GG(matrix_translate, const float* m) {
263    x += m[0];
264    y += m[1];
265}
266STAGE_GG(matrix_scale_translate, const float* m) {
267    x = mad(x,m[0], m[2]);
268    y = mad(y,m[1], m[3]);
269}
270STAGE_GG(matrix_2x3, const float* m) {
271    auto X = mad(x,m[0], mad(y,m[2], m[4])),
272         Y = mad(x,m[1], mad(y,m[3], m[5]));
273    x = X;
274    y = Y;
275}
276STAGE_GG(matrix_perspective, const float* m) {
277    // N.B. Unlike the other matrix_ stages, this matrix is row-major.
278    auto X = mad(x,m[0], mad(y,m[1], m[2])),
279         Y = mad(x,m[3], mad(y,m[4], m[5])),
280         Z = mad(x,m[6], mad(y,m[7], m[8]));
281    x = X * rcp(Z);
282    y = Y * rcp(Z);
283}
284
285STAGE_PP(uniform_color, const SkJumper_UniformColorCtx* c) {
286    r = c->rgba[0];
287    g = c->rgba[1];
288    b = c->rgba[2];
289    a = c->rgba[3];
290}
291STAGE_PP(black_color, Ctx::None) { r = g = b =   0; a = 255; }
292STAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; }
293
294STAGE_PP(set_rgb, const float rgb[3]) {
295    r = from_float(rgb[0]);
296    g = from_float(rgb[1]);
297    b = from_float(rgb[2]);
298}
299
300STAGE_PP(clamp_a, Ctx::None) {
301    r = min(r, a);
302    g = min(g, a);
303    b = min(b, a);
304}
305STAGE_PP(clamp_a_dst, Ctx::None) {
306    dr = min(dr, da);
307    dg = min(dg, da);
308    db = min(db, da);
309}
310
311STAGE_PP(premul, Ctx::None) {
312    r = div255(r * a);
313    g = div255(g * a);
314    b = div255(b * a);
315}
316STAGE_PP(premul_dst, Ctx::None) {
317    dr = div255(dr * da);
318    dg = div255(dg * da);
319    db = div255(db * da);
320}
321
322STAGE_PP(swap_rb, Ctx::None) {
323    auto tmp = r;
324    r = b;
325    b = tmp;
326}
327
328STAGE_PP(move_src_dst, Ctx::None) {
329    dr = r;
330    dg = g;
331    db = b;
332    da = a;
333}
334
335STAGE_PP(move_dst_src, Ctx::None) {
336    r = dr;
337    g = dg;
338    b = db;
339    a = da;
340}
341
342STAGE_PP(invert, Ctx::None) {
343    r = inv(r);
344    g = inv(g);
345    b = inv(b);
346    a = inv(a);
347}
348
349// ~~~~~~ Blend modes ~~~~~~ //
350
351// The same logic applied to all 4 channels.
352#define BLEND_MODE(name)                                 \
353    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
354    STAGE_PP(name, Ctx::None) {                          \
355        r = name##_channel(r,dr,a,da);                   \
356        g = name##_channel(g,dg,a,da);                   \
357        b = name##_channel(b,db,a,da);                   \
358        a = name##_channel(a,da,a,da);                   \
359    }                                                    \
360    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
361
362    BLEND_MODE(clear)    { return 0; }
363    BLEND_MODE(srcatop)  { return div255( s*da + d*inv(sa) ); }
364    BLEND_MODE(dstatop)  { return div255( d*sa + s*inv(da) ); }
365    BLEND_MODE(srcin)    { return div255( s*da ); }
366    BLEND_MODE(dstin)    { return div255( d*sa ); }
367    BLEND_MODE(srcout)   { return div255( s*inv(da) ); }
368    BLEND_MODE(dstout)   { return div255( d*inv(sa) ); }
369    BLEND_MODE(srcover)  { return s + div255( d*inv(sa) ); }
370    BLEND_MODE(dstover)  { return d + div255( s*inv(da) ); }
371    BLEND_MODE(modulate) { return div255( s*d ); }
372    BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
373    BLEND_MODE(plus_)    { return min(s+d, 255); }
374    BLEND_MODE(screen)   { return s + d - div255( s*d ); }
375    BLEND_MODE(xor_)     { return div255( s*inv(da) + d*inv(sa) ); }
376#undef BLEND_MODE
377
378// The same logic applied to color, and srcover for alpha.
379#define BLEND_MODE(name)                                 \
380    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
381    STAGE_PP(name, Ctx::None) {                          \
382        r = name##_channel(r,dr,a,da);                   \
383        g = name##_channel(g,dg,a,da);                   \
384        b = name##_channel(b,db,a,da);                   \
385        a = a + div255( da*inv(a) );                     \
386    }                                                    \
387    SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
388
389    BLEND_MODE(darken)     { return s + d -   div255( max(s*da, d*sa) ); }
390    BLEND_MODE(lighten)    { return s + d -   div255( min(s*da, d*sa) ); }
391    BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
392    BLEND_MODE(exclusion)  { return s + d - 2*div255( s*d ); }
393
394    BLEND_MODE(hardlight) {
395        return div255( s*inv(da) + d*inv(sa) +
396                       if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
397    }
398    BLEND_MODE(overlay) {
399        return div255( s*inv(da) + d*inv(sa) +
400                       if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
401    }
402#undef BLEND_MODE
403
404// ~~~~~~ Helpers for interacting with memory ~~~~~~ //
405
406template <typename T>
407SI T* ptr_at_xy(const SkJumper_MemoryCtx* ctx, size_t dx, size_t dy) {
408    return (T*)ctx->pixels + dy*ctx->stride + dx;
409}
410
411template <typename T>
412SI U32 ix_and_ptr(T** ptr, const SkJumper_GatherCtx* ctx, F x, F y) {
413    auto clamp = [](F v, F limit) {
414        limit = bit_cast<F>( bit_cast<U32>(limit) - 1 );  // Exclusive -> inclusive.
415        return min(max(0, v), limit);
416    };
417    x = clamp(x, ctx->width);
418    y = clamp(y, ctx->height);
419
420    *ptr = (const T*)ctx->pixels;
421    return trunc_(y)*ctx->stride + trunc_(x);
422}
423
424template <typename V, typename T>
425SI V load(const T* ptr, size_t tail) {
426    V v = 0;
427    switch (tail & (N-1)) {
428        case  0: memcpy(&v, ptr, sizeof(v)); break;
429    #if defined(__AVX2__)
430        case 15: v[14] = ptr[14];
431        case 14: v[13] = ptr[13];
432        case 13: v[12] = ptr[12];
433        case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
434        case 11: v[10] = ptr[10];
435        case 10: v[ 9] = ptr[ 9];
436        case  9: v[ 8] = ptr[ 8];
437        case  8: memcpy(&v, ptr,  8*sizeof(T)); break;
438    #endif
439        case  7: v[ 6] = ptr[ 6];
440        case  6: v[ 5] = ptr[ 5];
441        case  5: v[ 4] = ptr[ 4];
442        case  4: memcpy(&v, ptr,  4*sizeof(T)); break;
443        case  3: v[ 2] = ptr[ 2];
444        case  2: memcpy(&v, ptr,  2*sizeof(T)); break;
445        case  1: v[ 0] = ptr[ 0];
446    }
447    return v;
448}
449template <typename V, typename T>
450SI void store(T* ptr, size_t tail, V v) {
451    switch (tail & (N-1)) {
452        case  0: memcpy(ptr, &v, sizeof(v)); break;
453    #if defined(__AVX2__)
454        case 15: ptr[14] = v[14];
455        case 14: ptr[13] = v[13];
456        case 13: ptr[12] = v[12];
457        case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
458        case 11: ptr[10] = v[10];
459        case 10: ptr[ 9] = v[ 9];
460        case  9: ptr[ 8] = v[ 8];
461        case  8: memcpy(ptr, &v,  8*sizeof(T)); break;
462    #endif
463        case  7: ptr[ 6] = v[ 6];
464        case  6: ptr[ 5] = v[ 5];
465        case  5: ptr[ 4] = v[ 4];
466        case  4: memcpy(ptr, &v,  4*sizeof(T)); break;
467        case  3: ptr[ 2] = v[ 2];
468        case  2: memcpy(ptr, &v,  2*sizeof(T)); break;
469        case  1: ptr[ 0] = v[ 0];
470    }
471}
472
473template <typename V, typename T>
474SI V gather(const T* ptr, U32 ix) {
475#if defined(__AVX2__)
476    return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
477              ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
478              ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
479              ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
480#else
481    return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
482              ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
483#endif
484}
485// TODO: AVX2 gather instructions where possible
486
487
488// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
489
490SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
491#if 1 && defined(__AVX2__)
492    // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
493    __m256i _01,_23;
494    split(rgba, &_01, &_23);
495    __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
496            _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
497    rgba = join<U32>(_02, _13);
498
499    auto cast_U16 = [](U32 v) -> U16 {
500        __m256i _02,_13;
501        split(v, &_02,&_13);
502        return _mm256_packus_epi32(_02,_13);
503    };
504#else
505    auto cast_U16 = [](U32 v) -> U16 {
506        return cast<U16>(v);
507    };
508#endif
509    *r = cast_U16(rgba & 65535) & 255;
510    *g = cast_U16(rgba & 65535) >>  8;
511    *b = cast_U16(rgba >>   16) & 255;
512    *a = cast_U16(rgba >>   16) >>  8;
513}
514
515SI void load_8888(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
516#if 1 && defined(__ARM_NEON)
517    uint8x8x4_t rgba;
518    switch (tail & (N-1)) {
519        case 0: rgba = vld4_u8     ((const uint8_t*)(ptr+0)         ); break;
520        case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6);
521        case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5);
522        case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4);
523        case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3);
524        case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2);
525        case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1);
526        case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
527    }
528    *r = cast<U16>(rgba.val[0]);
529    *g = cast<U16>(rgba.val[1]);
530    *b = cast<U16>(rgba.val[2]);
531    *a = cast<U16>(rgba.val[3]);
532#else
533    from_8888(load<U32>(ptr, tail), r,g,b,a);
534#endif
535}
536SI void store_8888(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
537#if 1 && defined(__ARM_NEON)
538    uint8x8x4_t rgba = {{
539        cast<U8>(r),
540        cast<U8>(g),
541        cast<U8>(b),
542        cast<U8>(a),
543    }};
544    switch (tail & (N-1)) {
545        case 0: vst4_u8     ((uint8_t*)(ptr+0), rgba   ); break;
546        case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6);
547        case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5);
548        case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4);
549        case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3);
550        case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2);
551        case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1);
552        case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
553    }
554#else
555    store(ptr, tail, cast<U32>(r | (g<<8)) <<  0
556                   | cast<U32>(b | (a<<8)) << 16);
557#endif
558}
559
560STAGE_PP(load_8888, const SkJumper_MemoryCtx* ctx) {
561    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
562}
563STAGE_PP(load_8888_dst, const SkJumper_MemoryCtx* ctx) {
564    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
565}
566STAGE_PP(store_8888, const SkJumper_MemoryCtx* ctx) {
567    store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
568}
569
570STAGE_PP(load_bgra, const SkJumper_MemoryCtx* ctx) {
571    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &b,&g,&r,&a);
572}
573STAGE_PP(load_bgra_dst, const SkJumper_MemoryCtx* ctx) {
574    load_8888(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &db,&dg,&dr,&da);
575}
576STAGE_PP(store_bgra, const SkJumper_MemoryCtx* ctx) {
577    store_8888(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, b,g,r,a);
578}
579
580STAGE_GP(gather_8888, const SkJumper_GatherCtx* ctx) {
581    const uint32_t* ptr;
582    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
583    from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
584}
585STAGE_GP(gather_bgra, const SkJumper_GatherCtx* ctx) {
586    const uint32_t* ptr;
587    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
588    from_8888(gather<U32>(ptr, ix), &b, &g, &r, &a);
589}
590
591// ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
592
593SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
594    // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
595    U16 R = (rgb >> 11) & 31,
596        G = (rgb >>  5) & 63,
597        B = (rgb >>  0) & 31;
598
599    // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
600    *r = (R << 3) | (R >> 2);
601    *g = (G << 2) | (G >> 4);
602    *b = (B << 3) | (B >> 2);
603}
604SI void load_565(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
605    from_565(load<U16>(ptr, tail), r,g,b);
606}
607SI void store_565(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
608    // Select the top 5,6,5 bits.
609    U16 R = r >> 3,
610        G = g >> 2,
611        B = b >> 3;
612    // Pack them back into 15|rrrrr gggggg bbbbb|0.
613    store(ptr, tail, R << 11
614                   | G <<  5
615                   | B <<  0);
616}
617
618STAGE_PP(load_565, const SkJumper_MemoryCtx* ctx) {
619    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
620    a = 255;
621}
622STAGE_PP(load_565_dst, const SkJumper_MemoryCtx* ctx) {
623    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
624    da = 255;
625}
626STAGE_PP(store_565, const SkJumper_MemoryCtx* ctx) {
627    store_565(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
628}
629STAGE_GP(gather_565, const SkJumper_GatherCtx* ctx) {
630    const uint16_t* ptr;
631    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
632    from_565(gather<U16>(ptr, ix), &r, &g, &b);
633    a = 255;
634}
635
636SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
637    // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
638    U16 R = (rgba >> 12) & 15,
639        G = (rgba >>  8) & 15,
640        B = (rgba >>  4) & 15,
641        A = (rgba >>  0) & 15;
642
643    // Scale [0,15] to [0,255].
644    *r = (R << 4) | R;
645    *g = (G << 4) | G;
646    *b = (B << 4) | B;
647    *a = (A << 4) | A;
648}
649SI void load_4444(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
650    from_4444(load<U16>(ptr, tail), r,g,b,a);
651}
652SI void store_4444(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
653    // Select the top 4 bits of each.
654    U16 R = r >> 4,
655        G = g >> 4,
656        B = b >> 4,
657        A = a >> 4;
658    // Pack them back into 15|rrrr gggg bbbb aaaa|0.
659    store(ptr, tail, R << 12
660                   | G <<  8
661                   | B <<  4
662                   | A <<  0);
663}
664
665STAGE_PP(load_4444, const SkJumper_MemoryCtx* ctx) {
666    load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
667}
668STAGE_PP(load_4444_dst, const SkJumper_MemoryCtx* ctx) {
669    load_4444(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
670}
671STAGE_PP(store_4444, const SkJumper_MemoryCtx* ctx) {
672    store_4444(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
673}
674STAGE_GP(gather_4444, const SkJumper_GatherCtx* ctx) {
675    const uint16_t* ptr;
676    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
677    from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
678}
679
680// ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
681
682SI U16 load_8(const uint8_t* ptr, size_t tail) {
683    return cast<U16>(load<U8>(ptr, tail));
684}
685SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
686    store(ptr, tail, cast<U8>(v));
687}
688
689STAGE_PP(load_a8, const SkJumper_MemoryCtx* ctx) {
690    r = g = b = 0;
691    a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
692}
693STAGE_PP(load_a8_dst, const SkJumper_MemoryCtx* ctx) {
694    dr = dg = db = 0;
695    da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
696}
697STAGE_PP(store_a8, const SkJumper_MemoryCtx* ctx) {
698    store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
699}
700STAGE_GP(gather_a8, const SkJumper_GatherCtx* ctx) {
701    const uint8_t* ptr;
702    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
703    r = g = b = 0;
704    a = cast<U16>(gather<U8>(ptr, ix));
705}
706
707STAGE_PP(load_g8, const SkJumper_MemoryCtx* ctx) {
708    r = g = b = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
709    a = 255;
710}
711STAGE_PP(load_g8_dst, const SkJumper_MemoryCtx* ctx) {
712    dr = dg = db = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
713    da = 255;
714}
715STAGE_PP(luminance_to_alpha, Ctx::None) {
716    a = (r*54 + g*183 + b*19)/256;  // 0.2126, 0.7152, 0.0722 with 256 denominator.
717    r = g = b = 0;
718}
719STAGE_GP(gather_g8, const SkJumper_GatherCtx* ctx) {
720    const uint8_t* ptr;
721    U32 ix = ix_and_ptr(&ptr, ctx, x,y);
722    r = g = b = cast<U16>(gather<U8>(ptr, ix));
723    a = 255;
724}
725
726// ~~~~~~ Coverage scales / lerps ~~~~~~ //
727
728STAGE_PP(scale_1_float, const float* f) {
729    U16 c = from_float(*f);
730    r = div255( r * c );
731    g = div255( g * c );
732    b = div255( b * c );
733    a = div255( a * c );
734}
735STAGE_PP(lerp_1_float, const float* f) {
736    U16 c = from_float(*f);
737    r = lerp(dr, r, c);
738    g = lerp(dg, g, c);
739    b = lerp(db, b, c);
740    a = lerp(da, a, c);
741}
742
743STAGE_PP(scale_u8, const SkJumper_MemoryCtx* ctx) {
744    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
745    r = div255( r * c );
746    g = div255( g * c );
747    b = div255( b * c );
748    a = div255( a * c );
749}
750STAGE_PP(lerp_u8, const SkJumper_MemoryCtx* ctx) {
751    U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
752    r = lerp(dr, r, c);
753    g = lerp(dg, g, c);
754    b = lerp(db, b, c);
755    a = lerp(da, a, c);
756}
757
758// Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
759SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
760    return if_then_else(a < da, min(cr,cg,cb)
761                              , max(cr,cg,cb));
762}
763STAGE_PP(scale_565, const SkJumper_MemoryCtx* ctx) {
764    U16 cr,cg,cb;
765    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
766    U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
767
768    r = div255( r * cr );
769    g = div255( g * cg );
770    b = div255( b * cb );
771    a = div255( a * ca );
772}
773STAGE_PP(lerp_565, const SkJumper_MemoryCtx* ctx) {
774    U16 cr,cg,cb;
775    load_565(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
776    U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
777
778    r = lerp(dr, r, cr);
779    g = lerp(dg, g, cg);
780    b = lerp(db, b, cb);
781    a = lerp(da, a, ca);
782}
783
784// ~~~~~~ Gradient stages ~~~~~~ //
785
786// Clamp x to [0,1], both sides inclusive (think, gradients).
787// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
788SI F clamp_01(F v) { return min(max(0, v), 1); }
789
790STAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); }
791STAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); }
792STAGE_GG(mirror_x_1, Ctx::None) {
793    auto two = [](F x){ return x+x; };
794    x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
795}
796
797SI U16 round_F_to_U16(F x) { return cast<U16>(x * 255.0f + 0.5f); }
798
799SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
800                        U16* r, U16* g, U16* b, U16* a) {
801    F fr = gather<F>(c->fs[0], idx),
802      fg = gather<F>(c->fs[1], idx),
803      fb = gather<F>(c->fs[2], idx),
804      fa = gather<F>(c->fs[3], idx),
805      br = gather<F>(c->bs[0], idx),
806      bg = gather<F>(c->bs[1], idx),
807      bb = gather<F>(c->bs[2], idx),
808      ba = gather<F>(c->bs[3], idx);
809
810    *r = round_F_to_U16(mad(t, fr, br));
811    *g = round_F_to_U16(mad(t, fg, bg));
812    *b = round_F_to_U16(mad(t, fb, bb));
813    *a = round_F_to_U16(mad(t, fa, ba));
814}
815
816STAGE_GP(gradient, const SkJumper_GradientCtx* c) {
817    auto t = x;
818    U32 idx = 0;
819
820    // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
821    for (size_t i = 1; i < c->stopCount; i++) {
822        idx += if_then_else(t >= c->ts[i], U32(1), U32(0));
823    }
824
825    gradient_lookup(c, idx, t, &r, &g, &b, &a);
826}
827
828STAGE_GP(evenly_spaced_gradient, const SkJumper_GradientCtx* c) {
829    auto t = x;
830    auto idx = trunc_(t * (c->stopCount-1));
831    gradient_lookup(c, idx, t, &r, &g, &b, &a);
832}
833
834STAGE_GP(evenly_spaced_2_stop_gradient, const void* ctx) {
835    // TODO: Rename Ctx SkJumper_EvenlySpaced2StopGradientCtx.
836    struct Ctx { float f[4], b[4]; };
837    auto c = (const Ctx*)ctx;
838
839    auto t = x;
840    r = round_F_to_U16(mad(t, c->f[0], c->b[0]));
841    g = round_F_to_U16(mad(t, c->f[1], c->b[1]));
842    b = round_F_to_U16(mad(t, c->f[2], c->b[2]));
843    a = round_F_to_U16(mad(t, c->f[3], c->b[3]));
844}
845
846STAGE_GG(xy_to_unit_angle, Ctx::None) {
847    F xabs = abs_(x),
848      yabs = abs_(y);
849
850    F slope = min(xabs, yabs)/max(xabs, yabs);
851    F s = slope * slope;
852
853    // Use a 7th degree polynomial to approximate atan.
854    // This was generated using sollya.gforge.inria.fr.
855    // A float optimized polynomial was generated using the following command.
856    // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
857    F phi = slope
858             * (0.15912117063999176025390625f     + s
859             * (-5.185396969318389892578125e-2f   + s
860             * (2.476101927459239959716796875e-2f + s
861             * (-7.0547382347285747528076171875e-3f))));
862
863    phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
864    phi = if_then_else(x < 0.0f   , 1.0f/2.0f - phi, phi);
865    phi = if_then_else(y < 0.0f   , 1.0f - phi     , phi);
866    phi = if_then_else(phi != phi , 0              , phi);  // Check for NaN.
867    x = phi;
868}
869STAGE_GG(xy_to_radius, Ctx::None) {
870    x = sqrt_(x*x + y*y);
871}
872
873// ~~~~~~ Compound stages ~~~~~~ //
874
875STAGE_PP(srcover_rgba_8888, const SkJumper_MemoryCtx* ctx) {
876    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
877
878    load_8888(ptr, tail, &dr,&dg,&db,&da);
879    r = r + div255( dr*inv(a) );
880    g = g + div255( dg*inv(a) );
881    b = b + div255( db*inv(a) );
882    a = a + div255( da*inv(a) );
883    store_8888(ptr, tail, r,g,b,a);
884}
885STAGE_PP(srcover_bgra_8888, const SkJumper_MemoryCtx* ctx) {
886    auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
887
888    load_8888(ptr, tail, &db,&dg,&dr,&da);
889    r = r + div255( dr*inv(a) );
890    g = g + div255( dg*inv(a) );
891    b = b + div255( db*inv(a) );
892    a = a + div255( da*inv(a) );
893    store_8888(ptr, tail, b,g,r,a);
894}
895
896#endif//defined(__clang__)
897