SkJumper_stages_lowp.cpp revision 5883a11862771c3d5abb3a26c8c4b6bb0570de23
1/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkJumper.h"
9#include "SkJumper_misc.h"
10#include <immintrin.h>
11
12#if !defined(__SSSE3__) || !defined(__clang__) || !defined(__x86_64__)
13    #error "We're starting with just SSSE3 x86-64 for now, and will always require Clang."
14#endif
15
16#define WRAP(name) sk_##name##_ssse3_lowp
17
18using K = const SkJumper_constants;
19static const size_t kStride = 8;
20
21template <typename T> using V = T __attribute__((ext_vector_type(8)));
22using U8  = V<uint8_t>;
23using U16 = V<uint16_t>;
24using U32 = V<uint32_t>;
25
26// See SkFixed15.h for details on this format and its operations.
27struct F {
28    U16 vec;
29
30    F() = default;
31    F(float f) {
32        // After adding 256.0f, the SkFixed15 value is the bottom two bytes of the float.
33        f += 256.0f;
34        vec = unaligned_load<uint16_t>(&f);
35    }
36
37    F(U16 v) : vec(v) {}
38    operator U16() const { return vec; }
39};
40
41SI F operator+(F x, F y) { return x.vec + y.vec; }
42SI F operator-(F x, F y) { return x.vec - y.vec; }
43SI F operator*(F x, F y) { return _mm_abs_epi16(_mm_mulhrs_epi16(x.vec, y.vec)); }
44SI F mad(F f, F m, F a) { return f*m+a; }
45SI F inv(F v) { return 1.0f - v; }
46SI F two(F v) { return v + v; }
47SI F lerp(F from, F to, F t) { return to*t + from*inv(t); }
48
49SI F operator<<(F x, int bits) { return x.vec << bits; }
50SI F operator>>(F x, int bits) { return x.vec >> bits; }
51
52using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
53
54MAYBE_MSABI
55extern "C" size_t WRAP(start_pipeline)(size_t x, size_t y, size_t limit, void** program, K* k) {
56    F v{};
57    auto start = (Stage*)load_and_inc(program);
58    while (x + kStride <= limit) {
59        start(k,program,x,y,0,    v,v,v,v, v,v,v,v);
60        x += kStride;
61    }
62    if (size_t tail = limit - x) {
63        start(k,program,x,y,tail, v,v,v,v, v,v,v,v);
64    }
65    return limit;
66}
67extern "C" void WRAP(just_return)(K*, void**, size_t,size_t,size_t, F,F,F,F, F,F,F,F) {}
68
69#define STAGE(name)                                                                   \
70    SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
71                     F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);             \
72    extern "C" void WRAP(name)(K* k, void** program, size_t x, size_t y, size_t tail, \
73                               F r, F g, F b, F a, F dr, F dg, F db, F da) {          \
74        LazyCtx ctx(program);                                                         \
75        name##_k(k,ctx,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
76        auto next = (Stage*)load_and_inc(program);                                    \
77        next(k,program,x,y,tail, r,g,b,a, dr,dg,db,da);                               \
78    }                                                                                 \
79    SI void name##_k(K* k, LazyCtx ctx, size_t x, size_t y, size_t tail,              \
80                     F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
81
82
83// Helper functions used by multiple stages.
84
85template <typename V, typename T>
86SI V load(const T* src, size_t tail) {
87    __builtin_assume(tail < kStride);
88    if (__builtin_expect(tail, 0)) {
89        V v{};  // Any inactive lanes are zeroed.
90        switch (tail-1) {
91            case 6: v[6] = src[6];
92            case 5: v[5] = src[5];
93            case 4: v[4] = src[4];
94            case 3: v[3] = src[3];
95            case 2: v[2] = src[2];
96            case 1: v[1] = src[1];
97            case 0: v[0] = src[0];
98        }
99        return v;
100    }
101    return unaligned_load<V>(src);
102}
103
104template <typename V, typename T>
105SI void store(T* dst, V v, size_t tail) {
106    __builtin_assume(tail < kStride);
107    if (__builtin_expect(tail, 0)) {
108        switch (tail-1) {
109            case 6: dst[6] = v[6];
110            case 5: dst[5] = v[5];
111            case 4: dst[4] = v[4];
112            case 3: dst[3] = v[3];
113            case 2: dst[2] = v[2];
114            case 1: dst[1] = v[1];
115            case 0: dst[0] = v[0];
116        }
117        return;
118    }
119    unaligned_store(dst, v);
120}
121
122SI void from_8888(U32 rgba, F* r, F* g, F* b, F* a) {
123    // Split the 8 pixels into low and high halves, and reinterpret as vectors of 16-bit values.
124    U16 lo = unaligned_load<U16>((const uint32_t*)&rgba + 0),
125        hi = unaligned_load<U16>((const uint32_t*)&rgba + 4);
126
127    // Shuffle so that the 4 bytes of each color channel are contiguous...
128    lo = _mm_shuffle_epi8(lo, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15));
129    hi = _mm_shuffle_epi8(hi, _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15));
130
131    // ...then get all 8 bytes of each color channel together into a single register.
132    U16 rg = _mm_unpacklo_epi32(lo,hi),
133        ba = _mm_unpackhi_epi32(lo,hi);
134
135    // Unpack as 16-bit values into the high half of each 16-bit lane, to get a free *256.
136    U16 R = _mm_unpacklo_epi8(U16(0), rg),
137        G = _mm_unpackhi_epi8(U16(0), rg),
138        B = _mm_unpacklo_epi8(U16(0), ba),
139        A = _mm_unpackhi_epi8(U16(0), ba);
140
141    // Now we scale from [0,255] to [0,32768].  Ideally that's 32768/255 = 128.50196,
142    // but we can approximate that very cheaply as 256*32897/65536 = 128.50391.
143    // 0 and 255 map to 0 and 32768 correctly, and nothing else is off by more than 1.
144    *r = _mm_mulhi_epu16(R, U16(32897));
145    *g = _mm_mulhi_epu16(G, U16(32897));
146    *b = _mm_mulhi_epu16(B, U16(32897));
147    *a = _mm_mulhi_epu16(A, U16(32897));
148}
149SI F from_byte(U8 bytes) {
150    // See from_8888() just above.
151    U16 hi = _mm_unpacklo_epi8(U16(0), widen_cast<__m128i>(bytes));
152    return (F)_mm_mulhi_epu16(hi, U16(32897));
153}
154
155SI U32 to_8888(F r, F g, F b, F a) {
156    // We want to interlace and pack these values from [0,32768] to [0,255].
157    // Luckily the simplest possible thing works great: >>7, then saturate.
158    // The 'u' in packus handles the saturation to [0,255] we need.
159    U16 rb = _mm_packus_epi16(r>>7,b>>7), // r0 r1 r2 r3 r4 r5 r6 r7 b0 b1 b2 b3 b4 b5 b6 b7
160        ga = _mm_packus_epi16(g>>7,a>>7);
161
162    U16 rg = _mm_unpacklo_epi8(rb, ga),   // r0 g0 r1 g1 ...                           r7 g7
163        ba = _mm_unpackhi_epi8(rb, ga);   // b0 a0       ...                           b7 a7
164
165    U16 lo = _mm_unpacklo_epi16(rg, ba),  // r0 g0 b0 a0 ...                     r3 g3 b3 a3
166        hi = _mm_unpackhi_epi16(rg, ba);  // r4 g4 b4 a4 ...                     r7 g7 b7 a7
167
168    U32 px;
169    memcpy((uint32_t*)&px + 0, &lo, sizeof(lo));
170    memcpy((uint32_t*)&px + 4, &hi, sizeof(hi));
171    return px;
172}
173SI U8 to_byte(F v) {
174    // See to_8888() just above.
175    U16 packed = _mm_packus_epi16(v>>7, v>>7);  // Doesn't really matter what we pack on top.
176    return unaligned_load<U8>(&packed);
177}
178
179// Stages!
180
181STAGE(constant_color) {
182    // We're converting to fixed point, which lets us play some IEEE representation tricks,
183    // replacing a naive *32768 and float->int conversion with a simple float add.
184    __m128i bits = _mm_loadu_ps((const float*)ctx) + _mm_set1_ps(256.0f);
185    r = _mm_shuffle_epi8(bits, _mm_set1_epi16(0x0100));
186    g = _mm_shuffle_epi8(bits, _mm_set1_epi16(0x0504));
187    b = _mm_shuffle_epi8(bits, _mm_set1_epi16(0x0908));
188    a = _mm_shuffle_epi8(bits, _mm_set1_epi16(0x0d0c));
189}
190
191STAGE(set_rgb) {
192    auto rgb = (const float*)ctx;
193    r = rgb[0];
194    g = rgb[1];
195    b = rgb[2];
196}
197
198STAGE(premul) {
199    r = r * a;
200    g = g * a;
201    b = b * a;
202}
203
204STAGE(load_8888) {
205    auto ptr = *(const uint32_t**)ctx + x;
206    from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
207}
208STAGE(store_8888) {
209    auto ptr = *(uint32_t**)ctx + x;
210    store(ptr, to_8888(r,g,b,a), tail);
211}
212
213STAGE(load_a8) {
214    auto ptr = *(const uint8_t**)ctx + x;
215    r = g = b = 0.0f;
216    a = from_byte(load<U8>(ptr, tail));
217}
218STAGE(store_a8) {
219    auto ptr = *(uint8_t**)ctx + x;
220    store(ptr, to_byte(a), tail);
221}
222
223STAGE(load_g8) {
224    auto ptr = *(const uint8_t**)ctx + x;
225    r = g = b = from_byte(load<U8>(ptr, tail));
226    a = 1.0f;
227}
228
229STAGE(srcover_rgba_8888) {
230    auto ptr = *(uint32_t**)ctx + x;
231
232    from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
233
234    r = mad(dr, inv(a), r);
235    g = mad(dg, inv(a), g);
236    b = mad(db, inv(a), b);
237    a = mad(da, inv(a), a);
238
239    store(ptr, to_8888(r,g,b,a), tail);
240}
241
242STAGE(scale_1_float) {
243    float c = *(const float*)ctx;
244
245    r = r * c;
246    g = g * c;
247    b = b * c;
248    a = a * c;
249}
250STAGE(scale_u8) {
251    auto ptr = *(const uint8_t**)ctx + x;
252
253    U8 scales = load<U8>(ptr, tail);
254    F c = from_byte(scales);
255
256    r = r * c;
257    g = g * c;
258    b = b * c;
259    a = a * c;
260}
261
262STAGE(lerp_1_float) {
263    float c = *(const float*)ctx;
264
265    r = lerp(dr, r, c);
266    g = lerp(dg, g, c);
267    b = lerp(db, b, c);
268    a = lerp(da, a, c);
269}
270STAGE(lerp_u8) {
271    auto ptr = *(const uint8_t**)ctx + x;
272
273    U8 scales = load<U8>(ptr, tail);
274    F c = from_byte(scales);
275
276    r = lerp(dr, r, c);
277    g = lerp(dg, g, c);
278    b = lerp(db, b, c);
279    a = lerp(da, a, c);
280}
281
282STAGE(swap_rb) {
283    auto tmp = r;
284    r = b;
285    b = tmp;
286}
287
288STAGE(swap) {
289    auto swap = [](F& v, F& dv) {
290        auto tmp = v;
291        v = dv;
292        dv = tmp;
293    };
294    swap(r, dr);
295    swap(g, dg);
296    swap(b, db);
297    swap(a, da);
298}
299STAGE(move_src_dst) {
300    dr = r;
301    dg = g;
302    db = b;
303    da = a;
304}
305STAGE(move_dst_src) {
306    r = dr;
307    g = dg;
308    b = db;
309    a = da;
310}
311
312// Most blend modes apply the same logic to each channel.
313#define BLEND_MODE(name)                       \
314    SI F name##_channel(F s, F d, F sa, F da); \
315    STAGE(name) {                              \
316        r = name##_channel(r,dr,a,da);         \
317        g = name##_channel(g,dg,a,da);         \
318        b = name##_channel(b,db,a,da);         \
319        a = name##_channel(a,da,a,da);         \
320    }                                          \
321    SI F name##_channel(F s, F d, F sa, F da)
322
323BLEND_MODE(clear)    { return 0.0f; }
324BLEND_MODE(srcatop)  { return s*da + d*inv(sa); }
325BLEND_MODE(dstatop)  { return d*sa + s*inv(da); }
326BLEND_MODE(srcin)    { return s * da; }
327BLEND_MODE(dstin)    { return d * sa; }
328BLEND_MODE(srcout)   { return s * inv(da); }
329BLEND_MODE(dstout)   { return d * inv(sa); }
330BLEND_MODE(srcover)  { return mad(d, inv(sa), s); }
331BLEND_MODE(dstover)  { return mad(s, inv(da), d); }
332
333BLEND_MODE(modulate) { return s*d; }
334BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
335BLEND_MODE(screen)   { return s + inv(s)*d; }
336BLEND_MODE(xor_)     { return s*inv(da) + d*inv(sa); }
337
338#undef BLEND_MODE
339