1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkBlitRow_opts_SSE2.h"
11#include "SkColorData.h"
12#include "SkColor_opts_SSE2.h"
13#include "SkDither.h"
14#include "SkMSAN.h"
15#include "SkUtils.h"
16
17/* SSE2 version of S32_Blend_BlitRow32()
18 * portable version is in core/SkBlitRow_D32.cpp
19 */
20void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21                              const SkPMColor* SK_RESTRICT src,
22                              int count, U8CPU alpha) {
23    SkASSERT(alpha <= 255);
24    if (count <= 0) {
25        return;
26    }
27
28    uint32_t src_scale = SkAlpha255To256(alpha);
29
30    if (count >= 4) {
31        SkASSERT(((size_t)dst & 0x03) == 0);
32        while (((size_t)dst & 0x0F) != 0) {
33            *dst = SkPMLerp(*src, *dst, src_scale);
34            src++;
35            dst++;
36            count--;
37        }
38
39        const __m128i *s = reinterpret_cast<const __m128i*>(src);
40        __m128i *d = reinterpret_cast<__m128i*>(dst);
41
42        while (count >= 4) {
43            // Load 4 pixels each of src and dest.
44            __m128i src_pixel = _mm_loadu_si128(s);
45            __m128i dst_pixel = _mm_load_si128(d);
46
47            __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
48            _mm_store_si128(d, result);
49            s++;
50            d++;
51            count -= 4;
52        }
53        src = reinterpret_cast<const SkPMColor*>(s);
54        dst = reinterpret_cast<SkPMColor*>(d);
55    }
56
57    while (count > 0) {
58        *dst = SkPMLerp(*src, *dst, src_scale);
59        src++;
60        dst++;
61        count--;
62    }
63}
64
65void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
66                               const SkPMColor* SK_RESTRICT src,
67                               int count, U8CPU alpha) {
68    SkASSERT(alpha <= 255);
69    if (count <= 0) {
70        return;
71    }
72
73    if (count >= 4) {
74        while (((size_t)dst & 0x0F) != 0) {
75            *dst = SkBlendARGB32(*src, *dst, alpha);
76            src++;
77            dst++;
78            count--;
79        }
80
81        const __m128i *s = reinterpret_cast<const __m128i*>(src);
82        __m128i *d = reinterpret_cast<__m128i*>(dst);
83        while (count >= 4) {
84            // Load 4 pixels each of src and dest.
85            __m128i src_pixel = _mm_loadu_si128(s);
86            __m128i dst_pixel = _mm_load_si128(d);
87
88            __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
89            _mm_store_si128(d, result);
90            s++;
91            d++;
92            count -= 4;
93        }
94        src = reinterpret_cast<const SkPMColor*>(s);
95        dst = reinterpret_cast<SkPMColor*>(d);
96    }
97
98    while (count > 0) {
99        *dst = SkBlendARGB32(*src, *dst, alpha);
100        src++;
101        dst++;
102        count--;
103    }
104}
105
106// The following (left) shifts cause the top 5 bits of the mask components to
107// line up with the corresponding components in an SkPMColor.
108// Note that the mask's RGB16 order may differ from the SkPMColor order.
109#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
110#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
111#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
112
113#if SK_R16x5_R32x5_SHIFT == 0
114    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
115#elif SK_R16x5_R32x5_SHIFT > 0
116    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
117#else
118    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
119#endif
120
121#if SK_G16x5_G32x5_SHIFT == 0
122    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
123#elif SK_G16x5_G32x5_SHIFT > 0
124    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
125#else
126    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
127#endif
128
129#if SK_B16x5_B32x5_SHIFT == 0
130    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
131#elif SK_B16x5_B32x5_SHIFT > 0
132    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
133#else
134    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
135#endif
136
137static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
138                                 __m128i &mask, __m128i &srcA) {
139    // In the following comments, the components of src, dst and mask are
140    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
141    // by an R, G, B, or A suffix. Components of one of the four pixels that
142    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
143    // example is the blue channel of the second destination pixel. Memory
144    // layout is shown for an ARGB byte order in a color value.
145
146    // src and srcA store 8-bit values interleaved with zeros.
147    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
148    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
149    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
150    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
151    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
152    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
153    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
154
155    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
156    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
157    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
158                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
159
160    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
161    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
162                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
163
164    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
165    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
166                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
167
168    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
169    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
170    // 8-bit position
171    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
172    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
173    mask = _mm_or_si128(_mm_or_si128(r, g), b);
174
175    // Interleave R,G,B into the lower byte of word.
176    // i.e. split the sixteen 8-bit values from mask into two sets of eight
177    // 16-bit values, padded by zero.
178    __m128i maskLo, maskHi;
179    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
180    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
181    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
182    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
183
184    // Upscale from 0..31 to 0..32
185    // (allows to replace division by left-shift further down)
186    // Left-shift each component by 4 and add the result back to that component,
187    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
188    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
189    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
190
191    // Multiply each component of maskLo and maskHi by srcA
192    maskLo = _mm_mullo_epi16(maskLo, srcA);
193    maskHi = _mm_mullo_epi16(maskHi, srcA);
194
195    // Left shift mask components by 8 (divide by 256)
196    maskLo = _mm_srli_epi16(maskLo, 8);
197    maskHi = _mm_srli_epi16(maskHi, 8);
198
199    // Interleave R,G,B into the lower byte of the word
200    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
201    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
202    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
203    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
204
205    // mask = (src - dst) * mask
206    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
207    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
208
209    // mask = (src - dst) * mask >> 5
210    maskLo = _mm_srai_epi16(maskLo, 5);
211    maskHi = _mm_srai_epi16(maskHi, 5);
212
213    // Add two pixels into result.
214    // result = dst + ((src - dst) * mask >> 5)
215    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
216    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
217
218    // Pack into 4 32bit dst pixels.
219    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
220    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
221    // clamping to 255 if necessary.
222    return _mm_packus_epi16(resultLo, resultHi);
223}
224
225static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
226                                       __m128i &mask) {
227    // In the following comments, the components of src, dst and mask are
228    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
229    // by an R, G, B, or A suffix. Components of one of the four pixels that
230    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
231    // example is the blue channel of the second destination pixel. Memory
232    // layout is shown for an ARGB byte order in a color value.
233
234    // src and srcA store 8-bit values interleaved with zeros.
235    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
236    // mask stores 16-bit values (shown as high and low bytes) interleaved with
237    // zeros
238    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
239    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
240
241    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
242    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
243    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
244                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
245
246    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
247    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
248                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
249
250    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
251    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
252                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
253
254    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
255    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
256    // 8-bit position
257    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
258    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
259    mask = _mm_or_si128(_mm_or_si128(r, g), b);
260
261    // Interleave R,G,B into the lower byte of word.
262    // i.e. split the sixteen 8-bit values from mask into two sets of eight
263    // 16-bit values, padded by zero.
264    __m128i maskLo, maskHi;
265    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
266    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
267    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
268    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
269
270    // Upscale from 0..31 to 0..32
271    // (allows to replace division by left-shift further down)
272    // Left-shift each component by 4 and add the result back to that component,
273    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
274    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
275    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
276
277    // Interleave R,G,B into the lower byte of the word
278    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
279    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
280    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
281    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
282
283    // mask = (src - dst) * mask
284    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
285    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
286
287    // mask = (src - dst) * mask >> 5
288    maskLo = _mm_srai_epi16(maskLo, 5);
289    maskHi = _mm_srai_epi16(maskHi, 5);
290
291    // Add two pixels into result.
292    // result = dst + ((src - dst) * mask >> 5)
293    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
294    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
295
296    // Pack into 4 32bit dst pixels and force opaque.
297    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
298    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
299    // clamping to 255 if necessary. Set alpha components to 0xFF.
300    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
301                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
302}
303
304void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
305                         SkColor src, int width, SkPMColor) {
306    if (width <= 0) {
307        return;
308    }
309
310    int srcA = SkColorGetA(src);
311    int srcR = SkColorGetR(src);
312    int srcG = SkColorGetG(src);
313    int srcB = SkColorGetB(src);
314
315    srcA = SkAlpha255To256(srcA);
316
317    if (width >= 4) {
318        SkASSERT(((size_t)dst & 0x03) == 0);
319        while (((size_t)dst & 0x0F) != 0) {
320            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
321            mask++;
322            dst++;
323            width--;
324        }
325
326        __m128i *d = reinterpret_cast<__m128i*>(dst);
327        // Set alpha to 0xFF and replicate source four times in SSE register.
328        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
329        // Interleave with zeros to get two sets of four 16-bit values.
330        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
331        // Set srcA_sse to contain eight copies of srcA, padded with zero.
332        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
333        __m128i srcA_sse = _mm_set1_epi16(srcA);
334        while (width >= 4) {
335            // Load four destination pixels into dst_sse.
336            __m128i dst_sse = _mm_load_si128(d);
337            // Load four 16-bit masks into lower half of mask_sse.
338            __m128i mask_sse = _mm_loadl_epi64(
339                                   reinterpret_cast<const __m128i*>(mask));
340
341            // Check whether masks are equal to 0 and get the highest bit
342            // of each byte of result, if masks are all zero, we will get
343            // pack_cmp to 0xFFFF
344            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
345                                             _mm_setzero_si128()));
346
347            // if mask pixels are not all zero, we will blend the dst pixels
348            if (pack_cmp != 0xFFFF) {
349                // Unpack 4 16bit mask pixels to
350                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
351                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
352                mask_sse = _mm_unpacklo_epi16(mask_sse,
353                                              _mm_setzero_si128());
354
355                // Process 4 32bit dst pixels
356                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
357                                                   mask_sse, srcA_sse);
358                _mm_store_si128(d, result);
359            }
360
361            d++;
362            mask += 4;
363            width -= 4;
364        }
365
366        dst = reinterpret_cast<SkPMColor*>(d);
367    }
368
369    while (width > 0) {
370        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
371        mask++;
372        dst++;
373        width--;
374    }
375}
376
377void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
378                               SkColor src, int width, SkPMColor opaqueDst) {
379    if (width <= 0) {
380        return;
381    }
382
383    int srcR = SkColorGetR(src);
384    int srcG = SkColorGetG(src);
385    int srcB = SkColorGetB(src);
386
387    if (width >= 4) {
388        SkASSERT(((size_t)dst & 0x03) == 0);
389        while (((size_t)dst & 0x0F) != 0) {
390            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
391            mask++;
392            dst++;
393            width--;
394        }
395
396        __m128i *d = reinterpret_cast<__m128i*>(dst);
397        // Set alpha to 0xFF and replicate source four times in SSE register.
398        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
399        // Set srcA_sse to contain eight copies of srcA, padded with zero.
400        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
401        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
402        while (width >= 4) {
403            // Load four destination pixels into dst_sse.
404            __m128i dst_sse = _mm_load_si128(d);
405            // Load four 16-bit masks into lower half of mask_sse.
406            __m128i mask_sse = _mm_loadl_epi64(
407                                   reinterpret_cast<const __m128i*>(mask));
408
409            // Check whether masks are equal to 0 and get the highest bit
410            // of each byte of result, if masks are all zero, we will get
411            // pack_cmp to 0xFFFF
412            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
413                                             _mm_setzero_si128()));
414
415            // if mask pixels are not all zero, we will blend the dst pixels
416            if (pack_cmp != 0xFFFF) {
417                // Unpack 4 16bit mask pixels to
418                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
419                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
420                mask_sse = _mm_unpacklo_epi16(mask_sse,
421                                              _mm_setzero_si128());
422
423                // Process 4 32bit dst pixels
424                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
425                                                         mask_sse);
426                _mm_store_si128(d, result);
427            }
428
429            d++;
430            mask += 4;
431            width -= 4;
432        }
433
434        dst = reinterpret_cast<SkPMColor*>(d);
435    }
436
437    while (width > 0) {
438        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
439        mask++;
440        dst++;
441        width--;
442    }
443}
444