1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9#include "SkBlitRow_opts_SSE2.h"
10#include "SkBitmapProcState_opts_SSE2.h"
11#include "SkColorPriv.h"
12#include "SkUtils.h"
13
14#include <emmintrin.h>
15
16/* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
19void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20                              const SkPMColor* SK_RESTRICT src,
21                              int count, U8CPU alpha) {
22    SkASSERT(alpha <= 255);
23    if (count <= 0) {
24        return;
25    }
26
27    uint32_t src_scale = SkAlpha255To256(alpha);
28    uint32_t dst_scale = 256 - src_scale;
29
30    if (count >= 4) {
31        SkASSERT(((size_t)dst & 0x03) == 0);
32        while (((size_t)dst & 0x0F) != 0) {
33            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34            src++;
35            dst++;
36            count--;
37        }
38
39        const __m128i *s = reinterpret_cast<const __m128i*>(src);
40        __m128i *d = reinterpret_cast<__m128i*>(dst);
41        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43
44        // Move scale factors to upper byte of word
45        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47        while (count >= 4) {
48            // Load 4 pixels each of src and dest.
49            __m128i src_pixel = _mm_loadu_si128(s);
50            __m128i dst_pixel = _mm_load_si128(d);
51
52            // Interleave Atom port 0/1 operations based on the execution port
53            // constraints that multiply can only be executed on port 0 (while
54            // boolean operations can be executed on either port 0 or port 1)
55            // because GCC currently doesn't do a good job scheduling
56            // instructions based on these constraints.
57
58            // Get red and blue pixels into lower byte of each word.
59            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61
62            // Multiply by scale.
63            // (4 x (0, rs.h, 0, bs.h))
64            // where rs.h stands for the higher byte of r * scale, and
65            // bs.h the higher byte of b * scale.
66            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67
68            // Get alpha and green pixels into higher byte of each word.
69            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71
72            // Multiply by scale.
73            // (4 x (as.h, as.l, gs.h, gs.l))
74            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75
76            // Clear the lower byte of the a*scale and g*scale results
77            // (4 x (as.h, 0, gs.h, 0))
78            src_ag = _mm_and_si128(src_ag, ag_mask);
79
80            // Operations the destination pixels are the same as on the
81            // source pixels. See the comments above.
82            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86            dst_ag = _mm_and_si128(dst_ag, ag_mask);
87
88            // Combine back into RGBA.
89            // (4 x (as.h, rs.h, gs.h, bs.h))
90            src_pixel = _mm_or_si128(src_rb, src_ag);
91            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92
93            // Add result
94            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95            _mm_store_si128(d, result);
96            s++;
97            d++;
98            count -= 4;
99        }
100        src = reinterpret_cast<const SkPMColor*>(s);
101        dst = reinterpret_cast<SkPMColor*>(d);
102    }
103
104    while (count > 0) {
105        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106        src++;
107        dst++;
108        count--;
109    }
110}
111
112void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
113                                const SkPMColor* SK_RESTRICT src,
114                                int count, U8CPU alpha) {
115    SkASSERT(alpha == 255);
116    if (count <= 0) {
117        return;
118    }
119
120    if (count >= 4) {
121        SkASSERT(((size_t)dst & 0x03) == 0);
122        while (((size_t)dst & 0x0F) != 0) {
123            *dst = SkPMSrcOver(*src, *dst);
124            src++;
125            dst++;
126            count--;
127        }
128
129        const __m128i *s = reinterpret_cast<const __m128i*>(src);
130        __m128i *d = reinterpret_cast<__m128i*>(dst);
131#ifdef SK_USE_ACCURATE_BLENDING
132        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
134        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
135        while (count >= 4) {
136            // Load 4 pixels
137            __m128i src_pixel = _mm_loadu_si128(s);
138            __m128i dst_pixel = _mm_load_si128(d);
139
140            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
141            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
142            // Shift alphas down to lower 8 bits of each quad.
143            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
144
145            // Copy alpha to upper 3rd byte of each quad
146            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
147
148            // Subtract alphas from 255, to get 0..255
149            alpha = _mm_sub_epi16(c_255, alpha);
150
151            // Multiply by red and blue by src alpha.
152            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153            // Multiply by alpha and green by src alpha.
154            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
155
156            // dst_rb_low = (dst_rb >> 8)
157            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
159
160            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162            dst_rb = _mm_add_epi16(dst_rb, c_128);
163            dst_rb = _mm_srli_epi16(dst_rb, 8);
164
165            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167            dst_ag = _mm_add_epi16(dst_ag, c_128);
168            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
169
170            // Combine back into RGBA.
171            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
172
173            // Add result
174            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175            _mm_store_si128(d, result);
176            s++;
177            d++;
178            count -= 4;
179        }
180    #else
181        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
183        while (count >= 4) {
184            // Load 4 pixels
185            __m128i src_pixel = _mm_loadu_si128(s);
186            __m128i dst_pixel = _mm_load_si128(d);
187
188            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
189            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
190
191            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
192            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193
194            // (a0, a0, a1, a1, a2, g2, a3, g3)
195            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196
197            // (a0, a0, a1, a1, a2, a2, a3, a3)
198            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
199
200            // Subtract alphas from 256, to get 1..256
201            alpha = _mm_sub_epi16(c_256, alpha);
202
203            // Multiply by red and blue by src alpha.
204            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205            // Multiply by alpha and green by src alpha.
206            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
207
208            // Divide by 256.
209            dst_rb = _mm_srli_epi16(dst_rb, 8);
210
211            // Mask out high bits (already in the right place)
212            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
213
214            // Combine back into RGBA.
215            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
216
217            // Add result
218            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219            _mm_store_si128(d, result);
220            s++;
221            d++;
222            count -= 4;
223        }
224#endif
225        src = reinterpret_cast<const SkPMColor*>(s);
226        dst = reinterpret_cast<SkPMColor*>(d);
227    }
228
229    while (count > 0) {
230        *dst = SkPMSrcOver(*src, *dst);
231        src++;
232        dst++;
233        count--;
234    }
235}
236
237void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
238                               const SkPMColor* SK_RESTRICT src,
239                               int count, U8CPU alpha) {
240    SkASSERT(alpha <= 255);
241    if (count <= 0) {
242        return;
243    }
244
245    if (count >= 4) {
246        while (((size_t)dst & 0x0F) != 0) {
247            *dst = SkBlendARGB32(*src, *dst, alpha);
248            src++;
249            dst++;
250            count--;
251        }
252
253        uint32_t src_scale = SkAlpha255To256(alpha);
254
255        const __m128i *s = reinterpret_cast<const __m128i*>(src);
256        __m128i *d = reinterpret_cast<__m128i*>(dst);
257        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
258        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
260        while (count >= 4) {
261            // Load 4 pixels each of src and dest.
262            __m128i src_pixel = _mm_loadu_si128(s);
263            __m128i dst_pixel = _mm_load_si128(d);
264
265            // Get red and blue pixels into lower byte of each word.
266            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
268
269            // Get alpha and green into lower byte of each word.
270            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
272
273            // Put per-pixel alpha in low byte of each word.
274            // After the following two statements, the dst_alpha looks like
275            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
276            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
278
279            // dst_alpha = dst_alpha * src_scale
280            // Because src_scales are in the higher byte of each word and
281            // we use mulhi here, the resulting alpha values are already
282            // in the right place and don't need to be divided by 256.
283            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
285
286            // Subtract alphas from 256, to get 1..256
287            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
288
289            // Multiply red and blue by dst pixel alpha.
290            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291            // Multiply alpha and green by dst pixel alpha.
292            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
293
294            // Multiply red and blue by global alpha.
295            // (4 x (0, rs.h, 0, bs.h))
296            // where rs.h stands for the higher byte of r * src_scale,
297            // and bs.h the higher byte of b * src_scale.
298            // Again, because we use mulhi, the resuling red and blue
299            // values are already in the right place and don't need to
300            // be divided by 256.
301            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
302            // Multiply alpha and green by global alpha.
303            // (4 x (0, as.h, 0, gs.h))
304            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
305
306            // Divide by 256.
307            dst_rb = _mm_srli_epi16(dst_rb, 8);
308
309            // Mask out low bits (goodies already in the right place; no need to divide)
310            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
311            // Shift alpha and green to higher byte of each word.
312            // (4 x (as.h, 0, gs.h, 0))
313            src_ag = _mm_slli_epi16(src_ag, 8);
314
315            // Combine back into RGBA.
316            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317            src_pixel = _mm_or_si128(src_rb, src_ag);
318
319            // Add two pixels into result.
320            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321            _mm_store_si128(d, result);
322            s++;
323            d++;
324            count -= 4;
325        }
326        src = reinterpret_cast<const SkPMColor*>(s);
327        dst = reinterpret_cast<SkPMColor*>(d);
328    }
329
330    while (count > 0) {
331        *dst = SkBlendARGB32(*src, *dst, alpha);
332        src++;
333        dst++;
334        count--;
335    }
336}
337
338/* SSE2 version of Color32()
339 * portable version is in core/SkBlitRow_D32.cpp
340 */
341void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342                  SkPMColor color) {
343
344    if (count <= 0) {
345        return;
346    }
347
348    if (0 == color) {
349        if (src != dst) {
350            memcpy(dst, src, count * sizeof(SkPMColor));
351        }
352        return;
353    }
354
355    unsigned colorA = SkGetPackedA32(color);
356    if (255 == colorA) {
357        sk_memset32(dst, color, count);
358    } else {
359        unsigned scale = 256 - SkAlpha255To256(colorA);
360
361        if (count >= 4) {
362            SkASSERT(((size_t)dst & 0x03) == 0);
363            while (((size_t)dst & 0x0F) != 0) {
364                *dst = color + SkAlphaMulQ(*src, scale);
365                src++;
366                dst++;
367                count--;
368            }
369
370            const __m128i *s = reinterpret_cast<const __m128i*>(src);
371            __m128i *d = reinterpret_cast<__m128i*>(dst);
372            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
373            __m128i src_scale_wide = _mm_set1_epi16(scale);
374            __m128i color_wide = _mm_set1_epi32(color);
375            while (count >= 4) {
376                // Load 4 pixels each of src and dest.
377                __m128i src_pixel = _mm_loadu_si128(s);
378
379                // Get red and blue pixels into lower byte of each word.
380                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
381
382                // Get alpha and green into lower byte of each word.
383                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
384
385                // Multiply by scale.
386                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
387                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
388
389                // Divide by 256.
390                src_rb = _mm_srli_epi16(src_rb, 8);
391                src_ag = _mm_andnot_si128(rb_mask, src_ag);
392
393                // Combine back into RGBA.
394                src_pixel = _mm_or_si128(src_rb, src_ag);
395
396                // Add color to result.
397                __m128i result = _mm_add_epi8(color_wide, src_pixel);
398
399                // Store result.
400                _mm_store_si128(d, result);
401                s++;
402                d++;
403                count -= 4;
404            }
405            src = reinterpret_cast<const SkPMColor*>(s);
406            dst = reinterpret_cast<SkPMColor*>(d);
407         }
408
409        while (count > 0) {
410            *dst = color + SkAlphaMulQ(*src, scale);
411            src += 1;
412            dst += 1;
413            count--;
414        }
415    }
416}
417
418void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
419                               size_t maskRB, SkColor origColor,
420                               int width, int height) {
421    SkPMColor color = SkPreMultiplyColor(origColor);
422    size_t dstOffset = dstRB - (width << 2);
423    size_t maskOffset = maskRB - width;
424    SkPMColor* dst = (SkPMColor *)device;
425    const uint8_t* mask = (const uint8_t*)maskPtr;
426    do {
427        int count = width;
428        if (count >= 4) {
429            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
430                *dst = SkBlendARGB32(color, *dst, *mask);
431                mask++;
432                dst++;
433                count--;
434            }
435            __m128i *d = reinterpret_cast<__m128i*>(dst);
436            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
437            __m128i c_256 = _mm_set1_epi16(256);
438            __m128i c_1 = _mm_set1_epi16(1);
439            __m128i src_pixel = _mm_set1_epi32(color);
440            while (count >= 4) {
441                // Load 4 pixels each of src and dest.
442                __m128i dst_pixel = _mm_load_si128(d);
443
444                //set the aphla value
445                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
446                                0, *(mask+3),0, \
447                                *(mask+2),0, *(mask+2),\
448                                0,*(mask+1), 0,*(mask+1),\
449                                0, *mask,0,*mask);
450
451                //call SkAlpha255To256()
452                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
453
454                // Get red and blue pixels into lower byte of each word.
455                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
456                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
457
458                // Get alpha and green into lower byte of each word.
459                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
460                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
461
462                // Put per-pixel alpha in low byte of each word.
463                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
464                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
465
466                // dst_alpha = dst_alpha * src_scale
467                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
468
469                // Divide by 256.
470                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
471
472                // Subtract alphas from 256, to get 1..256
473                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
474                // Multiply red and blue by dst pixel alpha.
475                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
476                // Multiply alpha and green by dst pixel alpha.
477                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
478
479                // Multiply red and blue by global alpha.
480                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
481                // Multiply alpha and green by global alpha.
482                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
483                // Divide by 256.
484                dst_rb = _mm_srli_epi16(dst_rb, 8);
485                src_rb = _mm_srli_epi16(src_rb, 8);
486
487                // Mask out low bits (goodies already in the right place; no need to divide)
488                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
489                src_ag = _mm_andnot_si128(rb_mask, src_ag);
490
491                // Combine back into RGBA.
492                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
493                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
494
495                // Add two pixels into result.
496                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
497                _mm_store_si128(d, result);
498                // load the next 4 pixel
499                mask = mask + 4;
500                d++;
501                count -= 4;
502            }
503            dst = reinterpret_cast<SkPMColor *>(d);
504        }
505        while(count > 0) {
506            *dst= SkBlendARGB32(color, *dst, *mask);
507            dst += 1;
508            mask++;
509            count --;
510        }
511        dst = (SkPMColor *)((char*)dst + dstOffset);
512        mask += maskOffset;
513    } while (--height != 0);
514}
515
516// The following (left) shifts cause the top 5 bits of the mask components to
517// line up with the corresponding components in an SkPMColor.
518// Note that the mask's RGB16 order may differ from the SkPMColor order.
519#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
520#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
521#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
522
523#if SK_R16x5_R32x5_SHIFT == 0
524    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
525#elif SK_R16x5_R32x5_SHIFT > 0
526    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
527#else
528    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
529#endif
530
531#if SK_G16x5_G32x5_SHIFT == 0
532    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
533#elif SK_G16x5_G32x5_SHIFT > 0
534    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
535#else
536    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
537#endif
538
539#if SK_B16x5_B32x5_SHIFT == 0
540    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
541#elif SK_B16x5_B32x5_SHIFT > 0
542    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
543#else
544    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
545#endif
546
547static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
548                                 __m128i &mask, __m128i &srcA) {
549    // In the following comments, the components of src, dst and mask are
550    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
551    // by an R, G, B, or A suffix. Components of one of the four pixels that
552    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
553    // example is the blue channel of the second destination pixel. Memory
554    // layout is shown for an ARGB byte order in a color value.
555
556    // src and srcA store 8-bit values interleaved with zeros.
557    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
558    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
559    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
560    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
561    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
562    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
563    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
564
565    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
566    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
567    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
568                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
569
570    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
571    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
572                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
573
574    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
575    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
576                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
577
578    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
579    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
580    // 8-bit position
581    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
582    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
583    mask = _mm_or_si128(_mm_or_si128(r, g), b);
584
585    // Interleave R,G,B into the lower byte of word.
586    // i.e. split the sixteen 8-bit values from mask into two sets of eight
587    // 16-bit values, padded by zero.
588    __m128i maskLo, maskHi;
589    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
590    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
591    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
592    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
593
594    // Upscale from 0..31 to 0..32
595    // (allows to replace division by left-shift further down)
596    // Left-shift each component by 4 and add the result back to that component,
597    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
598    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
599    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
600
601    // Multiply each component of maskLo and maskHi by srcA
602    maskLo = _mm_mullo_epi16(maskLo, srcA);
603    maskHi = _mm_mullo_epi16(maskHi, srcA);
604
605    // Left shift mask components by 8 (divide by 256)
606    maskLo = _mm_srli_epi16(maskLo, 8);
607    maskHi = _mm_srli_epi16(maskHi, 8);
608
609    // Interleave R,G,B into the lower byte of the word
610    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
611    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
612    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
613    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
614
615    // mask = (src - dst) * mask
616    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
617    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
618
619    // mask = (src - dst) * mask >> 5
620    maskLo = _mm_srai_epi16(maskLo, 5);
621    maskHi = _mm_srai_epi16(maskHi, 5);
622
623    // Add two pixels into result.
624    // result = dst + ((src - dst) * mask >> 5)
625    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
626    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
627
628    // Pack into 4 32bit dst pixels.
629    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
630    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
631    // clamping to 255 if necessary.
632    return _mm_packus_epi16(resultLo, resultHi);
633}
634
635static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
636                                       __m128i &mask) {
637    // In the following comments, the components of src, dst and mask are
638    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
639    // by an R, G, B, or A suffix. Components of one of the four pixels that
640    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
641    // example is the blue channel of the second destination pixel. Memory
642    // layout is shown for an ARGB byte order in a color value.
643
644    // src and srcA store 8-bit values interleaved with zeros.
645    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
646    // mask stores 16-bit values (shown as high and low bytes) interleaved with
647    // zeros
648    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
649    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
650
651    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
652    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
653    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
654                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
655
656    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
657    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
658                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
659
660    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
661    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
662                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
663
664    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
665    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
666    // 8-bit position
667    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
668    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
669    mask = _mm_or_si128(_mm_or_si128(r, g), b);
670
671    // Interleave R,G,B into the lower byte of word.
672    // i.e. split the sixteen 8-bit values from mask into two sets of eight
673    // 16-bit values, padded by zero.
674    __m128i maskLo, maskHi;
675    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
676    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
677    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
678    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
679
680    // Upscale from 0..31 to 0..32
681    // (allows to replace division by left-shift further down)
682    // Left-shift each component by 4 and add the result back to that component,
683    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
684    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
685    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
686
687    // Interleave R,G,B into the lower byte of the word
688    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
689    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
690    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
691    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
692
693    // mask = (src - dst) * mask
694    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
695    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
696
697    // mask = (src - dst) * mask >> 5
698    maskLo = _mm_srai_epi16(maskLo, 5);
699    maskHi = _mm_srai_epi16(maskHi, 5);
700
701    // Add two pixels into result.
702    // result = dst + ((src - dst) * mask >> 5)
703    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
704    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
705
706    // Pack into 4 32bit dst pixels and force opaque.
707    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
708    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
709    // clamping to 255 if necessary. Set alpha components to 0xFF.
710    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
711                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
712}
713
714void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
715                         SkColor src, int width, SkPMColor) {
716    if (width <= 0) {
717        return;
718    }
719
720    int srcA = SkColorGetA(src);
721    int srcR = SkColorGetR(src);
722    int srcG = SkColorGetG(src);
723    int srcB = SkColorGetB(src);
724
725    srcA = SkAlpha255To256(srcA);
726
727    if (width >= 4) {
728        SkASSERT(((size_t)dst & 0x03) == 0);
729        while (((size_t)dst & 0x0F) != 0) {
730            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
731            mask++;
732            dst++;
733            width--;
734        }
735
736        __m128i *d = reinterpret_cast<__m128i*>(dst);
737        // Set alpha to 0xFF and replicate source four times in SSE register.
738        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
739        // Interleave with zeros to get two sets of four 16-bit values.
740        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
741        // Set srcA_sse to contain eight copies of srcA, padded with zero.
742        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
743        __m128i srcA_sse = _mm_set1_epi16(srcA);
744        while (width >= 4) {
745            // Load four destination pixels into dst_sse.
746            __m128i dst_sse = _mm_load_si128(d);
747            // Load four 16-bit masks into lower half of mask_sse.
748            __m128i mask_sse = _mm_loadl_epi64(
749                                   reinterpret_cast<const __m128i*>(mask));
750
751            // Check whether masks are equal to 0 and get the highest bit
752            // of each byte of result, if masks are all zero, we will get
753            // pack_cmp to 0xFFFF
754            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
755                                             _mm_setzero_si128()));
756
757            // if mask pixels are not all zero, we will blend the dst pixels
758            if (pack_cmp != 0xFFFF) {
759                // Unpack 4 16bit mask pixels to
760                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
761                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
762                mask_sse = _mm_unpacklo_epi16(mask_sse,
763                                              _mm_setzero_si128());
764
765                // Process 4 32bit dst pixels
766                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
767                                                   mask_sse, srcA_sse);
768                _mm_store_si128(d, result);
769            }
770
771            d++;
772            mask += 4;
773            width -= 4;
774        }
775
776        dst = reinterpret_cast<SkPMColor*>(d);
777    }
778
779    while (width > 0) {
780        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
781        mask++;
782        dst++;
783        width--;
784    }
785}
786
787void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
788                               SkColor src, int width, SkPMColor opaqueDst) {
789    if (width <= 0) {
790        return;
791    }
792
793    int srcR = SkColorGetR(src);
794    int srcG = SkColorGetG(src);
795    int srcB = SkColorGetB(src);
796
797    if (width >= 4) {
798        SkASSERT(((size_t)dst & 0x03) == 0);
799        while (((size_t)dst & 0x0F) != 0) {
800            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
801            mask++;
802            dst++;
803            width--;
804        }
805
806        __m128i *d = reinterpret_cast<__m128i*>(dst);
807        // Set alpha to 0xFF and replicate source four times in SSE register.
808        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
809        // Set srcA_sse to contain eight copies of srcA, padded with zero.
810        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
811        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
812        while (width >= 4) {
813            // Load four destination pixels into dst_sse.
814            __m128i dst_sse = _mm_load_si128(d);
815            // Load four 16-bit masks into lower half of mask_sse.
816            __m128i mask_sse = _mm_loadl_epi64(
817                                   reinterpret_cast<const __m128i*>(mask));
818
819            // Check whether masks are equal to 0 and get the highest bit
820            // of each byte of result, if masks are all zero, we will get
821            // pack_cmp to 0xFFFF
822            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
823                                             _mm_setzero_si128()));
824
825            // if mask pixels are not all zero, we will blend the dst pixels
826            if (pack_cmp != 0xFFFF) {
827                // Unpack 4 16bit mask pixels to
828                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
829                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
830                mask_sse = _mm_unpacklo_epi16(mask_sse,
831                                              _mm_setzero_si128());
832
833                // Process 4 32bit dst pixels
834                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
835                                                         mask_sse);
836                _mm_store_si128(d, result);
837            }
838
839            d++;
840            mask += 4;
841            width -= 4;
842        }
843
844        dst = reinterpret_cast<SkPMColor*>(d);
845    }
846
847    while (width > 0) {
848        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
849        mask++;
850        dst++;
851        width--;
852    }
853}
854