1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9#include "SkBlitRow_opts_SSE2.h"
10#include "SkColorPriv.h"
11#include "SkUtils.h"
12
13#include <emmintrin.h>
14
15/* SSE2 version of S32_Blend_BlitRow32()
16 * portable version is in core/SkBlitRow_D32.cpp
17 */
18void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
19                              const SkPMColor* SK_RESTRICT src,
20                              int count, U8CPU alpha) {
21    SkASSERT(alpha <= 255);
22    if (count <= 0) {
23        return;
24    }
25
26    uint32_t src_scale = SkAlpha255To256(alpha);
27    uint32_t dst_scale = 256 - src_scale;
28
29    if (count >= 4) {
30        SkASSERT(((size_t)dst & 0x03) == 0);
31        while (((size_t)dst & 0x0F) != 0) {
32            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
33            src++;
34            dst++;
35            count--;
36        }
37
38        const __m128i *s = reinterpret_cast<const __m128i*>(src);
39        __m128i *d = reinterpret_cast<__m128i*>(dst);
40        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
41        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
42
43        // Move scale factors to upper byte of word
44        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
45        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
46        while (count >= 4) {
47            // Load 4 pixels each of src and dest.
48            __m128i src_pixel = _mm_loadu_si128(s);
49            __m128i dst_pixel = _mm_load_si128(d);
50
51            // Interleave Atom port 0/1 operations based on the execution port
52            // constraints that multiply can only be executed on port 0 (while
53            // boolean operations can be executed on either port 0 or port 1)
54            // because GCC currently doesn't do a good job scheduling
55            // instructions based on these constraints.
56
57            // Get red and blue pixels into lower byte of each word.
58            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
59            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
60
61            // Multiply by scale.
62            // (4 x (0, rs.h, 0, bs.h))
63            // where rs.h stands for the higher byte of r * scale, and
64            // bs.h the higher byte of b * scale.
65            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
66
67            // Get alpha and green pixels into higher byte of each word.
68            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
69            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
70
71            // Multiply by scale.
72            // (4 x (as.h, as.l, gs.h, gs.l))
73            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
74
75            // Clear the lower byte of the a*scale and g*scale results
76            // (4 x (as.h, 0, gs.h, 0))
77            src_ag = _mm_and_si128(src_ag, ag_mask);
78
79            // Operations the destination pixels are the same as on the
80            // source pixels. See the comments above.
81            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
82            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
83            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
84            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
85            dst_ag = _mm_and_si128(dst_ag, ag_mask);
86
87            // Combine back into RGBA.
88            // (4 x (as.h, rs.h, gs.h, bs.h))
89            src_pixel = _mm_or_si128(src_rb, src_ag);
90            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
91
92            // Add result
93            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
94            _mm_store_si128(d, result);
95            s++;
96            d++;
97            count -= 4;
98        }
99        src = reinterpret_cast<const SkPMColor*>(s);
100        dst = reinterpret_cast<SkPMColor*>(d);
101    }
102
103    while (count > 0) {
104        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
105        src++;
106        dst++;
107        count--;
108    }
109}
110
111void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
112                                const SkPMColor* SK_RESTRICT src,
113                                int count, U8CPU alpha) {
114    SkASSERT(alpha == 255);
115    if (count <= 0) {
116        return;
117    }
118
119    if (count >= 4) {
120        SkASSERT(((size_t)dst & 0x03) == 0);
121        while (((size_t)dst & 0x0F) != 0) {
122            *dst = SkPMSrcOver(*src, *dst);
123            src++;
124            dst++;
125            count--;
126        }
127
128        const __m128i *s = reinterpret_cast<const __m128i*>(src);
129        __m128i *d = reinterpret_cast<__m128i*>(dst);
130#ifdef SK_USE_ACCURATE_BLENDING
131        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
132        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
133        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
134        while (count >= 4) {
135            // Load 4 pixels
136            __m128i src_pixel = _mm_loadu_si128(s);
137            __m128i dst_pixel = _mm_load_si128(d);
138
139            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
140            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
141            // Shift alphas down to lower 8 bits of each quad.
142            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
143
144            // Copy alpha to upper 3rd byte of each quad
145            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
146
147            // Subtract alphas from 255, to get 0..255
148            alpha = _mm_sub_epi16(c_255, alpha);
149
150            // Multiply by red and blue by src alpha.
151            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
152            // Multiply by alpha and green by src alpha.
153            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
154
155            // dst_rb_low = (dst_rb >> 8)
156            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
157            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
158
159            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
160            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
161            dst_rb = _mm_add_epi16(dst_rb, c_128);
162            dst_rb = _mm_srli_epi16(dst_rb, 8);
163
164            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
165            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
166            dst_ag = _mm_add_epi16(dst_ag, c_128);
167            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
168
169            // Combine back into RGBA.
170            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
171
172            // Add result
173            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
174            _mm_store_si128(d, result);
175            s++;
176            d++;
177            count -= 4;
178        }
179    #else
180        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
181        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
182        while (count >= 4) {
183            // Load 4 pixels
184            __m128i src_pixel = _mm_loadu_si128(s);
185            __m128i dst_pixel = _mm_load_si128(d);
186
187            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
188            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
189
190            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
191            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
192
193            // (a0, a0, a1, a1, a2, g2, a3, g3)
194            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
195
196            // (a0, a0, a1, a1, a2, a2, a3, a3)
197            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
198
199            // Subtract alphas from 256, to get 1..256
200            alpha = _mm_sub_epi16(c_256, alpha);
201
202            // Multiply by red and blue by src alpha.
203            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
204            // Multiply by alpha and green by src alpha.
205            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
206
207            // Divide by 256.
208            dst_rb = _mm_srli_epi16(dst_rb, 8);
209
210            // Mask out high bits (already in the right place)
211            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
212
213            // Combine back into RGBA.
214            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
215
216            // Add result
217            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
218            _mm_store_si128(d, result);
219            s++;
220            d++;
221            count -= 4;
222        }
223#endif
224        src = reinterpret_cast<const SkPMColor*>(s);
225        dst = reinterpret_cast<SkPMColor*>(d);
226    }
227
228    while (count > 0) {
229        *dst = SkPMSrcOver(*src, *dst);
230        src++;
231        dst++;
232        count--;
233    }
234}
235
236void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
237                               const SkPMColor* SK_RESTRICT src,
238                               int count, U8CPU alpha) {
239    SkASSERT(alpha <= 255);
240    if (count <= 0) {
241        return;
242    }
243
244    if (count >= 4) {
245        while (((size_t)dst & 0x0F) != 0) {
246            *dst = SkBlendARGB32(*src, *dst, alpha);
247            src++;
248            dst++;
249            count--;
250        }
251
252        uint32_t src_scale = SkAlpha255To256(alpha);
253
254        const __m128i *s = reinterpret_cast<const __m128i*>(src);
255        __m128i *d = reinterpret_cast<__m128i*>(dst);
256        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
257        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
258        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
259        while (count >= 4) {
260            // Load 4 pixels each of src and dest.
261            __m128i src_pixel = _mm_loadu_si128(s);
262            __m128i dst_pixel = _mm_load_si128(d);
263
264            // Get red and blue pixels into lower byte of each word.
265            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
266            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
267
268            // Get alpha and green into lower byte of each word.
269            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
270            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
271
272            // Put per-pixel alpha in low byte of each word.
273            // After the following two statements, the dst_alpha looks like
274            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
275            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
276            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
277
278            // dst_alpha = dst_alpha * src_scale
279            // Because src_scales are in the higher byte of each word and
280            // we use mulhi here, the resulting alpha values are already
281            // in the right place and don't need to be divided by 256.
282            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
283            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
284
285            // Subtract alphas from 256, to get 1..256
286            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
287
288            // Multiply red and blue by dst pixel alpha.
289            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
290            // Multiply alpha and green by dst pixel alpha.
291            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
292
293            // Multiply red and blue by global alpha.
294            // (4 x (0, rs.h, 0, bs.h))
295            // where rs.h stands for the higher byte of r * src_scale,
296            // and bs.h the higher byte of b * src_scale.
297            // Again, because we use mulhi, the resuling red and blue
298            // values are already in the right place and don't need to
299            // be divided by 256.
300            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
301            // Multiply alpha and green by global alpha.
302            // (4 x (0, as.h, 0, gs.h))
303            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
304
305            // Divide by 256.
306            dst_rb = _mm_srli_epi16(dst_rb, 8);
307
308            // Mask out low bits (goodies already in the right place; no need to divide)
309            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
310            // Shift alpha and green to higher byte of each word.
311            // (4 x (as.h, 0, gs.h, 0))
312            src_ag = _mm_slli_epi16(src_ag, 8);
313
314            // Combine back into RGBA.
315            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
316            src_pixel = _mm_or_si128(src_rb, src_ag);
317
318            // Add two pixels into result.
319            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
320            _mm_store_si128(d, result);
321            s++;
322            d++;
323            count -= 4;
324        }
325        src = reinterpret_cast<const SkPMColor*>(s);
326        dst = reinterpret_cast<SkPMColor*>(d);
327    }
328
329    while (count > 0) {
330        *dst = SkBlendARGB32(*src, *dst, alpha);
331        src++;
332        dst++;
333        count--;
334    }
335}
336
337/* SSE2 version of Color32()
338 * portable version is in core/SkBlitRow_D32.cpp
339 */
340void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
341                  SkPMColor color) {
342
343    if (count <= 0) {
344        return;
345    }
346
347    if (0 == color) {
348        if (src != dst) {
349            memcpy(dst, src, count * sizeof(SkPMColor));
350        }
351        return;
352    }
353
354    unsigned colorA = SkGetPackedA32(color);
355    if (255 == colorA) {
356        sk_memset32(dst, color, count);
357    } else {
358        unsigned scale = 256 - SkAlpha255To256(colorA);
359
360        if (count >= 4) {
361            SkASSERT(((size_t)dst & 0x03) == 0);
362            while (((size_t)dst & 0x0F) != 0) {
363                *dst = color + SkAlphaMulQ(*src, scale);
364                src++;
365                dst++;
366                count--;
367            }
368
369            const __m128i *s = reinterpret_cast<const __m128i*>(src);
370            __m128i *d = reinterpret_cast<__m128i*>(dst);
371            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
372            __m128i src_scale_wide = _mm_set1_epi16(scale);
373            __m128i color_wide = _mm_set1_epi32(color);
374            while (count >= 4) {
375                // Load 4 pixels each of src and dest.
376                __m128i src_pixel = _mm_loadu_si128(s);
377
378                // Get red and blue pixels into lower byte of each word.
379                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
380
381                // Get alpha and green into lower byte of each word.
382                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
383
384                // Multiply by scale.
385                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
386                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
387
388                // Divide by 256.
389                src_rb = _mm_srli_epi16(src_rb, 8);
390                src_ag = _mm_andnot_si128(rb_mask, src_ag);
391
392                // Combine back into RGBA.
393                src_pixel = _mm_or_si128(src_rb, src_ag);
394
395                // Add color to result.
396                __m128i result = _mm_add_epi8(color_wide, src_pixel);
397
398                // Store result.
399                _mm_store_si128(d, result);
400                s++;
401                d++;
402                count -= 4;
403            }
404            src = reinterpret_cast<const SkPMColor*>(s);
405            dst = reinterpret_cast<SkPMColor*>(d);
406         }
407
408        while (count > 0) {
409            *dst = color + SkAlphaMulQ(*src, scale);
410            src += 1;
411            dst += 1;
412            count--;
413        }
414    }
415}
416
417void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
418                               size_t maskRB, SkColor origColor,
419                               int width, int height) {
420    SkPMColor color = SkPreMultiplyColor(origColor);
421    size_t dstOffset = dstRB - (width << 2);
422    size_t maskOffset = maskRB - width;
423    SkPMColor* dst = (SkPMColor *)device;
424    const uint8_t* mask = (const uint8_t*)maskPtr;
425    do {
426        int count = width;
427        if (count >= 4) {
428            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
429                *dst = SkBlendARGB32(color, *dst, *mask);
430                mask++;
431                dst++;
432                count--;
433            }
434            __m128i *d = reinterpret_cast<__m128i*>(dst);
435            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
436            __m128i c_256 = _mm_set1_epi16(256);
437            __m128i c_1 = _mm_set1_epi16(1);
438            __m128i src_pixel = _mm_set1_epi32(color);
439            while (count >= 4) {
440                // Load 4 pixels each of src and dest.
441                __m128i dst_pixel = _mm_load_si128(d);
442
443                //set the aphla value
444                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
445                                0, *(mask+3),0, \
446                                *(mask+2),0, *(mask+2),\
447                                0,*(mask+1), 0,*(mask+1),\
448                                0, *mask,0,*mask);
449
450                //call SkAlpha255To256()
451                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
452
453                // Get red and blue pixels into lower byte of each word.
454                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
455                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
456
457                // Get alpha and green into lower byte of each word.
458                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
459                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
460
461                // Put per-pixel alpha in low byte of each word.
462                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
463                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
464
465                // dst_alpha = dst_alpha * src_scale
466                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
467
468                // Divide by 256.
469                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
470
471                // Subtract alphas from 256, to get 1..256
472                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
473                // Multiply red and blue by dst pixel alpha.
474                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
475                // Multiply alpha and green by dst pixel alpha.
476                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
477
478                // Multiply red and blue by global alpha.
479                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
480                // Multiply alpha and green by global alpha.
481                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
482                // Divide by 256.
483                dst_rb = _mm_srli_epi16(dst_rb, 8);
484                src_rb = _mm_srli_epi16(src_rb, 8);
485
486                // Mask out low bits (goodies already in the right place; no need to divide)
487                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
488                src_ag = _mm_andnot_si128(rb_mask, src_ag);
489
490                // Combine back into RGBA.
491                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
492                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
493
494                // Add two pixels into result.
495                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
496                _mm_store_si128(d, result);
497                // load the next 4 pixel
498                mask = mask + 4;
499                d++;
500                count -= 4;
501            }
502            dst = reinterpret_cast<SkPMColor *>(d);
503        }
504        while(count > 0) {
505            *dst= SkBlendARGB32(color, *dst, *mask);
506            dst += 1;
507            mask++;
508            count --;
509        }
510        dst = (SkPMColor *)((char*)dst + dstOffset);
511        mask += maskOffset;
512    } while (--height != 0);
513}
514
515static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
516                                 __m128i &mask, __m128i &scale) {
517    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
518    __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
519                              16-SK_R16_SHIFT-(SK_R16_BITS-5)),
520                              _mm_set1_epi32(0x001F0000));
521
522    __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
523                              8-SK_G16_SHIFT-(SK_G16_BITS-5)),
524                              _mm_set1_epi32(0x00001F00));
525
526    __m128i b = _mm_and_si128(_mm_slli_epi32(mask,
527                              SK_B16_BITS-5),
528                              _mm_set1_epi32(0x0000001F));
529
530    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
531    mask = _mm_or_si128(_mm_or_si128(r, g), b);
532
533    // Interleave R,G,B into the lower byte of word.
534    __m128i maskLo, maskHi;
535    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
536    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
537
538    // Upscale to 0..32
539    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
540    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
541
542    maskLo = _mm_mullo_epi16(maskLo, scale);
543    maskHi = _mm_mullo_epi16(maskHi, scale);
544
545    maskLo = _mm_srli_epi16(maskLo, 8);
546    maskHi = _mm_srli_epi16(maskHi, 8);
547
548    // Interleave R,G,B into the lower byte of the word.
549    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
550    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
551
552    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
553    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
554
555    maskLo = _mm_srai_epi16(maskLo, 5);
556    maskHi = _mm_srai_epi16(maskHi, 5);
557
558    // Add two pixels into result.
559    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
560    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
561
562    // Pack into 4 32bit dst pixels
563    return _mm_packus_epi16(resultLo, resultHi);
564}
565
566static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
567                                       __m128i &mask) {
568    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
569    __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
570                              16-SK_R16_SHIFT-(SK_R16_BITS-5)),
571                              _mm_set1_epi32(0x001F0000));
572
573    __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
574                              8-SK_G16_SHIFT-(SK_G16_BITS-5)),
575                              _mm_set1_epi32(0x00001F00));
576
577    __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5),
578                              _mm_set1_epi32(0x0000001F));
579
580    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
581    mask = _mm_or_si128(_mm_or_si128(r, g), b);
582
583    // Interleave R,G,B into the lower byte of word.
584    __m128i maskLo, maskHi;
585    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
586    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
587
588    // Upscale to 0..32
589    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
590    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
591
592    // Interleave R,G,B into the lower byte of the word.
593    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
594    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
595
596    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
597    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
598
599    maskLo = _mm_srai_epi16(maskLo, 5);
600    maskHi = _mm_srai_epi16(maskHi, 5);
601
602    // Add two pixels into result.
603    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
604    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
605
606    // Pack into 4 32bit dst pixels
607    return _mm_packus_epi16(resultLo, resultHi);
608}
609
610void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
611                         SkColor color, int width, SkPMColor) {
612    if (width <= 0) {
613        return;
614    }
615
616    int srcA = SkColorGetA(color);
617    int srcR = SkColorGetR(color);
618    int srcG = SkColorGetG(color);
619    int srcB = SkColorGetB(color);
620
621    srcA = SkAlpha255To256(srcA);
622
623    if (width >= 4) {
624        SkASSERT(((size_t)dst & 0x03) == 0);
625        while (((size_t)dst & 0x0F) != 0) {
626            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
627            src++;
628            dst++;
629            width--;
630        }
631
632        __m128i *d = reinterpret_cast<__m128i*>(dst);
633        __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
634        srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
635        __m128i scale = _mm_set1_epi16(srcA);
636        while (width >= 4) {
637            __m128i dst_pixel = _mm_load_si128(d);
638            __m128i mask_pixel = _mm_loadl_epi64(
639                                     reinterpret_cast<const __m128i*>(src));
640
641            // Check whether mask_pixels are equal to 0 and get the highest bit
642            // of each byte of result, if mask pixes are all zero, we will get
643            // pack_cmp to 0xFFFF
644            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
645                                             _mm_setzero_si128()));
646
647            // if mask pixels are not all zero, we will blend the dst pixels
648            if (pack_cmp != 0xFFFF) {
649                // Unpack 4 16bit mask pixels to
650                // (p0, 0, p1, 0, p2, 0, p3, 0)
651                mask_pixel = _mm_unpacklo_epi16(mask_pixel,
652                                                _mm_setzero_si128());
653
654                // Process 4 32bit dst pixels
655                __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
656                                                   mask_pixel, scale);
657                _mm_store_si128(d, result);
658            }
659
660            d++;
661            src += 4;
662            width -= 4;
663        }
664
665        dst = reinterpret_cast<SkPMColor*>(d);
666    }
667
668    while (width > 0) {
669        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
670        src++;
671        dst++;
672        width--;
673    }
674}
675
676void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
677                               SkColor color, int width, SkPMColor opaqueDst) {
678    if (width <= 0) {
679        return;
680    }
681
682    int srcR = SkColorGetR(color);
683    int srcG = SkColorGetG(color);
684    int srcB = SkColorGetB(color);
685
686    if (width >= 4) {
687        SkASSERT(((size_t)dst & 0x03) == 0);
688        while (((size_t)dst & 0x0F) != 0) {
689            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
690            src++;
691            dst++;
692            width--;
693        }
694
695        __m128i *d = reinterpret_cast<__m128i*>(dst);
696        __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
697        srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
698        while (width >= 4) {
699            __m128i dst_pixel = _mm_load_si128(d);
700            __m128i mask_pixel = _mm_loadl_epi64(
701                                     reinterpret_cast<const __m128i*>(src));
702
703            // Check whether mask_pixels are equal to 0 and get the highest bit
704            // of each byte of result, if mask pixes are all zero, we will get
705            // pack_cmp to 0xFFFF
706            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
707                                             _mm_setzero_si128()));
708
709            // if mask pixels are not all zero, we will blend the dst pixels
710            if (pack_cmp != 0xFFFF) {
711                // Unpack 4 16bit mask pixels to
712                // (p0, 0, p1, 0, p2, 0, p3, 0)
713                mask_pixel = _mm_unpacklo_epi16(mask_pixel,
714                                                _mm_setzero_si128());
715
716                // Process 4 32bit dst pixels
717                __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
718                                                         mask_pixel);
719                _mm_store_si128(d, result);
720            }
721
722            d++;
723            src += 4;
724            width -= 4;
725        }
726
727        dst = reinterpret_cast<SkPMColor*>(d);
728    }
729
730    while (width > 0) {
731        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
732        src++;
733        dst++;
734        width--;
735    }
736}
737