SkBlitRow_opts_SSE2.cpp revision 39ce33a1facae795eb2f02e35674702de7eb23b5
1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8
9#include "SkBlitRow_opts_SSE2.h"
10#include "SkBitmapProcState_opts_SSE2.h"
11#include "SkColorPriv.h"
12#include "SkColor_opts_SSE2.h"
13#include "SkUtils.h"
14
15#include <emmintrin.h>
16
17/* SSE2 version of S32_Blend_BlitRow32()
18 * portable version is in core/SkBlitRow_D32.cpp
19 */
20void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21                              const SkPMColor* SK_RESTRICT src,
22                              int count, U8CPU alpha) {
23    SkASSERT(alpha <= 255);
24    if (count <= 0) {
25        return;
26    }
27
28    uint32_t src_scale = SkAlpha255To256(alpha);
29    uint32_t dst_scale = 256 - src_scale;
30
31    if (count >= 4) {
32        SkASSERT(((size_t)dst & 0x03) == 0);
33        while (((size_t)dst & 0x0F) != 0) {
34            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
35            src++;
36            dst++;
37            count--;
38        }
39
40        const __m128i *s = reinterpret_cast<const __m128i*>(src);
41        __m128i *d = reinterpret_cast<__m128i*>(dst);
42        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
43        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
44
45        // Move scale factors to upper byte of word
46        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
47        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
48        while (count >= 4) {
49            // Load 4 pixels each of src and dest.
50            __m128i src_pixel = _mm_loadu_si128(s);
51            __m128i dst_pixel = _mm_load_si128(d);
52
53            // Interleave Atom port 0/1 operations based on the execution port
54            // constraints that multiply can only be executed on port 0 (while
55            // boolean operations can be executed on either port 0 or port 1)
56            // because GCC currently doesn't do a good job scheduling
57            // instructions based on these constraints.
58
59            // Get red and blue pixels into lower byte of each word.
60            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
61            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
62
63            // Multiply by scale.
64            // (4 x (0, rs.h, 0, bs.h))
65            // where rs.h stands for the higher byte of r * scale, and
66            // bs.h the higher byte of b * scale.
67            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
68
69            // Get alpha and green pixels into higher byte of each word.
70            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
71            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
72
73            // Multiply by scale.
74            // (4 x (as.h, as.l, gs.h, gs.l))
75            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
76
77            // Clear the lower byte of the a*scale and g*scale results
78            // (4 x (as.h, 0, gs.h, 0))
79            src_ag = _mm_and_si128(src_ag, ag_mask);
80
81            // Operations the destination pixels are the same as on the
82            // source pixels. See the comments above.
83            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
84            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
85            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
86            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
87            dst_ag = _mm_and_si128(dst_ag, ag_mask);
88
89            // Combine back into RGBA.
90            // (4 x (as.h, rs.h, gs.h, bs.h))
91            src_pixel = _mm_or_si128(src_rb, src_ag);
92            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
93
94            // Add result
95            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
96            _mm_store_si128(d, result);
97            s++;
98            d++;
99            count -= 4;
100        }
101        src = reinterpret_cast<const SkPMColor*>(s);
102        dst = reinterpret_cast<SkPMColor*>(d);
103    }
104
105    while (count > 0) {
106        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
107        src++;
108        dst++;
109        count--;
110    }
111}
112
113void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
114                                const SkPMColor* SK_RESTRICT src,
115                                int count, U8CPU alpha) {
116    SkASSERT(alpha == 255);
117    if (count <= 0) {
118        return;
119    }
120
121    if (count >= 4) {
122        SkASSERT(((size_t)dst & 0x03) == 0);
123        while (((size_t)dst & 0x0F) != 0) {
124            *dst = SkPMSrcOver(*src, *dst);
125            src++;
126            dst++;
127            count--;
128        }
129
130        const __m128i *s = reinterpret_cast<const __m128i*>(src);
131        __m128i *d = reinterpret_cast<__m128i*>(dst);
132#ifdef SK_USE_ACCURATE_BLENDING
133        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
134        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
135        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
136        while (count >= 4) {
137            // Load 4 pixels
138            __m128i src_pixel = _mm_loadu_si128(s);
139            __m128i dst_pixel = _mm_load_si128(d);
140
141            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
142            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
143            // Shift alphas down to lower 8 bits of each quad.
144            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
145
146            // Copy alpha to upper 3rd byte of each quad
147            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
148
149            // Subtract alphas from 255, to get 0..255
150            alpha = _mm_sub_epi16(c_255, alpha);
151
152            // Multiply by red and blue by src alpha.
153            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
154            // Multiply by alpha and green by src alpha.
155            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
156
157            // dst_rb_low = (dst_rb >> 8)
158            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
159            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
160
161            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
162            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
163            dst_rb = _mm_add_epi16(dst_rb, c_128);
164            dst_rb = _mm_srli_epi16(dst_rb, 8);
165
166            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
167            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
168            dst_ag = _mm_add_epi16(dst_ag, c_128);
169            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
170
171            // Combine back into RGBA.
172            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
173
174            // Add result
175            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
176            _mm_store_si128(d, result);
177            s++;
178            d++;
179            count -= 4;
180        }
181    #else
182        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
183        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
184        while (count >= 4) {
185            // Load 4 pixels
186            __m128i src_pixel = _mm_loadu_si128(s);
187            __m128i dst_pixel = _mm_load_si128(d);
188
189            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
190            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
191
192            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
193            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
194
195            // (a0, a0, a1, a1, a2, g2, a3, g3)
196            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
197
198            // (a0, a0, a1, a1, a2, a2, a3, a3)
199            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
200
201            // Subtract alphas from 256, to get 1..256
202            alpha = _mm_sub_epi16(c_256, alpha);
203
204            // Multiply by red and blue by src alpha.
205            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
206            // Multiply by alpha and green by src alpha.
207            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
208
209            // Divide by 256.
210            dst_rb = _mm_srli_epi16(dst_rb, 8);
211
212            // Mask out high bits (already in the right place)
213            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
214
215            // Combine back into RGBA.
216            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
217
218            // Add result
219            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
220            _mm_store_si128(d, result);
221            s++;
222            d++;
223            count -= 4;
224        }
225#endif
226        src = reinterpret_cast<const SkPMColor*>(s);
227        dst = reinterpret_cast<SkPMColor*>(d);
228    }
229
230    while (count > 0) {
231        *dst = SkPMSrcOver(*src, *dst);
232        src++;
233        dst++;
234        count--;
235    }
236}
237
238void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
239                               const SkPMColor* SK_RESTRICT src,
240                               int count, U8CPU alpha) {
241    SkASSERT(alpha <= 255);
242    if (count <= 0) {
243        return;
244    }
245
246    if (count >= 4) {
247        while (((size_t)dst & 0x0F) != 0) {
248            *dst = SkBlendARGB32(*src, *dst, alpha);
249            src++;
250            dst++;
251            count--;
252        }
253
254        uint32_t src_scale = SkAlpha255To256(alpha);
255
256        const __m128i *s = reinterpret_cast<const __m128i*>(src);
257        __m128i *d = reinterpret_cast<__m128i*>(dst);
258        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
259        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
260        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
261        while (count >= 4) {
262            // Load 4 pixels each of src and dest.
263            __m128i src_pixel = _mm_loadu_si128(s);
264            __m128i dst_pixel = _mm_load_si128(d);
265
266            // Get red and blue pixels into lower byte of each word.
267            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
268            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
269
270            // Get alpha and green into lower byte of each word.
271            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
272            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
273
274            // Put per-pixel alpha in low byte of each word.
275            // After the following two statements, the dst_alpha looks like
276            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
277            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
278            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
279
280            // dst_alpha = dst_alpha * src_scale
281            // Because src_scales are in the higher byte of each word and
282            // we use mulhi here, the resulting alpha values are already
283            // in the right place and don't need to be divided by 256.
284            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
285            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
286
287            // Subtract alphas from 256, to get 1..256
288            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
289
290            // Multiply red and blue by dst pixel alpha.
291            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
292            // Multiply alpha and green by dst pixel alpha.
293            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
294
295            // Multiply red and blue by global alpha.
296            // (4 x (0, rs.h, 0, bs.h))
297            // where rs.h stands for the higher byte of r * src_scale,
298            // and bs.h the higher byte of b * src_scale.
299            // Again, because we use mulhi, the resuling red and blue
300            // values are already in the right place and don't need to
301            // be divided by 256.
302            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
303            // Multiply alpha and green by global alpha.
304            // (4 x (0, as.h, 0, gs.h))
305            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
306
307            // Divide by 256.
308            dst_rb = _mm_srli_epi16(dst_rb, 8);
309
310            // Mask out low bits (goodies already in the right place; no need to divide)
311            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
312            // Shift alpha and green to higher byte of each word.
313            // (4 x (as.h, 0, gs.h, 0))
314            src_ag = _mm_slli_epi16(src_ag, 8);
315
316            // Combine back into RGBA.
317            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
318            src_pixel = _mm_or_si128(src_rb, src_ag);
319
320            // Add two pixels into result.
321            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
322            _mm_store_si128(d, result);
323            s++;
324            d++;
325            count -= 4;
326        }
327        src = reinterpret_cast<const SkPMColor*>(s);
328        dst = reinterpret_cast<SkPMColor*>(d);
329    }
330
331    while (count > 0) {
332        *dst = SkBlendARGB32(*src, *dst, alpha);
333        src++;
334        dst++;
335        count--;
336    }
337}
338
339/* SSE2 version of Color32()
340 * portable version is in core/SkBlitRow_D32.cpp
341 */
342void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
343                  SkPMColor color) {
344
345    if (count <= 0) {
346        return;
347    }
348
349    if (0 == color) {
350        if (src != dst) {
351            memcpy(dst, src, count * sizeof(SkPMColor));
352        }
353        return;
354    }
355
356    unsigned colorA = SkGetPackedA32(color);
357    if (255 == colorA) {
358        sk_memset32(dst, color, count);
359    } else {
360        unsigned scale = 256 - SkAlpha255To256(colorA);
361
362        if (count >= 4) {
363            SkASSERT(((size_t)dst & 0x03) == 0);
364            while (((size_t)dst & 0x0F) != 0) {
365                *dst = color + SkAlphaMulQ(*src, scale);
366                src++;
367                dst++;
368                count--;
369            }
370
371            const __m128i *s = reinterpret_cast<const __m128i*>(src);
372            __m128i *d = reinterpret_cast<__m128i*>(dst);
373            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
374            __m128i src_scale_wide = _mm_set1_epi16(scale);
375            __m128i color_wide = _mm_set1_epi32(color);
376            while (count >= 4) {
377                // Load 4 pixels each of src and dest.
378                __m128i src_pixel = _mm_loadu_si128(s);
379
380                // Get red and blue pixels into lower byte of each word.
381                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
382
383                // Get alpha and green into lower byte of each word.
384                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
385
386                // Multiply by scale.
387                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
388                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
389
390                // Divide by 256.
391                src_rb = _mm_srli_epi16(src_rb, 8);
392                src_ag = _mm_andnot_si128(rb_mask, src_ag);
393
394                // Combine back into RGBA.
395                src_pixel = _mm_or_si128(src_rb, src_ag);
396
397                // Add color to result.
398                __m128i result = _mm_add_epi8(color_wide, src_pixel);
399
400                // Store result.
401                _mm_store_si128(d, result);
402                s++;
403                d++;
404                count -= 4;
405            }
406            src = reinterpret_cast<const SkPMColor*>(s);
407            dst = reinterpret_cast<SkPMColor*>(d);
408         }
409
410        while (count > 0) {
411            *dst = color + SkAlphaMulQ(*src, scale);
412            src += 1;
413            dst += 1;
414            count--;
415        }
416    }
417}
418
419void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
420                               size_t maskRB, SkColor origColor,
421                               int width, int height) {
422    SkPMColor color = SkPreMultiplyColor(origColor);
423    size_t dstOffset = dstRB - (width << 2);
424    size_t maskOffset = maskRB - width;
425    SkPMColor* dst = (SkPMColor *)device;
426    const uint8_t* mask = (const uint8_t*)maskPtr;
427    do {
428        int count = width;
429        if (count >= 4) {
430            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
431                *dst = SkBlendARGB32(color, *dst, *mask);
432                mask++;
433                dst++;
434                count--;
435            }
436            __m128i *d = reinterpret_cast<__m128i*>(dst);
437            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
438            __m128i c_256 = _mm_set1_epi16(256);
439            __m128i c_1 = _mm_set1_epi16(1);
440            __m128i src_pixel = _mm_set1_epi32(color);
441            while (count >= 4) {
442                // Load 4 pixels each of src and dest.
443                __m128i dst_pixel = _mm_load_si128(d);
444
445                //set the aphla value
446                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
447                                0, *(mask+3),0, \
448                                *(mask+2),0, *(mask+2),\
449                                0,*(mask+1), 0,*(mask+1),\
450                                0, *mask,0,*mask);
451
452                //call SkAlpha255To256()
453                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
454
455                // Get red and blue pixels into lower byte of each word.
456                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
457                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
458
459                // Get alpha and green into lower byte of each word.
460                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
461                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
462
463                // Put per-pixel alpha in low byte of each word.
464                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
465                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
466
467                // dst_alpha = dst_alpha * src_scale
468                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
469
470                // Divide by 256.
471                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
472
473                // Subtract alphas from 256, to get 1..256
474                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
475                // Multiply red and blue by dst pixel alpha.
476                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
477                // Multiply alpha and green by dst pixel alpha.
478                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
479
480                // Multiply red and blue by global alpha.
481                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
482                // Multiply alpha and green by global alpha.
483                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
484                // Divide by 256.
485                dst_rb = _mm_srli_epi16(dst_rb, 8);
486                src_rb = _mm_srli_epi16(src_rb, 8);
487
488                // Mask out low bits (goodies already in the right place; no need to divide)
489                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
490                src_ag = _mm_andnot_si128(rb_mask, src_ag);
491
492                // Combine back into RGBA.
493                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
494                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
495
496                // Add two pixels into result.
497                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
498                _mm_store_si128(d, result);
499                // load the next 4 pixel
500                mask = mask + 4;
501                d++;
502                count -= 4;
503            }
504            dst = reinterpret_cast<SkPMColor *>(d);
505        }
506        while(count > 0) {
507            *dst= SkBlendARGB32(color, *dst, *mask);
508            dst += 1;
509            mask++;
510            count --;
511        }
512        dst = (SkPMColor *)((char*)dst + dstOffset);
513        mask += maskOffset;
514    } while (--height != 0);
515}
516
517// The following (left) shifts cause the top 5 bits of the mask components to
518// line up with the corresponding components in an SkPMColor.
519// Note that the mask's RGB16 order may differ from the SkPMColor order.
520#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
521#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
522#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
523
524#if SK_R16x5_R32x5_SHIFT == 0
525    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
526#elif SK_R16x5_R32x5_SHIFT > 0
527    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
528#else
529    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
530#endif
531
532#if SK_G16x5_G32x5_SHIFT == 0
533    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
534#elif SK_G16x5_G32x5_SHIFT > 0
535    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
536#else
537    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
538#endif
539
540#if SK_B16x5_B32x5_SHIFT == 0
541    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
542#elif SK_B16x5_B32x5_SHIFT > 0
543    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
544#else
545    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
546#endif
547
548static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
549                                 __m128i &mask, __m128i &srcA) {
550    // In the following comments, the components of src, dst and mask are
551    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
552    // by an R, G, B, or A suffix. Components of one of the four pixels that
553    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
554    // example is the blue channel of the second destination pixel. Memory
555    // layout is shown for an ARGB byte order in a color value.
556
557    // src and srcA store 8-bit values interleaved with zeros.
558    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
559    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
560    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
561    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
562    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
563    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
564    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
565
566    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
567    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
568    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
569                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
570
571    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
572    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
573                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
574
575    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
576    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
577                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
578
579    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
580    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
581    // 8-bit position
582    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
583    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
584    mask = _mm_or_si128(_mm_or_si128(r, g), b);
585
586    // Interleave R,G,B into the lower byte of word.
587    // i.e. split the sixteen 8-bit values from mask into two sets of eight
588    // 16-bit values, padded by zero.
589    __m128i maskLo, maskHi;
590    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
591    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
592    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
593    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
594
595    // Upscale from 0..31 to 0..32
596    // (allows to replace division by left-shift further down)
597    // Left-shift each component by 4 and add the result back to that component,
598    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
599    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
600    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
601
602    // Multiply each component of maskLo and maskHi by srcA
603    maskLo = _mm_mullo_epi16(maskLo, srcA);
604    maskHi = _mm_mullo_epi16(maskHi, srcA);
605
606    // Left shift mask components by 8 (divide by 256)
607    maskLo = _mm_srli_epi16(maskLo, 8);
608    maskHi = _mm_srli_epi16(maskHi, 8);
609
610    // Interleave R,G,B into the lower byte of the word
611    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
612    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
613    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
614    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
615
616    // mask = (src - dst) * mask
617    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
618    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
619
620    // mask = (src - dst) * mask >> 5
621    maskLo = _mm_srai_epi16(maskLo, 5);
622    maskHi = _mm_srai_epi16(maskHi, 5);
623
624    // Add two pixels into result.
625    // result = dst + ((src - dst) * mask >> 5)
626    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
627    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
628
629    // Pack into 4 32bit dst pixels.
630    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
631    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
632    // clamping to 255 if necessary.
633    return _mm_packus_epi16(resultLo, resultHi);
634}
635
636static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
637                                       __m128i &mask) {
638    // In the following comments, the components of src, dst and mask are
639    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
640    // by an R, G, B, or A suffix. Components of one of the four pixels that
641    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
642    // example is the blue channel of the second destination pixel. Memory
643    // layout is shown for an ARGB byte order in a color value.
644
645    // src and srcA store 8-bit values interleaved with zeros.
646    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
647    // mask stores 16-bit values (shown as high and low bytes) interleaved with
648    // zeros
649    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
650    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
651
652    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
653    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
654    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
655                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
656
657    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
658    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
659                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
660
661    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
662    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
663                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
664
665    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
666    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
667    // 8-bit position
668    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
669    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
670    mask = _mm_or_si128(_mm_or_si128(r, g), b);
671
672    // Interleave R,G,B into the lower byte of word.
673    // i.e. split the sixteen 8-bit values from mask into two sets of eight
674    // 16-bit values, padded by zero.
675    __m128i maskLo, maskHi;
676    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
677    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
678    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
679    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
680
681    // Upscale from 0..31 to 0..32
682    // (allows to replace division by left-shift further down)
683    // Left-shift each component by 4 and add the result back to that component,
684    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
685    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
686    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
687
688    // Interleave R,G,B into the lower byte of the word
689    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
690    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
691    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
692    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
693
694    // mask = (src - dst) * mask
695    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
696    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
697
698    // mask = (src - dst) * mask >> 5
699    maskLo = _mm_srai_epi16(maskLo, 5);
700    maskHi = _mm_srai_epi16(maskHi, 5);
701
702    // Add two pixels into result.
703    // result = dst + ((src - dst) * mask >> 5)
704    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
705    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
706
707    // Pack into 4 32bit dst pixels and force opaque.
708    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
709    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
710    // clamping to 255 if necessary. Set alpha components to 0xFF.
711    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
712                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
713}
714
715void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
716                         SkColor src, int width, SkPMColor) {
717    if (width <= 0) {
718        return;
719    }
720
721    int srcA = SkColorGetA(src);
722    int srcR = SkColorGetR(src);
723    int srcG = SkColorGetG(src);
724    int srcB = SkColorGetB(src);
725
726    srcA = SkAlpha255To256(srcA);
727
728    if (width >= 4) {
729        SkASSERT(((size_t)dst & 0x03) == 0);
730        while (((size_t)dst & 0x0F) != 0) {
731            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
732            mask++;
733            dst++;
734            width--;
735        }
736
737        __m128i *d = reinterpret_cast<__m128i*>(dst);
738        // Set alpha to 0xFF and replicate source four times in SSE register.
739        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
740        // Interleave with zeros to get two sets of four 16-bit values.
741        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
742        // Set srcA_sse to contain eight copies of srcA, padded with zero.
743        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
744        __m128i srcA_sse = _mm_set1_epi16(srcA);
745        while (width >= 4) {
746            // Load four destination pixels into dst_sse.
747            __m128i dst_sse = _mm_load_si128(d);
748            // Load four 16-bit masks into lower half of mask_sse.
749            __m128i mask_sse = _mm_loadl_epi64(
750                                   reinterpret_cast<const __m128i*>(mask));
751
752            // Check whether masks are equal to 0 and get the highest bit
753            // of each byte of result, if masks are all zero, we will get
754            // pack_cmp to 0xFFFF
755            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
756                                             _mm_setzero_si128()));
757
758            // if mask pixels are not all zero, we will blend the dst pixels
759            if (pack_cmp != 0xFFFF) {
760                // Unpack 4 16bit mask pixels to
761                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
762                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
763                mask_sse = _mm_unpacklo_epi16(mask_sse,
764                                              _mm_setzero_si128());
765
766                // Process 4 32bit dst pixels
767                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
768                                                   mask_sse, srcA_sse);
769                _mm_store_si128(d, result);
770            }
771
772            d++;
773            mask += 4;
774            width -= 4;
775        }
776
777        dst = reinterpret_cast<SkPMColor*>(d);
778    }
779
780    while (width > 0) {
781        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
782        mask++;
783        dst++;
784        width--;
785    }
786}
787
788void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
789                               SkColor src, int width, SkPMColor opaqueDst) {
790    if (width <= 0) {
791        return;
792    }
793
794    int srcR = SkColorGetR(src);
795    int srcG = SkColorGetG(src);
796    int srcB = SkColorGetB(src);
797
798    if (width >= 4) {
799        SkASSERT(((size_t)dst & 0x03) == 0);
800        while (((size_t)dst & 0x0F) != 0) {
801            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
802            mask++;
803            dst++;
804            width--;
805        }
806
807        __m128i *d = reinterpret_cast<__m128i*>(dst);
808        // Set alpha to 0xFF and replicate source four times in SSE register.
809        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
810        // Set srcA_sse to contain eight copies of srcA, padded with zero.
811        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
812        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
813        while (width >= 4) {
814            // Load four destination pixels into dst_sse.
815            __m128i dst_sse = _mm_load_si128(d);
816            // Load four 16-bit masks into lower half of mask_sse.
817            __m128i mask_sse = _mm_loadl_epi64(
818                                   reinterpret_cast<const __m128i*>(mask));
819
820            // Check whether masks are equal to 0 and get the highest bit
821            // of each byte of result, if masks are all zero, we will get
822            // pack_cmp to 0xFFFF
823            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
824                                             _mm_setzero_si128()));
825
826            // if mask pixels are not all zero, we will blend the dst pixels
827            if (pack_cmp != 0xFFFF) {
828                // Unpack 4 16bit mask pixels to
829                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
830                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
831                mask_sse = _mm_unpacklo_epi16(mask_sse,
832                                              _mm_setzero_si128());
833
834                // Process 4 32bit dst pixels
835                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
836                                                         mask_sse);
837                _mm_store_si128(d, result);
838            }
839
840            d++;
841            mask += 4;
842            width -= 4;
843        }
844
845        dst = reinterpret_cast<SkPMColor*>(d);
846    }
847
848    while (width > 0) {
849        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
850        mask++;
851        dst++;
852        width--;
853    }
854}
855
856/* SSE2 version of S32_D565_Opaque()
857 * portable version is in core/SkBlitRow_D16.cpp
858 */
859void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
860                          const SkPMColor* SK_RESTRICT src, int count,
861                          U8CPU alpha, int /*x*/, int /*y*/) {
862    SkASSERT(255 == alpha);
863
864    if (count <= 0) {
865        return;
866    }
867
868    if (count >= 8) {
869        while (((size_t)dst & 0x0F) != 0) {
870            SkPMColor c = *src++;
871            SkPMColorAssert(c);
872
873            *dst++ = SkPixel32ToPixel16_ToU16(c);
874            count--;
875        }
876
877        const __m128i* s = reinterpret_cast<const __m128i*>(src);
878        __m128i* d = reinterpret_cast<__m128i*>(dst);
879        __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
880        __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
881        __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
882
883        while (count >= 8) {
884            // Load 8 pixels of src.
885            __m128i src_pixel1 = _mm_loadu_si128(s++);
886            __m128i src_pixel2 = _mm_loadu_si128(s++);
887
888            // Calculate result r.
889            __m128i r1 = _mm_srli_epi32(src_pixel1,
890                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
891            r1 = _mm_and_si128(r1, r16_mask);
892            __m128i r2 = _mm_srli_epi32(src_pixel2,
893                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
894            r2 = _mm_and_si128(r2, r16_mask);
895            __m128i r = _mm_packs_epi32(r1, r2);
896
897            // Calculate result g.
898            __m128i g1 = _mm_srli_epi32(src_pixel1,
899                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
900            g1 = _mm_and_si128(g1, g16_mask);
901            __m128i g2 = _mm_srli_epi32(src_pixel2,
902                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
903            g2 = _mm_and_si128(g2, g16_mask);
904            __m128i g = _mm_packs_epi32(g1, g2);
905
906            // Calculate result b.
907            __m128i b1 = _mm_srli_epi32(src_pixel1,
908                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
909            b1 = _mm_and_si128(b1, b16_mask);
910            __m128i b2 = _mm_srli_epi32(src_pixel2,
911                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
912            b2 = _mm_and_si128(b2, b16_mask);
913            __m128i b = _mm_packs_epi32(b1, b2);
914
915            // Store 8 16-bit colors in dst.
916            __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
917            _mm_store_si128(d++, d_pixel);
918            count -= 8;
919        }
920        src = reinterpret_cast<const SkPMColor*>(s);
921        dst = reinterpret_cast<uint16_t*>(d);
922    }
923
924    if (count > 0) {
925        do {
926            SkPMColor c = *src++;
927            SkPMColorAssert(c);
928            *dst++ = SkPixel32ToPixel16_ToU16(c);
929        } while (--count != 0);
930    }
931}
932
933/* SSE2 version of S32A_D565_Opaque()
934 * portable version is in core/SkBlitRow_D16.cpp
935 */
936void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
937                           const SkPMColor* SK_RESTRICT src,
938                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
939    SkASSERT(255 == alpha);
940
941    if (count <= 0) {
942        return;
943    }
944
945    if (count >= 8) {
946        // Make dst 16 bytes alignment
947        while (((size_t)dst & 0x0F) != 0) {
948            SkPMColor c = *src++;
949            if (c) {
950              *dst = SkSrcOver32To16(c, *dst);
951            }
952            dst += 1;
953            count--;
954        }
955
956        const __m128i* s = reinterpret_cast<const __m128i*>(src);
957        __m128i* d = reinterpret_cast<__m128i*>(dst);
958        __m128i var255 = _mm_set1_epi16(255);
959        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
960        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
961        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
962
963        while (count >= 8) {
964            // Load 8 pixels of src.
965            __m128i src_pixel1 = _mm_loadu_si128(s++);
966            __m128i src_pixel2 = _mm_loadu_si128(s++);
967
968            // Check whether src pixels are equal to 0 and get the highest bit
969            // of each byte of result, if src pixels are all zero, src_cmp1 and
970            // src_cmp2 will be 0xFFFF.
971            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
972                                             _mm_setzero_si128()));
973            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
974                                             _mm_setzero_si128()));
975            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
976                d++;
977                count -= 8;
978                continue;
979            }
980
981            // Load 8 pixels of dst.
982            __m128i dst_pixel = _mm_load_si128(d);
983
984            // Extract A from src.
985            __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
986            sa1 = _mm_srli_epi32(sa1, 24);
987            __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
988            sa2 = _mm_srli_epi32(sa2, 24);
989            __m128i sa = _mm_packs_epi32(sa1, sa2);
990
991            // Extract R from src.
992            __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
993            sr1 = _mm_srli_epi32(sr1, 24);
994            __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
995            sr2 = _mm_srli_epi32(sr2, 24);
996            __m128i sr = _mm_packs_epi32(sr1, sr2);
997
998            // Extract G from src.
999            __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
1000            sg1 = _mm_srli_epi32(sg1, 24);
1001            __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
1002            sg2 = _mm_srli_epi32(sg2, 24);
1003            __m128i sg = _mm_packs_epi32(sg1, sg2);
1004
1005            // Extract B from src.
1006            __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
1007            sb1 = _mm_srli_epi32(sb1, 24);
1008            __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
1009            sb2 = _mm_srli_epi32(sb2, 24);
1010            __m128i sb = _mm_packs_epi32(sb1, sb2);
1011
1012            // Extract R G B from dst.
1013            __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
1014            dr = _mm_and_si128(dr, r16_mask);
1015            __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
1016            dg = _mm_and_si128(dg, g16_mask);
1017            __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
1018            db = _mm_and_si128(db, b16_mask);
1019
1020            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1021
1022            // Calculate R G B of result.
1023            // Original algorithm is in SkSrcOver32To16().
1024            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
1025            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1026            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
1027            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1028            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
1029            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1030
1031            // Pack R G B into 16-bit color.
1032            __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
1033
1034            // Store 8 16-bit colors in dst.
1035            _mm_store_si128(d++, d_pixel);
1036            count -= 8;
1037        }
1038
1039        src = reinterpret_cast<const SkPMColor*>(s);
1040        dst = reinterpret_cast<uint16_t*>(d);
1041    }
1042
1043    if (count > 0) {
1044        do {
1045            SkPMColor c = *src++;
1046            SkPMColorAssert(c);
1047            if (c) {
1048                *dst = SkSrcOver32To16(c, *dst);
1049            }
1050            dst += 1;
1051        } while (--count != 0);
1052    }
1053}
1054