1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkBlitRow_opts_SSE2.h"
11#include "SkColorPriv.h"
12#include "SkColor_opts_SSE2.h"
13#include "SkDither.h"
14#include "SkUtils.h"
15
16/* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
19void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20                              const SkPMColor* SK_RESTRICT src,
21                              int count, U8CPU alpha) {
22    SkASSERT(alpha <= 255);
23    if (count <= 0) {
24        return;
25    }
26
27    uint32_t src_scale = SkAlpha255To256(alpha);
28    uint32_t dst_scale = 256 - src_scale;
29
30    if (count >= 4) {
31        SkASSERT(((size_t)dst & 0x03) == 0);
32        while (((size_t)dst & 0x0F) != 0) {
33            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34            src++;
35            dst++;
36            count--;
37        }
38
39        const __m128i *s = reinterpret_cast<const __m128i*>(src);
40        __m128i *d = reinterpret_cast<__m128i*>(dst);
41        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43
44        // Move scale factors to upper byte of word
45        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47        while (count >= 4) {
48            // Load 4 pixels each of src and dest.
49            __m128i src_pixel = _mm_loadu_si128(s);
50            __m128i dst_pixel = _mm_load_si128(d);
51
52            // Interleave Atom port 0/1 operations based on the execution port
53            // constraints that multiply can only be executed on port 0 (while
54            // boolean operations can be executed on either port 0 or port 1)
55            // because GCC currently doesn't do a good job scheduling
56            // instructions based on these constraints.
57
58            // Get red and blue pixels into lower byte of each word.
59            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61
62            // Multiply by scale.
63            // (4 x (0, rs.h, 0, bs.h))
64            // where rs.h stands for the higher byte of r * scale, and
65            // bs.h the higher byte of b * scale.
66            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67
68            // Get alpha and green pixels into higher byte of each word.
69            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71
72            // Multiply by scale.
73            // (4 x (as.h, as.l, gs.h, gs.l))
74            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75
76            // Clear the lower byte of the a*scale and g*scale results
77            // (4 x (as.h, 0, gs.h, 0))
78            src_ag = _mm_and_si128(src_ag, ag_mask);
79
80            // Operations the destination pixels are the same as on the
81            // source pixels. See the comments above.
82            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86            dst_ag = _mm_and_si128(dst_ag, ag_mask);
87
88            // Combine back into RGBA.
89            // (4 x (as.h, rs.h, gs.h, bs.h))
90            src_pixel = _mm_or_si128(src_rb, src_ag);
91            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92
93            // Add result
94            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95            _mm_store_si128(d, result);
96            s++;
97            d++;
98            count -= 4;
99        }
100        src = reinterpret_cast<const SkPMColor*>(s);
101        dst = reinterpret_cast<SkPMColor*>(d);
102    }
103
104    while (count > 0) {
105        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106        src++;
107        dst++;
108        count--;
109    }
110}
111
112void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
113                                const SkPMColor* SK_RESTRICT src,
114                                int count, U8CPU alpha) {
115    SkASSERT(alpha == 255);
116    if (count <= 0) {
117        return;
118    }
119
120    if (count >= 4) {
121        SkASSERT(((size_t)dst & 0x03) == 0);
122        while (((size_t)dst & 0x0F) != 0) {
123            *dst = SkPMSrcOver(*src, *dst);
124            src++;
125            dst++;
126            count--;
127        }
128
129        const __m128i *s = reinterpret_cast<const __m128i*>(src);
130        __m128i *d = reinterpret_cast<__m128i*>(dst);
131#ifdef SK_USE_ACCURATE_BLENDING
132        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
134        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
135        while (count >= 4) {
136            // Load 4 pixels
137            __m128i src_pixel = _mm_loadu_si128(s);
138            __m128i dst_pixel = _mm_load_si128(d);
139
140            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
141            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
142            // Shift alphas down to lower 8 bits of each quad.
143            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
144
145            // Copy alpha to upper 3rd byte of each quad
146            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
147
148            // Subtract alphas from 255, to get 0..255
149            alpha = _mm_sub_epi16(c_255, alpha);
150
151            // Multiply by red and blue by src alpha.
152            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153            // Multiply by alpha and green by src alpha.
154            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
155
156            // dst_rb_low = (dst_rb >> 8)
157            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
159
160            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162            dst_rb = _mm_add_epi16(dst_rb, c_128);
163            dst_rb = _mm_srli_epi16(dst_rb, 8);
164
165            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167            dst_ag = _mm_add_epi16(dst_ag, c_128);
168            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
169
170            // Combine back into RGBA.
171            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
172
173            // Add result
174            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175            _mm_store_si128(d, result);
176            s++;
177            d++;
178            count -= 4;
179        }
180#else
181        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
183        while (count >= 4) {
184            // Load 4 pixels
185            __m128i src_pixel = _mm_loadu_si128(s);
186            __m128i dst_pixel = _mm_load_si128(d);
187
188            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
189            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
190
191            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
192            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193
194            // (a0, a0, a1, a1, a2, g2, a3, g3)
195            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196
197            // (a0, a0, a1, a1, a2, a2, a3, a3)
198            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
199
200            // Subtract alphas from 256, to get 1..256
201            alpha = _mm_sub_epi16(c_256, alpha);
202
203            // Multiply by red and blue by src alpha.
204            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205            // Multiply by alpha and green by src alpha.
206            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
207
208            // Divide by 256.
209            dst_rb = _mm_srli_epi16(dst_rb, 8);
210
211            // Mask out high bits (already in the right place)
212            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
213
214            // Combine back into RGBA.
215            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
216
217            // Add result
218            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219            _mm_store_si128(d, result);
220            s++;
221            d++;
222            count -= 4;
223        }
224#endif
225        src = reinterpret_cast<const SkPMColor*>(s);
226        dst = reinterpret_cast<SkPMColor*>(d);
227    }
228
229    while (count > 0) {
230        *dst = SkPMSrcOver(*src, *dst);
231        src++;
232        dst++;
233        count--;
234    }
235}
236
237void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
238                               const SkPMColor* SK_RESTRICT src,
239                               int count, U8CPU alpha) {
240    SkASSERT(alpha <= 255);
241    if (count <= 0) {
242        return;
243    }
244
245    if (count >= 4) {
246        while (((size_t)dst & 0x0F) != 0) {
247            *dst = SkBlendARGB32(*src, *dst, alpha);
248            src++;
249            dst++;
250            count--;
251        }
252
253        uint32_t src_scale = SkAlpha255To256(alpha);
254
255        const __m128i *s = reinterpret_cast<const __m128i*>(src);
256        __m128i *d = reinterpret_cast<__m128i*>(dst);
257        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
258        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
260        while (count >= 4) {
261            // Load 4 pixels each of src and dest.
262            __m128i src_pixel = _mm_loadu_si128(s);
263            __m128i dst_pixel = _mm_load_si128(d);
264
265            // Get red and blue pixels into lower byte of each word.
266            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
268
269            // Get alpha and green into lower byte of each word.
270            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
272
273            // Put per-pixel alpha in low byte of each word.
274            // After the following two statements, the dst_alpha looks like
275            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
276            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
278
279            // dst_alpha = dst_alpha * src_scale
280            // Because src_scales are in the higher byte of each word and
281            // we use mulhi here, the resulting alpha values are already
282            // in the right place and don't need to be divided by 256.
283            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
285
286            // Subtract alphas from 256, to get 1..256
287            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
288
289            // Multiply red and blue by dst pixel alpha.
290            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291            // Multiply alpha and green by dst pixel alpha.
292            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
293
294            // Multiply red and blue by global alpha.
295            // (4 x (0, rs.h, 0, bs.h))
296            // where rs.h stands for the higher byte of r * src_scale,
297            // and bs.h the higher byte of b * src_scale.
298            // Again, because we use mulhi, the resuling red and blue
299            // values are already in the right place and don't need to
300            // be divided by 256.
301            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
302            // Multiply alpha and green by global alpha.
303            // (4 x (0, as.h, 0, gs.h))
304            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
305
306            // Divide by 256.
307            dst_rb = _mm_srli_epi16(dst_rb, 8);
308
309            // Mask out low bits (goodies already in the right place; no need to divide)
310            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
311            // Shift alpha and green to higher byte of each word.
312            // (4 x (as.h, 0, gs.h, 0))
313            src_ag = _mm_slli_epi16(src_ag, 8);
314
315            // Combine back into RGBA.
316            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317            src_pixel = _mm_or_si128(src_rb, src_ag);
318
319            // Add two pixels into result.
320            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321            _mm_store_si128(d, result);
322            s++;
323            d++;
324            count -= 4;
325        }
326        src = reinterpret_cast<const SkPMColor*>(s);
327        dst = reinterpret_cast<SkPMColor*>(d);
328    }
329
330    while (count > 0) {
331        *dst = SkBlendARGB32(*src, *dst, alpha);
332        src++;
333        dst++;
334        count--;
335    }
336}
337
338/* SSE2 version of Color32()
339 * portable version is in core/SkBlitRow_D32.cpp
340 */
341void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342                  SkPMColor color) {
343    if (count <= 0) {
344        return;
345    }
346
347    if (0 == color) {
348        if (src != dst) {
349            memcpy(dst, src, count * sizeof(SkPMColor));
350        }
351        return;
352    }
353
354    unsigned colorA = SkGetPackedA32(color);
355    if (255 == colorA) {
356        sk_memset32(dst, color, count);
357    } else {
358        unsigned scale = 256 - SkAlpha255To256(colorA);
359
360        if (count >= 4) {
361            SkASSERT(((size_t)dst & 0x03) == 0);
362            while (((size_t)dst & 0x0F) != 0) {
363                *dst = color + SkAlphaMulQ(*src, scale);
364                src++;
365                dst++;
366                count--;
367            }
368
369            const __m128i *s = reinterpret_cast<const __m128i*>(src);
370            __m128i *d = reinterpret_cast<__m128i*>(dst);
371            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
372            __m128i src_scale_wide = _mm_set1_epi16(scale);
373            __m128i color_wide = _mm_set1_epi32(color);
374            while (count >= 4) {
375                // Load 4 pixels each of src and dest.
376                __m128i src_pixel = _mm_loadu_si128(s);
377
378                // Get red and blue pixels into lower byte of each word.
379                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
380
381                // Get alpha and green into lower byte of each word.
382                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
383
384                // Multiply by scale.
385                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
386                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
387
388                // Divide by 256.
389                src_rb = _mm_srli_epi16(src_rb, 8);
390                src_ag = _mm_andnot_si128(rb_mask, src_ag);
391
392                // Combine back into RGBA.
393                src_pixel = _mm_or_si128(src_rb, src_ag);
394
395                // Add color to result.
396                __m128i result = _mm_add_epi8(color_wide, src_pixel);
397
398                // Store result.
399                _mm_store_si128(d, result);
400                s++;
401                d++;
402                count -= 4;
403            }
404            src = reinterpret_cast<const SkPMColor*>(s);
405            dst = reinterpret_cast<SkPMColor*>(d);
406        }
407
408        while (count > 0) {
409            *dst = color + SkAlphaMulQ(*src, scale);
410            src += 1;
411            dst += 1;
412            count--;
413        }
414    }
415}
416
417void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
418                               size_t maskRB, SkColor origColor,
419                               int width, int height) {
420    SkPMColor color = SkPreMultiplyColor(origColor);
421    size_t dstOffset = dstRB - (width << 2);
422    size_t maskOffset = maskRB - width;
423    SkPMColor* dst = (SkPMColor *)device;
424    const uint8_t* mask = (const uint8_t*)maskPtr;
425    do {
426        int count = width;
427        if (count >= 4) {
428            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
429                *dst = SkBlendARGB32(color, *dst, *mask);
430                mask++;
431                dst++;
432                count--;
433            }
434            __m128i *d = reinterpret_cast<__m128i*>(dst);
435            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
436            __m128i c_256 = _mm_set1_epi16(256);
437            __m128i c_1 = _mm_set1_epi16(1);
438            __m128i src_pixel = _mm_set1_epi32(color);
439            while (count >= 4) {
440                // Load 4 pixels each of src and dest.
441                __m128i dst_pixel = _mm_load_si128(d);
442
443                //set the aphla value
444                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
445                                0, *(mask+3),0, \
446                                *(mask+2),0, *(mask+2),\
447                                0,*(mask+1), 0,*(mask+1),\
448                                0, *mask,0,*mask);
449
450                //call SkAlpha255To256()
451                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
452
453                // Get red and blue pixels into lower byte of each word.
454                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
455                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
456
457                // Get alpha and green into lower byte of each word.
458                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
459                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
460
461                // Put per-pixel alpha in low byte of each word.
462                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
463                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
464
465                // dst_alpha = dst_alpha * src_scale
466                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
467
468                // Divide by 256.
469                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
470
471                // Subtract alphas from 256, to get 1..256
472                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
473                // Multiply red and blue by dst pixel alpha.
474                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
475                // Multiply alpha and green by dst pixel alpha.
476                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
477
478                // Multiply red and blue by global alpha.
479                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
480                // Multiply alpha and green by global alpha.
481                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
482                // Divide by 256.
483                dst_rb = _mm_srli_epi16(dst_rb, 8);
484                src_rb = _mm_srli_epi16(src_rb, 8);
485
486                // Mask out low bits (goodies already in the right place; no need to divide)
487                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
488                src_ag = _mm_andnot_si128(rb_mask, src_ag);
489
490                // Combine back into RGBA.
491                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
492                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
493
494                // Add two pixels into result.
495                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
496                _mm_store_si128(d, result);
497                // load the next 4 pixel
498                mask = mask + 4;
499                d++;
500                count -= 4;
501            }
502            dst = reinterpret_cast<SkPMColor *>(d);
503        }
504        while (count > 0) {
505            *dst= SkBlendARGB32(color, *dst, *mask);
506            dst += 1;
507            mask++;
508            count --;
509        }
510        dst = (SkPMColor *)((char*)dst + dstOffset);
511        mask += maskOffset;
512    } while (--height != 0);
513}
514
515// The following (left) shifts cause the top 5 bits of the mask components to
516// line up with the corresponding components in an SkPMColor.
517// Note that the mask's RGB16 order may differ from the SkPMColor order.
518#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
519#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
520#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
521
522#if SK_R16x5_R32x5_SHIFT == 0
523    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
524#elif SK_R16x5_R32x5_SHIFT > 0
525    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
526#else
527    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
528#endif
529
530#if SK_G16x5_G32x5_SHIFT == 0
531    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
532#elif SK_G16x5_G32x5_SHIFT > 0
533    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
534#else
535    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
536#endif
537
538#if SK_B16x5_B32x5_SHIFT == 0
539    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
540#elif SK_B16x5_B32x5_SHIFT > 0
541    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
542#else
543    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
544#endif
545
546static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
547                                 __m128i &mask, __m128i &srcA) {
548    // In the following comments, the components of src, dst and mask are
549    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
550    // by an R, G, B, or A suffix. Components of one of the four pixels that
551    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
552    // example is the blue channel of the second destination pixel. Memory
553    // layout is shown for an ARGB byte order in a color value.
554
555    // src and srcA store 8-bit values interleaved with zeros.
556    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
557    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
558    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
559    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
560    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
561    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
562    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
563
564    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
565    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
566    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
567                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
568
569    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
570    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
571                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
572
573    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
574    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
575                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
576
577    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
578    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
579    // 8-bit position
580    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
581    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
582    mask = _mm_or_si128(_mm_or_si128(r, g), b);
583
584    // Interleave R,G,B into the lower byte of word.
585    // i.e. split the sixteen 8-bit values from mask into two sets of eight
586    // 16-bit values, padded by zero.
587    __m128i maskLo, maskHi;
588    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
589    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
590    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
591    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
592
593    // Upscale from 0..31 to 0..32
594    // (allows to replace division by left-shift further down)
595    // Left-shift each component by 4 and add the result back to that component,
596    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
597    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
598    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
599
600    // Multiply each component of maskLo and maskHi by srcA
601    maskLo = _mm_mullo_epi16(maskLo, srcA);
602    maskHi = _mm_mullo_epi16(maskHi, srcA);
603
604    // Left shift mask components by 8 (divide by 256)
605    maskLo = _mm_srli_epi16(maskLo, 8);
606    maskHi = _mm_srli_epi16(maskHi, 8);
607
608    // Interleave R,G,B into the lower byte of the word
609    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
610    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
611    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
612    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
613
614    // mask = (src - dst) * mask
615    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
616    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
617
618    // mask = (src - dst) * mask >> 5
619    maskLo = _mm_srai_epi16(maskLo, 5);
620    maskHi = _mm_srai_epi16(maskHi, 5);
621
622    // Add two pixels into result.
623    // result = dst + ((src - dst) * mask >> 5)
624    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
625    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
626
627    // Pack into 4 32bit dst pixels.
628    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
629    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
630    // clamping to 255 if necessary.
631    return _mm_packus_epi16(resultLo, resultHi);
632}
633
634static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
635                                       __m128i &mask) {
636    // In the following comments, the components of src, dst and mask are
637    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
638    // by an R, G, B, or A suffix. Components of one of the four pixels that
639    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
640    // example is the blue channel of the second destination pixel. Memory
641    // layout is shown for an ARGB byte order in a color value.
642
643    // src and srcA store 8-bit values interleaved with zeros.
644    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
645    // mask stores 16-bit values (shown as high and low bytes) interleaved with
646    // zeros
647    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
648    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
649
650    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
651    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
652    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
653                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
654
655    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
656    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
657                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
658
659    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
660    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
661                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
662
663    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
664    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
665    // 8-bit position
666    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
667    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
668    mask = _mm_or_si128(_mm_or_si128(r, g), b);
669
670    // Interleave R,G,B into the lower byte of word.
671    // i.e. split the sixteen 8-bit values from mask into two sets of eight
672    // 16-bit values, padded by zero.
673    __m128i maskLo, maskHi;
674    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
675    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
676    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
677    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
678
679    // Upscale from 0..31 to 0..32
680    // (allows to replace division by left-shift further down)
681    // Left-shift each component by 4 and add the result back to that component,
682    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
683    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
684    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
685
686    // Interleave R,G,B into the lower byte of the word
687    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
688    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
689    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
690    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
691
692    // mask = (src - dst) * mask
693    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
694    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
695
696    // mask = (src - dst) * mask >> 5
697    maskLo = _mm_srai_epi16(maskLo, 5);
698    maskHi = _mm_srai_epi16(maskHi, 5);
699
700    // Add two pixels into result.
701    // result = dst + ((src - dst) * mask >> 5)
702    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
703    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
704
705    // Pack into 4 32bit dst pixels and force opaque.
706    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
707    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
708    // clamping to 255 if necessary. Set alpha components to 0xFF.
709    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
710                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
711}
712
713void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
714                         SkColor src, int width, SkPMColor) {
715    if (width <= 0) {
716        return;
717    }
718
719    int srcA = SkColorGetA(src);
720    int srcR = SkColorGetR(src);
721    int srcG = SkColorGetG(src);
722    int srcB = SkColorGetB(src);
723
724    srcA = SkAlpha255To256(srcA);
725
726    if (width >= 4) {
727        SkASSERT(((size_t)dst & 0x03) == 0);
728        while (((size_t)dst & 0x0F) != 0) {
729            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
730            mask++;
731            dst++;
732            width--;
733        }
734
735        __m128i *d = reinterpret_cast<__m128i*>(dst);
736        // Set alpha to 0xFF and replicate source four times in SSE register.
737        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
738        // Interleave with zeros to get two sets of four 16-bit values.
739        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
740        // Set srcA_sse to contain eight copies of srcA, padded with zero.
741        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
742        __m128i srcA_sse = _mm_set1_epi16(srcA);
743        while (width >= 4) {
744            // Load four destination pixels into dst_sse.
745            __m128i dst_sse = _mm_load_si128(d);
746            // Load four 16-bit masks into lower half of mask_sse.
747            __m128i mask_sse = _mm_loadl_epi64(
748                                   reinterpret_cast<const __m128i*>(mask));
749
750            // Check whether masks are equal to 0 and get the highest bit
751            // of each byte of result, if masks are all zero, we will get
752            // pack_cmp to 0xFFFF
753            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
754                                             _mm_setzero_si128()));
755
756            // if mask pixels are not all zero, we will blend the dst pixels
757            if (pack_cmp != 0xFFFF) {
758                // Unpack 4 16bit mask pixels to
759                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
760                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
761                mask_sse = _mm_unpacklo_epi16(mask_sse,
762                                              _mm_setzero_si128());
763
764                // Process 4 32bit dst pixels
765                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
766                                                   mask_sse, srcA_sse);
767                _mm_store_si128(d, result);
768            }
769
770            d++;
771            mask += 4;
772            width -= 4;
773        }
774
775        dst = reinterpret_cast<SkPMColor*>(d);
776    }
777
778    while (width > 0) {
779        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
780        mask++;
781        dst++;
782        width--;
783    }
784}
785
786void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
787                               SkColor src, int width, SkPMColor opaqueDst) {
788    if (width <= 0) {
789        return;
790    }
791
792    int srcR = SkColorGetR(src);
793    int srcG = SkColorGetG(src);
794    int srcB = SkColorGetB(src);
795
796    if (width >= 4) {
797        SkASSERT(((size_t)dst & 0x03) == 0);
798        while (((size_t)dst & 0x0F) != 0) {
799            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
800            mask++;
801            dst++;
802            width--;
803        }
804
805        __m128i *d = reinterpret_cast<__m128i*>(dst);
806        // Set alpha to 0xFF and replicate source four times in SSE register.
807        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
808        // Set srcA_sse to contain eight copies of srcA, padded with zero.
809        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
810        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
811        while (width >= 4) {
812            // Load four destination pixels into dst_sse.
813            __m128i dst_sse = _mm_load_si128(d);
814            // Load four 16-bit masks into lower half of mask_sse.
815            __m128i mask_sse = _mm_loadl_epi64(
816                                   reinterpret_cast<const __m128i*>(mask));
817
818            // Check whether masks are equal to 0 and get the highest bit
819            // of each byte of result, if masks are all zero, we will get
820            // pack_cmp to 0xFFFF
821            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
822                                             _mm_setzero_si128()));
823
824            // if mask pixels are not all zero, we will blend the dst pixels
825            if (pack_cmp != 0xFFFF) {
826                // Unpack 4 16bit mask pixels to
827                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
828                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
829                mask_sse = _mm_unpacklo_epi16(mask_sse,
830                                              _mm_setzero_si128());
831
832                // Process 4 32bit dst pixels
833                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
834                                                         mask_sse);
835                _mm_store_si128(d, result);
836            }
837
838            d++;
839            mask += 4;
840            width -= 4;
841        }
842
843        dst = reinterpret_cast<SkPMColor*>(d);
844    }
845
846    while (width > 0) {
847        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
848        mask++;
849        dst++;
850        width--;
851    }
852}
853
854/* SSE2 version of S32_D565_Opaque()
855 * portable version is in core/SkBlitRow_D16.cpp
856 */
857void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
858                          const SkPMColor* SK_RESTRICT src, int count,
859                          U8CPU alpha, int /*x*/, int /*y*/) {
860    SkASSERT(255 == alpha);
861
862    if (count <= 0) {
863        return;
864    }
865
866    if (count >= 8) {
867        while (((size_t)dst & 0x0F) != 0) {
868            SkPMColor c = *src++;
869            SkPMColorAssert(c);
870
871            *dst++ = SkPixel32ToPixel16_ToU16(c);
872            count--;
873        }
874
875        const __m128i* s = reinterpret_cast<const __m128i*>(src);
876        __m128i* d = reinterpret_cast<__m128i*>(dst);
877        __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
878        __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
879        __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
880
881        while (count >= 8) {
882            // Load 8 pixels of src.
883            __m128i src_pixel1 = _mm_loadu_si128(s++);
884            __m128i src_pixel2 = _mm_loadu_si128(s++);
885
886            // Calculate result r.
887            __m128i r1 = _mm_srli_epi32(src_pixel1,
888                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
889            r1 = _mm_and_si128(r1, r16_mask);
890            __m128i r2 = _mm_srli_epi32(src_pixel2,
891                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
892            r2 = _mm_and_si128(r2, r16_mask);
893            __m128i r = _mm_packs_epi32(r1, r2);
894
895            // Calculate result g.
896            __m128i g1 = _mm_srli_epi32(src_pixel1,
897                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
898            g1 = _mm_and_si128(g1, g16_mask);
899            __m128i g2 = _mm_srli_epi32(src_pixel2,
900                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
901            g2 = _mm_and_si128(g2, g16_mask);
902            __m128i g = _mm_packs_epi32(g1, g2);
903
904            // Calculate result b.
905            __m128i b1 = _mm_srli_epi32(src_pixel1,
906                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
907            b1 = _mm_and_si128(b1, b16_mask);
908            __m128i b2 = _mm_srli_epi32(src_pixel2,
909                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
910            b2 = _mm_and_si128(b2, b16_mask);
911            __m128i b = _mm_packs_epi32(b1, b2);
912
913            // Store 8 16-bit colors in dst.
914            __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
915            _mm_store_si128(d++, d_pixel);
916            count -= 8;
917        }
918        src = reinterpret_cast<const SkPMColor*>(s);
919        dst = reinterpret_cast<uint16_t*>(d);
920    }
921
922    if (count > 0) {
923        do {
924            SkPMColor c = *src++;
925            SkPMColorAssert(c);
926            *dst++ = SkPixel32ToPixel16_ToU16(c);
927        } while (--count != 0);
928    }
929}
930
931/* SSE2 version of S32A_D565_Opaque()
932 * portable version is in core/SkBlitRow_D16.cpp
933 */
934void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
935                           const SkPMColor* SK_RESTRICT src,
936                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
937    SkASSERT(255 == alpha);
938
939    if (count <= 0) {
940        return;
941    }
942
943    if (count >= 8) {
944        // Make dst 16 bytes alignment
945        while (((size_t)dst & 0x0F) != 0) {
946            SkPMColor c = *src++;
947            if (c) {
948              *dst = SkSrcOver32To16(c, *dst);
949            }
950            dst += 1;
951            count--;
952        }
953
954        const __m128i* s = reinterpret_cast<const __m128i*>(src);
955        __m128i* d = reinterpret_cast<__m128i*>(dst);
956        __m128i var255 = _mm_set1_epi16(255);
957        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
958        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
959        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
960
961        while (count >= 8) {
962            // Load 8 pixels of src.
963            __m128i src_pixel1 = _mm_loadu_si128(s++);
964            __m128i src_pixel2 = _mm_loadu_si128(s++);
965
966            // Check whether src pixels are equal to 0 and get the highest bit
967            // of each byte of result, if src pixels are all zero, src_cmp1 and
968            // src_cmp2 will be 0xFFFF.
969            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
970                                             _mm_setzero_si128()));
971            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
972                                             _mm_setzero_si128()));
973            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
974                d++;
975                count -= 8;
976                continue;
977            }
978
979            // Load 8 pixels of dst.
980            __m128i dst_pixel = _mm_load_si128(d);
981
982            // Extract A from src.
983            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
984            sa1 = _mm_srli_epi32(sa1, 24);
985            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
986            sa2 = _mm_srli_epi32(sa2, 24);
987            __m128i sa = _mm_packs_epi32(sa1, sa2);
988
989            // Extract R from src.
990            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
991            sr1 = _mm_srli_epi32(sr1, 24);
992            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
993            sr2 = _mm_srli_epi32(sr2, 24);
994            __m128i sr = _mm_packs_epi32(sr1, sr2);
995
996            // Extract G from src.
997            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
998            sg1 = _mm_srli_epi32(sg1, 24);
999            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1000            sg2 = _mm_srli_epi32(sg2, 24);
1001            __m128i sg = _mm_packs_epi32(sg1, sg2);
1002
1003            // Extract B from src.
1004            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1005            sb1 = _mm_srli_epi32(sb1, 24);
1006            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1007            sb2 = _mm_srli_epi32(sb2, 24);
1008            __m128i sb = _mm_packs_epi32(sb1, sb2);
1009
1010            // Extract R G B from dst.
1011            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1012            dr = _mm_and_si128(dr, r16_mask);
1013            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1014            dg = _mm_and_si128(dg, g16_mask);
1015            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1016            db = _mm_and_si128(db, b16_mask);
1017
1018            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1019
1020            // Calculate R G B of result.
1021            // Original algorithm is in SkSrcOver32To16().
1022            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
1023            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1024            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
1025            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1026            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
1027            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1028
1029            // Pack R G B into 16-bit color.
1030            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1031
1032            // Store 8 16-bit colors in dst.
1033            _mm_store_si128(d++, d_pixel);
1034            count -= 8;
1035        }
1036
1037        src = reinterpret_cast<const SkPMColor*>(s);
1038        dst = reinterpret_cast<uint16_t*>(d);
1039    }
1040
1041    if (count > 0) {
1042        do {
1043            SkPMColor c = *src++;
1044            SkPMColorAssert(c);
1045            if (c) {
1046                *dst = SkSrcOver32To16(c, *dst);
1047            }
1048            dst += 1;
1049        } while (--count != 0);
1050    }
1051}
1052
1053void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1054                                 const SkPMColor* SK_RESTRICT src,
1055                                 int count, U8CPU alpha, int x, int y) {
1056    SkASSERT(255 == alpha);
1057
1058    if (count <= 0) {
1059        return;
1060    }
1061
1062    if (count >= 8) {
1063        while (((size_t)dst & 0x0F) != 0) {
1064            DITHER_565_SCAN(y);
1065            SkPMColor c = *src++;
1066            SkPMColorAssert(c);
1067
1068            unsigned dither = DITHER_VALUE(x);
1069            *dst++ = SkDitherRGB32To565(c, dither);
1070            DITHER_INC_X(x);
1071            count--;
1072        }
1073
1074        unsigned short dither_value[8];
1075        __m128i dither;
1076#ifdef ENABLE_DITHER_MATRIX_4X4
1077        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1078        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1079        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1080        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1081        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1082#else
1083        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1084        dither_value[0] = dither_value[4] = (dither_scan
1085                                             >> (((x) & 3) << 2)) & 0xF;
1086        dither_value[1] = dither_value[5] = (dither_scan
1087                                             >> (((x + 1) & 3) << 2)) & 0xF;
1088        dither_value[2] = dither_value[6] = (dither_scan
1089                                             >> (((x + 2) & 3) << 2)) & 0xF;
1090        dither_value[3] = dither_value[7] = (dither_scan
1091                                             >> (((x + 3) & 3) << 2)) & 0xF;
1092#endif
1093        dither = _mm_loadu_si128((__m128i*) dither_value);
1094
1095        const __m128i* s = reinterpret_cast<const __m128i*>(src);
1096        __m128i* d = reinterpret_cast<__m128i*>(dst);
1097
1098        while (count >= 8) {
1099            // Load 8 pixels of src.
1100            __m128i src_pixel1 = _mm_loadu_si128(s++);
1101            __m128i src_pixel2 = _mm_loadu_si128(s++);
1102
1103            // Extract R from src.
1104            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1105            sr1 = _mm_srli_epi32(sr1, 24);
1106            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1107            sr2 = _mm_srli_epi32(sr2, 24);
1108            __m128i sr = _mm_packs_epi32(sr1, sr2);
1109
1110            // SkDITHER_R32To565(sr, dither)
1111            __m128i sr_offset = _mm_srli_epi16(sr, 5);
1112            sr = _mm_add_epi16(sr, dither);
1113            sr = _mm_sub_epi16(sr, sr_offset);
1114            sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
1115
1116            // Extract G from src.
1117            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1118            sg1 = _mm_srli_epi32(sg1, 24);
1119            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1120            sg2 = _mm_srli_epi32(sg2, 24);
1121            __m128i sg = _mm_packs_epi32(sg1, sg2);
1122
1123            // SkDITHER_R32To565(sg, dither)
1124            __m128i sg_offset = _mm_srli_epi16(sg, 6);
1125            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
1126            sg = _mm_sub_epi16(sg, sg_offset);
1127            sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
1128
1129            // Extract B from src.
1130            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1131            sb1 = _mm_srli_epi32(sb1, 24);
1132            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1133            sb2 = _mm_srli_epi32(sb2, 24);
1134            __m128i sb = _mm_packs_epi32(sb1, sb2);
1135
1136            // SkDITHER_R32To565(sb, dither)
1137            __m128i sb_offset = _mm_srli_epi16(sb, 5);
1138            sb = _mm_add_epi16(sb, dither);
1139            sb = _mm_sub_epi16(sb, sb_offset);
1140            sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
1141
1142            // Pack and store 16-bit dst pixel.
1143            __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
1144            _mm_store_si128(d++, d_pixel);
1145
1146            count -= 8;
1147            x += 8;
1148        }
1149
1150        src = reinterpret_cast<const SkPMColor*>(s);
1151        dst = reinterpret_cast<uint16_t*>(d);
1152    }
1153
1154    if (count > 0) {
1155        DITHER_565_SCAN(y);
1156        do {
1157            SkPMColor c = *src++;
1158            SkPMColorAssert(c);
1159
1160            unsigned dither = DITHER_VALUE(x);
1161            *dst++ = SkDitherRGB32To565(c, dither);
1162            DITHER_INC_X(x);
1163        } while (--count != 0);
1164    }
1165}
1166
1167/* SSE2 version of S32A_D565_Opaque_Dither()
1168 * portable version is in core/SkBlitRow_D16.cpp
1169 */
1170void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1171                                  const SkPMColor* SK_RESTRICT src,
1172                                  int count, U8CPU alpha, int x, int y) {
1173    SkASSERT(255 == alpha);
1174
1175    if (count <= 0) {
1176        return;
1177    }
1178
1179    if (count >= 8) {
1180        while (((size_t)dst & 0x0F) != 0) {
1181            DITHER_565_SCAN(y);
1182            SkPMColor c = *src++;
1183            SkPMColorAssert(c);
1184            if (c) {
1185                unsigned a = SkGetPackedA32(c);
1186
1187                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1188
1189                unsigned sr = SkGetPackedR32(c);
1190                unsigned sg = SkGetPackedG32(c);
1191                unsigned sb = SkGetPackedB32(c);
1192                sr = SkDITHER_R32_FOR_565(sr, d);
1193                sg = SkDITHER_G32_FOR_565(sg, d);
1194                sb = SkDITHER_B32_FOR_565(sb, d);
1195
1196                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1197                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1198                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1199                // now src and dst expanded are in g:11 r:10 x:1 b:10
1200                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1201            }
1202            dst += 1;
1203            DITHER_INC_X(x);
1204            count--;
1205        }
1206
1207        unsigned short dither_value[8];
1208        __m128i dither, dither_cur;
1209#ifdef ENABLE_DITHER_MATRIX_4X4
1210        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1211        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1212        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1213        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1214        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1215#else
1216        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1217        dither_value[0] = dither_value[4] = (dither_scan
1218                                             >> (((x) & 3) << 2)) & 0xF;
1219        dither_value[1] = dither_value[5] = (dither_scan
1220                                             >> (((x + 1) & 3) << 2)) & 0xF;
1221        dither_value[2] = dither_value[6] = (dither_scan
1222                                             >> (((x + 2) & 3) << 2)) & 0xF;
1223        dither_value[3] = dither_value[7] = (dither_scan
1224                                             >> (((x + 3) & 3) << 2)) & 0xF;
1225#endif
1226        dither = _mm_loadu_si128((__m128i*) dither_value);
1227
1228        const __m128i* s = reinterpret_cast<const __m128i*>(src);
1229        __m128i* d = reinterpret_cast<__m128i*>(dst);
1230        __m128i var256 = _mm_set1_epi16(256);
1231        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1232        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1233        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1234
1235        while (count >= 8) {
1236            // Load 8 pixels of src and dst.
1237            __m128i src_pixel1 = _mm_loadu_si128(s++);
1238            __m128i src_pixel2 = _mm_loadu_si128(s++);
1239            __m128i dst_pixel = _mm_load_si128(d);
1240
1241            // Extract A from src.
1242            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1243            sa1 = _mm_srli_epi32(sa1, 24);
1244            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1245            sa2 = _mm_srli_epi32(sa2, 24);
1246            __m128i sa = _mm_packs_epi32(sa1, sa2);
1247
1248            // Calculate current dither value.
1249            dither_cur = _mm_mullo_epi16(dither,
1250                                         _mm_add_epi16(sa, _mm_set1_epi16(1)));
1251            dither_cur = _mm_srli_epi16(dither_cur, 8);
1252
1253            // Extract R from src.
1254            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1255            sr1 = _mm_srli_epi32(sr1, 24);
1256            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1257            sr2 = _mm_srli_epi32(sr2, 24);
1258            __m128i sr = _mm_packs_epi32(sr1, sr2);
1259
1260            // SkDITHER_R32_FOR_565(sr, d)
1261            __m128i sr_offset = _mm_srli_epi16(sr, 5);
1262            sr = _mm_add_epi16(sr, dither_cur);
1263            sr = _mm_sub_epi16(sr, sr_offset);
1264
1265            // Expand sr.
1266            sr = _mm_slli_epi16(sr, 2);
1267
1268            // Extract G from src.
1269            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1270            sg1 = _mm_srli_epi32(sg1, 24);
1271            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1272            sg2 = _mm_srli_epi32(sg2, 24);
1273            __m128i sg = _mm_packs_epi32(sg1, sg2);
1274
1275            // sg = SkDITHER_G32_FOR_565(sg, d).
1276            __m128i sg_offset = _mm_srli_epi16(sg, 6);
1277            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1278            sg = _mm_sub_epi16(sg, sg_offset);
1279
1280            // Expand sg.
1281            sg = _mm_slli_epi16(sg, 3);
1282
1283            // Extract B from src.
1284            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1285            sb1 = _mm_srli_epi32(sb1, 24);
1286            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1287            sb2 = _mm_srli_epi32(sb2, 24);
1288            __m128i sb = _mm_packs_epi32(sb1, sb2);
1289
1290            // sb = SkDITHER_B32_FOR_565(sb, d).
1291            __m128i sb_offset = _mm_srli_epi16(sb, 5);
1292            sb = _mm_add_epi16(sb, dither_cur);
1293            sb = _mm_sub_epi16(sb, sb_offset);
1294
1295            // Expand sb.
1296            sb = _mm_slli_epi16(sb, 2);
1297
1298            // Extract R G B from dst.
1299            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1300            dr = _mm_and_si128(dr, r16_mask);
1301            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1302            dg = _mm_and_si128(dg, g16_mask);
1303            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1304            db = _mm_and_si128(db, b16_mask);
1305
1306            // SkAlpha255To256(255 - a) >> 3
1307            __m128i isa = _mm_sub_epi16(var256, sa);
1308            isa = _mm_srli_epi16(isa, 3);
1309
1310            dr = _mm_mullo_epi16(dr, isa);
1311            dr = _mm_add_epi16(dr, sr);
1312            dr = _mm_srli_epi16(dr, 5);
1313
1314            dg = _mm_mullo_epi16(dg, isa);
1315            dg = _mm_add_epi16(dg, sg);
1316            dg = _mm_srli_epi16(dg, 5);
1317
1318            db = _mm_mullo_epi16(db, isa);
1319            db = _mm_add_epi16(db, sb);
1320            db = _mm_srli_epi16(db, 5);
1321
1322            // Package and store dst pixel.
1323            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1324            _mm_store_si128(d++, d_pixel);
1325
1326            count -= 8;
1327            x += 8;
1328        }
1329
1330        src = reinterpret_cast<const SkPMColor*>(s);
1331        dst = reinterpret_cast<uint16_t*>(d);
1332    }
1333
1334    if (count > 0) {
1335        DITHER_565_SCAN(y);
1336        do {
1337            SkPMColor c = *src++;
1338            SkPMColorAssert(c);
1339            if (c) {
1340                unsigned a = SkGetPackedA32(c);
1341
1342                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1343
1344                unsigned sr = SkGetPackedR32(c);
1345                unsigned sg = SkGetPackedG32(c);
1346                unsigned sb = SkGetPackedB32(c);
1347                sr = SkDITHER_R32_FOR_565(sr, d);
1348                sg = SkDITHER_G32_FOR_565(sg, d);
1349                sb = SkDITHER_B32_FOR_565(sb, d);
1350
1351                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1352                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1353                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1354                // now src and dst expanded are in g:11 r:10 x:1 b:10
1355                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1356            }
1357            dst += 1;
1358            DITHER_INC_X(x);
1359        } while (--count != 0);
1360    }
1361}
1362