1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkBlitRow_opts_SSE2.h"
11#include "SkColorPriv.h"
12#include "SkColor_opts_SSE2.h"
13#include "SkDither.h"
14#include "SkMSAN.h"
15#include "SkUtils.h"
16
17/* SSE2 version of S32_Blend_BlitRow32()
18 * portable version is in core/SkBlitRow_D32.cpp
19 */
20void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21                              const SkPMColor* SK_RESTRICT src,
22                              int count, U8CPU alpha) {
23    SkASSERT(alpha <= 255);
24    if (count <= 0) {
25        return;
26    }
27
28    uint32_t src_scale = SkAlpha255To256(alpha);
29    uint32_t dst_scale = 256 - src_scale;
30
31    if (count >= 4) {
32        SkASSERT(((size_t)dst & 0x03) == 0);
33        while (((size_t)dst & 0x0F) != 0) {
34            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
35            src++;
36            dst++;
37            count--;
38        }
39
40        const __m128i *s = reinterpret_cast<const __m128i*>(src);
41        __m128i *d = reinterpret_cast<__m128i*>(dst);
42
43        while (count >= 4) {
44            // Load 4 pixels each of src and dest.
45            __m128i src_pixel = _mm_loadu_si128(s);
46            __m128i dst_pixel = _mm_load_si128(d);
47
48            src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
49            dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
50
51            // Add result
52            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
53            _mm_store_si128(d, result);
54            s++;
55            d++;
56            count -= 4;
57        }
58        src = reinterpret_cast<const SkPMColor*>(s);
59        dst = reinterpret_cast<SkPMColor*>(d);
60    }
61
62    while (count > 0) {
63        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
64        src++;
65        dst++;
66        count--;
67    }
68}
69
70void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
71                                const SkPMColor* SK_RESTRICT src,
72                                int count, U8CPU alpha) {
73    sk_msan_assert_initialized(src, src+count);
74
75    SkASSERT(alpha == 255);
76    if (count <= 0) {
77        return;
78    }
79
80#ifdef SK_USE_ACCURATE_BLENDING
81    if (count >= 4) {
82        SkASSERT(((size_t)dst & 0x03) == 0);
83        while (((size_t)dst & 0x0F) != 0) {
84            *dst = SkPMSrcOver(*src, *dst);
85            src++;
86            dst++;
87            count--;
88        }
89
90        const __m128i *s = reinterpret_cast<const __m128i*>(src);
91        __m128i *d = reinterpret_cast<__m128i*>(dst);
92        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
93        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
94        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
95        while (count >= 4) {
96            // Load 4 pixels
97            __m128i src_pixel = _mm_loadu_si128(s);
98            __m128i dst_pixel = _mm_load_si128(d);
99
100            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
101            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
102            // Shift alphas down to lower 8 bits of each quad.
103            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
104
105            // Copy alpha to upper 3rd byte of each quad
106            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
107
108            // Subtract alphas from 255, to get 0..255
109            alpha = _mm_sub_epi16(c_255, alpha);
110
111            // Multiply by red and blue by src alpha.
112            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
113            // Multiply by alpha and green by src alpha.
114            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
115
116            // dst_rb_low = (dst_rb >> 8)
117            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
118            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
119
120            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
121            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
122            dst_rb = _mm_add_epi16(dst_rb, c_128);
123            dst_rb = _mm_srli_epi16(dst_rb, 8);
124
125            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
126            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
127            dst_ag = _mm_add_epi16(dst_ag, c_128);
128            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
129
130            // Combine back into RGBA.
131            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
132
133            // Add result
134            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
135            _mm_store_si128(d, result);
136            s++;
137            d++;
138            count -= 4;
139        }
140        src = reinterpret_cast<const SkPMColor*>(s);
141        dst = reinterpret_cast<SkPMColor*>(d);
142    }
143
144    while (count > 0) {
145        *dst = SkPMSrcOver(*src, *dst);
146        src++;
147        dst++;
148        count--;
149    }
150#else
151    int count16 = count / 16;
152    __m128i* dst4 = (__m128i*)dst;
153    const __m128i* src4 = (const __m128i*)src;
154
155    for (int i = 0; i < count16 * 4; i += 4) {
156        // Load 16 source pixels.
157        __m128i s0 = _mm_loadu_si128(src4+i+0),
158                s1 = _mm_loadu_si128(src4+i+1),
159                s2 = _mm_loadu_si128(src4+i+2),
160                s3 = _mm_loadu_si128(src4+i+3);
161
162        const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
163        const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
164        __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
165        if (0xffff == _mm_movemask_epi8(cmp)) {
166            // All 16 source pixels are fully transparent. There's nothing to do!
167            continue;
168        }
169        const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
170        cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
171        if (0xffff == _mm_movemask_epi8(cmp)) {
172            // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
173            _mm_storeu_si128(dst4+i+0, s0);
174            _mm_storeu_si128(dst4+i+1, s1);
175            _mm_storeu_si128(dst4+i+2, s2);
176            _mm_storeu_si128(dst4+i+3, s3);
177            continue;
178        }
179        // The general slow case: do the blend for all 16 pixels.
180        _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
181        _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
182        _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
183        _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
184    }
185
186    // Wrap up the last <= 15 pixels.
187    SkASSERT(count - (count16*16) <= 15);
188    for (int i = count16*16; i < count; i++) {
189        // This check is not really necessarily, but it prevents pointless autovectorization.
190        if (src[i] & 0xFF000000) {
191            dst[i] = SkPMSrcOver(src[i], dst[i]);
192        }
193    }
194#endif
195}
196
197void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
198                               const SkPMColor* SK_RESTRICT src,
199                               int count, U8CPU alpha) {
200    SkASSERT(alpha <= 255);
201    if (count <= 0) {
202        return;
203    }
204
205    if (count >= 4) {
206        while (((size_t)dst & 0x0F) != 0) {
207            *dst = SkBlendARGB32(*src, *dst, alpha);
208            src++;
209            dst++;
210            count--;
211        }
212
213        const __m128i *s = reinterpret_cast<const __m128i*>(src);
214        __m128i *d = reinterpret_cast<__m128i*>(dst);
215        while (count >= 4) {
216            // Load 4 pixels each of src and dest.
217            __m128i src_pixel = _mm_loadu_si128(s);
218            __m128i dst_pixel = _mm_load_si128(d);
219
220            __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
221            _mm_store_si128(d, result);
222            s++;
223            d++;
224            count -= 4;
225        }
226        src = reinterpret_cast<const SkPMColor*>(s);
227        dst = reinterpret_cast<SkPMColor*>(d);
228    }
229
230    while (count > 0) {
231        *dst = SkBlendARGB32(*src, *dst, alpha);
232        src++;
233        dst++;
234        count--;
235    }
236}
237
238void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
239    SkASSERT(count > 0);
240
241    uint32_t src_expand = (SkGetPackedG32(src) << 24) |
242                          (SkGetPackedR32(src) << 13) |
243                          (SkGetPackedB32(src) << 2);
244    unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
245
246    // Check if we have enough pixels to run SIMD
247    if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
248        __m128i* dst_wide;
249        const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
250        const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
251        const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
252        const __m128i scale_wide = _mm_set1_epi16(scale);
253        const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
254        const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
255
256        // Align dst to an even 16 byte address (0-7 pixels)
257        while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
258            *dst = SkBlend32_RGB16(src_expand, *dst, scale);
259            dst += 1;
260            count--;
261        }
262
263        dst_wide = reinterpret_cast<__m128i*>(dst);
264        do {
265            // Load eight RGB565 pixels
266            __m128i pixels = _mm_load_si128(dst_wide);
267
268            // Mask out sub-pixels
269            __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
270            __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
271            pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
272            __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
273
274            // Scale with alpha
275            pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
276            pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
277            pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
278
279            // Add src_X_wide and shift down again
280            pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
281            pixel_R = _mm_srli_epi16(pixel_R, 5);
282            pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
283            pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
284            pixel_B = _mm_srli_epi16(pixel_B, 5);
285
286            // Combine into RGB565 and store
287            pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
288            pixel_G = _mm_and_si128(pixel_G, mask_green);
289            pixels = _mm_or_si128(pixel_R, pixel_G);
290            pixels = _mm_or_si128(pixels, pixel_B);
291            _mm_store_si128(dst_wide, pixels);
292            count -= 8;
293            dst_wide++;
294        } while (count >= 8);
295
296        dst = reinterpret_cast<uint16_t*>(dst_wide);
297    }
298
299    // Small loop to handle remaining pixels.
300    while (count > 0) {
301        *dst = SkBlend32_RGB16(src_expand, *dst, scale);
302        dst += 1;
303        count--;
304    }
305}
306
307// The following (left) shifts cause the top 5 bits of the mask components to
308// line up with the corresponding components in an SkPMColor.
309// Note that the mask's RGB16 order may differ from the SkPMColor order.
310#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
311#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
312#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
313
314#if SK_R16x5_R32x5_SHIFT == 0
315    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
316#elif SK_R16x5_R32x5_SHIFT > 0
317    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
318#else
319    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
320#endif
321
322#if SK_G16x5_G32x5_SHIFT == 0
323    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
324#elif SK_G16x5_G32x5_SHIFT > 0
325    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
326#else
327    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
328#endif
329
330#if SK_B16x5_B32x5_SHIFT == 0
331    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
332#elif SK_B16x5_B32x5_SHIFT > 0
333    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
334#else
335    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
336#endif
337
338static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
339                                 __m128i &mask, __m128i &srcA) {
340    // In the following comments, the components of src, dst and mask are
341    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
342    // by an R, G, B, or A suffix. Components of one of the four pixels that
343    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
344    // example is the blue channel of the second destination pixel. Memory
345    // layout is shown for an ARGB byte order in a color value.
346
347    // src and srcA store 8-bit values interleaved with zeros.
348    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
349    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
350    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
351    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
352    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
353    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
354    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
355
356    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
357    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
358    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
359                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
360
361    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
362    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
363                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
364
365    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
366    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
367                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
368
369    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
370    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
371    // 8-bit position
372    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
373    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
374    mask = _mm_or_si128(_mm_or_si128(r, g), b);
375
376    // Interleave R,G,B into the lower byte of word.
377    // i.e. split the sixteen 8-bit values from mask into two sets of eight
378    // 16-bit values, padded by zero.
379    __m128i maskLo, maskHi;
380    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
381    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
382    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
383    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
384
385    // Upscale from 0..31 to 0..32
386    // (allows to replace division by left-shift further down)
387    // Left-shift each component by 4 and add the result back to that component,
388    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
389    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
390    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
391
392    // Multiply each component of maskLo and maskHi by srcA
393    maskLo = _mm_mullo_epi16(maskLo, srcA);
394    maskHi = _mm_mullo_epi16(maskHi, srcA);
395
396    // Left shift mask components by 8 (divide by 256)
397    maskLo = _mm_srli_epi16(maskLo, 8);
398    maskHi = _mm_srli_epi16(maskHi, 8);
399
400    // Interleave R,G,B into the lower byte of the word
401    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
402    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
403    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
404    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
405
406    // mask = (src - dst) * mask
407    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
408    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
409
410    // mask = (src - dst) * mask >> 5
411    maskLo = _mm_srai_epi16(maskLo, 5);
412    maskHi = _mm_srai_epi16(maskHi, 5);
413
414    // Add two pixels into result.
415    // result = dst + ((src - dst) * mask >> 5)
416    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
417    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
418
419    // Pack into 4 32bit dst pixels.
420    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
421    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
422    // clamping to 255 if necessary.
423    return _mm_packus_epi16(resultLo, resultHi);
424}
425
426static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
427                                       __m128i &mask) {
428    // In the following comments, the components of src, dst and mask are
429    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
430    // by an R, G, B, or A suffix. Components of one of the four pixels that
431    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
432    // example is the blue channel of the second destination pixel. Memory
433    // layout is shown for an ARGB byte order in a color value.
434
435    // src and srcA store 8-bit values interleaved with zeros.
436    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
437    // mask stores 16-bit values (shown as high and low bytes) interleaved with
438    // zeros
439    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
440    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
441
442    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
443    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
444    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
445                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
446
447    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
448    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
449                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
450
451    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
452    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
453                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
454
455    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
456    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
457    // 8-bit position
458    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
459    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
460    mask = _mm_or_si128(_mm_or_si128(r, g), b);
461
462    // Interleave R,G,B into the lower byte of word.
463    // i.e. split the sixteen 8-bit values from mask into two sets of eight
464    // 16-bit values, padded by zero.
465    __m128i maskLo, maskHi;
466    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
467    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
468    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
469    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
470
471    // Upscale from 0..31 to 0..32
472    // (allows to replace division by left-shift further down)
473    // Left-shift each component by 4 and add the result back to that component,
474    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
475    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
476    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
477
478    // Interleave R,G,B into the lower byte of the word
479    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
480    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
481    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
482    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
483
484    // mask = (src - dst) * mask
485    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
486    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
487
488    // mask = (src - dst) * mask >> 5
489    maskLo = _mm_srai_epi16(maskLo, 5);
490    maskHi = _mm_srai_epi16(maskHi, 5);
491
492    // Add two pixels into result.
493    // result = dst + ((src - dst) * mask >> 5)
494    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
495    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
496
497    // Pack into 4 32bit dst pixels and force opaque.
498    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
499    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
500    // clamping to 255 if necessary. Set alpha components to 0xFF.
501    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
502                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
503}
504
505void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
506                         SkColor src, int width, SkPMColor) {
507    if (width <= 0) {
508        return;
509    }
510
511    int srcA = SkColorGetA(src);
512    int srcR = SkColorGetR(src);
513    int srcG = SkColorGetG(src);
514    int srcB = SkColorGetB(src);
515
516    srcA = SkAlpha255To256(srcA);
517
518    if (width >= 4) {
519        SkASSERT(((size_t)dst & 0x03) == 0);
520        while (((size_t)dst & 0x0F) != 0) {
521            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
522            mask++;
523            dst++;
524            width--;
525        }
526
527        __m128i *d = reinterpret_cast<__m128i*>(dst);
528        // Set alpha to 0xFF and replicate source four times in SSE register.
529        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
530        // Interleave with zeros to get two sets of four 16-bit values.
531        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
532        // Set srcA_sse to contain eight copies of srcA, padded with zero.
533        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
534        __m128i srcA_sse = _mm_set1_epi16(srcA);
535        while (width >= 4) {
536            // Load four destination pixels into dst_sse.
537            __m128i dst_sse = _mm_load_si128(d);
538            // Load four 16-bit masks into lower half of mask_sse.
539            __m128i mask_sse = _mm_loadl_epi64(
540                                   reinterpret_cast<const __m128i*>(mask));
541
542            // Check whether masks are equal to 0 and get the highest bit
543            // of each byte of result, if masks are all zero, we will get
544            // pack_cmp to 0xFFFF
545            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
546                                             _mm_setzero_si128()));
547
548            // if mask pixels are not all zero, we will blend the dst pixels
549            if (pack_cmp != 0xFFFF) {
550                // Unpack 4 16bit mask pixels to
551                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
552                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
553                mask_sse = _mm_unpacklo_epi16(mask_sse,
554                                              _mm_setzero_si128());
555
556                // Process 4 32bit dst pixels
557                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
558                                                   mask_sse, srcA_sse);
559                _mm_store_si128(d, result);
560            }
561
562            d++;
563            mask += 4;
564            width -= 4;
565        }
566
567        dst = reinterpret_cast<SkPMColor*>(d);
568    }
569
570    while (width > 0) {
571        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
572        mask++;
573        dst++;
574        width--;
575    }
576}
577
578void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
579                               SkColor src, int width, SkPMColor opaqueDst) {
580    if (width <= 0) {
581        return;
582    }
583
584    int srcR = SkColorGetR(src);
585    int srcG = SkColorGetG(src);
586    int srcB = SkColorGetB(src);
587
588    if (width >= 4) {
589        SkASSERT(((size_t)dst & 0x03) == 0);
590        while (((size_t)dst & 0x0F) != 0) {
591            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
592            mask++;
593            dst++;
594            width--;
595        }
596
597        __m128i *d = reinterpret_cast<__m128i*>(dst);
598        // Set alpha to 0xFF and replicate source four times in SSE register.
599        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
600        // Set srcA_sse to contain eight copies of srcA, padded with zero.
601        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
602        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
603        while (width >= 4) {
604            // Load four destination pixels into dst_sse.
605            __m128i dst_sse = _mm_load_si128(d);
606            // Load four 16-bit masks into lower half of mask_sse.
607            __m128i mask_sse = _mm_loadl_epi64(
608                                   reinterpret_cast<const __m128i*>(mask));
609
610            // Check whether masks are equal to 0 and get the highest bit
611            // of each byte of result, if masks are all zero, we will get
612            // pack_cmp to 0xFFFF
613            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
614                                             _mm_setzero_si128()));
615
616            // if mask pixels are not all zero, we will blend the dst pixels
617            if (pack_cmp != 0xFFFF) {
618                // Unpack 4 16bit mask pixels to
619                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
620                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
621                mask_sse = _mm_unpacklo_epi16(mask_sse,
622                                              _mm_setzero_si128());
623
624                // Process 4 32bit dst pixels
625                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
626                                                         mask_sse);
627                _mm_store_si128(d, result);
628            }
629
630            d++;
631            mask += 4;
632            width -= 4;
633        }
634
635        dst = reinterpret_cast<SkPMColor*>(d);
636    }
637
638    while (width > 0) {
639        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
640        mask++;
641        dst++;
642        width--;
643    }
644}
645
646/* SSE2 version of S32_D565_Opaque()
647 * portable version is in core/SkBlitRow_D16.cpp
648 */
649void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
650                          const SkPMColor* SK_RESTRICT src, int count,
651                          U8CPU alpha, int /*x*/, int /*y*/) {
652    SkASSERT(255 == alpha);
653
654    if (count <= 0) {
655        return;
656    }
657
658    if (count >= 8) {
659        while (((size_t)dst & 0x0F) != 0) {
660            SkPMColor c = *src++;
661            SkPMColorAssert(c);
662
663            *dst++ = SkPixel32ToPixel16_ToU16(c);
664            count--;
665        }
666
667        const __m128i* s = reinterpret_cast<const __m128i*>(src);
668        __m128i* d = reinterpret_cast<__m128i*>(dst);
669
670        while (count >= 8) {
671            // Load 8 pixels of src.
672            __m128i src_pixel1 = _mm_loadu_si128(s++);
673            __m128i src_pixel2 = _mm_loadu_si128(s++);
674
675            __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
676            _mm_store_si128(d++, d_pixel);
677            count -= 8;
678        }
679        src = reinterpret_cast<const SkPMColor*>(s);
680        dst = reinterpret_cast<uint16_t*>(d);
681    }
682
683    if (count > 0) {
684        do {
685            SkPMColor c = *src++;
686            SkPMColorAssert(c);
687            *dst++ = SkPixel32ToPixel16_ToU16(c);
688        } while (--count != 0);
689    }
690}
691
692/* SSE2 version of S32A_D565_Opaque()
693 * portable version is in core/SkBlitRow_D16.cpp
694 */
695void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
696                           const SkPMColor* SK_RESTRICT src,
697                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
698    SkASSERT(255 == alpha);
699
700    if (count <= 0) {
701        return;
702    }
703
704    if (count >= 8) {
705        // Make dst 16 bytes alignment
706        while (((size_t)dst & 0x0F) != 0) {
707            SkPMColor c = *src++;
708            if (c) {
709              *dst = SkSrcOver32To16(c, *dst);
710            }
711            dst += 1;
712            count--;
713        }
714
715        const __m128i* s = reinterpret_cast<const __m128i*>(src);
716        __m128i* d = reinterpret_cast<__m128i*>(dst);
717        __m128i var255 = _mm_set1_epi16(255);
718        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
719        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
720        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
721
722        while (count >= 8) {
723            // Load 8 pixels of src.
724            __m128i src_pixel1 = _mm_loadu_si128(s++);
725            __m128i src_pixel2 = _mm_loadu_si128(s++);
726
727            // Check whether src pixels are equal to 0 and get the highest bit
728            // of each byte of result, if src pixels are all zero, src_cmp1 and
729            // src_cmp2 will be 0xFFFF.
730            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
731                                             _mm_setzero_si128()));
732            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
733                                             _mm_setzero_si128()));
734            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
735                d++;
736                count -= 8;
737                continue;
738            }
739
740            // Load 8 pixels of dst.
741            __m128i dst_pixel = _mm_load_si128(d);
742
743            // Extract A from src.
744            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
745            sa1 = _mm_srli_epi32(sa1, 24);
746            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
747            sa2 = _mm_srli_epi32(sa2, 24);
748            __m128i sa = _mm_packs_epi32(sa1, sa2);
749
750            // Extract R from src.
751            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
752            sr1 = _mm_srli_epi32(sr1, 24);
753            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
754            sr2 = _mm_srli_epi32(sr2, 24);
755            __m128i sr = _mm_packs_epi32(sr1, sr2);
756
757            // Extract G from src.
758            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
759            sg1 = _mm_srli_epi32(sg1, 24);
760            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
761            sg2 = _mm_srli_epi32(sg2, 24);
762            __m128i sg = _mm_packs_epi32(sg1, sg2);
763
764            // Extract B from src.
765            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
766            sb1 = _mm_srli_epi32(sb1, 24);
767            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
768            sb2 = _mm_srli_epi32(sb2, 24);
769            __m128i sb = _mm_packs_epi32(sb1, sb2);
770
771            // Extract R G B from dst.
772            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
773            dr = _mm_and_si128(dr, r16_mask);
774            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
775            dg = _mm_and_si128(dg, g16_mask);
776            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
777            db = _mm_and_si128(db, b16_mask);
778
779            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
780
781            // Calculate R G B of result.
782            // Original algorithm is in SkSrcOver32To16().
783            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
784            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
785            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
786            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
787            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
788            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
789
790            // Pack R G B into 16-bit color.
791            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
792
793            // Store 8 16-bit colors in dst.
794            _mm_store_si128(d++, d_pixel);
795            count -= 8;
796        }
797
798        src = reinterpret_cast<const SkPMColor*>(s);
799        dst = reinterpret_cast<uint16_t*>(d);
800    }
801
802    if (count > 0) {
803        do {
804            SkPMColor c = *src++;
805            SkPMColorAssert(c);
806            if (c) {
807                *dst = SkSrcOver32To16(c, *dst);
808            }
809            dst += 1;
810        } while (--count != 0);
811    }
812}
813
814void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
815                                 const SkPMColor* SK_RESTRICT src,
816                                 int count, U8CPU alpha, int x, int y) {
817    SkASSERT(255 == alpha);
818
819    if (count <= 0) {
820        return;
821    }
822
823    if (count >= 8) {
824        while (((size_t)dst & 0x0F) != 0) {
825            DITHER_565_SCAN(y);
826            SkPMColor c = *src++;
827            SkPMColorAssert(c);
828
829            unsigned dither = DITHER_VALUE(x);
830            *dst++ = SkDitherRGB32To565(c, dither);
831            DITHER_INC_X(x);
832            count--;
833        }
834
835        unsigned short dither_value[8];
836        __m128i dither;
837#ifdef ENABLE_DITHER_MATRIX_4X4
838        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
839        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
840        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
841        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
842        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
843#else
844        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
845        dither_value[0] = dither_value[4] = (dither_scan
846                                             >> (((x) & 3) << 2)) & 0xF;
847        dither_value[1] = dither_value[5] = (dither_scan
848                                             >> (((x + 1) & 3) << 2)) & 0xF;
849        dither_value[2] = dither_value[6] = (dither_scan
850                                             >> (((x + 2) & 3) << 2)) & 0xF;
851        dither_value[3] = dither_value[7] = (dither_scan
852                                             >> (((x + 3) & 3) << 2)) & 0xF;
853#endif
854        dither = _mm_loadu_si128((__m128i*) dither_value);
855
856        const __m128i* s = reinterpret_cast<const __m128i*>(src);
857        __m128i* d = reinterpret_cast<__m128i*>(dst);
858
859        while (count >= 8) {
860            // Load 8 pixels of src.
861            __m128i src_pixel1 = _mm_loadu_si128(s++);
862            __m128i src_pixel2 = _mm_loadu_si128(s++);
863
864            // Extract R from src.
865            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
866            sr1 = _mm_srli_epi32(sr1, 24);
867            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
868            sr2 = _mm_srli_epi32(sr2, 24);
869            __m128i sr = _mm_packs_epi32(sr1, sr2);
870
871            // SkDITHER_R32To565(sr, dither)
872            __m128i sr_offset = _mm_srli_epi16(sr, 5);
873            sr = _mm_add_epi16(sr, dither);
874            sr = _mm_sub_epi16(sr, sr_offset);
875            sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
876
877            // Extract G from src.
878            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
879            sg1 = _mm_srli_epi32(sg1, 24);
880            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
881            sg2 = _mm_srli_epi32(sg2, 24);
882            __m128i sg = _mm_packs_epi32(sg1, sg2);
883
884            // SkDITHER_R32To565(sg, dither)
885            __m128i sg_offset = _mm_srli_epi16(sg, 6);
886            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
887            sg = _mm_sub_epi16(sg, sg_offset);
888            sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
889
890            // Extract B from src.
891            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
892            sb1 = _mm_srli_epi32(sb1, 24);
893            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
894            sb2 = _mm_srli_epi32(sb2, 24);
895            __m128i sb = _mm_packs_epi32(sb1, sb2);
896
897            // SkDITHER_R32To565(sb, dither)
898            __m128i sb_offset = _mm_srli_epi16(sb, 5);
899            sb = _mm_add_epi16(sb, dither);
900            sb = _mm_sub_epi16(sb, sb_offset);
901            sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
902
903            // Pack and store 16-bit dst pixel.
904            __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
905            _mm_store_si128(d++, d_pixel);
906
907            count -= 8;
908            x += 8;
909        }
910
911        src = reinterpret_cast<const SkPMColor*>(s);
912        dst = reinterpret_cast<uint16_t*>(d);
913    }
914
915    if (count > 0) {
916        DITHER_565_SCAN(y);
917        do {
918            SkPMColor c = *src++;
919            SkPMColorAssert(c);
920
921            unsigned dither = DITHER_VALUE(x);
922            *dst++ = SkDitherRGB32To565(c, dither);
923            DITHER_INC_X(x);
924        } while (--count != 0);
925    }
926}
927
928/* SSE2 version of S32A_D565_Opaque_Dither()
929 * portable version is in core/SkBlitRow_D16.cpp
930 */
931void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
932                                  const SkPMColor* SK_RESTRICT src,
933                                  int count, U8CPU alpha, int x, int y) {
934    SkASSERT(255 == alpha);
935
936    if (count <= 0) {
937        return;
938    }
939
940    if (count >= 8) {
941        while (((size_t)dst & 0x0F) != 0) {
942            DITHER_565_SCAN(y);
943            SkPMColor c = *src++;
944            SkPMColorAssert(c);
945            if (c) {
946                unsigned a = SkGetPackedA32(c);
947
948                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
949
950                unsigned sr = SkGetPackedR32(c);
951                unsigned sg = SkGetPackedG32(c);
952                unsigned sb = SkGetPackedB32(c);
953                sr = SkDITHER_R32_FOR_565(sr, d);
954                sg = SkDITHER_G32_FOR_565(sg, d);
955                sb = SkDITHER_B32_FOR_565(sb, d);
956
957                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
958                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
959                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
960                // now src and dst expanded are in g:11 r:10 x:1 b:10
961                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
962            }
963            dst += 1;
964            DITHER_INC_X(x);
965            count--;
966        }
967
968        unsigned short dither_value[8];
969        __m128i dither, dither_cur;
970#ifdef ENABLE_DITHER_MATRIX_4X4
971        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
972        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
973        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
974        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
975        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
976#else
977        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
978        dither_value[0] = dither_value[4] = (dither_scan
979                                             >> (((x) & 3) << 2)) & 0xF;
980        dither_value[1] = dither_value[5] = (dither_scan
981                                             >> (((x + 1) & 3) << 2)) & 0xF;
982        dither_value[2] = dither_value[6] = (dither_scan
983                                             >> (((x + 2) & 3) << 2)) & 0xF;
984        dither_value[3] = dither_value[7] = (dither_scan
985                                             >> (((x + 3) & 3) << 2)) & 0xF;
986#endif
987        dither = _mm_loadu_si128((__m128i*) dither_value);
988
989        const __m128i* s = reinterpret_cast<const __m128i*>(src);
990        __m128i* d = reinterpret_cast<__m128i*>(dst);
991        __m128i var256 = _mm_set1_epi16(256);
992        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
993        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
994        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
995
996        while (count >= 8) {
997            // Load 8 pixels of src and dst.
998            __m128i src_pixel1 = _mm_loadu_si128(s++);
999            __m128i src_pixel2 = _mm_loadu_si128(s++);
1000            __m128i dst_pixel = _mm_load_si128(d);
1001
1002            // Extract A from src.
1003            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1004            sa1 = _mm_srli_epi32(sa1, 24);
1005            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1006            sa2 = _mm_srli_epi32(sa2, 24);
1007            __m128i sa = _mm_packs_epi32(sa1, sa2);
1008
1009            // Calculate current dither value.
1010            dither_cur = _mm_mullo_epi16(dither,
1011                                         _mm_add_epi16(sa, _mm_set1_epi16(1)));
1012            dither_cur = _mm_srli_epi16(dither_cur, 8);
1013
1014            // Extract R from src.
1015            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1016            sr1 = _mm_srli_epi32(sr1, 24);
1017            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1018            sr2 = _mm_srli_epi32(sr2, 24);
1019            __m128i sr = _mm_packs_epi32(sr1, sr2);
1020
1021            // SkDITHER_R32_FOR_565(sr, d)
1022            __m128i sr_offset = _mm_srli_epi16(sr, 5);
1023            sr = _mm_add_epi16(sr, dither_cur);
1024            sr = _mm_sub_epi16(sr, sr_offset);
1025
1026            // Expand sr.
1027            sr = _mm_slli_epi16(sr, 2);
1028
1029            // Extract G from src.
1030            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1031            sg1 = _mm_srli_epi32(sg1, 24);
1032            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1033            sg2 = _mm_srli_epi32(sg2, 24);
1034            __m128i sg = _mm_packs_epi32(sg1, sg2);
1035
1036            // sg = SkDITHER_G32_FOR_565(sg, d).
1037            __m128i sg_offset = _mm_srli_epi16(sg, 6);
1038            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1039            sg = _mm_sub_epi16(sg, sg_offset);
1040
1041            // Expand sg.
1042            sg = _mm_slli_epi16(sg, 3);
1043
1044            // Extract B from src.
1045            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1046            sb1 = _mm_srli_epi32(sb1, 24);
1047            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1048            sb2 = _mm_srli_epi32(sb2, 24);
1049            __m128i sb = _mm_packs_epi32(sb1, sb2);
1050
1051            // sb = SkDITHER_B32_FOR_565(sb, d).
1052            __m128i sb_offset = _mm_srli_epi16(sb, 5);
1053            sb = _mm_add_epi16(sb, dither_cur);
1054            sb = _mm_sub_epi16(sb, sb_offset);
1055
1056            // Expand sb.
1057            sb = _mm_slli_epi16(sb, 2);
1058
1059            // Extract R G B from dst.
1060            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1061            dr = _mm_and_si128(dr, r16_mask);
1062            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1063            dg = _mm_and_si128(dg, g16_mask);
1064            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1065            db = _mm_and_si128(db, b16_mask);
1066
1067            // SkAlpha255To256(255 - a) >> 3
1068            __m128i isa = _mm_sub_epi16(var256, sa);
1069            isa = _mm_srli_epi16(isa, 3);
1070
1071            dr = _mm_mullo_epi16(dr, isa);
1072            dr = _mm_add_epi16(dr, sr);
1073            dr = _mm_srli_epi16(dr, 5);
1074
1075            dg = _mm_mullo_epi16(dg, isa);
1076            dg = _mm_add_epi16(dg, sg);
1077            dg = _mm_srli_epi16(dg, 5);
1078
1079            db = _mm_mullo_epi16(db, isa);
1080            db = _mm_add_epi16(db, sb);
1081            db = _mm_srli_epi16(db, 5);
1082
1083            // Package and store dst pixel.
1084            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1085            _mm_store_si128(d++, d_pixel);
1086
1087            count -= 8;
1088            x += 8;
1089        }
1090
1091        src = reinterpret_cast<const SkPMColor*>(s);
1092        dst = reinterpret_cast<uint16_t*>(d);
1093    }
1094
1095    if (count > 0) {
1096        DITHER_565_SCAN(y);
1097        do {
1098            SkPMColor c = *src++;
1099            SkPMColorAssert(c);
1100            if (c) {
1101                unsigned a = SkGetPackedA32(c);
1102
1103                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1104
1105                unsigned sr = SkGetPackedR32(c);
1106                unsigned sg = SkGetPackedG32(c);
1107                unsigned sb = SkGetPackedB32(c);
1108                sr = SkDITHER_R32_FOR_565(sr, d);
1109                sg = SkDITHER_G32_FOR_565(sg, d);
1110                sb = SkDITHER_B32_FOR_565(sb, d);
1111
1112                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1113                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1114                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1115                // now src and dst expanded are in g:11 r:10 x:1 b:10
1116                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1117            }
1118            dst += 1;
1119            DITHER_INC_X(x);
1120        } while (--count != 0);
1121    }
1122}
1123