1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkBlitRow_opts_SSE2.h"
11#include "SkColorPriv.h"
12#include "SkColor_opts_SSE2.h"
13#include "SkDither.h"
14#include "SkUtils.h"
15
16/* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
19void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20                              const SkPMColor* SK_RESTRICT src,
21                              int count, U8CPU alpha) {
22    SkASSERT(alpha <= 255);
23    if (count <= 0) {
24        return;
25    }
26
27    uint32_t src_scale = SkAlpha255To256(alpha);
28    uint32_t dst_scale = 256 - src_scale;
29
30    if (count >= 4) {
31        SkASSERT(((size_t)dst & 0x03) == 0);
32        while (((size_t)dst & 0x0F) != 0) {
33            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34            src++;
35            dst++;
36            count--;
37        }
38
39        const __m128i *s = reinterpret_cast<const __m128i*>(src);
40        __m128i *d = reinterpret_cast<__m128i*>(dst);
41
42        while (count >= 4) {
43            // Load 4 pixels each of src and dest.
44            __m128i src_pixel = _mm_loadu_si128(s);
45            __m128i dst_pixel = _mm_load_si128(d);
46
47            src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
48            dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
49
50            // Add result
51            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
52            _mm_store_si128(d, result);
53            s++;
54            d++;
55            count -= 4;
56        }
57        src = reinterpret_cast<const SkPMColor*>(s);
58        dst = reinterpret_cast<SkPMColor*>(d);
59    }
60
61    while (count > 0) {
62        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
63        src++;
64        dst++;
65        count--;
66    }
67}
68
69void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
70                                const SkPMColor* SK_RESTRICT src,
71                                int count, U8CPU alpha) {
72    SkASSERT(alpha == 255);
73    if (count <= 0) {
74        return;
75    }
76
77#ifdef SK_USE_ACCURATE_BLENDING
78    if (count >= 4) {
79        SkASSERT(((size_t)dst & 0x03) == 0);
80        while (((size_t)dst & 0x0F) != 0) {
81            *dst = SkPMSrcOver(*src, *dst);
82            src++;
83            dst++;
84            count--;
85        }
86
87        const __m128i *s = reinterpret_cast<const __m128i*>(src);
88        __m128i *d = reinterpret_cast<__m128i*>(dst);
89        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
90        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
91        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
92        while (count >= 4) {
93            // Load 4 pixels
94            __m128i src_pixel = _mm_loadu_si128(s);
95            __m128i dst_pixel = _mm_load_si128(d);
96
97            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
98            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
99            // Shift alphas down to lower 8 bits of each quad.
100            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
101
102            // Copy alpha to upper 3rd byte of each quad
103            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
104
105            // Subtract alphas from 255, to get 0..255
106            alpha = _mm_sub_epi16(c_255, alpha);
107
108            // Multiply by red and blue by src alpha.
109            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
110            // Multiply by alpha and green by src alpha.
111            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
112
113            // dst_rb_low = (dst_rb >> 8)
114            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
115            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
116
117            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
118            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
119            dst_rb = _mm_add_epi16(dst_rb, c_128);
120            dst_rb = _mm_srli_epi16(dst_rb, 8);
121
122            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
123            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
124            dst_ag = _mm_add_epi16(dst_ag, c_128);
125            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
126
127            // Combine back into RGBA.
128            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
129
130            // Add result
131            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
132            _mm_store_si128(d, result);
133            s++;
134            d++;
135            count -= 4;
136        }
137        src = reinterpret_cast<const SkPMColor*>(s);
138        dst = reinterpret_cast<SkPMColor*>(d);
139    }
140
141    while (count > 0) {
142        *dst = SkPMSrcOver(*src, *dst);
143        src++;
144        dst++;
145        count--;
146    }
147#else
148    int count16 = count / 16;
149    __m128i* dst4 = (__m128i*)dst;
150    const __m128i* src4 = (const __m128i*)src;
151
152    for (int i = 0; i < count16 * 4; i += 4) {
153        // Load 16 source pixels.
154        __m128i s0 = _mm_loadu_si128(src4+i+0),
155                s1 = _mm_loadu_si128(src4+i+1),
156                s2 = _mm_loadu_si128(src4+i+2),
157                s3 = _mm_loadu_si128(src4+i+3);
158
159        const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
160        const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
161        __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
162        if (0xffff == _mm_movemask_epi8(cmp)) {
163            // All 16 source pixels are fully transparent. There's nothing to do!
164            continue;
165        }
166        const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
167        cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
168        if (0xffff == _mm_movemask_epi8(cmp)) {
169            // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
170            _mm_storeu_si128(dst4+i+0, s0);
171            _mm_storeu_si128(dst4+i+1, s1);
172            _mm_storeu_si128(dst4+i+2, s2);
173            _mm_storeu_si128(dst4+i+3, s3);
174            continue;
175        }
176        // The general slow case: do the blend for all 16 pixels.
177        _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
178        _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
179        _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
180        _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
181    }
182
183    // Wrap up the last <= 15 pixels.
184    SkASSERT(count - (count16*16) <= 15);
185    for (int i = count16*16; i < count; i++) {
186        // This check is not really necessarily, but it prevents pointless autovectorization.
187        if (src[i] & 0xFF000000) {
188            dst[i] = SkPMSrcOver(src[i], dst[i]);
189        }
190    }
191#endif
192}
193
194void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
195                               const SkPMColor* SK_RESTRICT src,
196                               int count, U8CPU alpha) {
197    SkASSERT(alpha <= 255);
198    if (count <= 0) {
199        return;
200    }
201
202    if (count >= 4) {
203        while (((size_t)dst & 0x0F) != 0) {
204            *dst = SkBlendARGB32(*src, *dst, alpha);
205            src++;
206            dst++;
207            count--;
208        }
209
210        const __m128i *s = reinterpret_cast<const __m128i*>(src);
211        __m128i *d = reinterpret_cast<__m128i*>(dst);
212        while (count >= 4) {
213            // Load 4 pixels each of src and dest.
214            __m128i src_pixel = _mm_loadu_si128(s);
215            __m128i dst_pixel = _mm_load_si128(d);
216
217            __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
218            _mm_store_si128(d, result);
219            s++;
220            d++;
221            count -= 4;
222        }
223        src = reinterpret_cast<const SkPMColor*>(s);
224        dst = reinterpret_cast<SkPMColor*>(d);
225    }
226
227    while (count > 0) {
228        *dst = SkBlendARGB32(*src, *dst, alpha);
229        src++;
230        dst++;
231        count--;
232    }
233}
234
235void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
236    SkASSERT(count > 0);
237
238    uint32_t src_expand = (SkGetPackedG32(src) << 24) |
239                          (SkGetPackedR32(src) << 13) |
240                          (SkGetPackedB32(src) << 2);
241    unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
242
243    // Check if we have enough pixels to run SIMD
244    if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
245        __m128i* dst_wide;
246        const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
247        const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
248        const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
249        const __m128i scale_wide = _mm_set1_epi16(scale);
250        const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
251        const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
252
253        // Align dst to an even 16 byte address (0-7 pixels)
254        while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
255            *dst = SkBlend32_RGB16(src_expand, *dst, scale);
256            dst += 1;
257            count--;
258        }
259
260        dst_wide = reinterpret_cast<__m128i*>(dst);
261        do {
262            // Load eight RGB565 pixels
263            __m128i pixels = _mm_load_si128(dst_wide);
264
265            // Mask out sub-pixels
266            __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
267            __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
268            pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
269            __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
270
271            // Scale with alpha
272            pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
273            pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
274            pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
275
276            // Add src_X_wide and shift down again
277            pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
278            pixel_R = _mm_srli_epi16(pixel_R, 5);
279            pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
280            pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
281            pixel_B = _mm_srli_epi16(pixel_B, 5);
282
283            // Combine into RGB565 and store
284            pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
285            pixel_G = _mm_and_si128(pixel_G, mask_green);
286            pixels = _mm_or_si128(pixel_R, pixel_G);
287            pixels = _mm_or_si128(pixels, pixel_B);
288            _mm_store_si128(dst_wide, pixels);
289            count -= 8;
290            dst_wide++;
291        } while (count >= 8);
292
293        dst = reinterpret_cast<uint16_t*>(dst_wide);
294    }
295
296    // Small loop to handle remaining pixels.
297    while (count > 0) {
298        *dst = SkBlend32_RGB16(src_expand, *dst, scale);
299        dst += 1;
300        count--;
301    }
302}
303
304void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
305                               size_t maskRB, SkColor origColor,
306                               int width, int height) {
307    SkPMColor color = SkPreMultiplyColor(origColor);
308    size_t dstOffset = dstRB - (width << 2);
309    size_t maskOffset = maskRB - width;
310    SkPMColor* dst = (SkPMColor *)device;
311    const uint8_t* mask = (const uint8_t*)maskPtr;
312    do {
313        int count = width;
314        if (count >= 4) {
315            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
316                *dst = SkBlendARGB32(color, *dst, *mask);
317                mask++;
318                dst++;
319                count--;
320            }
321            __m128i *d = reinterpret_cast<__m128i*>(dst);
322            __m128i src_pixel = _mm_set1_epi32(color);
323            while (count >= 4) {
324                // Load 4 dst pixels
325                __m128i dst_pixel = _mm_load_si128(d);
326
327                // Set the alpha value
328                __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
329                alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
330                alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128());
331
332                __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide);
333                _mm_store_si128(d, result);
334                // Load the next 4 dst pixels and alphas
335                mask = mask + 4;
336                d++;
337                count -= 4;
338            }
339            dst = reinterpret_cast<SkPMColor*>(d);
340        }
341        while (count > 0) {
342            *dst= SkBlendARGB32(color, *dst, *mask);
343            dst += 1;
344            mask++;
345            count --;
346        }
347        dst = (SkPMColor *)((char*)dst + dstOffset);
348        mask += maskOffset;
349    } while (--height != 0);
350}
351
352// The following (left) shifts cause the top 5 bits of the mask components to
353// line up with the corresponding components in an SkPMColor.
354// Note that the mask's RGB16 order may differ from the SkPMColor order.
355#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
356#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
357#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
358
359#if SK_R16x5_R32x5_SHIFT == 0
360    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
361#elif SK_R16x5_R32x5_SHIFT > 0
362    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
363#else
364    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
365#endif
366
367#if SK_G16x5_G32x5_SHIFT == 0
368    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
369#elif SK_G16x5_G32x5_SHIFT > 0
370    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
371#else
372    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
373#endif
374
375#if SK_B16x5_B32x5_SHIFT == 0
376    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
377#elif SK_B16x5_B32x5_SHIFT > 0
378    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
379#else
380    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
381#endif
382
383static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
384                                 __m128i &mask, __m128i &srcA) {
385    // In the following comments, the components of src, dst and mask are
386    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
387    // by an R, G, B, or A suffix. Components of one of the four pixels that
388    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
389    // example is the blue channel of the second destination pixel. Memory
390    // layout is shown for an ARGB byte order in a color value.
391
392    // src and srcA store 8-bit values interleaved with zeros.
393    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
394    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
395    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
396    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
397    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
398    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
399    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
400
401    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
402    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
403    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
404                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
405
406    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
407    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
408                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
409
410    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
411    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
412                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
413
414    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
415    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
416    // 8-bit position
417    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
418    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
419    mask = _mm_or_si128(_mm_or_si128(r, g), b);
420
421    // Interleave R,G,B into the lower byte of word.
422    // i.e. split the sixteen 8-bit values from mask into two sets of eight
423    // 16-bit values, padded by zero.
424    __m128i maskLo, maskHi;
425    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
426    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
427    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
428    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
429
430    // Upscale from 0..31 to 0..32
431    // (allows to replace division by left-shift further down)
432    // Left-shift each component by 4 and add the result back to that component,
433    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
434    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
435    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
436
437    // Multiply each component of maskLo and maskHi by srcA
438    maskLo = _mm_mullo_epi16(maskLo, srcA);
439    maskHi = _mm_mullo_epi16(maskHi, srcA);
440
441    // Left shift mask components by 8 (divide by 256)
442    maskLo = _mm_srli_epi16(maskLo, 8);
443    maskHi = _mm_srli_epi16(maskHi, 8);
444
445    // Interleave R,G,B into the lower byte of the word
446    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
447    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
448    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
449    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
450
451    // mask = (src - dst) * mask
452    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
453    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
454
455    // mask = (src - dst) * mask >> 5
456    maskLo = _mm_srai_epi16(maskLo, 5);
457    maskHi = _mm_srai_epi16(maskHi, 5);
458
459    // Add two pixels into result.
460    // result = dst + ((src - dst) * mask >> 5)
461    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
462    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
463
464    // Pack into 4 32bit dst pixels.
465    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
466    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
467    // clamping to 255 if necessary.
468    return _mm_packus_epi16(resultLo, resultHi);
469}
470
471static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
472                                       __m128i &mask) {
473    // In the following comments, the components of src, dst and mask are
474    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
475    // by an R, G, B, or A suffix. Components of one of the four pixels that
476    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
477    // example is the blue channel of the second destination pixel. Memory
478    // layout is shown for an ARGB byte order in a color value.
479
480    // src and srcA store 8-bit values interleaved with zeros.
481    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
482    // mask stores 16-bit values (shown as high and low bytes) interleaved with
483    // zeros
484    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
485    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
486
487    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
488    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
489    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
490                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
491
492    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
493    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
494                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
495
496    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
497    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
498                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
499
500    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
501    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
502    // 8-bit position
503    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
504    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
505    mask = _mm_or_si128(_mm_or_si128(r, g), b);
506
507    // Interleave R,G,B into the lower byte of word.
508    // i.e. split the sixteen 8-bit values from mask into two sets of eight
509    // 16-bit values, padded by zero.
510    __m128i maskLo, maskHi;
511    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
512    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
513    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
514    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
515
516    // Upscale from 0..31 to 0..32
517    // (allows to replace division by left-shift further down)
518    // Left-shift each component by 4 and add the result back to that component,
519    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
520    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
521    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
522
523    // Interleave R,G,B into the lower byte of the word
524    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
525    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
526    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
527    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
528
529    // mask = (src - dst) * mask
530    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
531    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
532
533    // mask = (src - dst) * mask >> 5
534    maskLo = _mm_srai_epi16(maskLo, 5);
535    maskHi = _mm_srai_epi16(maskHi, 5);
536
537    // Add two pixels into result.
538    // result = dst + ((src - dst) * mask >> 5)
539    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
540    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
541
542    // Pack into 4 32bit dst pixels and force opaque.
543    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
544    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
545    // clamping to 255 if necessary. Set alpha components to 0xFF.
546    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
547                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
548}
549
550void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
551                         SkColor src, int width, SkPMColor) {
552    if (width <= 0) {
553        return;
554    }
555
556    int srcA = SkColorGetA(src);
557    int srcR = SkColorGetR(src);
558    int srcG = SkColorGetG(src);
559    int srcB = SkColorGetB(src);
560
561    srcA = SkAlpha255To256(srcA);
562
563    if (width >= 4) {
564        SkASSERT(((size_t)dst & 0x03) == 0);
565        while (((size_t)dst & 0x0F) != 0) {
566            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
567            mask++;
568            dst++;
569            width--;
570        }
571
572        __m128i *d = reinterpret_cast<__m128i*>(dst);
573        // Set alpha to 0xFF and replicate source four times in SSE register.
574        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
575        // Interleave with zeros to get two sets of four 16-bit values.
576        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
577        // Set srcA_sse to contain eight copies of srcA, padded with zero.
578        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
579        __m128i srcA_sse = _mm_set1_epi16(srcA);
580        while (width >= 4) {
581            // Load four destination pixels into dst_sse.
582            __m128i dst_sse = _mm_load_si128(d);
583            // Load four 16-bit masks into lower half of mask_sse.
584            __m128i mask_sse = _mm_loadl_epi64(
585                                   reinterpret_cast<const __m128i*>(mask));
586
587            // Check whether masks are equal to 0 and get the highest bit
588            // of each byte of result, if masks are all zero, we will get
589            // pack_cmp to 0xFFFF
590            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
591                                             _mm_setzero_si128()));
592
593            // if mask pixels are not all zero, we will blend the dst pixels
594            if (pack_cmp != 0xFFFF) {
595                // Unpack 4 16bit mask pixels to
596                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
597                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
598                mask_sse = _mm_unpacklo_epi16(mask_sse,
599                                              _mm_setzero_si128());
600
601                // Process 4 32bit dst pixels
602                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
603                                                   mask_sse, srcA_sse);
604                _mm_store_si128(d, result);
605            }
606
607            d++;
608            mask += 4;
609            width -= 4;
610        }
611
612        dst = reinterpret_cast<SkPMColor*>(d);
613    }
614
615    while (width > 0) {
616        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
617        mask++;
618        dst++;
619        width--;
620    }
621}
622
623void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
624                               SkColor src, int width, SkPMColor opaqueDst) {
625    if (width <= 0) {
626        return;
627    }
628
629    int srcR = SkColorGetR(src);
630    int srcG = SkColorGetG(src);
631    int srcB = SkColorGetB(src);
632
633    if (width >= 4) {
634        SkASSERT(((size_t)dst & 0x03) == 0);
635        while (((size_t)dst & 0x0F) != 0) {
636            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
637            mask++;
638            dst++;
639            width--;
640        }
641
642        __m128i *d = reinterpret_cast<__m128i*>(dst);
643        // Set alpha to 0xFF and replicate source four times in SSE register.
644        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
645        // Set srcA_sse to contain eight copies of srcA, padded with zero.
646        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
647        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
648        while (width >= 4) {
649            // Load four destination pixels into dst_sse.
650            __m128i dst_sse = _mm_load_si128(d);
651            // Load four 16-bit masks into lower half of mask_sse.
652            __m128i mask_sse = _mm_loadl_epi64(
653                                   reinterpret_cast<const __m128i*>(mask));
654
655            // Check whether masks are equal to 0 and get the highest bit
656            // of each byte of result, if masks are all zero, we will get
657            // pack_cmp to 0xFFFF
658            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
659                                             _mm_setzero_si128()));
660
661            // if mask pixels are not all zero, we will blend the dst pixels
662            if (pack_cmp != 0xFFFF) {
663                // Unpack 4 16bit mask pixels to
664                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
665                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
666                mask_sse = _mm_unpacklo_epi16(mask_sse,
667                                              _mm_setzero_si128());
668
669                // Process 4 32bit dst pixels
670                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
671                                                         mask_sse);
672                _mm_store_si128(d, result);
673            }
674
675            d++;
676            mask += 4;
677            width -= 4;
678        }
679
680        dst = reinterpret_cast<SkPMColor*>(d);
681    }
682
683    while (width > 0) {
684        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
685        mask++;
686        dst++;
687        width--;
688    }
689}
690
691/* SSE2 version of S32_D565_Opaque()
692 * portable version is in core/SkBlitRow_D16.cpp
693 */
694void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
695                          const SkPMColor* SK_RESTRICT src, int count,
696                          U8CPU alpha, int /*x*/, int /*y*/) {
697    SkASSERT(255 == alpha);
698
699    if (count <= 0) {
700        return;
701    }
702
703    if (count >= 8) {
704        while (((size_t)dst & 0x0F) != 0) {
705            SkPMColor c = *src++;
706            SkPMColorAssert(c);
707
708            *dst++ = SkPixel32ToPixel16_ToU16(c);
709            count--;
710        }
711
712        const __m128i* s = reinterpret_cast<const __m128i*>(src);
713        __m128i* d = reinterpret_cast<__m128i*>(dst);
714
715        while (count >= 8) {
716            // Load 8 pixels of src.
717            __m128i src_pixel1 = _mm_loadu_si128(s++);
718            __m128i src_pixel2 = _mm_loadu_si128(s++);
719
720            __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
721            _mm_store_si128(d++, d_pixel);
722            count -= 8;
723        }
724        src = reinterpret_cast<const SkPMColor*>(s);
725        dst = reinterpret_cast<uint16_t*>(d);
726    }
727
728    if (count > 0) {
729        do {
730            SkPMColor c = *src++;
731            SkPMColorAssert(c);
732            *dst++ = SkPixel32ToPixel16_ToU16(c);
733        } while (--count != 0);
734    }
735}
736
737/* SSE2 version of S32A_D565_Opaque()
738 * portable version is in core/SkBlitRow_D16.cpp
739 */
740void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
741                           const SkPMColor* SK_RESTRICT src,
742                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
743    SkASSERT(255 == alpha);
744
745    if (count <= 0) {
746        return;
747    }
748
749    if (count >= 8) {
750        // Make dst 16 bytes alignment
751        while (((size_t)dst & 0x0F) != 0) {
752            SkPMColor c = *src++;
753            if (c) {
754              *dst = SkSrcOver32To16(c, *dst);
755            }
756            dst += 1;
757            count--;
758        }
759
760        const __m128i* s = reinterpret_cast<const __m128i*>(src);
761        __m128i* d = reinterpret_cast<__m128i*>(dst);
762        __m128i var255 = _mm_set1_epi16(255);
763        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
764        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
765        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
766
767        while (count >= 8) {
768            // Load 8 pixels of src.
769            __m128i src_pixel1 = _mm_loadu_si128(s++);
770            __m128i src_pixel2 = _mm_loadu_si128(s++);
771
772            // Check whether src pixels are equal to 0 and get the highest bit
773            // of each byte of result, if src pixels are all zero, src_cmp1 and
774            // src_cmp2 will be 0xFFFF.
775            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
776                                             _mm_setzero_si128()));
777            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
778                                             _mm_setzero_si128()));
779            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
780                d++;
781                count -= 8;
782                continue;
783            }
784
785            // Load 8 pixels of dst.
786            __m128i dst_pixel = _mm_load_si128(d);
787
788            // Extract A from src.
789            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
790            sa1 = _mm_srli_epi32(sa1, 24);
791            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
792            sa2 = _mm_srli_epi32(sa2, 24);
793            __m128i sa = _mm_packs_epi32(sa1, sa2);
794
795            // Extract R from src.
796            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
797            sr1 = _mm_srli_epi32(sr1, 24);
798            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
799            sr2 = _mm_srli_epi32(sr2, 24);
800            __m128i sr = _mm_packs_epi32(sr1, sr2);
801
802            // Extract G from src.
803            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
804            sg1 = _mm_srli_epi32(sg1, 24);
805            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
806            sg2 = _mm_srli_epi32(sg2, 24);
807            __m128i sg = _mm_packs_epi32(sg1, sg2);
808
809            // Extract B from src.
810            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
811            sb1 = _mm_srli_epi32(sb1, 24);
812            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
813            sb2 = _mm_srli_epi32(sb2, 24);
814            __m128i sb = _mm_packs_epi32(sb1, sb2);
815
816            // Extract R G B from dst.
817            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
818            dr = _mm_and_si128(dr, r16_mask);
819            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
820            dg = _mm_and_si128(dg, g16_mask);
821            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
822            db = _mm_and_si128(db, b16_mask);
823
824            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
825
826            // Calculate R G B of result.
827            // Original algorithm is in SkSrcOver32To16().
828            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
829            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
830            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
831            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
832            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
833            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
834
835            // Pack R G B into 16-bit color.
836            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
837
838            // Store 8 16-bit colors in dst.
839            _mm_store_si128(d++, d_pixel);
840            count -= 8;
841        }
842
843        src = reinterpret_cast<const SkPMColor*>(s);
844        dst = reinterpret_cast<uint16_t*>(d);
845    }
846
847    if (count > 0) {
848        do {
849            SkPMColor c = *src++;
850            SkPMColorAssert(c);
851            if (c) {
852                *dst = SkSrcOver32To16(c, *dst);
853            }
854            dst += 1;
855        } while (--count != 0);
856    }
857}
858
859void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
860                                 const SkPMColor* SK_RESTRICT src,
861                                 int count, U8CPU alpha, int x, int y) {
862    SkASSERT(255 == alpha);
863
864    if (count <= 0) {
865        return;
866    }
867
868    if (count >= 8) {
869        while (((size_t)dst & 0x0F) != 0) {
870            DITHER_565_SCAN(y);
871            SkPMColor c = *src++;
872            SkPMColorAssert(c);
873
874            unsigned dither = DITHER_VALUE(x);
875            *dst++ = SkDitherRGB32To565(c, dither);
876            DITHER_INC_X(x);
877            count--;
878        }
879
880        unsigned short dither_value[8];
881        __m128i dither;
882#ifdef ENABLE_DITHER_MATRIX_4X4
883        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
884        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
885        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
886        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
887        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
888#else
889        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
890        dither_value[0] = dither_value[4] = (dither_scan
891                                             >> (((x) & 3) << 2)) & 0xF;
892        dither_value[1] = dither_value[5] = (dither_scan
893                                             >> (((x + 1) & 3) << 2)) & 0xF;
894        dither_value[2] = dither_value[6] = (dither_scan
895                                             >> (((x + 2) & 3) << 2)) & 0xF;
896        dither_value[3] = dither_value[7] = (dither_scan
897                                             >> (((x + 3) & 3) << 2)) & 0xF;
898#endif
899        dither = _mm_loadu_si128((__m128i*) dither_value);
900
901        const __m128i* s = reinterpret_cast<const __m128i*>(src);
902        __m128i* d = reinterpret_cast<__m128i*>(dst);
903
904        while (count >= 8) {
905            // Load 8 pixels of src.
906            __m128i src_pixel1 = _mm_loadu_si128(s++);
907            __m128i src_pixel2 = _mm_loadu_si128(s++);
908
909            // Extract R from src.
910            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
911            sr1 = _mm_srli_epi32(sr1, 24);
912            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
913            sr2 = _mm_srli_epi32(sr2, 24);
914            __m128i sr = _mm_packs_epi32(sr1, sr2);
915
916            // SkDITHER_R32To565(sr, dither)
917            __m128i sr_offset = _mm_srli_epi16(sr, 5);
918            sr = _mm_add_epi16(sr, dither);
919            sr = _mm_sub_epi16(sr, sr_offset);
920            sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
921
922            // Extract G from src.
923            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
924            sg1 = _mm_srli_epi32(sg1, 24);
925            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
926            sg2 = _mm_srli_epi32(sg2, 24);
927            __m128i sg = _mm_packs_epi32(sg1, sg2);
928
929            // SkDITHER_R32To565(sg, dither)
930            __m128i sg_offset = _mm_srli_epi16(sg, 6);
931            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
932            sg = _mm_sub_epi16(sg, sg_offset);
933            sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
934
935            // Extract B from src.
936            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
937            sb1 = _mm_srli_epi32(sb1, 24);
938            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
939            sb2 = _mm_srli_epi32(sb2, 24);
940            __m128i sb = _mm_packs_epi32(sb1, sb2);
941
942            // SkDITHER_R32To565(sb, dither)
943            __m128i sb_offset = _mm_srli_epi16(sb, 5);
944            sb = _mm_add_epi16(sb, dither);
945            sb = _mm_sub_epi16(sb, sb_offset);
946            sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
947
948            // Pack and store 16-bit dst pixel.
949            __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
950            _mm_store_si128(d++, d_pixel);
951
952            count -= 8;
953            x += 8;
954        }
955
956        src = reinterpret_cast<const SkPMColor*>(s);
957        dst = reinterpret_cast<uint16_t*>(d);
958    }
959
960    if (count > 0) {
961        DITHER_565_SCAN(y);
962        do {
963            SkPMColor c = *src++;
964            SkPMColorAssert(c);
965
966            unsigned dither = DITHER_VALUE(x);
967            *dst++ = SkDitherRGB32To565(c, dither);
968            DITHER_INC_X(x);
969        } while (--count != 0);
970    }
971}
972
973/* SSE2 version of S32A_D565_Opaque_Dither()
974 * portable version is in core/SkBlitRow_D16.cpp
975 */
976void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
977                                  const SkPMColor* SK_RESTRICT src,
978                                  int count, U8CPU alpha, int x, int y) {
979    SkASSERT(255 == alpha);
980
981    if (count <= 0) {
982        return;
983    }
984
985    if (count >= 8) {
986        while (((size_t)dst & 0x0F) != 0) {
987            DITHER_565_SCAN(y);
988            SkPMColor c = *src++;
989            SkPMColorAssert(c);
990            if (c) {
991                unsigned a = SkGetPackedA32(c);
992
993                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
994
995                unsigned sr = SkGetPackedR32(c);
996                unsigned sg = SkGetPackedG32(c);
997                unsigned sb = SkGetPackedB32(c);
998                sr = SkDITHER_R32_FOR_565(sr, d);
999                sg = SkDITHER_G32_FOR_565(sg, d);
1000                sb = SkDITHER_B32_FOR_565(sb, d);
1001
1002                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1003                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1004                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1005                // now src and dst expanded are in g:11 r:10 x:1 b:10
1006                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1007            }
1008            dst += 1;
1009            DITHER_INC_X(x);
1010            count--;
1011        }
1012
1013        unsigned short dither_value[8];
1014        __m128i dither, dither_cur;
1015#ifdef ENABLE_DITHER_MATRIX_4X4
1016        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1017        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1018        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1019        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1020        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1021#else
1022        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1023        dither_value[0] = dither_value[4] = (dither_scan
1024                                             >> (((x) & 3) << 2)) & 0xF;
1025        dither_value[1] = dither_value[5] = (dither_scan
1026                                             >> (((x + 1) & 3) << 2)) & 0xF;
1027        dither_value[2] = dither_value[6] = (dither_scan
1028                                             >> (((x + 2) & 3) << 2)) & 0xF;
1029        dither_value[3] = dither_value[7] = (dither_scan
1030                                             >> (((x + 3) & 3) << 2)) & 0xF;
1031#endif
1032        dither = _mm_loadu_si128((__m128i*) dither_value);
1033
1034        const __m128i* s = reinterpret_cast<const __m128i*>(src);
1035        __m128i* d = reinterpret_cast<__m128i*>(dst);
1036        __m128i var256 = _mm_set1_epi16(256);
1037        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1038        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1039        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1040
1041        while (count >= 8) {
1042            // Load 8 pixels of src and dst.
1043            __m128i src_pixel1 = _mm_loadu_si128(s++);
1044            __m128i src_pixel2 = _mm_loadu_si128(s++);
1045            __m128i dst_pixel = _mm_load_si128(d);
1046
1047            // Extract A from src.
1048            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1049            sa1 = _mm_srli_epi32(sa1, 24);
1050            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1051            sa2 = _mm_srli_epi32(sa2, 24);
1052            __m128i sa = _mm_packs_epi32(sa1, sa2);
1053
1054            // Calculate current dither value.
1055            dither_cur = _mm_mullo_epi16(dither,
1056                                         _mm_add_epi16(sa, _mm_set1_epi16(1)));
1057            dither_cur = _mm_srli_epi16(dither_cur, 8);
1058
1059            // Extract R from src.
1060            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1061            sr1 = _mm_srli_epi32(sr1, 24);
1062            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1063            sr2 = _mm_srli_epi32(sr2, 24);
1064            __m128i sr = _mm_packs_epi32(sr1, sr2);
1065
1066            // SkDITHER_R32_FOR_565(sr, d)
1067            __m128i sr_offset = _mm_srli_epi16(sr, 5);
1068            sr = _mm_add_epi16(sr, dither_cur);
1069            sr = _mm_sub_epi16(sr, sr_offset);
1070
1071            // Expand sr.
1072            sr = _mm_slli_epi16(sr, 2);
1073
1074            // Extract G from src.
1075            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1076            sg1 = _mm_srli_epi32(sg1, 24);
1077            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1078            sg2 = _mm_srli_epi32(sg2, 24);
1079            __m128i sg = _mm_packs_epi32(sg1, sg2);
1080
1081            // sg = SkDITHER_G32_FOR_565(sg, d).
1082            __m128i sg_offset = _mm_srli_epi16(sg, 6);
1083            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1084            sg = _mm_sub_epi16(sg, sg_offset);
1085
1086            // Expand sg.
1087            sg = _mm_slli_epi16(sg, 3);
1088
1089            // Extract B from src.
1090            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1091            sb1 = _mm_srli_epi32(sb1, 24);
1092            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1093            sb2 = _mm_srli_epi32(sb2, 24);
1094            __m128i sb = _mm_packs_epi32(sb1, sb2);
1095
1096            // sb = SkDITHER_B32_FOR_565(sb, d).
1097            __m128i sb_offset = _mm_srli_epi16(sb, 5);
1098            sb = _mm_add_epi16(sb, dither_cur);
1099            sb = _mm_sub_epi16(sb, sb_offset);
1100
1101            // Expand sb.
1102            sb = _mm_slli_epi16(sb, 2);
1103
1104            // Extract R G B from dst.
1105            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1106            dr = _mm_and_si128(dr, r16_mask);
1107            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1108            dg = _mm_and_si128(dg, g16_mask);
1109            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1110            db = _mm_and_si128(db, b16_mask);
1111
1112            // SkAlpha255To256(255 - a) >> 3
1113            __m128i isa = _mm_sub_epi16(var256, sa);
1114            isa = _mm_srli_epi16(isa, 3);
1115
1116            dr = _mm_mullo_epi16(dr, isa);
1117            dr = _mm_add_epi16(dr, sr);
1118            dr = _mm_srli_epi16(dr, 5);
1119
1120            dg = _mm_mullo_epi16(dg, isa);
1121            dg = _mm_add_epi16(dg, sg);
1122            dg = _mm_srli_epi16(dg, 5);
1123
1124            db = _mm_mullo_epi16(db, isa);
1125            db = _mm_add_epi16(db, sb);
1126            db = _mm_srli_epi16(db, 5);
1127
1128            // Package and store dst pixel.
1129            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1130            _mm_store_si128(d++, d_pixel);
1131
1132            count -= 8;
1133            x += 8;
1134        }
1135
1136        src = reinterpret_cast<const SkPMColor*>(s);
1137        dst = reinterpret_cast<uint16_t*>(d);
1138    }
1139
1140    if (count > 0) {
1141        DITHER_565_SCAN(y);
1142        do {
1143            SkPMColor c = *src++;
1144            SkPMColorAssert(c);
1145            if (c) {
1146                unsigned a = SkGetPackedA32(c);
1147
1148                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1149
1150                unsigned sr = SkGetPackedR32(c);
1151                unsigned sg = SkGetPackedG32(c);
1152                unsigned sb = SkGetPackedB32(c);
1153                sr = SkDITHER_R32_FOR_565(sr, d);
1154                sg = SkDITHER_G32_FOR_565(sg, d);
1155                sb = SkDITHER_B32_FOR_565(sb, d);
1156
1157                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1158                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1159                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1160                // now src and dst expanded are in g:11 r:10 x:1 b:10
1161                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1162            }
1163            dst += 1;
1164            DITHER_INC_X(x);
1165        } while (--count != 0);
1166    }
1167}
1168