19272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/*
298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com * Copyright 2012 The Android Open Source Project
3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com *
4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be
5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file.
69272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
88c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org#include <emmintrin.h>
983ecdc3ac69c9208493c4c3fc8ea9f84b1350535caryclark@google.com#include "SkBitmapProcState_opts_SSE2.h"
108c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org#include "SkBlitRow_opts_SSE2.h"
119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h"
12475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org#include "SkColor_opts_SSE2.h"
13275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#include "SkDither.h"
14c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org#include "SkUtils.h"
159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32()
179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */
194e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
204e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              const SkPMColor* SK_RESTRICT src,
214e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                              int count, U8CPU alpha) {
229272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t src_scale = SkAlpha255To256(alpha);
289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    uint32_t dst_scale = 256 - src_scale;
299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
30dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
31dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
4298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
4398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
4498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        // Move scale factors to upper byte of word
4598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
4698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
5298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Interleave Atom port 0/1 operations based on the execution port
5398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // constraints that multiply can only be executed on port 0 (while
5498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // boolean operations can be executed on either port 0 or port 1)
5598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // because GCC currently doesn't do a good job scheduling
5698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // instructions based on these constraints.
5798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
58dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
5998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
6298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Multiply by scale.
6398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, rs.h, 0, bs.h))
6498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // where rs.h stands for the higher byte of r * scale, and
6598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // bs.h the higher byte of b * scale.
6698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
6798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
6898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Get alpha and green pixels into higher byte of each word.
6998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
7098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by scale.
7398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, as.l, gs.h, gs.l))
7498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
7698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Clear the lower byte of the a*scale and g*scale results
7798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, 0, gs.h, 0))
7898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_and_si128(src_ag, ag_mask);
7998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com
8098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Operations the destination pixels are the same as on the
8198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // source pixels. See the comments above.
8298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
8398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
8498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
8598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
8698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_ag = _mm_and_si128(dst_ag, ag_mask);
87dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
88dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
8998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, rs.h, gs.h, bs.h))
90dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
91dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
93dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
94dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
96dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
97dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
98dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
1029272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
1039272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
1044e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org    while (count > 0) {
1059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
1069272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
1079272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
1089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
1099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
1109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
1119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
1124e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
1134e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                const SkPMColor* SK_RESTRICT src,
1144e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                                int count, U8CPU alpha) {
1159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha == 255);
1169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
1179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
1189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
119dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
120dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
121dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        SkASSERT(((size_t)dst & 0x03) == 0);
122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkPMSrcOver(*src, *dst);
124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
1319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING
132dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
141f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
142dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Shift alphas down to lower 8 bits of each quad.
143dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i alpha = _mm_srli_epi32(src_pixel, 24);
144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Copy alpha to upper 3rd byte of each quad
146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 255, to get 0..255
149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_255, alpha);
150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb_low = (dst_rb >> 8)
157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_add_epi16(dst_rb, c_128);
163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_add_epi16(dst_ag, c_128);
168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
169dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
171dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
172dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
173dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
174dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
176dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
177dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
178dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
1808c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org#else
181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels
185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
189f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
190dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
191f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
192f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
194f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, a0, a1, a1, a2, g2, a3, g3)
195f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org
197f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            // (a0, a0, a1, a1, a2, a2, a3, a3)
198f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org            alpha = _mm_shufflelo_epi16(alpha, 0xF5);
199dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
200dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            alpha = _mm_sub_epi16(c_256, alpha);
202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by red and blue by src alpha.
204dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply by alpha and green by src alpha.
206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, alpha);
207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
208dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
209dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
210dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
211dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out high bits (already in the right place)
212dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
213dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
214dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
215dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
216dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
217dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add result
218dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
220dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
221dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
222dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
223dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
2249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif
225dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
226dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
227dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    }
2289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
2309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkPMSrcOver(*src, *dst);
2319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
2329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
2339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
2349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
2369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
2374e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
2384e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               const SkPMColor* SK_RESTRICT src,
2394e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org                               int count, U8CPU alpha) {
2409272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    SkASSERT(alpha <= 255);
2419272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    if (count <= 0) {
2429272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        return;
2439272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
2449272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org
245dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org    if (count >= 4) {
246dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (((size_t)dst & 0x0F) != 0) {
247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            *dst = SkBlendARGB32(*src, *dst, alpha);
248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src++;
249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst++;
250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count--;
251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        uint32_t src_scale = SkAlpha255To256(alpha);
254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        const __m128i *s = reinterpret_cast<const __m128i*>(src);
256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i *d = reinterpret_cast<__m128i*>(dst);
25798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
258dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        while (count >= 4) {
261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Load 4 pixels each of src and dest.
262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_pixel = _mm_loadu_si128(s);
263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get red and blue pixels into lower byte of each word.
266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Get alpha and green into lower byte of each word.
270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Put per-pixel alpha in low byte of each word.
27498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // After the following two statements, the dst_alpha looks like
27598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
276dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // dst_alpha = dst_alpha * src_scale
28098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Because src_scales are in the higher byte of each word and
28198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // we use mulhi here, the resulting alpha values are already
28298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // in the right place and don't need to be divided by 256.
28398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
28498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
285dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
286dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Subtract alphas from 256, to get 1..256
287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by dst pixel alpha.
290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by dst pixel alpha.
292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply red and blue by global alpha.
29598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, rs.h, 0, bs.h))
29698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // where rs.h stands for the higher byte of r * src_scale,
29798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // and bs.h the higher byte of b * src_scale.
29898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Again, because we use mulhi, the resuling red and blue
29998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // values are already in the right place and don't need to
30098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // be divided by 256.
30198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
302dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Multiply alpha and green by global alpha.
30398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (0, as.h, 0, gs.h))
30498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
305dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
306dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Divide by 256.
307dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_rb = _mm_srli_epi16(dst_rb, 8);
308dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
309dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Mask out low bits (goodies already in the right place; no need to divide)
310dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
31198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // Shift alpha and green to higher byte of each word.
31298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            // (4 x (as.h, 0, gs.h, 0))
31398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com            src_ag = _mm_slli_epi16(src_ag, 8);
314dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
315dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Combine back into RGBA.
316dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            src_pixel = _mm_or_si128(src_rb, src_ag);
318dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
319dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            // Add two pixels into result.
320dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            _mm_store_si128(d, result);
322dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            s++;
323dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            d++;
324dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org            count -= 4;
325dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        }
326dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
327dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org        dst = reinterpret_cast<SkPMColor*>(d);
3289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
329dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org
3309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    while (count > 0) {
3319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        *dst = SkBlendARGB32(*src, *dst, alpha);
3329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        src++;
3339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        dst++;
3349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org        count--;
3359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org    }
3369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org}
337c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
338c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org/* SSE2 version of Color32()
339c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp
340c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org */
341c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                  SkPMColor color) {
343c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (count <= 0) {
344c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        return;
345c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
346c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
347c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (0 == color) {
348c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        if (src != dst) {
349c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            memcpy(dst, src, count * sizeof(SkPMColor));
350c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        }
351c909a1ecadd422d91ff97d10ce08865290223b14reed@google.com        return;
352c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
353c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
354c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    unsigned colorA = SkGetPackedA32(color);
355c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    if (255 == colorA) {
356c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        sk_memset32(dst, color, count);
357c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    } else {
358c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        unsigned scale = 256 - SkAlpha255To256(colorA);
359c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
360c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        if (count >= 4) {
361c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            SkASSERT(((size_t)dst & 0x03) == 0);
362c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            while (((size_t)dst & 0x0F) != 0) {
363c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                *dst = color + SkAlphaMulQ(*src, scale);
364c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src++;
365c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                dst++;
366c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                count--;
367c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            }
368c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
369c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            const __m128i *s = reinterpret_cast<const __m128i*>(src);
370c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i *d = reinterpret_cast<__m128i*>(dst);
371c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
372c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i src_scale_wide = _mm_set1_epi16(scale);
373c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            __m128i color_wide = _mm_set1_epi32(color);
374c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            while (count >= 4) {
375c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Load 4 pixels each of src and dest.
376c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_pixel = _mm_loadu_si128(s);
377c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
378c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Get red and blue pixels into lower byte of each word.
379c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
380981d4798007b91e2e19c13b171583927a56df63breed@google.com
381c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Get alpha and green into lower byte of each word.
382c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
383c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
384c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Multiply by scale.
385c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
386c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
387c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
388c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Divide by 256.
389c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_rb = _mm_srli_epi16(src_rb, 8);
390c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_ag = _mm_andnot_si128(rb_mask, src_ag);
391c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
392c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Combine back into RGBA.
393c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                src_pixel = _mm_or_si128(src_rb, src_ag);
394c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
395c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Add color to result.
396c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                __m128i result = _mm_add_epi8(color_wide, src_pixel);
397c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
398c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                // Store result.
399c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                _mm_store_si128(d, result);
400c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                s++;
401c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                d++;
402c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org                count -= 4;
403c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            }
404c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            src = reinterpret_cast<const SkPMColor*>(s);
405c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            dst = reinterpret_cast<SkPMColor*>(d);
4068c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org        }
407c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org
408c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org        while (count > 0) {
409c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            *dst = color + SkAlphaMulQ(*src, scale);
410c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            src += 1;
411c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            dst += 1;
412c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org            count--;
413981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
414c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org    }
415c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org}
416981d4798007b91e2e19c13b171583927a56df63breed@google.com
417edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
418edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com                               size_t maskRB, SkColor origColor,
419d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                               int width, int height) {
420ee467ee79d449ebe6ae7f7946e613cc70a479c69reed@google.com    SkPMColor color = SkPreMultiplyColor(origColor);
421981d4798007b91e2e19c13b171583927a56df63breed@google.com    size_t dstOffset = dstRB - (width << 2);
422981d4798007b91e2e19c13b171583927a56df63breed@google.com    size_t maskOffset = maskRB - width;
423981d4798007b91e2e19c13b171583927a56df63breed@google.com    SkPMColor* dst = (SkPMColor *)device;
424edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com    const uint8_t* mask = (const uint8_t*)maskPtr;
425981d4798007b91e2e19c13b171583927a56df63breed@google.com    do {
426981d4798007b91e2e19c13b171583927a56df63breed@google.com        int count = width;
427981d4798007b91e2e19c13b171583927a56df63breed@google.com        if (count >= 4) {
428981d4798007b91e2e19c13b171583927a56df63breed@google.com            while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
429981d4798007b91e2e19c13b171583927a56df63breed@google.com                *dst = SkBlendARGB32(color, *dst, *mask);
430981d4798007b91e2e19c13b171583927a56df63breed@google.com                mask++;
431981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst++;
432981d4798007b91e2e19c13b171583927a56df63breed@google.com                count--;
433981d4798007b91e2e19c13b171583927a56df63breed@google.com            }
434981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i *d = reinterpret_cast<__m128i*>(dst);
435981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
436981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i c_256 = _mm_set1_epi16(256);
437981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i c_1 = _mm_set1_epi16(1);
438981d4798007b91e2e19c13b171583927a56df63breed@google.com            __m128i src_pixel = _mm_set1_epi32(color);
439981d4798007b91e2e19c13b171583927a56df63breed@google.com            while (count >= 4) {
440981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Load 4 pixels each of src and dest.
441981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_pixel = _mm_load_si128(d);
442981d4798007b91e2e19c13b171583927a56df63breed@google.com
443981d4798007b91e2e19c13b171583927a56df63breed@google.com                //set the aphla value
444981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
445981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0, *(mask+3),0, \
446981d4798007b91e2e19c13b171583927a56df63breed@google.com                                *(mask+2),0, *(mask+2),\
447981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0,*(mask+1), 0,*(mask+1),\
448981d4798007b91e2e19c13b171583927a56df63breed@google.com                                0, *mask,0,*mask);
449981d4798007b91e2e19c13b171583927a56df63breed@google.com
450981d4798007b91e2e19c13b171583927a56df63breed@google.com                //call SkAlpha255To256()
451981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
452981d4798007b91e2e19c13b171583927a56df63breed@google.com
453981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Get red and blue pixels into lower byte of each word.
454981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
455981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
456981d4798007b91e2e19c13b171583927a56df63breed@google.com
457981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Get alpha and green into lower byte of each word.
458981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
459981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
460981d4798007b91e2e19c13b171583927a56df63breed@google.com
461981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Put per-pixel alpha in low byte of each word.
462981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
463981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
464981d4798007b91e2e19c13b171583927a56df63breed@google.com
465981d4798007b91e2e19c13b171583927a56df63breed@google.com                // dst_alpha = dst_alpha * src_scale
466981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
467981d4798007b91e2e19c13b171583927a56df63breed@google.com
468981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Divide by 256.
469981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_srli_epi16(dst_alpha, 8);
470981d4798007b91e2e19c13b171583927a56df63breed@google.com
471981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Subtract alphas from 256, to get 1..256
472981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
473981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply red and blue by dst pixel alpha.
474981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
475981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply alpha and green by dst pixel alpha.
476981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
477981d4798007b91e2e19c13b171583927a56df63breed@google.com
478981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply red and blue by global alpha.
479981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
480981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Multiply alpha and green by global alpha.
481981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
482981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Divide by 256.
483981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_rb = _mm_srli_epi16(dst_rb, 8);
484981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_rb = _mm_srli_epi16(src_rb, 8);
485981d4798007b91e2e19c13b171583927a56df63breed@google.com
486981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Mask out low bits (goodies already in the right place; no need to divide)
487981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
488981d4798007b91e2e19c13b171583927a56df63breed@google.com                src_ag = _mm_andnot_si128(rb_mask, src_ag);
489981d4798007b91e2e19c13b171583927a56df63breed@google.com
490981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Combine back into RGBA.
491981d4798007b91e2e19c13b171583927a56df63breed@google.com                dst_pixel = _mm_or_si128(dst_rb, dst_ag);
492981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
493981d4798007b91e2e19c13b171583927a56df63breed@google.com
494981d4798007b91e2e19c13b171583927a56df63breed@google.com                // Add two pixels into result.
495981d4798007b91e2e19c13b171583927a56df63breed@google.com                __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
496981d4798007b91e2e19c13b171583927a56df63breed@google.com                _mm_store_si128(d, result);
497981d4798007b91e2e19c13b171583927a56df63breed@google.com                // load the next 4 pixel
498981d4798007b91e2e19c13b171583927a56df63breed@google.com                mask = mask + 4;
499981d4798007b91e2e19c13b171583927a56df63breed@google.com                d++;
500981d4798007b91e2e19c13b171583927a56df63breed@google.com                count -= 4;
501981d4798007b91e2e19c13b171583927a56df63breed@google.com            }
502981d4798007b91e2e19c13b171583927a56df63breed@google.com            dst = reinterpret_cast<SkPMColor *>(d);
503981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
5048c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org        while (count > 0) {
505981d4798007b91e2e19c13b171583927a56df63breed@google.com            *dst= SkBlendARGB32(color, *dst, *mask);
506981d4798007b91e2e19c13b171583927a56df63breed@google.com            dst += 1;
507981d4798007b91e2e19c13b171583927a56df63breed@google.com            mask++;
508981d4798007b91e2e19c13b171583927a56df63breed@google.com            count --;
509981d4798007b91e2e19c13b171583927a56df63breed@google.com        }
510981d4798007b91e2e19c13b171583927a56df63breed@google.com        dst = (SkPMColor *)((char*)dst + dstOffset);
511981d4798007b91e2e19c13b171583927a56df63breed@google.com        mask += maskOffset;
512981d4798007b91e2e19c13b171583927a56df63breed@google.com    } while (--height != 0);
513981d4798007b91e2e19c13b171583927a56df63breed@google.com}
514d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
5158cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// The following (left) shifts cause the top 5 bits of the mask components to
5168cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// line up with the corresponding components in an SkPMColor.
5178cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// Note that the mask's RGB16 order may differ from the SkPMColor order.
5188cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
5198cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
5208cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
5218cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5228cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_R16x5_R32x5_SHIFT == 0
5238cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
5248cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_R16x5_R32x5_SHIFT > 0
5258cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
5268cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5278cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
5288cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5298cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5308cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_G16x5_G32x5_SHIFT == 0
5318cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
5328cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_G16x5_G32x5_SHIFT > 0
5338cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
5348cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5358cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
5368cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5378cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
5388cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_B16x5_B32x5_SHIFT == 0
5398cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
5408cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_B16x5_B32x5_SHIFT > 0
5418cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
5428cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else
5438cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
5448cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif
5458cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
54676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
54776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                 __m128i &mask, __m128i &srcA) {
54876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // In the following comments, the components of src, dst and mask are
54976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
55076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // by an R, G, B, or A suffix. Components of one of the four pixels that
55176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
55276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // example is the blue channel of the second destination pixel. Memory
55376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // layout is shown for an ARGB byte order in a color value.
55476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
55576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src and srcA store 8-bit values interleaved with zeros.
55676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
55776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
55876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
55976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
56076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
56176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
56276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
56376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
564d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
56576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
5668cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
5678cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
5688cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com
56976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
5708cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
5718cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
572fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
57376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
5748cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
5758cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
576fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
577d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
57876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
57976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 8-bit position
58076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
58176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
582d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    mask = _mm_or_si128(_mm_or_si128(r, g), b);
583d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
584fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    // Interleave R,G,B into the lower byte of word.
58576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // i.e. split the sixteen 8-bit values from mask into two sets of eight
58676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 16-bit values, padded by zero.
587d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i maskLo, maskHi;
58876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
589d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
59076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
591d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
592d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
59376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Upscale from 0..31 to 0..32
59476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // (allows to replace division by left-shift further down)
59576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left-shift each component by 4 and add the result back to that component,
59676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
597d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
598d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
599d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
60076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Multiply each component of maskLo and maskHi by srcA
60176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, srcA);
60276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, srcA);
603d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
60476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left shift mask components by 8 (divide by 256)
605d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srli_epi16(maskLo, 8);
606d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srli_epi16(maskHi, 8);
607d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
60876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Interleave R,G,B into the lower byte of the word
60976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
610d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
61176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
612d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
613d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
61476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask
61576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
61676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
617d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
61876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask >> 5
619d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srai_epi16(maskLo, 5);
620d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srai_epi16(maskHi, 5);
621d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
622d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Add two pixels into result.
62376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // result = dst + ((src - dst) * mask >> 5)
624d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
625d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
626d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
62776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Pack into 4 32bit dst pixels.
62876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
62976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
63076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // clamping to 255 if necessary.
631d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    return _mm_packus_epi16(resultLo, resultHi);
632d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
633d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
63476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
635d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                       __m128i &mask) {
63676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // In the following comments, the components of src, dst and mask are
63776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
63876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // by an R, G, B, or A suffix. Components of one of the four pixels that
63976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
64076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // example is the blue channel of the second destination pixel. Memory
64176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // layout is shown for an ARGB byte order in a color value.
64276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
64376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src and srcA store 8-bit values interleaved with zeros.
64476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
64576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask stores 16-bit values (shown as high and low bytes) interleaved with
64676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // zeros
64776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
64876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
64976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
650d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
65176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
6528cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
6538cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
654d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
65576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
6568cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
6578cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
658fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
65976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
6608cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
6618cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
662d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
663d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
66476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
66576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 8-bit position
66676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
66776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
668d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    mask = _mm_or_si128(_mm_or_si128(r, g), b);
669d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
670fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    // Interleave R,G,B into the lower byte of word.
67176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // i.e. split the sixteen 8-bit values from mask into two sets of eight
67276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // 16-bit values, padded by zero.
673d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i maskLo, maskHi;
67476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
675d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
67676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
677d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
678d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
67976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Upscale from 0..31 to 0..32
68076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // (allows to replace division by left-shift further down)
68176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Left-shift each component by 4 and add the result back to that component,
68276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
683d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
684d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
685d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
68676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Interleave R,G,B into the lower byte of the word
68776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
688d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
68976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
690d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
691d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
69276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask
69376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
69476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
695d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
69676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // mask = (src - dst) * mask >> 5
697d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskLo = _mm_srai_epi16(maskLo, 5);
698d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    maskHi = _mm_srai_epi16(maskHi, 5);
699d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
700d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    // Add two pixels into result.
70176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // result = dst + ((src - dst) * mask >> 5)
702d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
703d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
704d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
70527123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com    // Pack into 4 32bit dst pixels and force opaque.
70676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
70776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
70876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    // clamping to 255 if necessary. Set alpha components to 0xFF.
70927123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
71027123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
711d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
712d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
71376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
71476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                         SkColor src, int width, SkPMColor) {
715d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width <= 0) {
716d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        return;
717d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
718d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
71976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcA = SkColorGetA(src);
72076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcR = SkColorGetR(src);
72176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcG = SkColorGetG(src);
72276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcB = SkColorGetB(src);
723fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
724d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    srcA = SkAlpha255To256(srcA);
725d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
726d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width >= 4) {
727d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        SkASSERT(((size_t)dst & 0x03) == 0);
728d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (((size_t)dst & 0x0F) != 0) {
72976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
73076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask++;
731d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            dst++;
732d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width--;
733d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
734d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
735d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
73676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set alpha to 0xFF and replicate source four times in SSE register.
73776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
73876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Interleave with zeros to get two sets of four 16-bit values.
73976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
74076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set srcA_sse to contain eight copies of srcA, padded with zero.
74176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
74276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i srcA_sse = _mm_set1_epi16(srcA);
743d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (width >= 4) {
74476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four destination pixels into dst_sse.
74576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i dst_sse = _mm_load_si128(d);
74676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four 16-bit masks into lower half of mask_sse.
74776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i mask_sse = _mm_loadl_epi64(
74876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                   reinterpret_cast<const __m128i*>(mask));
74976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
75076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Check whether masks are equal to 0 and get the highest bit
75176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // of each byte of result, if masks are all zero, we will get
752d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // pack_cmp to 0xFFFF
75376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
754d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                             _mm_setzero_si128()));
755d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
756d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // if mask pixels are not all zero, we will blend the dst pixels
757d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            if (pack_cmp != 0xFFFF) {
758fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                // Unpack 4 16bit mask pixels to
75976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
76076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
76176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                mask_sse = _mm_unpacklo_epi16(mask_sse,
76276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                              _mm_setzero_si128());
763d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
764d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                // Process 4 32bit dst pixels
76576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
76676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                                   mask_sse, srcA_sse);
767d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                _mm_store_si128(d, result);
768d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            }
769d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
770d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            d++;
77176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask += 4;
772d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width -= 4;
773d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
774d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
775d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst = reinterpret_cast<SkPMColor*>(d);
776d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
777d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
778d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    while (width > 0) {
77976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
78076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        mask++;
781d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst++;
782fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        width--;
783d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
784d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
785d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
78676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
78776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                               SkColor src, int width, SkPMColor opaqueDst) {
788d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width <= 0) {
789d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        return;
790d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
791d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
79276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcR = SkColorGetR(src);
79376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcG = SkColorGetG(src);
79476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org    int srcB = SkColorGetB(src);
795d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
796d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    if (width >= 4) {
797d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        SkASSERT(((size_t)dst & 0x03) == 0);
798d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (((size_t)dst & 0x0F) != 0) {
79976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
80076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask++;
801d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            dst++;
802d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width--;
803d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
804d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
805d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        __m128i *d = reinterpret_cast<__m128i*>(dst);
80676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set alpha to 0xFF and replicate source four times in SSE register.
80776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
80876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // Set srcA_sse to contain eight copies of srcA, padded with zero.
80976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
81076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
811d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        while (width >= 4) {
81276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four destination pixels into dst_sse.
81376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i dst_sse = _mm_load_si128(d);
81476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Load four 16-bit masks into lower half of mask_sse.
81576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            __m128i mask_sse = _mm_loadl_epi64(
81676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                   reinterpret_cast<const __m128i*>(mask));
81776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org
81876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // Check whether masks are equal to 0 and get the highest bit
81976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            // of each byte of result, if masks are all zero, we will get
820d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // pack_cmp to 0xFFFF
82176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
822d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                                             _mm_setzero_si128()));
823d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
824d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            // if mask pixels are not all zero, we will blend the dst pixels
825d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            if (pack_cmp != 0xFFFF) {
826fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                // Unpack 4 16bit mask pixels to
82776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
82876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
82976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                mask_sse = _mm_unpacklo_epi16(mask_sse,
83076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                              _mm_setzero_si128());
831d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
832d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                // Process 4 32bit dst pixels
83376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
83476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org                                                         mask_sse);
835d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com                _mm_store_si128(d, result);
836d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            }
837d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
838d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            d++;
83976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org            mask += 4;
840d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com            width -= 4;
841d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        }
842d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
843d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst = reinterpret_cast<SkPMColor*>(d);
844d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
845d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com
846d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    while (width > 0) {
84776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
84876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org        mask++;
849d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com        dst++;
850fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        width--;
851d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com    }
852d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com}
853475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
85439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org/* SSE2 version of S32_D565_Opaque()
85539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp
85639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org */
85739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.orgvoid S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
85839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                          const SkPMColor* SK_RESTRICT src, int count,
85939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                          U8CPU alpha, int /*x*/, int /*y*/) {
86039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    SkASSERT(255 == alpha);
86139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
86239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count <= 0) {
86339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        return;
86439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
86539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
86639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count >= 8) {
86739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
86839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColor c = *src++;
86939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColorAssert(c);
87039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
87139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            *dst++ = SkPixel32ToPixel16_ToU16(c);
87239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            count--;
87339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        }
87439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
87539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
87639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
87739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
87839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
87939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
88039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
88139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        while (count >= 8) {
88239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Load 8 pixels of src.
88339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
88439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
88539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
88639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result r.
88739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r1 = _mm_srli_epi32(src_pixel1,
88839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
88939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            r1 = _mm_and_si128(r1, r16_mask);
89039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r2 = _mm_srli_epi32(src_pixel2,
89139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_R32_SHIFT + (8 - SK_R16_BITS));
89239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            r2 = _mm_and_si128(r2, r16_mask);
89339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i r = _mm_packs_epi32(r1, r2);
89439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
89539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result g.
89639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g1 = _mm_srli_epi32(src_pixel1,
89739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
89839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            g1 = _mm_and_si128(g1, g16_mask);
89939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g2 = _mm_srli_epi32(src_pixel2,
90039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_G32_SHIFT + (8 - SK_G16_BITS));
90139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            g2 = _mm_and_si128(g2, g16_mask);
90239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i g = _mm_packs_epi32(g1, g2);
90339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
90439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Calculate result b.
90539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b1 = _mm_srli_epi32(src_pixel1,
90639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
90739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            b1 = _mm_and_si128(b1, b16_mask);
90839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b2 = _mm_srli_epi32(src_pixel2,
90939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org                                        SK_B32_SHIFT + (8 - SK_B16_BITS));
91039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            b2 = _mm_and_si128(b2, b16_mask);
91139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            __m128i b = _mm_packs_epi32(b1, b2);
91239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
91339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            // Store 8 16-bit colors in dst.
914c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
91539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
91639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            count -= 8;
91739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        }
91839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
91939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
92039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
92139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
92239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    if (count > 0) {
92339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        do {
92439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColor c = *src++;
92539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            SkPMColorAssert(c);
92639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org            *dst++ = SkPixel32ToPixel16_ToU16(c);
92739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org        } while (--count != 0);
92839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org    }
92939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org}
93039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org
931475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org/* SSE2 version of S32A_D565_Opaque()
932475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp
933475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org */
934475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.orgvoid S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
935475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                           const SkPMColor* SK_RESTRICT src,
936475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
937475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    SkASSERT(255 == alpha);
938475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
939475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count <= 0) {
940475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        return;
941475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
942475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
943475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count >= 8) {
944475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        // Make dst 16 bytes alignment
945475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
946475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColor c = *src++;
947475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (c) {
948475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org              *dst = SkSrcOver32To16(c, *dst);
949475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
950475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dst += 1;
951475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            count--;
952475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        }
953475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
954475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
955475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
956475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i var255 = _mm_set1_epi16(255);
957475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
958475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
959475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
960475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
961475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        while (count >= 8) {
962475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Load 8 pixels of src.
963475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
964475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
965475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
966475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Check whether src pixels are equal to 0 and get the highest bit
967475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // of each byte of result, if src pixels are all zero, src_cmp1 and
968475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // src_cmp2 will be 0xFFFF.
969475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
970475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                                             _mm_setzero_si128()));
971475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
972475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                                             _mm_setzero_si128()));
973475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
974475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                d++;
975475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                count -= 8;
976475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                continue;
977475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
978475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
979475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Load 8 pixels of dst.
980475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
981475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
982475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract A from src.
983c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
984475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sa1 = _mm_srli_epi32(sa1, 24);
985c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
986475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sa2 = _mm_srli_epi32(sa2, 24);
987475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sa = _mm_packs_epi32(sa1, sa2);
988475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
989475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract R from src.
990c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
991475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sr1 = _mm_srli_epi32(sr1, 24);
992c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
993475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sr2 = _mm_srli_epi32(sr2, 24);
994475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sr = _mm_packs_epi32(sr1, sr2);
995475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
996475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract G from src.
997c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
998475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sg1 = _mm_srli_epi32(sg1, 24);
999c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1000475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sg2 = _mm_srli_epi32(sg2, 24);
1001475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sg = _mm_packs_epi32(sg1, sg2);
1002475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1003475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract B from src.
1004c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1005475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sb1 = _mm_srli_epi32(sb1, 24);
1006c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1007475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            sb2 = _mm_srli_epi32(sb2, 24);
1008475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i sb = _mm_packs_epi32(sb1, sb2);
1009475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1010475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Extract R G B from dst.
1011c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1012475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dr = _mm_and_si128(dr, r16_mask);
1013c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1014475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dg = _mm_and_si128(dg, g16_mask);
1015c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1016475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            db = _mm_and_si128(db, b16_mask);
1017475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1018475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1019475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1020475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Calculate R G B of result.
1021475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Original algorithm is in SkSrcOver32To16().
1022c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
1023475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1024c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
1025475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1026c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
1027475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1028475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1029475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Pack R G B into 16-bit color.
1030c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1031475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1032475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            // Store 8 16-bit colors in dst.
1033475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
1034475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            count -= 8;
1035475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        }
1036475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1037475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
1038475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
1039475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
1040475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org
1041475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    if (count > 0) {
1042475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        do {
1043475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColor c = *src++;
1044475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            SkPMColorAssert(c);
1045475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            if (c) {
1046475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org                *dst = SkSrcOver32To16(c, *dst);
1047475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            }
1048475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org            dst += 1;
1049475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org        } while (--count != 0);
1050475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org    }
1051475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org}
1052275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1053275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.orgvoid S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1054275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                 const SkPMColor* SK_RESTRICT src,
1055275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                 int count, U8CPU alpha, int x, int y) {
1056275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    SkASSERT(255 == alpha);
1057275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1058275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    if (count <= 0) {
1059275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        return;
1060275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    }
1061275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1062275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    if (count >= 8) {
1063275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
1064275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            DITHER_565_SCAN(y);
1065275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            SkPMColor c = *src++;
1066275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            SkPMColorAssert(c);
1067275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1068275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            unsigned dither = DITHER_VALUE(x);
1069275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            *dst++ = SkDitherRGB32To565(c, dither);
1070275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            DITHER_INC_X(x);
1071275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            count--;
1072275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        }
1073275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1074275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        unsigned short dither_value[8];
1075275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        __m128i dither;
1076275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#ifdef ENABLE_DITHER_MATRIX_4X4
1077275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1078275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1079275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1080275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1081275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1082275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#else
1083275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1084275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[0] = dither_value[4] = (dither_scan
1085275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                             >> (((x) & 3) << 2)) & 0xF;
1086275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[1] = dither_value[5] = (dither_scan
1087275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                             >> (((x + 1) & 3) << 2)) & 0xF;
1088275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[2] = dither_value[6] = (dither_scan
1089275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                             >> (((x + 2) & 3) << 2)) & 0xF;
1090275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither_value[3] = dither_value[7] = (dither_scan
1091275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org                                             >> (((x + 3) & 3) << 2)) & 0xF;
1092275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#endif
1093275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dither = _mm_loadu_si128((__m128i*) dither_value);
1094275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1095275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
1096275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
1097275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1098275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        while (count >= 8) {
1099275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Load 8 pixels of src.
1100275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
1101275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
1102275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1103275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Extract R from src.
1104275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1105275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr1 = _mm_srli_epi32(sr1, 24);
1106275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1107275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr2 = _mm_srli_epi32(sr2, 24);
1108275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sr = _mm_packs_epi32(sr1, sr2);
1109275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1110275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // SkDITHER_R32To565(sr, dither)
1111275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sr_offset = _mm_srli_epi16(sr, 5);
1112275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr = _mm_add_epi16(sr, dither);
1113275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr = _mm_sub_epi16(sr, sr_offset);
1114275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
1115275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1116275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Extract G from src.
1117275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1118275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg1 = _mm_srli_epi32(sg1, 24);
1119275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1120275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg2 = _mm_srli_epi32(sg2, 24);
1121275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sg = _mm_packs_epi32(sg1, sg2);
1122275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1123275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // SkDITHER_R32To565(sg, dither)
1124275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sg_offset = _mm_srli_epi16(sg, 6);
1125275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
1126275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg = _mm_sub_epi16(sg, sg_offset);
1127275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
1128275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1129275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Extract B from src.
1130275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1131275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb1 = _mm_srli_epi32(sb1, 24);
1132275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1133275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb2 = _mm_srli_epi32(sb2, 24);
1134275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sb = _mm_packs_epi32(sb1, sb2);
1135275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1136275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // SkDITHER_R32To565(sb, dither)
1137275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            __m128i sb_offset = _mm_srli_epi16(sb, 5);
1138275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb = _mm_add_epi16(sb, dither);
1139275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb = _mm_sub_epi16(sb, sb_offset);
1140275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
1141275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1142275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            // Pack and store 16-bit dst pixel.
1143c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
1144275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
1145275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1146275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            count -= 8;
1147275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            x += 8;
1148275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        }
1149275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1150275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
1151275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
1152275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    }
1153275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1154275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    if (count > 0) {
1155275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        DITHER_565_SCAN(y);
1156275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        do {
1157275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            SkPMColor c = *src++;
1158275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            SkPMColorAssert(c);
1159275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org
1160275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            unsigned dither = DITHER_VALUE(x);
1161275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            *dst++ = SkDitherRGB32To565(c, dither);
1162275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org            DITHER_INC_X(x);
1163275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org        } while (--count != 0);
1164275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org    }
1165275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org}
1166fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1167fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org/* SSE2 version of S32A_D565_Opaque_Dither()
1168fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp
1169fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org */
1170fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.orgvoid S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1171fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                                  const SkPMColor* SK_RESTRICT src,
1172fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                                  int count, U8CPU alpha, int x, int y) {
1173fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org    SkASSERT(255 == alpha);
1174fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1175fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org    if (count <= 0) {
1176fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        return;
1177fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org    }
1178fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1179fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org    if (count >= 8) {
1180fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        while (((size_t)dst & 0x0F) != 0) {
1181fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            DITHER_565_SCAN(y);
1182fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            SkPMColor c = *src++;
1183fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            SkPMColorAssert(c);
1184fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            if (c) {
1185fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                unsigned a = SkGetPackedA32(c);
1186fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1187fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1188fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1189fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                unsigned sr = SkGetPackedR32(c);
1190fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                unsigned sg = SkGetPackedG32(c);
1191fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                unsigned sb = SkGetPackedB32(c);
1192fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                sr = SkDITHER_R32_FOR_565(sr, d);
1193fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                sg = SkDITHER_G32_FOR_565(sg, d);
1194fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                sb = SkDITHER_B32_FOR_565(sb, d);
1195fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1196fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1197fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1198fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1199fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                // now src and dst expanded are in g:11 r:10 x:1 b:10
1200fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1201fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            }
1202fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dst += 1;
1203fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            DITHER_INC_X(x);
1204fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            count--;
1205fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        }
1206fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1207fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        unsigned short dither_value[8];
1208fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        __m128i dither, dither_cur;
1209fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org#ifdef ENABLE_DITHER_MATRIX_4X4
1210fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1211fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1212fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1213fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1214fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1215fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org#else
1216fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1217fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither_value[0] = dither_value[4] = (dither_scan
1218fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                                             >> (((x) & 3) << 2)) & 0xF;
1219fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither_value[1] = dither_value[5] = (dither_scan
1220fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                                             >> (((x + 1) & 3) << 2)) & 0xF;
1221fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither_value[2] = dither_value[6] = (dither_scan
1222fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                                             >> (((x + 2) & 3) << 2)) & 0xF;
1223fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither_value[3] = dither_value[7] = (dither_scan
1224fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                                             >> (((x + 3) & 3) << 2)) & 0xF;
1225fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org#endif
1226fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dither = _mm_loadu_si128((__m128i*) dither_value);
1227fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1228fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        const __m128i* s = reinterpret_cast<const __m128i*>(src);
1229fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        __m128i* d = reinterpret_cast<__m128i*>(dst);
1230fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        __m128i var256 = _mm_set1_epi16(256);
1231fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1232fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1233fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1234fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1235fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        while (count >= 8) {
1236fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Load 8 pixels of src and dst.
1237fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i src_pixel1 = _mm_loadu_si128(s++);
1238fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i src_pixel2 = _mm_loadu_si128(s++);
1239fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i dst_pixel = _mm_load_si128(d);
1240fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1241fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Extract A from src.
1242c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1243fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sa1 = _mm_srli_epi32(sa1, 24);
1244c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1245fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sa2 = _mm_srli_epi32(sa2, 24);
1246fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sa = _mm_packs_epi32(sa1, sa2);
1247fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1248fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Calculate current dither value.
1249fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dither_cur = _mm_mullo_epi16(dither,
1250fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                                         _mm_add_epi16(sa, _mm_set1_epi16(1)));
1251fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dither_cur = _mm_srli_epi16(dither_cur, 8);
1252fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1253fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Extract R from src.
1254fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1255fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sr1 = _mm_srli_epi32(sr1, 24);
1256fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1257fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sr2 = _mm_srli_epi32(sr2, 24);
1258fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sr = _mm_packs_epi32(sr1, sr2);
1259fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1260fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // SkDITHER_R32_FOR_565(sr, d)
1261fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sr_offset = _mm_srli_epi16(sr, 5);
1262fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sr = _mm_add_epi16(sr, dither_cur);
1263fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sr = _mm_sub_epi16(sr, sr_offset);
1264fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1265fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Expand sr.
1266fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sr = _mm_slli_epi16(sr, 2);
1267fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1268fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Extract G from src.
1269fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1270fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sg1 = _mm_srli_epi32(sg1, 24);
1271fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1272fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sg2 = _mm_srli_epi32(sg2, 24);
1273fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sg = _mm_packs_epi32(sg1, sg2);
1274fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1275fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // sg = SkDITHER_G32_FOR_565(sg, d).
1276fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sg_offset = _mm_srli_epi16(sg, 6);
1277fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1278fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sg = _mm_sub_epi16(sg, sg_offset);
1279fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1280fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Expand sg.
1281fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sg = _mm_slli_epi16(sg, 3);
1282fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1283fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Extract B from src.
1284fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1285fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sb1 = _mm_srli_epi32(sb1, 24);
1286fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1287fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sb2 = _mm_srli_epi32(sb2, 24);
1288fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sb = _mm_packs_epi32(sb1, sb2);
1289fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1290fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // sb = SkDITHER_B32_FOR_565(sb, d).
1291fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i sb_offset = _mm_srli_epi16(sb, 5);
1292fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sb = _mm_add_epi16(sb, dither_cur);
1293fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sb = _mm_sub_epi16(sb, sb_offset);
1294fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1295fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Expand sb.
1296fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            sb = _mm_slli_epi16(sb, 2);
1297fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1298fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Extract R G B from dst.
1299fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1300fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dr = _mm_and_si128(dr, r16_mask);
1301fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1302fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dg = _mm_and_si128(dg, g16_mask);
1303fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1304fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            db = _mm_and_si128(db, b16_mask);
1305fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1306fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // SkAlpha255To256(255 - a) >> 3
1307fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            __m128i isa = _mm_sub_epi16(var256, sa);
1308fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            isa = _mm_srli_epi16(isa, 3);
1309fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1310fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dr = _mm_mullo_epi16(dr, isa);
1311fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dr = _mm_add_epi16(dr, sr);
1312fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dr = _mm_srli_epi16(dr, 5);
1313fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1314fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dg = _mm_mullo_epi16(dg, isa);
1315fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dg = _mm_add_epi16(dg, sg);
1316fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dg = _mm_srli_epi16(dg, 5);
1317fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1318fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            db = _mm_mullo_epi16(db, isa);
1319fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            db = _mm_add_epi16(db, sb);
1320fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            db = _mm_srli_epi16(db, 5);
1321fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1322fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            // Package and store dst pixel.
1323c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1324fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            _mm_store_si128(d++, d_pixel);
1325fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1326fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            count -= 8;
1327fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            x += 8;
1328fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        }
1329fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1330fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        src = reinterpret_cast<const SkPMColor*>(s);
1331fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        dst = reinterpret_cast<uint16_t*>(d);
1332fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org    }
1333fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1334fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org    if (count > 0) {
1335fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        DITHER_565_SCAN(y);
1336fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        do {
1337fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            SkPMColor c = *src++;
1338fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            SkPMColorAssert(c);
1339fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            if (c) {
1340fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                unsigned a = SkGetPackedA32(c);
1341fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1342fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1343fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1344fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                unsigned sr = SkGetPackedR32(c);
1345fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                unsigned sg = SkGetPackedG32(c);
1346fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                unsigned sb = SkGetPackedB32(c);
1347fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                sr = SkDITHER_R32_FOR_565(sr, d);
1348fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                sg = SkDITHER_G32_FOR_565(sg, d);
1349fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                sb = SkDITHER_B32_FOR_565(sb, d);
1350fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org
1351fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1352fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1353fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1354fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                // now src and dst expanded are in g:11 r:10 x:1 b:10
1355fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1356fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            }
1357fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            dst += 1;
1358fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org            DITHER_INC_X(x);
1359fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org        } while (--count != 0);
1360fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org    }
1361fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org}
1362